Re: [PATCH v3 2/4] mei: gsc_proxy: add gsc proxy driver

2023-05-03 Thread Teres Alexis, Alan Previn
We only had nits before and all sorted now, so..
Reviewed-by: Alan Previn 

On Tue, 2023-05-02 at 09:38 -0700, Ceraolo Spurio, Daniele wrote:
> From: Alexander Usyskin 
> 
> Add GSC proxy driver. It to allows messaging between GSC component
> on Intel graphics card and CSE device.
> 
> Cc: Alan Previn 
> Signed-off-by: Alexander Usyskin 
> Signed-off-by: Tomas Winkler 
> Signed-off-by: Daniele Ceraolo Spurio 
> Acked-by: Greg Kroah-Hartman 
> ---
> 
> v2: re-order includes, drop reference to "on board" card in commit
> message and comments.



Re: [RFC PATCH 04/10] drm/sched: Add generic scheduler message interface

2023-05-03 Thread Luben Tuikov
On 2023-04-03 20:22, Matthew Brost wrote:
> Add generic schedule message interface which sends messages to backend
> from the drm_gpu_scheduler main submission thread. The idea is some of
> these messages modify some state in drm_sched_entity which is also
> modified during submission. By scheduling these messages and submission
> in the same thread their is not race changing states in
> drm_sched_entity.

"... there is no race when changing ..." or better yet,
"... we eliminate races due to drm_sched_entity state changes."

> 
> This interface will be used in XE, new Intel GPU driver, to cleanup,

"Xe"?

Regards,
Luben

> suspend, resume, and change scheduling properties of a drm_sched_entity.
> 
> The interface is designed to be generic and extendable with only the
> backend understanding the messages.
> 
> Signed-off-by: Matthew Brost 
> ---
>  drivers/gpu/drm/scheduler/sched_main.c | 58 +-
>  include/drm/gpu_scheduler.h| 29 -
>  2 files changed, 84 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
> b/drivers/gpu/drm/scheduler/sched_main.c
> index 2795021efe7b..9dc3378e9c5e 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -1055,6 +1055,54 @@ drm_sched_pick_best(struct drm_gpu_scheduler 
> **sched_list,
>  }
>  EXPORT_SYMBOL(drm_sched_pick_best);
>  
> +/**
> + * drm_sched_add_msg - add scheduler message
> + *
> + * @sched: scheduler instance
> + * @msg: message to be added
> + *
> + * Can and will pass an jobs waiting on dependencies or in a runnable queue.
> + * Messages processing will stop if schedule run wq is stopped and resume 
> when
> + * run wq is started.
> + */
> +void drm_sched_add_msg(struct drm_gpu_scheduler *sched,
> +struct drm_sched_msg *msg)
> +{
> + spin_lock(>job_list_lock);
> + list_add_tail(>link, >msgs);
> + spin_unlock(>job_list_lock);
> +
> + /*
> +  * Same as above in drm_sched_run_wq_queue, try to kick worker if
> +  * paused, harmless if this races
> +  */
> + if (!sched->pause_run_wq)
> + queue_work(sched->run_wq, >work_run);
> +}
> +EXPORT_SYMBOL(drm_sched_add_msg);
> +
> +/**
> + * drm_sched_get_msg - get scheduler message
> + *
> + * @sched: scheduler instance
> + *
> + * Returns NULL or message
> + */
> +static struct drm_sched_msg *
> +drm_sched_get_msg(struct drm_gpu_scheduler *sched)
> +{
> + struct drm_sched_msg *msg;
> +
> + spin_lock(>job_list_lock);
> + msg = list_first_entry_or_null(>msgs,
> +struct drm_sched_msg, link);
> + if (msg)
> + list_del(>link);
> + spin_unlock(>job_list_lock);
> +
> + return msg;
> +}
> +
>  /**
>   * drm_sched_main - main scheduler thread
>   *
> @@ -1068,6 +1116,7 @@ static void drm_sched_main(struct work_struct *w)
>  
>   while (!READ_ONCE(sched->pause_run_wq)) {
>   struct drm_sched_entity *entity;
> + struct drm_sched_msg *msg;
>   struct drm_sched_fence *s_fence;
>   struct drm_sched_job *sched_job;
>   struct dma_fence *fence;
> @@ -1075,12 +1124,16 @@ static void drm_sched_main(struct work_struct *w)
>  
>   cleanup_job = drm_sched_get_cleanup_job(sched);
>   entity = drm_sched_select_entity(sched);
> + msg = drm_sched_get_msg(sched);
>  
>   if (cleanup_job)
>   sched->ops->free_job(cleanup_job);
>  
> + if (msg)
> + sched->ops->process_msg(msg);
> +
>   if (!entity) {
> - if (!cleanup_job)
> + if (!cleanup_job && !msg)
>   break;
>   continue;
>   }
> @@ -1089,7 +1142,7 @@ static void drm_sched_main(struct work_struct *w)
>  
>   if (!sched_job) {
>   complete_all(>entity_idle);
> - if (!cleanup_job)
> + if (!cleanup_job && !msg)
>   break;
>   continue;
>   }
> @@ -1181,6 +1234,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>  
>   init_waitqueue_head(>job_scheduled);
>   INIT_LIST_HEAD(>pending_list);
> + INIT_LIST_HEAD(>msgs);
>   spin_lock_init(>job_list_lock);
>   atomic_set(>hw_rq_count, 0);
>   INIT_DELAYED_WORK(>work_tdr, drm_sched_job_timedout);
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 3e421f5a710c..18172ae63ab7 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -398,6 +398,23 @@ enum drm_gpu_sched_stat {
>   DRM_GPU_SCHED_STAT_ENODEV,
>  };
>  
> +/**
> + * struct drm_sched_msg - an in-band (relative to GPU scheduler run queue)
> + * message
> + *
> + * Generic enough for backend defined messages, backend can expand if needed.
> + 

Re: [RFC PATCH 07/10] drm/sched: Add helper to set TDR timeout

2023-05-03 Thread Luben Tuikov
On 2023-04-03 20:22, Matthew Brost wrote:
> Add helper to set TDR timeout and restart the TDR with new timeout
> value. This will be used in XE, new Intel GPU driver, to trigger the TDR
> to cleanup drm_sched_entity that encounter errors.
> 
> Signed-off-by: Matthew Brost 
> ---
>  drivers/gpu/drm/scheduler/sched_main.c | 18 ++
>  include/drm/gpu_scheduler.h|  1 +
>  2 files changed, 19 insertions(+)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
> b/drivers/gpu/drm/scheduler/sched_main.c
> index 4eac02d212c1..d61880315d8d 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -370,6 +370,24 @@ static void drm_sched_start_timeout(struct 
> drm_gpu_scheduler *sched)
>   queue_delayed_work(sched->timeout_wq, >work_tdr, 
> sched->timeout);
>  }
>  
> +/**
> + * drm_sched_set_timeout - set timeout for reset worker
> + *
> + * @sched: scheduler instance to set and (re)-start the worker for
> + * @timeout: timeout period
> + *
> + * Set and (re)-start the timeout for the given scheduler.
> + */
> +void drm_sched_set_timeout(struct drm_gpu_scheduler *sched, long timeout)
> +{
> + spin_lock(>job_list_lock);
> + sched->timeout = timeout;
> + cancel_delayed_work(>work_tdr);

I see that the comment says "(re-)start"(sic). Is the rest of the logic
stable in that we don't need to use _sync() version, and/or at least
inspect the return value of the one currently used?

Regards,
Luben

> + drm_sched_start_timeout(sched);
> + spin_unlock(>job_list_lock);
> +}
> +EXPORT_SYMBOL(drm_sched_set_timeout);
> +
>  /**
>   * drm_sched_fault - immediately start timeout handler
>   *
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 18172ae63ab7..6258e324bd7c 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -593,6 +593,7 @@ void drm_sched_entity_modify_sched(struct 
> drm_sched_entity *entity,
>   struct drm_gpu_scheduler **sched_list,
> unsigned int num_sched_list);
>  
> +void drm_sched_set_timeout(struct drm_gpu_scheduler *sched, long timeout);
>  void drm_sched_job_cleanup(struct drm_sched_job *job);
>  void drm_sched_wakeup(struct drm_gpu_scheduler *sched);
>  void drm_sched_add_msg(struct drm_gpu_scheduler *sched,



Re: [PATCH v3 1/4] drm/i915/mtl: Define GSC Proxy component interface

2023-05-03 Thread Teres Alexis, Alan Previn
LGTM
Reviewed-by: Alan Previn 

On Tue, 2023-05-02 at 09:38 -0700, Ceraolo Spurio, Daniele wrote:
> From: Alexander Usyskin 
> 
> GSC Proxy component is used for communication between the
> Intel graphics driver and MEI driver.
> 
> Cc: Alan Previn 
> Signed-off-by: Alexander Usyskin 
> Signed-off-by: Tomas Winkler 
> Signed-off-by: Daniele Ceraolo Spurio 
> Acked-by: Greg Kroah-Hartman 
> ---
> 
> v2: Improve documentation, remove unneeded includes



Re: [RFC PATCH 06/10] drm/sched: Submit job before starting TDR

2023-05-03 Thread Luben Tuikov
On 2023-04-03 20:22, Matthew Brost wrote:
> If the TDR is set to a value, it can fire before a job is submitted in
> drm_sched_main. The job should be always be submitted before the TDR
> fires, fix this ordering.
> 
> Signed-off-by: Matthew Brost 
> ---
>  drivers/gpu/drm/scheduler/sched_main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
> b/drivers/gpu/drm/scheduler/sched_main.c
> index 6ae710017024..4eac02d212c1 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -1150,10 +1150,10 @@ static void drm_sched_main(struct work_struct *w)
>   s_fence = sched_job->s_fence;
>  
>   atomic_inc(>hw_rq_count);
> - drm_sched_job_begin(sched_job);
>  
>   trace_drm_run_job(sched_job, entity);
>   fence = sched->ops->run_job(sched_job);
> + drm_sched_job_begin(sched_job);
>   complete_all(>entity_idle);
>   drm_sched_fence_scheduled(s_fence);
>  

Not sure if this is correct. In drm_sched_job_begin() we add the job to the 
"pending_list"
(meaning it is pending execution in the hardware) and we also start a timeout 
timer. Both
of those should be started before the job is given to the hardware.

If the timeout is set to too small a value, then that should probably be fixed 
instead.

Regards,
Luben


Re: drm/sched: Replacement for drm_sched_resubmit_jobs() is deprecated

2023-05-03 Thread Matthew Brost
On Wed, May 03, 2023 at 10:47:43AM +0200, Christian König wrote:
> Adding Luben as well.
> 
> Am 03.05.23 um 10:16 schrieb Boris Brezillon:
> > [SNIP]
> > > To sum-up, we shouldn't call drm_sched_{start,stop,resubmit_jobs}().
> > After the discussion I had with Matthew yesterday on IRC, I
> > realized there was no clear agreement on this. Matthew uses those 3
> > helpers in the Xe driver right now, and given he intends to use a
> > multi-threaded wq for its 1:1 schedulers run queue, there's no way he
> > can get away without calling drm_sched_{start,stop}().
> > drm_sched_resubmit_jobs() can be open-coded in each driver, but I'm
> > wondering if it wouldn't be preferable to add a ::resubmit_job() method
> > or extend the ::run_job() one to support the resubmit semantics, which,
> > AFAIU, is just about enforcing the job done fence (the one returned by
> > ::run_job()) doesn't transition from a signaled to an unsignaled state.
> > 
> > But probably more important than providing a generic helper, we should
> > document the resubmit semantics (AKA, what should and/or shouldn't be
> > done with pending jobs when a recovery happens). Because forbidding
> > people to use a generic helper function doesn't give any guarantee that
> > they'll do the right thing when coding their own logic, unless we give
> > clues about what's considered right/wrong, and the current state of the
> > doc is pretty unclear in this regard.
> 
> I should probably talk about the history of the re-submit feature a bit
> more.
> 
> Basically AMD came up with re-submission as a cheap way of increasing the
> reliability of GPU resets. Problem is that it turned into an absolutely
> nightmare. We tried for the last 5 years or so to get that stable and it's
> still crashing.
> 
> The first and most major problem is that the kernel doesn't even has the
> information if re-submitting jobs is possible or not. For example a job
> which has already been pushed to the hw could have grabbed a binary
> semaphore and re-submitting it will just wait forever for the semaphore to
> be released.
> 
> The second problem is that the dma_fence semantics don't allow to ever
> transit the state of a fence from signaled back to unsignaled. This means
> that you can't re-use the hw fence and need to allocate a new one, but since
> memory allocation is forbidden inside a reset handler as well (YES we need
> to better document that part) you actually need to keep a bunch of hw fences
> pre-allocated around to make this work. Amdgpu choose to illegally re-use
> the hw fence instead which only works with quite extreme hacks.
> 
> The third problem is that the lifetime of the job object was actually
> defined very well before we tried to use re-submission. Basically it's just
> an intermediate state used between the IOCTL and pushing things to the hw,
> introducing this re-submit feature completely messed that up and cause quite
> a number of use after free errors in the past which are again only solved by
> quite some hacks.
> 
> What we should do in the GPU scheduler instead is the follow:
> 
> 1. Don't support re-submission at all!
>     Instead we can provide help to drivers to query which fences (scheduler
> or hw) are still not signaled yet.
>     This can then be used to re-create hw state if (and only if!) the driver
> knows what it's doing and can actually guarantee that this will work.
>     E.g. the case for XE where the kernel driver knows the contexts which
> were not running at the time and can re-create their queues.
> 
> 2. We can provide both a wq to use for single threaded application as well
> as start/stop semantics.
>     It's just that the start/stop semantics should never touch what was
> already submitted, but rather just make sure that we don't get any new
> submissions.
> 

I pretty much agree with everything Christian has said here and Xe
aligns with this. Let me explain what Xe does.

1. Entity hang (TDR timeout of job on a entity, firmware notifies Xe that a
entity hung, entity IOMMU CAT error, etc...):
- No re-submission at all
- ban the entity
- notify the UMD
- cleanup all pending jobs / fences
2. Entire GPU hang (worth mentioning with good HW + KMD this *should*
never happen):
- stop all schedulers (same as a entity in Xe because 1 to 1)
- cleanup odd entity state related to communication with the
  firmware
- check if an entity has a job that started but not finished, if
  so ban it (same mechanism as above)
- resubmit all jobs from good entities
- start all schedulers (same as a entity in Xe because 1 to 1)

The implementation for this in the following file [1]. Search for the
drm scheduler functions and you should be able to find implementation
easily.

If you want to use an ordered work queue to avoid the stop / start dance
great do that, in Xe the stop / start dance works. I have extensively
tested this and the flow is rock solid and please 

Re: [PATCH v1] drm/mipi-dsi: Set the fwnode for mipi_dsi_device

2023-05-03 Thread Saravana Kannan
On Fri, Mar 17, 2023 at 3:36 PM Saravana Kannan  wrote:
>
> On Sun, Mar 12, 2023 at 7:45 AM Martin Kepplinger
>  wrote:
> >
> > Am Donnerstag, dem 09.03.2023 um 22:39 -0800 schrieb Saravana Kannan:
> > > After commit 3fb16866b51d ("driver core: fw_devlink: Make cycle
> > > detection more robust"), fw_devlink prints an error when consumer
> > > devices don't have their fwnode set. This used to be ignored
> > > silently.
> > >
> > > Set the fwnode mipi_dsi_device so fw_devlink can find them and
> > > properly
> > > track their dependencies.
> > >
> > > This fixes errors like this:
> > > [0.334054] nwl-dsi 30a0.mipi-dsi: Failed to create device
> > > link with regulator-lcd-1v8
> > > [0.346964] nwl-dsi 30a0.mipi-dsi: Failed to create device
> > > link with backlight-dsi
> > >
> > > Reported-by: Martin Kepplinger 
> >
> > Reported-and-tested-by: Martin Kepplinger 
>
> Maintainers,
>
> Nudge nudge. Will this be picked up for 6.3-rcX?

Greg,

Can you pick this up please? It's a fix that hasn't been picked up for
a few months.

Here's the link to the actual patch for your convenience:
https://lore.kernel.org/lkml/20230310063910.2474472-1-sarava...@google.com/#t

-Saravana

>
> -Saravana
>
> >
> > thanks,
> >  martin
> >
> > > Link:
> > > https://lore.kernel.org/lkml/2a8e407f4f18c9350f8629a2b5fa18673355b2ae.ca...@puri.sm/
> > > Fixes: 068a00233969 ("drm: Add MIPI DSI bus support")
> > > Signed-off-by: Saravana Kannan 
> > > ---
> > >  drivers/gpu/drm/drm_mipi_dsi.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/drm_mipi_dsi.c
> > > b/drivers/gpu/drm/drm_mipi_dsi.c
> > > index b41aaf2bb9f1..7923cc21b78e 100644
> > > --- a/drivers/gpu/drm/drm_mipi_dsi.c
> > > +++ b/drivers/gpu/drm/drm_mipi_dsi.c
> > > @@ -221,7 +221,7 @@ mipi_dsi_device_register_full(struct
> > > mipi_dsi_host *host,
> > > return dsi;
> > > }
> > >
> > > -   dsi->dev.of_node = info->node;
> > > +   device_set_node(>dev, of_fwnode_handle(info->node));
> > > dsi->channel = info->channel;
> > > strlcpy(dsi->name, info->type, sizeof(dsi->name));
> > >
> >
> >
> > --
> > To unsubscribe from this group and stop receiving emails from it, send an 
> > email to kernel-team+unsubscr...@android.com.
> >


[pull] amdgpu drm-fixes-6.4

2023-05-03 Thread Alex Deucher
Hi Dave, Daniel,

Fixes for 6.4.

The following changes since commit d893f39320e1248d1c97fde0d6e51e5ea008a76b:

  drm/amd/display: Lowering min Z8 residency time (2023-04-26 22:53:58 -0400)

are available in the Git repository at:

  https://gitlab.freedesktop.org/agd5f/linux.git 
tags/amd-drm-fixes-6.4-2023-05-03

for you to fetch changes up to 1253685f0d3eb3eab0bfc4bf15ab341a5f3da0c8:

  drm/amdgpu: drop redundant sched job cleanup when cs is aborted (2023-05-03 
23:10:02 -0400)


amd-drm-fixes-6.4-2023-05-03:

amdgpu:
- GPU reset fixes
- Doorbell fix when resizing BARs
- Fix spurious warnings in gmc
- Locking fix for AMDGPU_SCHED IOCTL
- SR-IOV fix
- DCN 3.1.4 fix
- DCN 3.2 fix
- Fix job cleanup when CS is aborted


Chia-I Wu (1):
  drm/amdgpu: add a missing lock for AMDGPU_SCHED

Guchun Chen (1):
  drm/amdgpu: drop redundant sched job cleanup when cs is aborted

Hamza Mahfooz (1):
  drm/amdgpu: fix an amdgpu_irq_put() issue in gmc_v9_0_hw_fini()

Horace Chen (1):
  drm/amdgpu: disable SDMA WPTR_POLL_ENABLE for SR-IOV

Horatio Zhang (2):
  drm/amdgpu: fix amdgpu_irq_put call trace in gmc_v11_0_hw_fini
  drm/amdgpu: fix amdgpu_irq_put call trace in gmc_v10_0_hw_fini

Leo Chen (1):
  drm/amd/display: Change default Z8 watermark values

Samson Tam (1):
  drm/amd/display: filter out invalid bits in pipe_fuses

Shane Xiao (1):
  drm/amdgpu: Enable doorbell selfring after resize FB BAR

lyndonli (2):
  drm/amdgpu: Fix mode2 reset for sienna cichlid
  drm/amdgpu: Use the default reset when loading or reloading the driver

 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 13 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  7 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c  |  6 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  1 -
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c |  1 -
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  1 -
 drivers/gpu/drm/amd/amdgpu/nv.c| 23 +++-
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c |  5 +
 drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/soc15.c | 25 +-
 drivers/gpu/drm/amd/amdgpu/soc21.c | 23 +++-
 .../gpu/drm/amd/display/dc/dcn32/dcn32_resource.c  | 10 -
 .../drm/amd/display/dc/dcn321/dcn321_resource.c| 10 -
 .../gpu/drm/amd/display/dc/dml/dcn314/dcn314_fpu.c |  4 ++--
 14 files changed, 78 insertions(+), 53 deletions(-)


Re: [PATCH v2] accel/habanalabs: Make use of rhashtable

2023-05-03 Thread Cai Huoqing
On 30 4月 23 09:36:29, Oded Gabbay wrote:
> On Fri, Apr 28, 2023 at 5:49 PM Cai Huoqing  wrote:
> >
> > Using rhashtable to accelerate the search for userptr by address,
> > instead of using a list.
> >
> > Preferably, the lookup complexity of a hash table is O(1).
> >
> > This patch will speedup the method
> > hl_userptr_is_pinned by rhashtable_lookup_fast.
> >
> > Signed-off-by: Cai Huoqing 
> 
> Thanks for the patch, but the reason we never optimized this path is
> because this code path is only relevant for Goya, which we don't want
> to change the code for anymore.
> For Gaudi we don't pin the memory in the host during submission. It is
> done much earlier, when the user maps the memory to the device. The
> code path in Gaudi is only in case the device is loaded with its MMU
> disabled. This mode was used only for debug/bring-up of the ASIC many
> years ago. As you can see in Gaudi2, that mode was dropped even for

Do you mean that the userspace directly call HL_MEM_OP_MAP/HL_MEM_OP_UNMAP
with a flag HL_MEM_USERPTR instead of pin host mem in submissmion?

> debug/bring-up.
> 
> Therefore, I prefer not to take this patch as validation for both
> functionality and performance will take time which will be better
> spent elsewhere.
> 
> Thanks,
> Oded
> 
> > ---
> > v1->v2:
> > Use rhashtable_free_and_destroy in hl_userptr_delete_list.
> >
> >  .../habanalabs/common/command_submission.c| 16 ++--
> >  drivers/accel/habanalabs/common/habanalabs.h  | 19 +
> >  drivers/accel/habanalabs/common/memory.c  | 39 +++
> >  drivers/accel/habanalabs/gaudi/gaudi.c| 16 +---
> >  drivers/accel/habanalabs/goya/goya.c  | 14 ---
> >  5 files changed, 65 insertions(+), 39 deletions(-)
> >
> > diff --git a/drivers/accel/habanalabs/common/command_submission.c 
> > b/drivers/accel/habanalabs/common/command_submission.c
> > index af9d2e22c6e7..35c2ab934396 100644
> > --- a/drivers/accel/habanalabs/common/command_submission.c
> > +++ b/drivers/accel/habanalabs/common/command_submission.c
> > @@ -312,7 +312,7 @@ static int cs_parser(struct hl_fpriv *hpriv, struct 
> > hl_cs_job *job)
> > parser.job_id = job->id;
> >
> > parser.hw_queue_id = job->hw_queue_id;
> > -   parser.job_userptr_list = >userptr_list;
> > +   parser.job_userptr_ht = >userptr_ht;
> > parser.patched_cb = NULL;
> > parser.user_cb = job->user_cb;
> > parser.user_cb_size = job->user_cb_size;
> > @@ -351,7 +351,7 @@ static void hl_complete_job(struct hl_device *hdev, 
> > struct hl_cs_job *job)
> > struct hl_cs *cs = job->cs;
> >
> > if (is_cb_patched(hdev, job)) {
> > -   hl_userptr_delete_list(hdev, >userptr_list);
> > +   hl_userptr_delete_list(hdev, >userptr_ht);
> >
> > /*
> >  * We might arrive here from rollback and patched CB wasn't
> > @@ -1284,6 +1284,7 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device 
> > *hdev,
> > enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
> >  {
> > struct hl_cs_job *job;
> > +   int rc;
> >
> > job = kzalloc(sizeof(*job), GFP_ATOMIC);
> > if (!job)
> > @@ -1296,13 +1297,20 @@ struct hl_cs_job *hl_cs_allocate_job(struct 
> > hl_device *hdev,
> > job->queue_type = queue_type;
> > job->is_kernel_allocated_cb = is_kernel_allocated_cb;
> >
> > -   if (is_cb_patched(hdev, job))
> > -   INIT_LIST_HEAD(>userptr_list);
> > +   if (is_cb_patched(hdev, job)) {
> > +   rc = rhashtable_init(>userptr_ht, 
> > _userptr_rht_params);
> > +   if (rc)
> > +   goto free_job;
> > +   }
> >
> > if (job->queue_type == QUEUE_TYPE_EXT)
> > INIT_WORK(>finish_work, job_wq_completion);
> >
> > return job;
> > +
> > +free_job:
> > +   kfree(job);
> > +   return NULL;
> >  }
> >
> >  static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
> > diff --git a/drivers/accel/habanalabs/common/habanalabs.h 
> > b/drivers/accel/habanalabs/common/habanalabs.h
> > index eaae69a9f817..9c876d1480d2 100644
> > --- a/drivers/accel/habanalabs/common/habanalabs.h
> > +++ b/drivers/accel/habanalabs/common/habanalabs.h
> > @@ -19,6 +19,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -540,6 +541,8 @@ struct hl_hints_range {
> > u64 end_addr;
> >  };
> >
> > +extern const struct rhashtable_params hl_userptr_rht_params;
> > +
> >  /**
> >   * struct asic_fixed_properties - ASIC specific immutable properties.
> >   * @hw_queues_props: H/W queues properties.
> > @@ -1915,7 +1918,7 @@ struct hl_ctx_mgr {
> >  /**
> >   * struct hl_userptr - memory mapping chunk information
> >   * @vm_type: type of the VM.
> > - * @job_node: linked-list node for hanging the object on the Job's list.
> > + * @job_node: hashtable node for hanging the object on 

Re: [PATCH v2 1/8] drm: Disable the cursor plane on atomic contexts with virtualized drivers

2023-05-03 Thread Zack Rusin
On Wed, 2023-05-03 at 09:48 +0200, Javier Martinez Canillas wrote:
> Zack Rusin  writes:
> 
> > On Tue, 2023-05-02 at 11:32 +0200, Javier Martinez Canillas wrote:
> > > !! External Email
> > > 
> > > Daniel Vetter  writes:
> > > 
> > > > On Mon, Jul 11, 2022 at 11:32:39PM -0400, Zack Rusin wrote:
> > > > > From: Zack Rusin 
> > > > > 
> > > > > Cursor planes on virtualized drivers have special meaning and require
> > > > > that the clients handle them in specific ways, e.g. the cursor plane
> > > > > should react to the mouse movement the way a mouse cursor would be
> > > > > expected to and the client is required to set hotspot properties on it
> > > > > in order for the mouse events to be routed correctly.
> > > > > 
> > > > > This breaks the contract as specified by the "universal planes". Fix 
> > > > > it
> > > > > by disabling the cursor planes on virtualized drivers while adding
> > > > > a foundation on top of which it's possible to special case mouse 
> > > > > cursor
> > > > > planes for clients that want it.
> > > > > 
> > > > > Disabling the cursor planes makes some kms compositors which were 
> > > > > broken,
> > > > > e.g. Weston, fallback to software cursor which works fine or at least
> > > > > better than currently while having no effect on others, e.g. 
> > > > > gnome-shell
> > > > > or kwin, which put virtualized drivers on a deny-list when running in
> > > > > atomic context to make them fallback to legacy kms and avoid this 
> > > > > issue.
> > > > > 
> > > > > Signed-off-by: Zack Rusin 
> > > > > Fixes: 681e7ec73044 ("drm: Allow userspace to ask for universal plane 
> > > > > list
> > > > > (v2)")
> > > 
> > > [...]
> > > 
> > > > > diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
> > > > > index f6159acb8856..c4cd7fc350d9 100644
> > > > > --- a/include/drm/drm_drv.h
> > > > > +++ b/include/drm/drm_drv.h
> > > > > @@ -94,6 +94,16 @@ enum drm_driver_feature {
> > > > >   * synchronization of command submission.
> > > > >   */
> > > > >  DRIVER_SYNCOBJ_TIMELINE = BIT(6),
> > > > > +    /**
> > > > > + * @DRIVER_VIRTUAL:
> > > > > + *
> > > > > + * Driver is running on top of virtual hardware. The most 
> > > > > significant
> > > > > + * implication of this is a requirement of special handling of 
> > > > > the
> > > > > + * cursor plane (e.g. cursor plane has to actually track the 
> > > > > mouse
> > > > > + * cursor and the clients are required to set hotspot in order 
> > > > > for
> > > > > + * the cursor planes to work correctly).
> > > > > + */
> > > > > +    DRIVER_VIRTUAL  = BIT(7),
> > > > 
> > > > I think the naming here is unfortunate, because people will vonder why
> > > > e.g. vkms doesn't set this, and then add it, and confuse stuff 
> > > > completely.
> > > > 
> > > > Also it feels a bit wrong to put this onto the driver, when really it's 
> > > > a
> > > > cursor flag. I guess you can make it some kind of flag in the drm_plane
> > > > structure, or a new plane type, but putting it there instead of into the
> > > > "random pile of midlayer-mistake driver flags" would be a lot better.
> > > > 
> > > > Otherwise I think the series looks roughly how I'd expect it to look.
> > > > -Daniel
> > > > 
> > > 
> > > AFAICT this is the only remaining thing to be addressed for this series ?
> > 
> > No, there was more. tbh I haven't had the time to think about whether the 
> > above
> > makes sense to me, e.g. I'm not sure if having virtualized drivers expose
> > "support
> > universal planes" and adding another plane which is not universal (the only
> > "universal" plane on them being the default one) makes more sense than a 
> > flag
> > that
> > says "this driver requires a cursor in the cursor plane". There's certainly 
> > a
> > huge
> > difference in how userspace would be required to handle it and it's way 
> > uglier
> > with
> > two different cursor planes. i.e. there's a lot of ways in which this could 
> > be
> > cleaner in the kernel but they all require significant changes to userspace,
> > that go
> > way beyond "attach hotspot info to this plane". I'd like to avoid approaches
> > that
> > mean running with atomic kms requires completely separate paths for 
> > virtualized
> > drivers because no one will ever support and maintain it.
> > 
> > It's not a trivial thing because it's fundamentally hard to untangle the 
> > fact
> > the
> > virtualized drivers have been advertising universal plane support without 
> > ever
> > supporting universal planes. Especially because most new userspace in 
> > general
> > checks
> > for "universal planes" to expose atomic kms paths.
> > 
> 
> After some discussion on the #dri-devel, your approach makes sense and the
> only contention point is the name of the driver feature flag name. The one
> you are using (DRIVER_VIRTUAL) seems to be too broad and generic (the fact
> that vkms won't set and is a virtual driver as well, is a good example).
> 
> Maybe 

Re: [PATCH v2 1/8] drm: Disable the cursor plane on atomic contexts with virtualized drivers

2023-05-03 Thread Zack Rusin
On Wed, 2023-05-03 at 10:54 +0300, Pekka Paalanen wrote:
> On Wed, 3 May 2023 03:35:29 +
> Zack Rusin  wrote:
> 
> > On Tue, 2023-05-02 at 11:32 +0200, Javier Martinez Canillas wrote:
> > > !! External Email
> > > 
> > > Daniel Vetter  writes:
> > >   
> > > > On Mon, Jul 11, 2022 at 11:32:39PM -0400, Zack Rusin wrote:  
> > > > > From: Zack Rusin 
> > > > > 
> > > > > Cursor planes on virtualized drivers have special meaning and require
> > > > > that the clients handle them in specific ways, e.g. the cursor plane
> > > > > should react to the mouse movement the way a mouse cursor would be
> > > > > expected to and the client is required to set hotspot properties on it
> > > > > in order for the mouse events to be routed correctly.
> > > > > 
> > > > > This breaks the contract as specified by the "universal planes". Fix 
> > > > > it
> > > > > by disabling the cursor planes on virtualized drivers while adding
> > > > > a foundation on top of which it's possible to special case mouse 
> > > > > cursor
> > > > > planes for clients that want it.
> > > > > 
> > > > > Disabling the cursor planes makes some kms compositors which were 
> > > > > broken,
> > > > > e.g. Weston, fallback to software cursor which works fine or at least
> > > > > better than currently while having no effect on others, e.g. 
> > > > > gnome-shell
> > > > > or kwin, which put virtualized drivers on a deny-list when running in
> > > > > atomic context to make them fallback to legacy kms and avoid this 
> > > > > issue.
> > > > > 
> > > > > Signed-off-by: Zack Rusin 
> > > > > Fixes: 681e7ec73044 ("drm: Allow userspace to ask for universal plane 
> > > > > list
> > > > > (v2)")  
> > > 
> > > [...]
> > >   
> > > > > diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
> > > > > index f6159acb8856..c4cd7fc350d9 100644
> > > > > --- a/include/drm/drm_drv.h
> > > > > +++ b/include/drm/drm_drv.h
> > > > > @@ -94,6 +94,16 @@ enum drm_driver_feature {
> > > > >   * synchronization of command submission.
> > > > >   */
> > > > >  DRIVER_SYNCOBJ_TIMELINE = BIT(6),
> > > > > +    /**
> > > > > + * @DRIVER_VIRTUAL:
> > > > > + *
> > > > > + * Driver is running on top of virtual hardware. The most 
> > > > > significant
> > > > > + * implication of this is a requirement of special handling of 
> > > > > the
> > > > > + * cursor plane (e.g. cursor plane has to actually track the 
> > > > > mouse
> > > > > + * cursor and the clients are required to set hotspot in order 
> > > > > for
> > > > > + * the cursor planes to work correctly).
> > > > > + */
> > > > > +    DRIVER_VIRTUAL  = BIT(7),  
> > > > 
> > > > I think the naming here is unfortunate, because people will vonder why
> > > > e.g. vkms doesn't set this, and then add it, and confuse stuff 
> > > > completely.
> > > > 
> > > > Also it feels a bit wrong to put this onto the driver, when really it's 
> > > > a
> > > > cursor flag. I guess you can make it some kind of flag in the drm_plane
> > > > structure, or a new plane type, but putting it there instead of into the
> > > > "random pile of midlayer-mistake driver flags" would be a lot better.
> > > > 
> > > > Otherwise I think the series looks roughly how I'd expect it to look.
> > > > -Daniel
> > > >   
> > > 
> > > AFAICT this is the only remaining thing to be addressed for this series ? 
> > >  
> > 
> > No, there was more. tbh I haven't had the time to think about whether the 
> > above
> > makes sense to me, e.g. I'm not sure if having virtualized drivers expose
> > "support
> > universal planes" and adding another plane which is not universal (the only
> > "universal" plane on them being the default one) makes more sense than a 
> > flag
> > that
> > says "this driver requires a cursor in the cursor plane". There's certainly 
> > a
> > huge
> > difference in how userspace would be required to handle it and it's way 
> > uglier
> > with
> > two different cursor planes. i.e. there's a lot of ways in which this could 
> > be
> > cleaner in the kernel but they all require significant changes to userspace,
> > that go
> > way beyond "attach hotspot info to this plane".
> 
> > I'd like to avoid approaches that
> > mean running with atomic kms requires completely separate paths for 
> > virtualized
> > drivers because no one will ever support and maintain it.
> 
> Hi Zack,
> 
> you'd like to avoid that, but fundamentally that really is what has to
> happen in userspace for *nested* KMS drivers (VKMS is a virtual driver
> but not part of the interest group here) to reach optimality.
> 
> It really is a different path. I see no way around that. But if you
> accept that fact, then you could possibly gain a lot more benefits by
> asking userspace to handle nested KMS drivers differently. What those
> benefits are exactly I'm not sure, but I have a feeling there should be
> some, where the knowledge of running on a nested KMS driver allows for
> better decisions that 

Re: [RFC PATCH 0/1] Add AMDGPU_INFO_GUILTY_APP ioctl

2023-05-03 Thread Marek Olšák
On Wed, May 3, 2023, 14:53 André Almeida  wrote:

> Em 03/05/2023 14:08, Marek Olšák escreveu:
> > GPU hangs are pretty common post-bringup. They are not common per user,
> > but if we gather all hangs from all users, we can have lots and lots of
> > them.
> >
> > GPU hangs are indeed not very debuggable. There are however some things
> > we can do:
> > - Identify the hanging IB by its VA (the kernel should know it)
>
> How can the kernel tell which VA range is being executed? I only found
> that information at mmCP_IB1_BASE_ regs, but as stated in this thread by
> Christian this is not reliable to be read.
>

The kernel receives the VA and the size via the CS ioctl. When user queues
are enabled, the kernel will no longer receive them.


> > - Read and parse the IB to detect memory corruption.
> > - Print active waves with shader disassembly if SQ isn't hung (often
> > it's not).
> >
> > Determining which packet the CP is stuck on is tricky. The CP has 2
> > engines (one frontend and one backend) that work on the same command
> > buffer. The frontend engine runs ahead, executes some packets and
> > forwards others to the backend engine. Only the frontend engine has the
> > command buffer VA somewhere. The backend engine only receives packets
> > from the frontend engine via a FIFO, so it might not be possible to tell
> > where it's stuck if it's stuck.
>
> Do they run at the same asynchronously or does the front end waits the
> back end to execute?
>

They run asynchronously and should run asynchronously for performance, but
they can be synchronized using a special packet (PFP_SYNC_ME).

Marek


> >
> > When the gfx pipeline hangs outside of shaders, making a scandump seems
> > to be the only way to have a chance at finding out what's going wrong,
> > and only AMD-internal versions of hw can be scanned.
> >
> > Marek
> >
> > On Wed, May 3, 2023 at 11:23 AM Christian König
> >  > > wrote:
> >
> > Am 03.05.23 um 17:08 schrieb Felix Kuehling:
> >  > Am 2023-05-03 um 03:59 schrieb Christian König:
> >  >> Am 02.05.23 um 20:41 schrieb Alex Deucher:
> >  >>> On Tue, May 2, 2023 at 11:22 AM Timur Kristóf
> >  >>> mailto:timur.kris...@gmail.com>>
> wrote:
> >   [SNIP]
> >   In my opinion, the correct solution to those problems
> would be
> >   if
> >   the kernel could give userspace the necessary information
> > about
> >   a
> >   GPU hang before a GPU reset.
> >  
> >  >>>   The fundamental problem here is that the kernel doesn't
> have
> >  >>> that
> >  >>> information either. We know which IB timed out and can
> >  >>> potentially do
> >  >>> a devcoredump when that happens, but that's it.
> >  >>
> >  >> Is it really not possible to know such a fundamental thing
> > as what
> >  >> the
> >  >> GPU was doing when it hung? How are we supposed to do any
> > kind of
> >  >> debugging without knowing that?
> >  >>
> >  >> Yes, that's indeed something at least I try to figure out for
> years
> >  >> as well.
> >  >>
> >  >> Basically there are two major problems:
> >  >> 1. When the ASIC is hung you can't talk to the firmware engines
> any
> >  >> more and most state is not exposed directly, but just through
> some
> >  >> fw/hw interface.
> >  >> Just take a look at how umr reads the shader state from the
> SQ.
> >  >> When that block is hung you can't do that any more and basically
> > have
> >  >> no chance at all to figure out why it's hung.
> >  >>
> >  >> Same for other engines, I remember once spending a week
> > figuring
> >  >> out why the UVD block is hung during suspend. Turned out to be a
> >  >> debugging nightmare because any time you touch any register of
> that
> >  >> block the whole system would hang.
> >  >>
> >  >> 2. There are tons of things going on in a pipeline fashion or
> even
> >  >> completely in parallel. For example the CP is just the beginning
> > of a
> >  >> rather long pipeline which at the end produces a bunch of pixels.
> >  >> In almost all cases I've seen you ran into a problem
> somewhere
> >  >> deep in the pipeline and only very rarely at the beginning.
> >  >>
> >  >>
> >  >> I wonder what AMD's Windows driver team is doing with this
> > problem,
> >  >> surely they must have better tools to deal with GPU hangs?
> >  > For better or worse, most teams internally rely on scan dumps
> via
> >  > JTAG
> >  > which sort of limits the usefulness outside of AMD, but also
> > gives
> >  > you
> >  > the exact state of the hardware when it's hung so the
> > hardware teams
> >  > prefer it.
> >  >
> >   How does this approach scale? It's 

Re: [PATCH 4/4] drm/msm/dpu: Enable compression for command mode

2023-05-03 Thread Jessica Zhang




On 5/3/2023 4:00 PM, Marijn Suijten wrote:

Hi Jessica,

On 2023-05-03 12:04:59, Jessica Zhang wrote:



On 5/3/2023 12:28 AM, Marijn Suijten wrote:

On 2023-05-02 18:19:15, Jessica Zhang wrote:

Add a dpu_hw_intf op to enable data compression.

Signed-off-by: Jessica Zhang 
---
   drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c | 4 
   drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c  | 7 +++
   drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h  | 2 ++
   3 files changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c
index 74470d068622..4321a1aba17f 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c


Can we have INTF DCE on video-mode encoders as well?


Hi Marijn,

Currently, there's no way to validate DSC for video mode as I've only
made changes to support DSI for command mode. We are planning to post
changes to support DSC over DP, which will include changes for video mode.


Okay, but then mention so in the patch description (which is rather
short in this revision).


Acked.






   #define INTF_CFG2_DATABUS_WIDEN  BIT(0)
   #define INTF_CFG2_DATA_HCTL_EN   BIT(4)


These should probably be reindented to match the below... And the rest
of the defines use spaces instead of tabs.


Fair point, though I think fixing the whitespace for these 2 macros
specifically might be better in a more relevant series.


Yes, I have many patches to start cleaning these up, as well as all the
broken kerneldoc comments, but it's an uphill battle.  Not sure if I'll
get to it any time soon if at all.


With that being said, I'll change the spacing of the DATA_COMPRESS bit
to spaces instead of tabs.


Thanks, that seems to be the most common format.


+#define INTF_CFG2_DCE_DATA_COMPRESSBIT(12)
   
   #define INTF_MISR_CTRL			0x180

   #define INTF_MISR_SIGNATURE  0x184


This does not seem to apply on top of:
https://lore.kernel.org/linux-arm-msm/20230411-dpu-intf-te-v4-10-27ce1a5ab...@somainline.org/


Seems like I'm missing some patches from that series on my working
branch. Will rebase on top of the full series for the v2.


Thanks, but do discuss with Abhinav/Dmitry which series will land first.


+static inline void dpu_hw_intf_enable_compression(struct dpu_hw_intf *ctx)


Why inline?  This is used as a pointer callback.


Acked, will remove the inline.




+{
+   DPU_REG_WRITE(>hw, INTF_CONFIG2, INTF_CFG2_DCE_DATA_COMPRESS);


dpu_hw_intf_setup_timing_engine() also programs INTF_CONFIG2.  Is it
double-buffered, or is that config **always** unused when DSI CMD mode
is used in conjunction with DSC/DCE?  Otherwise this should perhaps OR
the bitflag into the register, or write the whole thing at once in
dpu_hw_intf_setup_timing_engine()?


For command mode, INTF_CONFIG2 is unused aside from setting
DATA_COMPRESS for DSC.

Since setup_timing_engine() is only used for video mode, the
corresponding changes will be made in the DSC v1.2 for DP changes.


Ack, that makes sense.  However, is this a guarantee that nothing else
will write INTF_CONFIG2 in the future, or will we solve that problem
when it happens?  I'm afraid more config-bits get added to this register
in the future and might possibly race/overwrite each other.


That's a fair point. There's no guarantee that nothing else will set 
INTF_CONFIG2 for command mode in the future. I think it would be better 
to add a register read now instead of having to fix that issue in a 
future change.


Thanks,

Jessica Zhang



- Marijn




Re: [PATCH v4 4/7] drm/msm/dpu: add PINGPONG_NONE to disconnect DSC from PINGPONG

2023-05-03 Thread Marijn Suijten
On 2023-05-03 13:10:36, Kuogee Hsieh wrote:
> During DSC setup, the crossbar mux need to be programmed to engage
> DSC to specified PINGPONG. Hence during tear down, the crossbar mux
> need to be reset to disengage DSC from PINGPONG. 0X0F is written to
> reset crossbar mux. It is not relevant to hw_pp->idx.  This patch add
> PINGPONG_NONE to serve as disable to reset crossbar mux.
> 
> Changes in v4:
> -- more details to commit text

As requested in v3, this doesn't adequately explain that all you're
doing is **removing `bool enable`** so that this function becomes
simpler to call in the disable scenario without coming up with a random
dpu_pingpong value that's irrelevant when enable=false.  How about the
following wording:

drm/msm/dpu: Introduce PINGPONG_NONE to disconnect DSC from PINGPONG

Disabling the crossbar mux between DSC and PINGPONG currently
requires a bogus enum dpu_pingpong value to be passed when calling
dsc_bind_pingpong_blk() with enable=false, even though the register
value written is independent of the current PINGPONG block.  Replace
that `bool enable` parameter with a new PINGPONG_NONE dpu_pingpong
flag that triggers the write of the "special" 0xF "crossbar
disabled" value to the register instead.

And don't forget to fix the log statement below.



>   DRM_DEBUG_KMS("%s dsc:%d %s pp:%d\n",
> - enable ? "Binding" : "Unbinding",
> + pp ? "Binding" : "Unbinding",
>   hw_dsc->idx - DSC_0,
> - enable ? "to" : "from",
> + pp ? "to" : "from",
>   pp - PINGPONG_0);

This wasn't adjusted, see v3 review.

- Marijn


Re: [PATCH 3/4] drm/msm/dpu: Add has_data_compress to dpu_caps

2023-05-03 Thread Jessica Zhang




On 5/3/2023 4:03 PM, Marijn Suijten wrote:

Hi Jessica,

On 2023-05-03 12:03:40, Jessica Zhang wrote:



On 5/3/2023 12:07 AM, Marijn Suijten wrote:

On 2023-05-02 18:19:14, Jessica Zhang wrote:

Add data_compress feature to DPU HW catalog.

In DPU 7.x and later, there is a DATA_COMPRESS register that must be set
within the DPU INTF block for DSC to work.

As core_rev (and related macros) was removed from the dpu_kms struct, the
most straightforward way to indicate the presence of this register would be
to have a flag in dpu_caps.


This is a very generic name to have in the global dpu_caps for a very
specific register on the INTF block since DPU >= 7.0.0, and I doubt any
new catalog contributor will know how to fill this field.  After all,
DPU < 7.0.0 also has DCE but it is controlled via the PINGPONG block.

Instead, how about having it as a DPU_INTF_DATA_COMPRESS (or similar)
feature flag on the INTF block?  We do the same for other (register
related) features on the INTF block, and you did the same to disable DSC
callbacks on PP in [1].


(Note: I said "you" but meant Kuogee)


Hi Marijn,

Sounds good.



In fact it seems that the DSC/DCE (enablement) registers have been moved
from PINGPONG to INTF in DPU 7.0.0.  Can you clarify in the patch
message for v2 that this is the case, and do the same in the linked
PINGPONG patch?  Perhaps these patches should be part of the same series
as they do not seem DSI-specific.


Will make a note of the PP to INTF change in the commit message.


Thanks.


I would prefer to keep this patch in this series is because it is needed
for DSI over command mode to work and the subsequent patch is
specifically for command mode.


That is fine, but do mention this in the commit message if it is
relevant here.  Otherwise only mention it as part of patch 4/4.


Acked.

Thanks,

Jessica Zhang



- Marijn


Re: [PATCH v4 2/7] drm/msm/dpu: add DPU_PINGPONG_DSC feature bit

2023-05-03 Thread Marijn Suijten
Hi Kuogee,

On 2023-05-03 13:10:34, Kuogee Hsieh wrote:
> Legacy DPU (DPU < 7.0.0) requires PP block to be involved during

Nit: I wouldn't call it "legacy" (that's not really relevant here), just

DPU < 7.0.0 requires the PINGPONG block ...

> DSC setting up. Since then, enable and start the DSC encoder engine

then -> since DPU 7.0.0

enabling* and starting* the DSC encoder engine

> had moved to INTF with helps of flush mechanism. This patch adds

s/had/has, or remove had altogether

"with the help of a/the"

This patch adds a (new)*, but you shouldn't write "this patch" at all:

https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes

> DPU_PINGPONG_DSC feature bit to indicate that both
> dpu_hw_pp_setup_dsc() and dpu_hw_pp_dsc_enable() pingpong ops
> functions are required to complete DSC datapath setup and start
> DSC engine.

... which should only be set on DPU < 7.0.0, but it doesn't seem like
"complete DSC datapath" really explains the goal of this patch (namely
disabling it on DPU >= 7.0.0, by only making it available on DPU <
7.0.0).

How about replacing this whole sentence, starting at "This patch", with:

Add a DPU_PINGPONG_DSC feature bit to restrict the availability of
dpu_hw_pp_setup_dsc() and dpu_hw_pp_dsc_{enable,disable}() on the
PINGPONG block to DPU < 7.0.0 hardware, as the registers are not
available [in the PINGPONG block] on DPU 7.0.0 and higher anymore.
Existing call-sites to these callbacks already skip calling into
them if the function pointer is NULL.

How does that sound to you?

> Changes in v4:
> -- add more details commit text
> 
> Reported-by: Marijn Suijten 
> Signed-off-by: Kuogee Hsieh 
> Reviewed-by: Dmitry Baryshkov 
> ---
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h  | 2 ++
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c | 9 ++---
>  2 files changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h 
> b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h
> index 71584cd..5d210f3 100644
> --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h
> +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h
> @@ -144,6 +144,7 @@ enum {
>   * @DPU_PINGPONG_SPLIT  PP block supports split fifo
>   * @DPU_PINGPONG_SLAVE  PP block is a suitable slave for split fifo
>   * @DPU_PINGPONG_DITHER,Dither blocks
> + * @DPU_PINGPONG_DSC,   PP ops functions required for DSC

As said in v3, drop the comma.

>   * @DPU_PINGPONG_MAX
>   */
>  enum {
> @@ -152,6 +153,7 @@ enum {
>   DPU_PINGPONG_SPLIT,
>   DPU_PINGPONG_SLAVE,
>   DPU_PINGPONG_DITHER,
> + DPU_PINGPONG_DSC,
>   DPU_PINGPONG_MAX
>  };
>  
> diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c 
> b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c
> index 3822e06..f255a04 100644
> --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c
> +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c
> @@ -264,9 +264,12 @@ static void _setup_pingpong_ops(struct dpu_hw_pingpong 
> *c,
>   c->ops.get_autorefresh = dpu_hw_pp_get_autorefresh_config;
>   c->ops.poll_timeout_wr_ptr = dpu_hw_pp_poll_timeout_wr_ptr;
>   c->ops.get_line_count = dpu_hw_pp_get_line_count;
> - c->ops.setup_dsc = dpu_hw_pp_setup_dsc;
> - c->ops.enable_dsc = dpu_hw_pp_dsc_enable;
> - c->ops.disable_dsc = dpu_hw_pp_dsc_disable;
> +
> + if (features & BIT(DPU_PINGPONG_DSC)) {

To stick with the style of this function, this should use test_bit()
like below for DPU_PINGPONG_DITHER.

Unless maintainers agree that we should replace all current uses in DPU
with `x & BIT(..)`.

- Marijn

> + c->ops.setup_dsc = dpu_hw_pp_setup_dsc;
> + c->ops.enable_dsc = dpu_hw_pp_dsc_enable;
> + c->ops.disable_dsc = dpu_hw_pp_dsc_disable;
> + }
>  
>   if (test_bit(DPU_PINGPONG_DITHER, ))
>   c->ops.setup_dither = dpu_hw_pp_setup_dither;
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
> 


Re: [PATCH 4/4] drm/msm/dpu: Enable compression for command mode

2023-05-03 Thread Jessica Zhang




On 5/3/2023 12:51 PM, Dmitry Baryshkov wrote:

On 03/05/2023 22:04, Jessica Zhang wrote:



On 5/3/2023 12:28 AM, Marijn Suijten wrote:

On 2023-05-02 18:19:15, Jessica Zhang wrote:

Add a dpu_hw_intf op to enable data compression.

Signed-off-by: Jessica Zhang 
---
  drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c | 4 
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c  | 7 +++
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h  | 2 ++
  3 files changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c

index 74470d068622..4321a1aba17f 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c


Can we have INTF DCE on video-mode encoders as well?


Hi Marijn,

Currently, there's no way to validate DSC for video mode as I've only 
made changes to support DSI for command mode. We are planning to post 
changes to support DSC over DP, which will include changes for video 
mode.


If I remember correctly, HDK8350 panel should support DSC for both 
command and video modes.


Hi Dmitry,

Correct, however we are planning to submit the video mode changes with 
the DP DSC v1.2 changes.


My current panel driver/dt changes are for command mode, so we would 
have to spent time to also add video mode support. It would be faster to 
land the video mode changes with DP support as that's already a work in 
progress.









@@ -72,6 +72,10 @@ static void _dpu_encoder_phys_cmd_update_intf_cfg(
  phys_enc->hw_intf,
  true,
  phys_enc->hw_pp->idx);
+
+    if (phys_enc->dpu_kms->catalog->caps->has_data_compress &&


As per my suggestion on patch 3/4, drop the flag and check above and
only check if the function is NULL (below).


Acked.




+    phys_enc->hw_intf->ops.enable_compression)
+    phys_enc->hw_intf->ops.enable_compression(phys_enc->hw_intf);
  }
  static void dpu_encoder_phys_cmd_pp_tx_done_irq(void *arg, int 
irq_idx)
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c

index 671048a78801..4ce7ffdd7a05 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c
@@ -64,10 +64,16 @@
  #define INTF_CFG2_DATABUS_WIDEN    BIT(0)
  #define INTF_CFG2_DATA_HCTL_EN    BIT(4)


These should probably be reindented to match the below... And the rest
of the defines use spaces instead of tabs.


Fair point, though I think fixing the whitespace for these 2 macros 
specifically might be better in a more relevant series.


With that being said, I'll change the spacing of the DATA_COMPRESS bit 
to spaces instead of tabs.





+#define INTF_CFG2_DCE_DATA_COMPRESS    BIT(12)
  #define INTF_MISR_CTRL    0x180
  #define INTF_MISR_SIGNATURE    0x184


This does not seem to apply on top of:
https://lore.kernel.org/linux-arm-msm/20230411-dpu-intf-te-v4-10-27ce1a5ab...@somainline.org/


Seems like I'm missing some patches from that series on my working 
branch. Will rebase on top of the full series for the v2.




+static inline void dpu_hw_intf_enable_compression(struct 
dpu_hw_intf *ctx)


Why inline?  This is used as a pointer callback.


Acked, will remove the inline.




+{
+    DPU_REG_WRITE(>hw, INTF_CONFIG2, 
INTF_CFG2_DCE_DATA_COMPRESS);


dpu_hw_intf_setup_timing_engine() also programs INTF_CONFIG2.  Is it
double-buffered, or is that config **always** unused when DSI CMD mode
is used in conjunction with DSC/DCE?  Otherwise this should perhaps OR
the bitflag into the register, or write the whole thing at once in
dpu_hw_intf_setup_timing_engine()?


For command mode, INTF_CONFIG2 is unused aside from setting 
DATA_COMPRESS for DSC.


Since setup_timing_engine() is only used for video mode, the 
corresponding changes will be made in the DSC v1.2 for DP changes.


So, for command mode panels is this the only bit that should be set in 
INTF_CFG2?


Yep, outside of the changes in this patch, INTF_CONFIG2 is only used in 
the video mode setup_timing_engine() method.


Thanks,

Jessica Zhang


--
With best wishes
Dmitry



[PATCH v11 1/2] MAINTAINERS: add maintainers for DRM LOONGSON driver

2023-05-03 Thread Sui Jingfeng
 This patch add myself as maintainer to drm loongson driver

Signed-off-by: Sui Jingfeng 
---
 MAINTAINERS | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index e9a3bf32fe28..4aa2e587f061 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6920,6 +6920,13 @@ T:   git git://anongit.freedesktop.org/drm/drm-misc
 F: drivers/gpu/drm/lima/
 F: include/uapi/drm/lima_drm.h
 
+DRM DRIVERS FOR LOONGSON
+M: Sui Jingfeng 
+L: dri-devel@lists.freedesktop.org
+S: Supported
+T: git git://anongit.freedesktop.org/drm/drm-misc
+F: drivers/gpu/drm/loongson/
+
 DRM DRIVERS FOR MEDIATEK
 M: Chun-Kuang Hu 
 M: Philipp Zabel 
-- 
2.25.1



Re: [PATCH v4 0/7] add DSC 1.2 dpu supports

2023-05-03 Thread Marijn Suijten
On 2023-05-03 13:10:32, Kuogee Hsieh wrote:
> This series adds the DPU side changes to support DSC 1.2 encoder. This
> was validated with both DSI DSC 1.2 panel and DP DSC 1.2 monitor.
> The DSI and DP parts will be pushed later on top of this change.
> This seriel is rebase on [1], [2] and catalog fixes from [3].

I left a bunch of comments, suggestions and questions on a few patches
and the cover letter in v3, but some do not seem to have been
addressed/answered.  Can you take a look?

> Abhinav Kumar (2):
>   drm/msm/dpu: add dsc blocks for remaining chipsets in catalog
>   drm/msm/dpu: add DSC 1.2 hw blocks for relevant chipsets
> 
> Kuogee Hsieh (5):
>   drm/msm/dpu: add DPU_PINGPONG_DSC feature bit
>   drm/msm/dpu: add DPU_PINGPONG_DSC bits into PP_BLK and PP_BLK_TE
> marcos

Since I did not get to review this patch yet:

  macros*

But remember that, as per my comment in the v3 cover letter, it is
conflicting with the catalog changes in [3].

- Marijn

>   drm/msm/dpu: add PINGPONG_NONE to disconnect DSC from PINGPONG
>   drm/msm/dpu: add support for DSC encoder v1.2 engine
>   drm/msm/dpu: separate DSC flush update out of interface
> 
>  drivers/gpu/drm/msm/Makefile   |   1 +
>  .../drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h|  23 +-
>  .../gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h |   8 +-
>  .../gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h |  26 +-
>  .../drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h|  35 +-
>  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_0_sm8250.h |  26 +-
>  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_2_sc7180.h |   4 +-
>  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_3_sm6115.h |   2 +-
>  .../drm/msm/disp/dpu1/catalog/dpu_6_5_qcm2290.h|   2 +-
>  .../gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h |  14 +
>  .../gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h |   7 +
>  .../drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h   |  16 +
>  .../gpu/drm/msm/disp/dpu1/catalog/dpu_8_1_sm8450.h |  14 +
>  .../gpu/drm/msm/disp/dpu1/catalog/dpu_9_0_sm8550.h |  14 +
>  drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c|  16 +-
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c |  33 +-
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h |  34 +-
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c |  22 +-
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.h |  10 +
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c |   7 +-
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h |  15 +-
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc_1_2.c | 385 
> +
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h|   3 +-
>  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c|   9 +-
>  drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c |   7 +-
>  25 files changed, 650 insertions(+), 83 deletions(-)
>  create mode 100644 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc_1_2.c
> 
> -- 
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
> 


Re: [PATCH 3/4] drm/msm/dpu: Add has_data_compress to dpu_caps

2023-05-03 Thread Marijn Suijten
Hi Jessica,

On 2023-05-03 12:03:40, Jessica Zhang wrote:
> 
> 
> On 5/3/2023 12:07 AM, Marijn Suijten wrote:
> > On 2023-05-02 18:19:14, Jessica Zhang wrote:
> >> Add data_compress feature to DPU HW catalog.
> >>
> >> In DPU 7.x and later, there is a DATA_COMPRESS register that must be set
> >> within the DPU INTF block for DSC to work.
> >>
> >> As core_rev (and related macros) was removed from the dpu_kms struct, the
> >> most straightforward way to indicate the presence of this register would be
> >> to have a flag in dpu_caps.
> > 
> > This is a very generic name to have in the global dpu_caps for a very
> > specific register on the INTF block since DPU >= 7.0.0, and I doubt any
> > new catalog contributor will know how to fill this field.  After all,
> > DPU < 7.0.0 also has DCE but it is controlled via the PINGPONG block.
> > 
> > Instead, how about having it as a DPU_INTF_DATA_COMPRESS (or similar)
> > feature flag on the INTF block?  We do the same for other (register
> > related) features on the INTF block, and you did the same to disable DSC
> > callbacks on PP in [1].

(Note: I said "you" but meant Kuogee)

> Hi Marijn,
> 
> Sounds good.
> 
> > 
> > In fact it seems that the DSC/DCE (enablement) registers have been moved
> > from PINGPONG to INTF in DPU 7.0.0.  Can you clarify in the patch
> > message for v2 that this is the case, and do the same in the linked
> > PINGPONG patch?  Perhaps these patches should be part of the same series
> > as they do not seem DSI-specific.
> 
> Will make a note of the PP to INTF change in the commit message.

Thanks.

> I would prefer to keep this patch in this series is because it is needed 
> for DSI over command mode to work and the subsequent patch is 
> specifically for command mode.

That is fine, but do mention this in the commit message if it is
relevant here.  Otherwise only mention it as part of patch 4/4.

- Marijn


[PATCH v5 2/5] drm/i915: use pat_index instead of cache_level

2023-05-03 Thread fei . yang
From: Fei Yang 

Currently the KMD is using enum i915_cache_level to set caching policy for
buffer objects. This is flaky because the PAT index which really controls
the caching behavior in PTE has far more levels than what's defined in the
enum. In addition, the PAT index is platform dependent, having to translate
between i915_cache_level and PAT index is not reliable, and makes the code
more complicated.

>From UMD's perspective there is also a necessity to set caching policy for
performance fine tuning. It's much easier for the UMD to directly use PAT
index because the behavior of each PAT index is clearly defined in Bspec.
Having the abstracted i915_cache_level sitting in between would only cause
more ambiguity.

For these reasons this patch replaces i915_cache_level with PAT index. Also
note, the cache_level is not completely removed yet, because the KMD still
has the need of creating buffer objects with simple cache settings such as
cached, uncached, or writethrough. For such simple cases, using cache_level
would help simplify the code.

Cc: Chris Wilson 
Cc: Matt Roper 
Signed-off-by: Fei Yang 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/display/intel_dpt.c  | 12 +--
 drivers/gpu/drm/i915/gem/i915_gem_domain.c| 45 ++
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 10 ++-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  3 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 51 +++-
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  4 +
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 25 +-
 drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  4 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 16 ++--
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |  2 +-
 .../drm/i915/gem/selftests/i915_gem_migrate.c |  2 +-
 .../drm/i915/gem/selftests/i915_gem_mman.c|  2 +-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  | 10 ++-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  | 71 
 drivers/gpu/drm/i915/gt/gen8_ppgtt.h  |  3 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  | 82 +--
 drivers/gpu/drm/i915/gt/intel_gtt.h   | 20 ++---
 drivers/gpu/drm/i915/gt/intel_migrate.c   | 47 ++-
 drivers/gpu/drm/i915/gt/intel_migrate.h   | 13 ++-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |  6 +-
 drivers/gpu/drm/i915/gt/selftest_migrate.c| 47 ++-
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  8 +-
 drivers/gpu/drm/i915/gt/selftest_timeline.c   |  2 +-
 drivers/gpu/drm/i915/gt/selftest_tlb.c|  4 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  | 10 ++-
 drivers/gpu/drm/i915/i915_debugfs.c   | 52 +---
 drivers/gpu/drm/i915/i915_gem.c   | 16 +++-
 drivers/gpu/drm/i915/i915_gpu_error.c |  8 +-
 drivers/gpu/drm/i915/i915_vma.c   | 16 ++--
 drivers/gpu/drm/i915/i915_vma.h   |  2 +-
 drivers/gpu/drm/i915/i915_vma_types.h |  2 -
 drivers/gpu/drm/i915/selftests/i915_gem.c |  5 +-
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |  4 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 15 ++--
 .../drm/i915/selftests/intel_memory_region.c  |  4 +-
 drivers/gpu/drm/i915/selftests/mock_gtt.c |  8 +-
 36 files changed, 391 insertions(+), 240 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_dpt.c 
b/drivers/gpu/drm/i915/display/intel_dpt.c
index c5eacfdba1a5..7c5fddb203ba 100644
--- a/drivers/gpu/drm/i915/display/intel_dpt.c
+++ b/drivers/gpu/drm/i915/display/intel_dpt.c
@@ -43,24 +43,24 @@ static void gen8_set_pte(void __iomem *addr, gen8_pte_t pte)
 static void dpt_insert_page(struct i915_address_space *vm,
dma_addr_t addr,
u64 offset,
-   enum i915_cache_level level,
+   unsigned int pat_index,
u32 flags)
 {
struct i915_dpt *dpt = i915_vm_to_dpt(vm);
gen8_pte_t __iomem *base = dpt->iomem;
 
gen8_set_pte(base + offset / I915_GTT_PAGE_SIZE,
-vm->pte_encode(addr, level, flags));
+vm->pte_encode(addr, pat_index, flags));
 }
 
 static void dpt_insert_entries(struct i915_address_space *vm,
   struct i915_vma_resource *vma_res,
-  enum i915_cache_level level,
+  unsigned int pat_index,
   u32 flags)
 {
struct i915_dpt *dpt = i915_vm_to_dpt(vm);
gen8_pte_t __iomem *base = dpt->iomem;
-   const gen8_pte_t pte_encode = vm->pte_encode(0, level, flags);
+   const gen8_pte_t pte_encode = vm->pte_encode(0, pat_index, flags);
struct sgt_iter sgt_iter;
dma_addr_t addr;
int i;
@@ -83,7 +83,7 @@ static void dpt_clear_range(struct i915_address_space *vm,
 static void dpt_bind_vma(struct i915_address_space *vm,
 struct i915_vm_pt_stash *stash,
 

[PATCH v5 3/5] drm/i915: make sure correct pte encode is used

2023-05-03 Thread fei . yang
From: Fei Yang 

PTE encode is platform dependent. After replacing cache_level with
pat_index, the newly introduced mtl_pte_encode is actually generic
for all gen12 platforms, thus rename it to gen12_pte_encode and
apply it to all gen12 platforms.

Cc: Chris Wilson 
Cc: Matt Roper 
Signed-off-by: Fei Yang 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c 
b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index f2334a713c4e..d1e3d3b90e95 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -55,9 +55,9 @@ static u64 gen8_pte_encode(dma_addr_t addr,
return pte;
 }
 
-static u64 mtl_pte_encode(dma_addr_t addr,
- unsigned int pat_index,
- u32 flags)
+static u64 gen12_pte_encode(dma_addr_t addr,
+   unsigned int pat_index,
+   u32 flags)
 {
gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
 
@@ -995,8 +995,8 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt,
 */
ppgtt->vm.alloc_scratch_dma = alloc_pt_dma;
 
-   if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70))
-   ppgtt->vm.pte_encode = mtl_pte_encode;
+   if (GRAPHICS_VER(gt->i915) >= 12)
+   ppgtt->vm.pte_encode = gen12_pte_encode;
else
ppgtt->vm.pte_encode = gen8_pte_encode;
 
-- 
2.25.1



[PATCH v5 5/5] drm/i915: Allow user to set cache at BO creation

2023-05-03 Thread fei . yang
From: Fei Yang 

To comply with the design that buffer objects shall have immutable
cache setting through out their life cycle, {set, get}_caching ioctl's
are no longer supported from MTL onward. With that change caching
policy can only be set at object creation time. The current code
applies a default (platform dependent) cache setting for all objects.
However this is not optimal for performance tuning. The patch extends
the existing gem_create uAPI to let user set PAT index for the object
at creation time.
The new extension is platform independent, so UMD's can switch to using
this extension for older platforms as well, while {set, get}_caching are
still supported on these legacy paltforms for compatibility reason.

Cc: Chris Wilson 
Cc: Matt Roper 
Cc: Andi Shyti 
Signed-off-by: Fei Yang 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/gem/i915_gem_create.c | 36 ++
 drivers/gpu/drm/i915/gem/i915_gem_object.c |  6 
 include/uapi/drm/i915_drm.h| 36 ++
 tools/include/uapi/drm/i915_drm.h  | 36 ++
 4 files changed, 114 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_create.c 
b/drivers/gpu/drm/i915/gem/i915_gem_create.c
index bfe1dbda4cb7..644a936248ad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_create.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_create.c
@@ -245,6 +245,7 @@ struct create_ext {
unsigned int n_placements;
unsigned int placement_mask;
unsigned long flags;
+   unsigned int pat_index;
 };
 
 static void repr_placements(char *buf, size_t size,
@@ -394,11 +395,39 @@ static int ext_set_protected(struct i915_user_extension 
__user *base, void *data
return 0;
 }
 
+static int ext_set_pat(struct i915_user_extension __user *base, void *data)
+{
+   struct create_ext *ext_data = data;
+   struct drm_i915_private *i915 = ext_data->i915;
+   struct drm_i915_gem_create_ext_set_pat ext;
+   unsigned int max_pat_index;
+
+   BUILD_BUG_ON(sizeof(struct drm_i915_gem_create_ext_set_pat) !=
+offsetofend(struct drm_i915_gem_create_ext_set_pat, rsvd));
+
+   if (copy_from_user(, base, sizeof(ext)))
+   return -EFAULT;
+
+   max_pat_index = INTEL_INFO(i915)->max_pat_index;
+
+   if (ext.pat_index > max_pat_index) {
+   drm_dbg(>drm, "PAT index is invalid: %u\n",
+   ext.pat_index);
+   return -EINVAL;
+   }
+
+   ext_data->pat_index = ext.pat_index;
+
+   return 0;
+}
+
 static const i915_user_extension_fn create_extensions[] = {
[I915_GEM_CREATE_EXT_MEMORY_REGIONS] = ext_set_placements,
[I915_GEM_CREATE_EXT_PROTECTED_CONTENT] = ext_set_protected,
+   [I915_GEM_CREATE_EXT_SET_PAT] = ext_set_pat,
 };
 
+#define PAT_INDEX_NOT_SET  0x
 /**
  * i915_gem_create_ext_ioctl - Creates a new mm object and returns a handle to 
it.
  * @dev: drm device pointer
@@ -418,6 +447,7 @@ i915_gem_create_ext_ioctl(struct drm_device *dev, void 
*data,
if (args->flags & ~I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS)
return -EINVAL;
 
+   ext_data.pat_index = PAT_INDEX_NOT_SET;
ret = i915_user_extensions(u64_to_user_ptr(args->extensions),
   create_extensions,
   ARRAY_SIZE(create_extensions),
@@ -454,5 +484,11 @@ i915_gem_create_ext_ioctl(struct drm_device *dev, void 
*data,
if (IS_ERR(obj))
return PTR_ERR(obj);
 
+   if (ext_data.pat_index != PAT_INDEX_NOT_SET) {
+   i915_gem_object_set_pat_index(obj, ext_data.pat_index);
+   /* Mark pat_index is set by UMD */
+   obj->pat_set_by_user = true;
+   }
+
return i915_gem_publish(obj, file, >size, >handle);
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index 46a19b099ec8..97ac6fb37958 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -208,6 +208,12 @@ bool i915_gem_object_can_bypass_llc(struct 
drm_i915_gem_object *obj)
if (!(obj->flags & I915_BO_ALLOC_USER))
return false;
 
+   /*
+* Always flush cache for UMD objects at creation time.
+*/
+   if (obj->pat_set_by_user)
+   return true;
+
/*
 * EHL and JSL add the 'Bypass LLC' MOCS entry, which should make it
 * possible for userspace to bypass the GTT caching bits set by the
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index dba7c5a5b25e..03c5c314846e 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -3630,9 +3630,13 @@ struct drm_i915_gem_create_ext {
 *
 * For I915_GEM_CREATE_EXT_PROTECTED_CONTENT usage see
 * struct drm_i915_gem_create_ext_protected_content.
+*
+* For I915_GEM_CREATE_EXT_SET_PAT 

[PATCH v5 4/5] drm/i915/mtl: end support for set caching ioctl

2023-05-03 Thread fei . yang
From: Fei Yang 

The design is to keep Buffer Object's caching policy immutable through
out its life cycle. This patch ends the support for set caching ioctl
from MTL onward. While doing that we also set BO's to be 1-way coherent
at creation time because GPU is no longer automatically snooping CPU
cache. For userspace components needing to fine tune the caching policy
for BO's, a follow up patch will extend the GEM_CREATE uAPI to allow
them specify caching mode at BO creation time.

Signed-off-by: Fei Yang 
Reviewed-by: Andi Shyti 
Reviewed-by: Andrzej Hajda 
---
 drivers/gpu/drm/i915/gem/i915_gem_domain.c | 3 +++
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c  | 9 -
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_domain.c 
b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
index ae99b4be5918..53282c6d3873 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_domain.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_domain.c
@@ -337,6 +337,9 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void 
*data,
if (IS_DGFX(i915))
return -ENODEV;
 
+   if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))
+   return -EOPNOTSUPP;
+
switch (args->caching) {
case I915_CACHING_NONE:
level = I915_CACHE_NONE;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c 
b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index 37d1efcd3ca6..cad4a6017f4b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -601,7 +601,14 @@ static int shmem_object_init(struct intel_memory_region 
*mem,
obj->write_domain = I915_GEM_DOMAIN_CPU;
obj->read_domains = I915_GEM_DOMAIN_CPU;
 
-   if (HAS_LLC(i915))
+   /*
+* MTL doesn't snoop CPU cache by default for GPU access (namely
+* 1-way coherency). However some UMD's are currently depending on
+* that. Make 1-way coherent the default setting for MTL. A follow
+* up patch will extend the GEM_CREATE uAPI to allow UMD's specify
+* caching mode at BO creation time
+*/
+   if (HAS_LLC(i915) || (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70)))
/* On some devices, we can have the GPU use the LLC (the CPU
 * cache) for about a 10% performance improvement
 * compared to uncached.  Graphics requests other than
-- 
2.25.1



[PATCH v5 0/5] drm/i915: Allow user to set cache at BO creation

2023-05-03 Thread fei . yang
From: Fei Yang 

The first three patches in this series are taken from
https://patchwork.freedesktop.org/series/116868/
These patches are included here because the last patch
has dependency on the pat_index refactor.

This series is focusing on uAPI changes,
1. end support for set caching ioctl [PATCH 4/5]
2. add set_pat extension for gem_create [PATCH 5/5]

v2: drop one patch that was merged separately
341ad0e8e254 drm/i915/mtl: Add PTE encode function
v3: rebase on https://patchwork.freedesktop.org/series/117082/
v4: fix missing unlock introduced in v3, and
solve a rebase conflict
v5: replace obj->cache_level with pat_set_by_user,
fix i915_cache_level_str() for legacy platforms.

Fei Yang (5):
  drm/i915: preparation for using PAT index
  drm/i915: use pat_index instead of cache_level
  drm/i915: make sure correct pte encode is used
  drm/i915/mtl: end support for set caching ioctl
  drm/i915: Allow user to set cache at BO creation

 drivers/gpu/drm/i915/display/intel_dpt.c  | 12 +--
 drivers/gpu/drm/i915/gem/i915_gem_create.c| 36 +
 drivers/gpu/drm/i915/gem/i915_gem_domain.c| 48 ++-
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 10 ++-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  3 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 66 +++-
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  8 ++
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 26 +-
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c |  9 ++-
 drivers/gpu/drm/i915/gem/i915_gem_shrinker.c  |  2 -
 drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  4 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 16 ++--
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |  2 +-
 .../drm/i915/gem/selftests/i915_gem_migrate.c |  2 +-
 .../drm/i915/gem/selftests/i915_gem_mman.c|  2 +-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  | 10 ++-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  | 73 +
 drivers/gpu/drm/i915/gt/gen8_ppgtt.h  |  3 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  | 76 +-
 drivers/gpu/drm/i915/gt/intel_gtt.h   | 20 +++--
 drivers/gpu/drm/i915/gt/intel_migrate.c   | 47 ++-
 drivers/gpu/drm/i915/gt/intel_migrate.h   | 13 ++-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |  6 +-
 drivers/gpu/drm/i915/gt/selftest_migrate.c| 47 +--
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  8 +-
 drivers/gpu/drm/i915/gt/selftest_timeline.c   |  2 +-
 drivers/gpu/drm/i915/gt/selftest_tlb.c|  4 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  | 10 ++-
 drivers/gpu/drm/i915/i915_debugfs.c   | 52 +---
 drivers/gpu/drm/i915/i915_gem.c   | 16 +++-
 drivers/gpu/drm/i915/i915_gpu_error.c |  8 +-
 drivers/gpu/drm/i915/i915_pci.c   | 79 ---
 drivers/gpu/drm/i915/i915_vma.c   | 16 ++--
 drivers/gpu/drm/i915/i915_vma.h   |  2 +-
 drivers/gpu/drm/i915/i915_vma_types.h |  2 -
 drivers/gpu/drm/i915/intel_device_info.h  |  5 ++
 drivers/gpu/drm/i915/selftests/i915_gem.c |  5 +-
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |  4 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 15 ++--
 .../drm/i915/selftests/intel_memory_region.c  |  4 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  9 +++
 drivers/gpu/drm/i915/selftests/mock_gtt.c |  8 +-
 include/uapi/drm/i915_drm.h   | 36 +
 tools/include/uapi/drm/i915_drm.h | 36 +
 44 files changed, 618 insertions(+), 244 deletions(-)

-- 
2.25.1



[PATCH v5 1/5] drm/i915: preparation for using PAT index

2023-05-03 Thread fei . yang
From: Fei Yang 

This patch is a preparation for replacing enum i915_cache_level with PAT
index. Caching policy for buffer objects is set through the PAT index in
PTE, the old i915_cache_level is not sufficient to represent all caching
modes supported by the hardware.

Preparing the transition by adding some platform dependent data structures
and helper functions to translate the cache_level to pat_index.

cachelevel_to_pat: a platform dependent array mapping cache_level to
   pat_index.

max_pat_index: the maximum PAT index recommended in hardware specification
   Needed for validating the PAT index passed in from user
   space.

i915_gem_get_pat_index: function to convert cache_level to PAT index.

obj_to_i915(obj): macro moved to header file for wider usage.

I915_MAX_CACHE_LEVEL: upper bound of i915_cache_level for the
  convenience of coding.

Cc: Chris Wilson 
Cc: Matt Roper 
Signed-off-by: Fei Yang 
Reviewed-by: Andi Shyti 
Reviewed-by: Andrzej Hajda 
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c|  9 +++
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  4 +
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  1 +
 drivers/gpu/drm/i915/gem/i915_gem_shrinker.c  |  2 -
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  6 ++
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  6 ++
 drivers/gpu/drm/i915/i915_pci.c   | 79 ---
 drivers/gpu/drm/i915/intel_device_info.h  |  5 ++
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  9 +++
 9 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index 4666bb82f312..8c70a0ec7d2f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -45,6 +45,15 @@ static struct kmem_cache *slab_objects;
 
 static const struct drm_gem_object_funcs i915_gem_object_funcs;
 
+unsigned int i915_gem_get_pat_index(struct drm_i915_private *i915,
+   enum i915_cache_level level)
+{
+   if (drm_WARN_ON(>drm, level >= I915_MAX_CACHE_LEVEL))
+   return 0;
+
+   return INTEL_INFO(i915)->cachelevel_to_pat[level];
+}
+
 struct drm_i915_gem_object *i915_gem_object_alloc(void)
 {
struct drm_i915_gem_object *obj;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object.h
index 885ccde9dc3c..4c92e17b4337 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -20,6 +20,8 @@
 
 enum intel_region_id;
 
+#define obj_to_i915(obj__) to_i915((obj__)->base.dev)
+
 static inline bool i915_gem_object_size_2big(u64 size)
 {
struct drm_i915_gem_object *obj;
@@ -30,6 +32,8 @@ static inline bool i915_gem_object_size_2big(u64 size)
return false;
 }
 
+unsigned int i915_gem_get_pat_index(struct drm_i915_private *i915,
+   enum i915_cache_level level);
 void i915_gem_init__objects(struct drm_i915_private *i915);
 
 void i915_objects_module_exit(void);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index 830c11431ee8..41b35abccf88 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -194,6 +194,7 @@ enum i915_cache_level {
 * engine.
 */
I915_CACHE_WT,
+   I915_MAX_CACHE_LEVEL,
 };
 
 enum i915_map_type {
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c 
b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
index b1672e054b21..214763942aa2 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
@@ -460,8 +460,6 @@ void i915_gem_shrinker_taints_mutex(struct drm_i915_private 
*i915,
fs_reclaim_release(GFP_KERNEL);
 }
 
-#define obj_to_i915(obj__) to_i915((obj__)->base.dev)
-
 /**
  * i915_gem_object_make_unshrinkable - Hide the object from the shrinker. By
  * default all object types that support shrinking(see IS_SHRINKABLE), will 
also
diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c 
b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index 22ec1566d2a7..bb6998d67133 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -78,6 +78,12 @@ static u64 mtl_pte_encode(dma_addr_t addr,
case I915_CACHE_WT:
pte |= GEN12_PPGTT_PTE_PAT0;
break;
+   default:
+   /* This should never happen. Added to deal with the compile
+* error due to the addition of I915_MAX_CACHE_LEVEL. Will
+* be removed by the pat_index patch.
+*/
+   break;
}
 
return pte;
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 20915edc8bd9..c8390d03fce2 100644
--- 

Re: [PATCH 4/4] drm/msm/dpu: Enable compression for command mode

2023-05-03 Thread Marijn Suijten
Hi Jessica,

On 2023-05-03 12:04:59, Jessica Zhang wrote:
> 
> 
> On 5/3/2023 12:28 AM, Marijn Suijten wrote:
> > On 2023-05-02 18:19:15, Jessica Zhang wrote:
> >> Add a dpu_hw_intf op to enable data compression.
> >>
> >> Signed-off-by: Jessica Zhang 
> >> ---
> >>   drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c | 4 
> >>   drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c  | 7 +++
> >>   drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h  | 2 ++
> >>   3 files changed, 13 insertions(+)
> >>
> >> diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c 
> >> b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c
> >> index 74470d068622..4321a1aba17f 100644
> >> --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c
> >> +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c
> > 
> > Can we have INTF DCE on video-mode encoders as well?
> 
> Hi Marijn,
> 
> Currently, there's no way to validate DSC for video mode as I've only 
> made changes to support DSI for command mode. We are planning to post 
> changes to support DSC over DP, which will include changes for video mode.

Okay, but then mention so in the patch description (which is rather
short in this revision).



> >>   #define INTF_CFG2_DATABUS_WIDEN  BIT(0)
> >>   #define INTF_CFG2_DATA_HCTL_EN   BIT(4)
> > 
> > These should probably be reindented to match the below... And the rest
> > of the defines use spaces instead of tabs.
> 
> Fair point, though I think fixing the whitespace for these 2 macros 
> specifically might be better in a more relevant series.

Yes, I have many patches to start cleaning these up, as well as all the
broken kerneldoc comments, but it's an uphill battle.  Not sure if I'll
get to it any time soon if at all.

> With that being said, I'll change the spacing of the DATA_COMPRESS bit 
> to spaces instead of tabs.

Thanks, that seems to be the most common format.

> >> +#define INTF_CFG2_DCE_DATA_COMPRESS   BIT(12)
> >>   
> >>   #define INTF_MISR_CTRL   0x180
> >>   #define INTF_MISR_SIGNATURE  0x184
> > 
> > This does not seem to apply on top of:
> > https://lore.kernel.org/linux-arm-msm/20230411-dpu-intf-te-v4-10-27ce1a5ab...@somainline.org/
> 
> Seems like I'm missing some patches from that series on my working 
> branch. Will rebase on top of the full series for the v2.

Thanks, but do discuss with Abhinav/Dmitry which series will land first.

> >> +static inline void dpu_hw_intf_enable_compression(struct dpu_hw_intf *ctx)
> > 
> > Why inline?  This is used as a pointer callback.
> 
> Acked, will remove the inline.
> 
> > 
> >> +{
> >> +  DPU_REG_WRITE(>hw, INTF_CONFIG2, INTF_CFG2_DCE_DATA_COMPRESS);
> > 
> > dpu_hw_intf_setup_timing_engine() also programs INTF_CONFIG2.  Is it
> > double-buffered, or is that config **always** unused when DSI CMD mode
> > is used in conjunction with DSC/DCE?  Otherwise this should perhaps OR
> > the bitflag into the register, or write the whole thing at once in
> > dpu_hw_intf_setup_timing_engine()?
> 
> For command mode, INTF_CONFIG2 is unused aside from setting 
> DATA_COMPRESS for DSC.
> 
> Since setup_timing_engine() is only used for video mode, the 
> corresponding changes will be made in the DSC v1.2 for DP changes.

Ack, that makes sense.  However, is this a guarantee that nothing else
will write INTF_CONFIG2 in the future, or will we solve that problem
when it happens?  I'm afraid more config-bits get added to this register
in the future and might possibly race/overwrite each other.

- Marijn




[PATCH v5 2/3] drm/i915: use pat_index instead of cache_level

2023-05-03 Thread fei . yang
From: Fei Yang 

Currently the KMD is using enum i915_cache_level to set caching policy for
buffer objects. This is flaky because the PAT index which really controls
the caching behavior in PTE has far more levels than what's defined in the
enum. In addition, the PAT index is platform dependent, having to translate
between i915_cache_level and PAT index is not reliable, and makes the code
more complicated.

>From UMD's perspective there is also a necessity to set caching policy for
performance fine tuning. It's much easier for the UMD to directly use PAT
index because the behavior of each PAT index is clearly defined in Bspec.
Having the abstracted i915_cache_level sitting in between would only cause
more ambiguity.

For these reasons this patch replaces i915_cache_level with PAT index. Also
note, the cache_level is not completely removed yet, because the KMD still
has the need of creating buffer objects with simple cache settings such as
cached, uncached, or writethrough. For such simple cases, using cache_level
would help simplify the code.

Cc: Chris Wilson 
Cc: Matt Roper 
Signed-off-by: Fei Yang 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/display/intel_dpt.c  | 12 +--
 drivers/gpu/drm/i915/gem/i915_gem_domain.c| 45 ++
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 10 ++-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  3 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 51 +++-
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  4 +
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 25 +-
 drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  4 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 16 ++--
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |  2 +-
 .../drm/i915/gem/selftests/i915_gem_migrate.c |  2 +-
 .../drm/i915/gem/selftests/i915_gem_mman.c|  2 +-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  | 10 ++-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  | 71 
 drivers/gpu/drm/i915/gt/gen8_ppgtt.h  |  3 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  | 82 +--
 drivers/gpu/drm/i915/gt/intel_gtt.h   | 20 ++---
 drivers/gpu/drm/i915/gt/intel_migrate.c   | 47 ++-
 drivers/gpu/drm/i915/gt/intel_migrate.h   | 13 ++-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |  6 +-
 drivers/gpu/drm/i915/gt/selftest_migrate.c| 47 ++-
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  8 +-
 drivers/gpu/drm/i915/gt/selftest_timeline.c   |  2 +-
 drivers/gpu/drm/i915/gt/selftest_tlb.c|  4 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  | 10 ++-
 drivers/gpu/drm/i915/i915_debugfs.c   | 52 +---
 drivers/gpu/drm/i915/i915_gem.c   | 16 +++-
 drivers/gpu/drm/i915/i915_gpu_error.c |  8 +-
 drivers/gpu/drm/i915/i915_vma.c   | 16 ++--
 drivers/gpu/drm/i915/i915_vma.h   |  2 +-
 drivers/gpu/drm/i915/i915_vma_types.h |  2 -
 drivers/gpu/drm/i915/selftests/i915_gem.c |  5 +-
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |  4 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 15 ++--
 .../drm/i915/selftests/intel_memory_region.c  |  4 +-
 drivers/gpu/drm/i915/selftests/mock_gtt.c |  8 +-
 36 files changed, 391 insertions(+), 240 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_dpt.c 
b/drivers/gpu/drm/i915/display/intel_dpt.c
index c5eacfdba1a5..7c5fddb203ba 100644
--- a/drivers/gpu/drm/i915/display/intel_dpt.c
+++ b/drivers/gpu/drm/i915/display/intel_dpt.c
@@ -43,24 +43,24 @@ static void gen8_set_pte(void __iomem *addr, gen8_pte_t pte)
 static void dpt_insert_page(struct i915_address_space *vm,
dma_addr_t addr,
u64 offset,
-   enum i915_cache_level level,
+   unsigned int pat_index,
u32 flags)
 {
struct i915_dpt *dpt = i915_vm_to_dpt(vm);
gen8_pte_t __iomem *base = dpt->iomem;
 
gen8_set_pte(base + offset / I915_GTT_PAGE_SIZE,
-vm->pte_encode(addr, level, flags));
+vm->pte_encode(addr, pat_index, flags));
 }
 
 static void dpt_insert_entries(struct i915_address_space *vm,
   struct i915_vma_resource *vma_res,
-  enum i915_cache_level level,
+  unsigned int pat_index,
   u32 flags)
 {
struct i915_dpt *dpt = i915_vm_to_dpt(vm);
gen8_pte_t __iomem *base = dpt->iomem;
-   const gen8_pte_t pte_encode = vm->pte_encode(0, level, flags);
+   const gen8_pte_t pte_encode = vm->pte_encode(0, pat_index, flags);
struct sgt_iter sgt_iter;
dma_addr_t addr;
int i;
@@ -83,7 +83,7 @@ static void dpt_clear_range(struct i915_address_space *vm,
 static void dpt_bind_vma(struct i915_address_space *vm,
 struct i915_vm_pt_stash *stash,
 

[PATCH v5 0/3] drm/i915: use pat_index instead of cache_level

2023-05-03 Thread fei . yang
From: Fei Yang 

This patch set was posted at
https://patchwork.freedesktop.org/series/116868/
Change title since the PTE patch was merged separately.

These patches are extracted from series
https://patchwork.freedesktop.org/series/115980/

This series refactor the cache policy programming so that the PTE
encode functions can be unified across all GEN12 platforms. This
refactor is also important in implementing the design which allows
uerspace to directly set cache policy for each Buffer Object.

v2: drop one patch that was merged separately
341ad0e8e254 drm/i915/mtl: Add PTE encode function
v3: disable {get, set}_caching ioctl
v4: fix missing unlock introduced in v3, and
solve a rebase conflict
v5: replace obj->cache_level with pat_set_by_user,
fix i915_cache_level_str() for legacy platforms.

Fei Yang (3):
  drm/i915: preparation for using PAT index
  drm/i915: use pat_index instead of cache_level
  drm/i915: make sure correct pte encode is used

 drivers/gpu/drm/i915/display/intel_dpt.c  | 12 +--
 drivers/gpu/drm/i915/gem/i915_gem_domain.c| 45 ++-
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 10 ++-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  3 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c| 60 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  8 ++
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 26 +-
 drivers/gpu/drm/i915/gem/i915_gem_shrinker.c  |  2 -
 drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  4 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 16 ++--
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |  2 +-
 .../drm/i915/gem/selftests/i915_gem_migrate.c |  2 +-
 .../drm/i915/gem/selftests/i915_gem_mman.c|  2 +-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  | 10 ++-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  | 73 +
 drivers/gpu/drm/i915/gt/gen8_ppgtt.h  |  3 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  | 76 +-
 drivers/gpu/drm/i915/gt/intel_gtt.h   | 20 +++--
 drivers/gpu/drm/i915/gt/intel_migrate.c   | 47 ++-
 drivers/gpu/drm/i915/gt/intel_migrate.h   | 13 ++-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |  6 +-
 drivers/gpu/drm/i915/gt/selftest_migrate.c| 47 +--
 drivers/gpu/drm/i915/gt/selftest_reset.c  |  8 +-
 drivers/gpu/drm/i915/gt/selftest_timeline.c   |  2 +-
 drivers/gpu/drm/i915/gt/selftest_tlb.c|  4 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  | 10 ++-
 drivers/gpu/drm/i915/i915_debugfs.c   | 52 +---
 drivers/gpu/drm/i915/i915_gem.c   | 16 +++-
 drivers/gpu/drm/i915/i915_gpu_error.c |  8 +-
 drivers/gpu/drm/i915/i915_pci.c   | 79 ---
 drivers/gpu/drm/i915/i915_vma.c   | 16 ++--
 drivers/gpu/drm/i915/i915_vma.h   |  2 +-
 drivers/gpu/drm/i915/i915_vma_types.h |  2 -
 drivers/gpu/drm/i915/intel_device_info.h  |  5 ++
 drivers/gpu/drm/i915/selftests/i915_gem.c |  5 +-
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |  4 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 15 ++--
 .../drm/i915/selftests/intel_memory_region.c  |  4 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  9 +++
 drivers/gpu/drm/i915/selftests/mock_gtt.c |  8 +-
 40 files changed, 493 insertions(+), 243 deletions(-)

-- 
2.25.1



[PATCH v5 1/3] drm/i915: preparation for using PAT index

2023-05-03 Thread fei . yang
From: Fei Yang 

This patch is a preparation for replacing enum i915_cache_level with PAT
index. Caching policy for buffer objects is set through the PAT index in
PTE, the old i915_cache_level is not sufficient to represent all caching
modes supported by the hardware.

Preparing the transition by adding some platform dependent data structures
and helper functions to translate the cache_level to pat_index.

cachelevel_to_pat: a platform dependent array mapping cache_level to
   pat_index.

max_pat_index: the maximum PAT index recommended in hardware specification
   Needed for validating the PAT index passed in from user
   space.

i915_gem_get_pat_index: function to convert cache_level to PAT index.

obj_to_i915(obj): macro moved to header file for wider usage.

I915_MAX_CACHE_LEVEL: upper bound of i915_cache_level for the
  convenience of coding.

Cc: Chris Wilson 
Cc: Matt Roper 
Signed-off-by: Fei Yang 
Reviewed-by: Andi Shyti 
Reviewed-by: Andrzej Hajda 
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c|  9 +++
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  4 +
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  1 +
 drivers/gpu/drm/i915/gem/i915_gem_shrinker.c  |  2 -
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  6 ++
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  6 ++
 drivers/gpu/drm/i915/i915_pci.c   | 79 ---
 drivers/gpu/drm/i915/intel_device_info.h  |  5 ++
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  9 +++
 9 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index 4666bb82f312..8c70a0ec7d2f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -45,6 +45,15 @@ static struct kmem_cache *slab_objects;
 
 static const struct drm_gem_object_funcs i915_gem_object_funcs;
 
+unsigned int i915_gem_get_pat_index(struct drm_i915_private *i915,
+   enum i915_cache_level level)
+{
+   if (drm_WARN_ON(>drm, level >= I915_MAX_CACHE_LEVEL))
+   return 0;
+
+   return INTEL_INFO(i915)->cachelevel_to_pat[level];
+}
+
 struct drm_i915_gem_object *i915_gem_object_alloc(void)
 {
struct drm_i915_gem_object *obj;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object.h
index 885ccde9dc3c..4c92e17b4337 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -20,6 +20,8 @@
 
 enum intel_region_id;
 
+#define obj_to_i915(obj__) to_i915((obj__)->base.dev)
+
 static inline bool i915_gem_object_size_2big(u64 size)
 {
struct drm_i915_gem_object *obj;
@@ -30,6 +32,8 @@ static inline bool i915_gem_object_size_2big(u64 size)
return false;
 }
 
+unsigned int i915_gem_get_pat_index(struct drm_i915_private *i915,
+   enum i915_cache_level level);
 void i915_gem_init__objects(struct drm_i915_private *i915);
 
 void i915_objects_module_exit(void);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
index 830c11431ee8..41b35abccf88 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -194,6 +194,7 @@ enum i915_cache_level {
 * engine.
 */
I915_CACHE_WT,
+   I915_MAX_CACHE_LEVEL,
 };
 
 enum i915_map_type {
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c 
b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
index b1672e054b21..214763942aa2 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
@@ -460,8 +460,6 @@ void i915_gem_shrinker_taints_mutex(struct drm_i915_private 
*i915,
fs_reclaim_release(GFP_KERNEL);
 }
 
-#define obj_to_i915(obj__) to_i915((obj__)->base.dev)
-
 /**
  * i915_gem_object_make_unshrinkable - Hide the object from the shrinker. By
  * default all object types that support shrinking(see IS_SHRINKABLE), will 
also
diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c 
b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index 22ec1566d2a7..bb6998d67133 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -78,6 +78,12 @@ static u64 mtl_pte_encode(dma_addr_t addr,
case I915_CACHE_WT:
pte |= GEN12_PPGTT_PTE_PAT0;
break;
+   default:
+   /* This should never happen. Added to deal with the compile
+* error due to the addition of I915_MAX_CACHE_LEVEL. Will
+* be removed by the pat_index patch.
+*/
+   break;
}
 
return pte;
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index 20915edc8bd9..c8390d03fce2 100644
--- 

[PATCH v5 3/3] drm/i915: make sure correct pte encode is used

2023-05-03 Thread fei . yang
From: Fei Yang 

PTE encode is platform dependent. After replacing cache_level with
pat_index, the newly introduced mtl_pte_encode is actually generic
for all gen12 platforms, thus rename it to gen12_pte_encode and
apply it to all gen12 platforms.

Cc: Chris Wilson 
Cc: Matt Roper 
Signed-off-by: Fei Yang 
Reviewed-by: Andi Shyti 
---
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c 
b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index f2334a713c4e..d1e3d3b90e95 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -55,9 +55,9 @@ static u64 gen8_pte_encode(dma_addr_t addr,
return pte;
 }
 
-static u64 mtl_pte_encode(dma_addr_t addr,
- unsigned int pat_index,
- u32 flags)
+static u64 gen12_pte_encode(dma_addr_t addr,
+   unsigned int pat_index,
+   u32 flags)
 {
gen8_pte_t pte = addr | GEN8_PAGE_PRESENT | GEN8_PAGE_RW;
 
@@ -995,8 +995,8 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt,
 */
ppgtt->vm.alloc_scratch_dma = alloc_pt_dma;
 
-   if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70))
-   ppgtt->vm.pte_encode = mtl_pte_encode;
+   if (GRAPHICS_VER(gt->i915) >= 12)
+   ppgtt->vm.pte_encode = gen12_pte_encode;
else
ppgtt->vm.pte_encode = gen8_pte_encode;
 
-- 
2.25.1



Re: [PATCH 0/7] drm/msm/dpu: simplify DPU encoder init

2023-05-03 Thread Abhinav Kumar




On 4/30/2023 4:57 PM, Dmitry Baryshkov wrote:

Rework dpu_encoder initialization code, simplifying calling sequences
and separating common init parts.


Please mention that your series was made on top of 
https://patchwork.freedesktop.org/series/116530/.


Figured it out when I tried to apply it to my branch to test.

Validated writeback just in case with this, hence please use

Tested-by: Abhinav Kumar  # sc7280



Dmitry Baryshkov (7):
   drm/msm/dpu: merge dpu_encoder_init() and dpu_encoder_setup()
   drm/msm/dpu: drop dpu_encoder_early_unregister
   drm/msm/dpu: separate common function to init physical encoder
   drm/msm/dpu: drop duplicated intf/wb indices from encoder structs
   drm/msm/dpu: inline dpu_encoder_get_wb()
   drm/msm/dpu: call dpu_rm_get_intf() from dpu_encoder_get_intf()
   drm/msm/dpu: drop dpu_encoder_phys_ops.atomic_mode_set

  drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c   | 190 --
  drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.h   |  14 +-
  .../gpu/drm/msm/disp/dpu1/dpu_encoder_phys.h  |  20 +-
  .../drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c  |  55 ++---
  .../drm/msm/disp/dpu1/dpu_encoder_phys_vid.c  |  35 +---
  .../drm/msm/disp/dpu1/dpu_encoder_phys_wb.c   |  38 +---
  drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c   |  87 +++-
  7 files changed, 155 insertions(+), 284 deletions(-)



[PATCH] drm/msm/dpu: add writeback support for sc7280

2023-05-03 Thread Abhinav Kumar
Add writeback support for sc7280. This was validated with kms_writeback
test case in IGT.

Signed-off-by: Abhinav Kumar 
---
 drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
index 6b2c7eae71d9..b4cf445b74bf 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
@@ -31,6 +31,7 @@ static const struct dpu_mdp_cfg sc7280_mdp[] = {
.clk_ctrls[DPU_CLK_CTRL_DMA0] = { .reg_off = 0x2ac, .bit_off = 8 },
.clk_ctrls[DPU_CLK_CTRL_DMA1] = { .reg_off = 0x2b4, .bit_off = 8 },
.clk_ctrls[DPU_CLK_CTRL_DMA2] = { .reg_off = 0x2c4, .bit_off = 8 },
+   .clk_ctrls[DPU_CLK_CTRL_WB2] = { .reg_off = 0x3b8, .bit_off = 24 },
},
 };
 
@@ -93,6 +94,11 @@ static const struct dpu_pingpong_cfg sc7280_pp[] = {
PP_BLK_DITHER("pingpong_3", PINGPONG_3, 0x6c000, 0, sc7280_pp_sblk, -1, 
-1),
 };
 
+static const struct dpu_wb_cfg sc7280_wb[] = {
+   WB_BLK("wb_2", WB_2, 0x65000, WB_SM8250_MASK, DPU_CLK_CTRL_WB2, 6,
+   VBIF_RT, MDP_SSPP_TOP0_INTR, 4096, 4),
+};
+
 static const struct dpu_intf_cfg sc7280_intf[] = {
INTF_BLK("intf_0", INTF_0, 0x34000, 0x280, INTF_DP, 
MSM_DP_CONTROLLER_0, 24, INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 24, 25),
INTF_BLK("intf_1", INTF_1, 0x35000, 0x2c4, INTF_DSI, 0, 24, 
INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 26, 27),
@@ -142,6 +148,8 @@ const struct dpu_mdss_cfg dpu_sc7280_cfg = {
.mixer = sc7280_lm,
.pingpong_count = ARRAY_SIZE(sc7280_pp),
.pingpong = sc7280_pp,
+   .wb_count = ARRAY_SIZE(sc7280_wb),
+   .wb = sc7280_wb,
.intf_count = ARRAY_SIZE(sc7280_intf),
.intf = sc7280_intf,
.vbif_count = ARRAY_SIZE(sdm845_vbif),
-- 
2.40.1



RE: [Intel-gfx] [PATCH v4 2/3] drm/i915: use pat_index instead of cache_level

2023-05-03 Thread Yang, Fei
[...]

>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
>> b/drivers/gpu/drm/i915/gem/i915_gem_object.c
>> index 8c70a0ec7d2f..27c948350b5b 100644
>> --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
>> @@ -54,6 +54,25 @@ unsigned int i915_gem_get_pat_index(struct 
>> drm_i915_private *i915,
>>  return INTEL_INFO(i915)->cachelevel_to_pat[level];
>>   }
>>
>> +bool i915_gem_object_has_cache_level(const struct drm_i915_gem_object *obj,
>> + enum i915_cache_level lvl)
>> +{
>> +/*
>> + * cache_level == I915_CACHE_INVAL indicates the UMD's have set the
>> + * caching policy through pat_index, in which case the KMD should
>> + * leave the coherency to be managed by user space, simply return
>> + * true here.
>> + */
>> +if (obj->cache_level == I915_CACHE_INVAL)
>> +return true;
>> +
>> +/*
>> + * Otherwise the pat_index should have been converted from cache_level
>> + * so that the following comparison is valid.
>> + */
>> +return obj->pat_index == i915_gem_get_pat_index(obj_to_i915(obj), lvl);
>> +}
>> +
>>   struct drm_i915_gem_object *i915_gem_object_alloc(void)
>>   {
>>  struct drm_i915_gem_object *obj;
>> @@ -133,7 +152,7 @@ void i915_gem_object_set_cache_coherency(struct 
>> drm_i915_gem_object *obj,
>>   {
>>  struct drm_i915_private *i915 = to_i915(obj->base.dev);
>>
>> -obj->cache_level = cache_level;
>> +obj->pat_index = i915_gem_get_pat_index(i915, cache_level);
>
> obj->cache_level is only ever set to "invalid" from the set pat
> extension? Doesn't that make it a boolean so there is no need for three
> bits to hold the enum, just the "pat has been externally set" bit really?

Will update.

>>
>>  if (cache_level != I915_CACHE_NONE)
>>  obj->cache_coherent = (I915_BO_CACHE_COHERENT_FOR_READ |

[...]

>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c 
>> b/drivers/gpu/drm/i915/i915_debugfs.c
>> index 41389a32e998..9a4922da3a71 100644
>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>> @@ -139,21 +139,56 @@ static const char *stringify_vma_type(const struct 
>> i915_vma *vma)
>>  return "ppgtt";
>>   }
>>
>> -static const char *i915_cache_level_str(struct drm_i915_private *i915, int 
>> type)
>> -{
>> -switch (type) {
>> -case I915_CACHE_NONE: return " uncached";
>> -case I915_CACHE_LLC: return HAS_LLC(i915) ? " LLC" : " snooped";
>> -case I915_CACHE_L3_LLC: return " L3+LLC";
>> -case I915_CACHE_WT: return " WT";
>> -default: return "";
>> +static const char *i915_cache_level_str(struct drm_i915_gem_object *obj)
>> +{
>> +struct drm_i915_private *i915 = obj_to_i915(obj);
>> +
>> +if (IS_METEORLAKE(i915)) {
>> +switch (obj->pat_index) {
>> +case 0: return " WB";
>> +case 1: return " WT";
>> +case 2: return " UC";
>> +case 3: return " WB (1-Way Coh)";
>> +case 4: return " WB (2-Way Coh)";
>> +default: return " not defined";
>> +}
>> +} else if (IS_PONTEVECCHIO(i915)) {
>> +switch (obj->pat_index) {
>> +case 0: return " UC";
>> +case 1: return " WC";
>> +case 2: return " WT";
>> +case 3: return " WB";
>> +case 4: return " WT (CLOS1)";
>> +case 5: return " WB (CLOS1)";
>> +case 6: return " WT (CLOS2)";
>> +case 7: return " WT (CLOS2)";
>> +default: return " not defined";
>> +}
>> +} else if (GRAPHICS_VER(i915) >= 12) {
>> +switch (obj->pat_index) {
>> +case 0: return " WB";
>> +case 1: return " WC";
>> +case 2: return " WT";
>> +case 3: return " UC";
>> +default: return " not defined";
>> +}
>> +} else {
>> +if (i915_gem_object_has_cache_level(obj, I915_CACHE_NONE))
>> +return " uncached";
>
> This will print uncached for all legacy platforms if set pat extension
> has been used, regardless of the index set.

Will update. Should just use obj->pat_index here.

> Are we okay with that? I find it questionable and would say no. It
> diverges from >= 12 and so is confusing.
>
>> +else if (i915_gem_object_has_cache_level(obj, I915_CACHE_LLC))
>> +return HAS_LLC(i915) ? " LLC" : " snooped";
>> +else if (i915_gem_object_has_cache_level(obj, 
>> I915_CACHE_L3_LLC))
>> +return " L3+LLC";
>> +else if (i915_gem_object_has_cache_level(obj, I915_CACHE_WT))
>> +return " WT";
>> +else
>> +return " not defined";
>
> Another thing is why use different names for caching modes between
> "legacy" and the rest?

For new platforms the string matches bspec. For legacy platforms I think 

Re: [PATCH 01/11] drm/dp_mst: Fix fractional DSC bpp handling

2023-05-03 Thread Lyude Paul
Reviewed-by: Lyude Paul 

Thanks!

On Tue, 2023-05-02 at 17:38 +0300, Ville Syrjala wrote:
> From: Ville Syrjälä 
> 
> The current code does '(bpp << 4) / 16' in the MST PBN
> calculation, but that is just the same as 'bpp' so the
> DSC codepath achieves absolutely nothing. Fix it up so that
> the fractional part of the bpp value is actually used instead
> of truncated away. 64*1006 has enough zero lsbs that we can
> just shift that down in the dividend and thus still manage
> to stick to a 32bit divisor.
> 
> And while touching this, let's just make the whole thing more
> straightforward by making the passed in bpp value .4 binary
> fixed point always, instead of having to pass in different
> things based on whether DSC is enabled or not.
> 
> Cc: Manasi Navare 
> Cc: Lyude Paul 
> Cc: Harry Wentland 
> Cc: David Francis 
> Cc: Mikita Lipski 
> Cc: Alex Deucher 
> Fixes: dc48529fb14e ("drm/dp_mst: Add PBN calculation for DSC modes")
> Signed-off-by: Ville Syrjälä 
> ---
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  2 +-
>  .../display/amdgpu_dm/amdgpu_dm_mst_types.c   |  2 +-
>  drivers/gpu/drm/display/drm_dp_mst_topology.c | 20 +--
>  drivers/gpu/drm/i915/display/intel_dp_mst.c   |  5 ++---
>  drivers/gpu/drm/nouveau/dispnv50/disp.c   |  3 +--
>  .../gpu/drm/tests/drm_dp_mst_helper_test.c|  2 +-
>  include/drm/display/drm_dp_mst_helper.h   |  2 +-
>  7 files changed, 12 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 6cacb76f389e..7d58f08a5444 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -6763,7 +6763,7 @@ static int dm_encoder_helper_atomic_check(struct 
> drm_encoder *encoder,
>   max_bpc);
>   bpp = convert_dc_color_depth_into_bpc(color_depth) * 3;
>   clock = adjusted_mode->clock;
> - dm_new_connector_state->pbn = drm_dp_calc_pbn_mode(clock, bpp, 
> false);
> + dm_new_connector_state->pbn = drm_dp_calc_pbn_mode(clock, bpp 
> << 4);
>   }
>  
>   dm_new_connector_state->vcpi_slots =
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
> index 994ba426ca66..eb4b666e50e8 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c
> @@ -1515,7 +1515,7 @@ enum dc_status dm_dp_mst_is_port_support_mode(
>   } else {
>   /* check if mode could be supported within full_pbn */
>   bpp = 
> convert_dc_color_depth_into_bpc(stream->timing.display_color_depth) * 3;
> - pbn = drm_dp_calc_pbn_mode(stream->timing.pix_clk_100hz / 10, 
> bpp, false);
> + pbn = drm_dp_calc_pbn_mode(stream->timing.pix_clk_100hz / 10, 
> bpp << 4);
>  
>   if (pbn > aconnector->mst_output_port->full_pbn)
>   return DC_FAIL_BANDWIDTH_VALIDATE;
> diff --git a/drivers/gpu/drm/display/drm_dp_mst_topology.c 
> b/drivers/gpu/drm/display/drm_dp_mst_topology.c
> index 38dab76ae69e..cd4c4f22c903 100644
> --- a/drivers/gpu/drm/display/drm_dp_mst_topology.c
> +++ b/drivers/gpu/drm/display/drm_dp_mst_topology.c
> @@ -4619,13 +4619,12 @@ EXPORT_SYMBOL(drm_dp_check_act_status);
>  
>  /**
>   * drm_dp_calc_pbn_mode() - Calculate the PBN for a mode.
> - * @clock: dot clock for the mode
> - * @bpp: bpp for the mode.
> - * @dsc: DSC mode. If true, bpp has units of 1/16 of a bit per pixel
> + * @clock: dot clock
> + * @bpp: bpp as .4 binary fixed point
>   *
>   * This uses the formula in the spec to calculate the PBN value for a mode.
>   */
> -int drm_dp_calc_pbn_mode(int clock, int bpp, bool dsc)
> +int drm_dp_calc_pbn_mode(int clock, int bpp)
>  {
>   /*
>* margin 5300ppm + 300ppm ~ 0.6% as per spec, factor is 1.006
> @@ -4636,18 +4635,9 @@ int drm_dp_calc_pbn_mode(int clock, int bpp, bool dsc)
>* peak_kbps *= (1006/1000)
>* peak_kbps *= (64/54)
>* peak_kbps *= 8convert to bytes
> -  *
> -  * If the bpp is in units of 1/16, further divide by 16. Put this
> -  * factor in the numerator rather than the denominator to avoid
> -  * integer overflow
>*/
> -
> - if (dsc)
> - return DIV_ROUND_UP_ULL(mul_u32_u32(clock * (bpp / 16), 64 * 
> 1006),
> - 8 * 54 * 1000 * 1000);
> -
> - return DIV_ROUND_UP_ULL(mul_u32_u32(clock * bpp, 64 * 1006),
> - 8 * 54 * 1000 * 1000);
> + return DIV_ROUND_UP_ULL(mul_u32_u32(clock * bpp, 64 * 1006 >> 4),
> + 1000 * 8 * 54 * 1000);
>  }
>  EXPORT_SYMBOL(drm_dp_calc_pbn_mode);
>  
> diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c 
> 

Re: [PATCH v6 06/15] drm/msm/a6xx: Introduce GMU wrapper support

2023-05-03 Thread Akhil P Oommen
On Tue, May 02, 2023 at 11:40:26AM +0200, Konrad Dybcio wrote:
> 
> 
> On 2.05.2023 09:49, Akhil P Oommen wrote:
> > On Sat, Apr 01, 2023 at 01:54:43PM +0200, Konrad Dybcio wrote:
> >> Some (particularly SMD_RPM, a.k.a non-RPMh) SoCs implement A6XX GPUs
> >> but don't implement the associated GMUs. This is due to the fact that
> >> the GMU directly pokes at RPMh. Sadly, this means we have to take care
> >> of enabling & scaling power rails, clocks and bandwidth ourselves.
> >>
> >> Reuse existing Adreno-common code and modify the deeply-GMU-infused
> >> A6XX code to facilitate these GPUs. This involves if-ing out lots
> >> of GMU callbacks and introducing a new type of GMU - GMU wrapper (it's
> >> the actual name that Qualcomm uses in their downstream kernels).
> >>
> >> This is essentially a register region which is convenient to model
> >> as a device. We'll use it for managing the GDSCs. The register
> >> layout matches the actual GMU_CX/GX regions on the "real GMU" devices
> >> and lets us reuse quite a bit of gmu_read/write/rmw calls.
> > << I sent a reply to this patch earlier, but not sure where it went.
> > Still figuring out Mutt... >>
> Answered it here:
> 
> https://lore.kernel.org/linux-arm-msm/4d3000c1-c3f9-0bfd-3eb3-23393f9a8...@linaro.org/

Thanks. Will check and respond there if needed.

> 
> I don't think I see any new comments in this "reply revision" (heh), so please
> check that one out.
> 
> > 
> > Only convenience I found is that we can reuse gmu register ops in a few
> > places (< 10 I think). If we just model this as another gpu memory
> > region, I think it will help to keep gmu vs gmu-wrapper/no-gmu
> > architecture code with clean separation. Also, it looks like we need to
> > keep a dummy gmu platform device in the devicetree with the current
> > approach. That doesn't sound right.
> That's correct, but.. if we switch away from that, VDD_GX/VDD_CX will
> need additional, gmuwrapper-configuration specific code anyway, as
> OPP & genpd will no longer make use of the default behavior which
> only gets triggered if there's a single power-domains=<> entry, afaicu.
Can you please tell me which specific *default behviour* do you mean here?
I am curious to know what I am overlooking here. We can always get a cxpd/gxpd 
device
and vote for the gdscs directly from the driver. Anything related to
OPP?

-Akhil
> 
> If nothing else, this is a very convenient way to model a part of the
> GPU (as that's essentially what GMU_CX is, to my understanding) and
> the bindings people didn't shoot me in the head for proposing this, so
> I assume it'd be cool to pursue this..
> 
> Konrad
> >>
> >> Signed-off-by: Konrad Dybcio 
> >> ---
> >>  drivers/gpu/drm/msm/adreno/a6xx_gmu.c   |  72 +++-
> >>  drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 255 
> >> +---
> >>  drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |   1 +
> >>  drivers/gpu/drm/msm/adreno/a6xx_gpu_state.c |  14 +-
> >>  drivers/gpu/drm/msm/adreno/adreno_gpu.c |   8 +-
> >>  drivers/gpu/drm/msm/adreno/adreno_gpu.h |   6 +
> >>  6 files changed, 318 insertions(+), 38 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c 
> >> b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> >> index 87babbb2a19f..b1acdb027205 100644
> >> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> >> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
> >> @@ -1469,6 +1469,7 @@ static int a6xx_gmu_get_irq(struct a6xx_gmu *gmu, 
> >> struct platform_device *pdev,
> >>  
> >>  void a6xx_gmu_remove(struct a6xx_gpu *a6xx_gpu)
> >>  {
> >> +  struct adreno_gpu *adreno_gpu = _gpu->base;
> >>struct a6xx_gmu *gmu = _gpu->gmu;
> >>struct platform_device *pdev = to_platform_device(gmu->dev);
> >>  
> >> @@ -1494,10 +1495,12 @@ void a6xx_gmu_remove(struct a6xx_gpu *a6xx_gpu)
> >>gmu->mmio = NULL;
> >>gmu->rscc = NULL;
> >>  
> >> -  a6xx_gmu_memory_free(gmu);
> >> +  if (!adreno_has_gmu_wrapper(adreno_gpu)) {
> >> +  a6xx_gmu_memory_free(gmu);
> >>  
> >> -  free_irq(gmu->gmu_irq, gmu);
> >> -  free_irq(gmu->hfi_irq, gmu);
> >> +  free_irq(gmu->gmu_irq, gmu);
> >> +  free_irq(gmu->hfi_irq, gmu);
> >> +  }
> >>  
> >>/* Drop reference taken in of_find_device_by_node */
> >>put_device(gmu->dev);
> >> @@ -1516,6 +1519,69 @@ static int cxpd_notifier_cb(struct notifier_block 
> >> *nb,
> >>return 0;
> >>  }
> >>  
> >> +int a6xx_gmu_wrapper_init(struct a6xx_gpu *a6xx_gpu, struct device_node 
> >> *node)
> >> +{
> >> +  struct platform_device *pdev = of_find_device_by_node(node);
> >> +  struct a6xx_gmu *gmu = _gpu->gmu;
> >> +  int ret;
> >> +
> >> +  if (!pdev)
> >> +  return -ENODEV;
> >> +
> >> +  gmu->dev = >dev;
> >> +
> >> +  of_dma_configure(gmu->dev, node, true);
> > why setup dma for a device that is not actually present?
> >> +
> >> +  pm_runtime_enable(gmu->dev);
> >> +
> >> +  /* Mark legacy for manual SPTPRAC control */
> >> +  gmu->legacy = true;
> >> +
> >> +  /* Map the GMU 

[PATCH v4 5/7] drm/msm/dpu: add support for DSC encoder v1.2 engine

2023-05-03 Thread Kuogee Hsieh
Add support for DSC 1.2 by providing the necessary hooks to program
the DPU DSC 1.2 encoder.

Changes in v3:
-- fixed kernel test rebot report that "__iomem *off" is declared but not
   used at dpu_hw_dsc_config_1_2()
-- unrolling thresh loops

Changes in v4:
-- delete DPU_DSC_HW_REV_1_1
-- delete off and used real register name directly

Reported-by: kernel test robot 
Signed-off-by: Kuogee Hsieh 
---
 drivers/gpu/drm/msm/Makefile   |   1 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h |  32 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h |  14 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc_1_2.c | 385 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c |   7 +-
 5 files changed, 435 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc_1_2.c

diff --git a/drivers/gpu/drm/msm/Makefile b/drivers/gpu/drm/msm/Makefile
index b814fc8..b9af5e4 100644
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@@ -65,6 +65,7 @@ msm-$(CONFIG_DRM_MSM_DPU) += \
disp/dpu1/dpu_hw_catalog.o \
disp/dpu1/dpu_hw_ctl.o \
disp/dpu1/dpu_hw_dsc.o \
+   disp/dpu1/dpu_hw_dsc_1_2.o \
disp/dpu1/dpu_hw_interrupts.o \
disp/dpu1/dpu_hw_intf.o \
disp/dpu1/dpu_hw_lm.o \
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h
index 5d210f3..ec11e62 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (c) 2022. Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2022-2023, Qualcomm Innovation Center, Inc. All rights 
reserved.
  * Copyright (c) 2015-2018, 2020 The Linux Foundation. All rights reserved.
  */
 
@@ -243,12 +243,18 @@ enum {
 };
 
 /**
- * DSC features
+ * DSC sub-blocks/features
  * @DPU_DSC_OUTPUT_CTRL   Configure which PINGPONG block gets
  *the pixel output from this DSC.
+ * @DPU_DSC_HW_REV_1_2DSC block supports dsc 1.1 and 1.2
+ * @DPU_DSC_NATIVE_422_EN Supports native422 and native420 encoding
+ * @DPU_DSC_MAX
  */
 enum {
DPU_DSC_OUTPUT_CTRL = 0x1,
+   DPU_DSC_HW_REV_1_2,
+   DPU_DSC_NATIVE_422_EN,
+   DPU_DSC_MAX
 };
 
 /**
@@ -313,6 +319,14 @@ struct dpu_pp_blk {
 };
 
 /**
+ * struct dpu_dsc_blk - DSC Encoder sub-blk information
+ * @info:   HW register and features supported by this sub-blk
+ */
+struct dpu_dsc_blk {
+   DPU_HW_SUBBLK_INFO;
+};
+
+/**
  * enum dpu_qos_lut_usage - define QoS LUT use cases
  */
 enum dpu_qos_lut_usage {
@@ -461,6 +475,17 @@ struct dpu_pingpong_sub_blks {
 };
 
 /**
+ * struct dpu_dsc_sub_blks - DSC sub-blks
+ * @enc: DSC encoder sub block
+ * @ctl: DSC controller sub block
+ *
+ */
+struct dpu_dsc_sub_blks {
+   struct dpu_dsc_blk enc;
+   struct dpu_dsc_blk ctl;
+};
+
+/**
  * dpu_clk_ctrl_type - Defines top level clock control signals
  */
 enum dpu_clk_ctrl_type {
@@ -614,10 +639,13 @@ struct dpu_merge_3d_cfg  {
  * struct dpu_dsc_cfg - information of DSC blocks
  * @id enum identifying this block
  * @base   register offset of this block
+ * @len:   length of hardware block
  * @features   bit mask identifying sub-blocks/features
+ * @sblk   sub-blocks information
  */
 struct dpu_dsc_cfg {
DPU_HW_BLK_INFO;
+   const struct dpu_dsc_sub_blks *sblk;
 };
 
 /**
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h
index 138080a..bdff74d 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h
@@ -1,5 +1,8 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (c) 2020-2022, Linaro Limited */
+/*
+ * Copyright (c) 2020-2022, Linaro Limited
+ * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved
+ */
 
 #ifndef _DPU_HW_DSC_H
 #define _DPU_HW_DSC_H
@@ -69,6 +72,15 @@ struct dpu_hw_dsc *dpu_hw_dsc_init(const struct dpu_dsc_cfg 
*cfg,
void __iomem *addr);
 
 /**
+ * dpu_hw_dsc_init_1_2 - initializes the v1.2 DSC hw driver block
+ * @cfg:  DSC catalog entry for which driver object is required
+ * @addr: Mapped register io address of MDP
+ * Returns: Error code or allocated dpu_hw_dsc context
+ */
+struct dpu_hw_dsc *dpu_hw_dsc_init_1_2(const struct dpu_dsc_cfg *cfg,
+   void __iomem *addr);
+
+/**
  * dpu_hw_dsc_destroy - destroys dsc driver context
  * @dsc:   Pointer to dsc driver context returned by dpu_hw_dsc_init
  */
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc_1_2.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc_1_2.c
new file mode 100644
index ..0c77c85
--- /dev/null
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc_1_2.c
@@ -0,0 +1,385 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020-2021, The Linux 

[PATCH v4 7/7] drm/msm/dpu: add DSC 1.2 hw blocks for relevant chipsets

2023-05-03 Thread Kuogee Hsieh
From: Abhinav Kumar 

Add DSC 1.2 hardware blocks to the catalog with necessary sub-block and
feature flag information.  Each display compression engine (DCE) contains
dual hard slice DSC encoders so both share same base address but with
its own different sub block address.

changes in v4:
-- delete DPU_DSC_HW_REV_1_1
-- re arrange sc8280xp_dsc[]

Signed-off-by: Abhinav Kumar 
Signed-off-by: Kuogee Hsieh 
Reviewed-by: Dmitry Baryshkov 
---
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h | 14 
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h |  7 ++
 .../drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h   | 16 ++
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_8_1_sm8450.h | 14 
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_9_0_sm8550.h | 14 
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 25 +-
 6 files changed, 89 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h
index 4f6a965..f98c2a5 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h
@@ -153,6 +153,18 @@ static const struct dpu_merge_3d_cfg sm8350_merge_3d[] = {
MERGE_3D_BLK("merge_3d_2", MERGE_3D_2, 0x5),
 };
 
+/*
+ * NOTE: Each display compression engine (DCE) contains dual hard
+ * slice DSC encoders so both share same base address but with
+ * its own different sub block address.
+ */
+static const struct dpu_dsc_cfg sm8350_dsc[] = {
+   DSC_BLK_1_2("dce_0", DSC_0, 0x8, 0x100, 0, dsc_sblk_0),
+   DSC_BLK_1_2("dce_0", DSC_1, 0x8, 0x100, 0, dsc_sblk_1),
+   DSC_BLK_1_2("dce_1", DSC_2, 0x81000, 0x100, BIT(DPU_DSC_NATIVE_422_EN), 
dsc_sblk_0),
+   DSC_BLK_1_2("dce_1", DSC_3, 0x81000, 0x100, BIT(DPU_DSC_NATIVE_422_EN), 
dsc_sblk_1),
+};
+
 static const struct dpu_intf_cfg sm8350_intf[] = {
INTF_BLK("intf_0", INTF_0, 0x34000, 0x280, INTF_DP, 
MSM_DP_CONTROLLER_0, 24, INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 24, 25),
INTF_BLK("intf_1", INTF_1, 0x35000, 0x2c4, INTF_DSI, 0, 24, 
INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 26, 27),
@@ -205,6 +217,8 @@ const struct dpu_mdss_cfg dpu_sm8350_cfg = {
.dspp = sm8350_dspp,
.pingpong_count = ARRAY_SIZE(sm8350_pp),
.pingpong = sm8350_pp,
+   .dsc = sm8350_dsc,
+   .dsc_count = ARRAY_SIZE(sm8350_dsc),
.merge_3d_count = ARRAY_SIZE(sm8350_merge_3d),
.merge_3d = sm8350_merge_3d,
.intf_count = ARRAY_SIZE(sm8350_intf),
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
index 6b2c7ea..3fd0498a 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
@@ -93,6 +93,11 @@ static const struct dpu_pingpong_cfg sc7280_pp[] = {
PP_BLK_DITHER("pingpong_3", PINGPONG_3, 0x6c000, 0, sc7280_pp_sblk, -1, 
-1),
 };
 
+/* NOTE: sc7280 only has one dsc hard slice encoder */
+static const struct dpu_dsc_cfg sc7280_dsc[] = {
+   DSC_BLK_1_2("dce_0", DSC_0, 0x8, 0x100, BIT(DPU_DSC_NATIVE_422_EN), 
dsc_sblk_0),
+};
+
 static const struct dpu_intf_cfg sc7280_intf[] = {
INTF_BLK("intf_0", INTF_0, 0x34000, 0x280, INTF_DP, 
MSM_DP_CONTROLLER_0, 24, INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 24, 25),
INTF_BLK("intf_1", INTF_1, 0x35000, 0x2c4, INTF_DSI, 0, 24, 
INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 26, 27),
@@ -142,6 +147,8 @@ const struct dpu_mdss_cfg dpu_sc7280_cfg = {
.mixer = sc7280_lm,
.pingpong_count = ARRAY_SIZE(sc7280_pp),
.pingpong = sc7280_pp,
+   .dsc_count = ARRAY_SIZE(sc7280_dsc),
+   .dsc = sc7280_dsc,
.intf_count = ARRAY_SIZE(sc7280_intf),
.intf = sc7280_intf,
.vbif_count = ARRAY_SIZE(sdm845_vbif),
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h
index 706d0f1..78ece02 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h
@@ -141,6 +141,20 @@ static const struct dpu_merge_3d_cfg sc8280xp_merge_3d[] = 
{
MERGE_3D_BLK("merge_3d_2", MERGE_3D_2, 0x5),
 };
 
+/*
+ * NOTE: Each display compression engine (DCE) contains dual hard
+ * slice DSC encoders so both share same base address but with
+ * its own different sub block address.
+ */
+static const struct dpu_dsc_cfg sc8280xp_dsc[] = {
+   DSC_BLK_1_2("dce_0", DSC_0, 0x8, 0x100, 0, dsc_sblk_0), 
+   DSC_BLK_1_2("dce_0", DSC_1, 0x8, 0x100, 0, dsc_sblk_1), 
+   DSC_BLK_1_2("dce_1", DSC_2, 0x81000, 0x100, BIT(DPU_DSC_NATIVE_422_EN), 
dsc_sblk_0), 
+   DSC_BLK_1_2("dce_1", DSC_3, 0x81000, 0x100, BIT(DPU_DSC_NATIVE_422_EN), 
dsc_sblk_1), 
+   DSC_BLK_1_2("dce_2", DSC_4, 0x82000, 0x100, 0, dsc_sblk_0), 
+   DSC_BLK_1_2("dce_2", 

[PATCH v4 4/7] drm/msm/dpu: add PINGPONG_NONE to disconnect DSC from PINGPONG

2023-05-03 Thread Kuogee Hsieh
During DSC setup, the crossbar mux need to be programmed to engage
DSC to specified PINGPONG. Hence during tear down, the crossbar mux
need to be reset to disengage DSC from PINGPONG. 0X0F is written to
reset crossbar mux. It is not relevant to hw_pp->idx.  This patch add
PINGPONG_NONE to serve as disable to reset crossbar mux.

Changes in v4:
-- more details to commit text

Signed-off-by: Kuogee Hsieh 
Reviewed-by: Dmitry Baryshkov 
---
 drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 2 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c  | 7 +++
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h  | 1 -
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h | 3 ++-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
index 1dc5dbe..d9ad334 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
@@ -1839,7 +1839,7 @@ static void dpu_encoder_dsc_pipe_cfg(struct dpu_hw_dsc 
*hw_dsc,
hw_pp->ops.setup_dsc(hw_pp);
 
if (hw_dsc->ops.dsc_bind_pingpong_blk)
-   hw_dsc->ops.dsc_bind_pingpong_blk(hw_dsc, true, hw_pp->idx);
+   hw_dsc->ops.dsc_bind_pingpong_blk(hw_dsc, hw_pp->idx);
 
if (hw_pp->ops.enable_dsc)
hw_pp->ops.enable_dsc(hw_pp);
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c
index 4a6bbcc..3e68d47 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c
@@ -157,7 +157,6 @@ static void dpu_hw_dsc_config_thresh(struct dpu_hw_dsc 
*hw_dsc,
 
 static void dpu_hw_dsc_bind_pingpong_blk(
struct dpu_hw_dsc *hw_dsc,
-   bool enable,
const enum dpu_pingpong pp)
 {
struct dpu_hw_blk_reg_map *c = _dsc->hw;
@@ -166,13 +165,13 @@ static void dpu_hw_dsc_bind_pingpong_blk(
 
dsc_ctl_offset = DSC_CTL(hw_dsc->idx);
 
-   if (enable)
+   if (pp)
mux_cfg = (pp - PINGPONG_0) & 0x7;
 
DRM_DEBUG_KMS("%s dsc:%d %s pp:%d\n",
-   enable ? "Binding" : "Unbinding",
+   pp ? "Binding" : "Unbinding",
hw_dsc->idx - DSC_0,
-   enable ? "to" : "from",
+   pp ? "to" : "from",
pp - PINGPONG_0);
 
DPU_REG_WRITE(c, dsc_ctl_offset, mux_cfg);
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h
index 287ec5f..138080a 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h
@@ -44,7 +44,6 @@ struct dpu_hw_dsc_ops {
  struct drm_dsc_config *dsc);
 
void (*dsc_bind_pingpong_blk)(struct dpu_hw_dsc *hw_dsc,
- bool enable,
  enum dpu_pingpong pp);
 };
 
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h
index 2d9192a..56826a9 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h
@@ -191,7 +191,8 @@ enum dpu_dsc {
 };
 
 enum dpu_pingpong {
-   PINGPONG_0 = 1,
+   PINGPONG_NONE,
+   PINGPONG_0,
PINGPONG_1,
PINGPONG_2,
PINGPONG_3,
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v4 6/7] drm/msm/dpu: separate DSC flush update out of interface

2023-05-03 Thread Kuogee Hsieh
Current DSC flush update is piggyback inside dpu_hw_ctl_intf_cfg_v1().
This patch separates DSC flush away from dpu_hw_ctl_intf_cfg_v1() by
adding dpu_hw_ctl_update_pending_flush_dsc_v1() to handle both per
DSC engine and DSC flush bits at same time to make it consistent with
the location of flush programming of other dpu sub blocks.

Signed-off-by: Kuogee Hsieh 
Reviewed-by: Dmitry Baryshkov 
---
 drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 14 --
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c  | 22 --
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.h  | 10 ++
 3 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
index d9ad334..71db23e 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
@@ -1823,12 +1823,18 @@ dpu_encoder_dsc_initial_line_calc(struct drm_dsc_config 
*dsc,
return DIV_ROUND_UP(total_pixels, dsc->slice_width);
 }
 
-static void dpu_encoder_dsc_pipe_cfg(struct dpu_hw_dsc *hw_dsc,
+static void dpu_encoder_dsc_pipe_cfg(struct dpu_encoder_virt *dpu_enc,
+struct dpu_hw_dsc *hw_dsc,
 struct dpu_hw_pingpong *hw_pp,
 struct drm_dsc_config *dsc,
 u32 common_mode,
 u32 initial_lines)
 {
+   struct dpu_encoder_phys *cur_master = dpu_enc->cur_master;
+   struct dpu_hw_ctl *ctl;
+
+   ctl = cur_master->hw_ctl;
+
if (hw_dsc->ops.dsc_config)
hw_dsc->ops.dsc_config(hw_dsc, dsc, common_mode, initial_lines);
 
@@ -1843,6 +1849,9 @@ static void dpu_encoder_dsc_pipe_cfg(struct dpu_hw_dsc 
*hw_dsc,
 
if (hw_pp->ops.enable_dsc)
hw_pp->ops.enable_dsc(hw_pp);
+
+   if (ctl->ops.update_pending_flush_dsc)
+   ctl->ops.update_pending_flush_dsc(ctl, hw_dsc->idx);
 }
 
 static void dpu_encoder_prep_dsc(struct dpu_encoder_virt *dpu_enc,
@@ -1887,7 +1896,8 @@ static void dpu_encoder_prep_dsc(struct dpu_encoder_virt 
*dpu_enc,
initial_lines = dpu_encoder_dsc_initial_line_calc(dsc, enc_ip_w);
 
for (i = 0; i < MAX_CHANNELS_PER_ENC; i++)
-   dpu_encoder_dsc_pipe_cfg(hw_dsc[i], hw_pp[i], dsc, 
dsc_common_mode, initial_lines);
+   dpu_encoder_dsc_pipe_cfg(dpu_enc, hw_dsc[i], hw_pp[i], dsc,
+   dsc_common_mode, initial_lines);
 }
 
 void dpu_encoder_prepare_for_kickoff(struct drm_encoder *drm_enc)
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c
index 4f7cfa9..832a6a7 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c
@@ -139,6 +139,11 @@ static inline void dpu_hw_ctl_trigger_flush_v1(struct 
dpu_hw_ctl *ctx)
CTL_DSPP_n_FLUSH(dspp - DSPP_0),
ctx->pending_dspp_flush_mask[dspp - DSPP_0]);
}
+
+   if (ctx->pending_flush_mask & BIT(DSC_IDX))
+   DPU_REG_WRITE(>hw, CTL_DSC_FLUSH,
+   ctx->pending_dsc_flush_mask);
+
DPU_REG_WRITE(>hw, CTL_FLUSH, ctx->pending_flush_mask);
 }
 
@@ -285,6 +290,13 @@ static void 
dpu_hw_ctl_update_pending_flush_merge_3d_v1(struct dpu_hw_ctl *ctx,
ctx->pending_flush_mask |= BIT(MERGE_3D_IDX);
 }
 
+static void dpu_hw_ctl_update_pending_flush_dsc_v1(struct dpu_hw_ctl *ctx,
+   enum dpu_dsc dsc_num)
+{
+   ctx->pending_dsc_flush_mask |= BIT(dsc_num - DSC_0);
+   ctx->pending_flush_mask |= BIT(DSC_IDX);
+}
+
 static void dpu_hw_ctl_update_pending_flush_dspp(struct dpu_hw_ctl *ctx,
enum dpu_dspp dspp, u32 dspp_sub_blk)
 {
@@ -502,9 +514,6 @@ static void dpu_hw_ctl_intf_cfg_v1(struct dpu_hw_ctl *ctx,
if ((test_bit(DPU_CTL_VM_CFG, >caps->features)))
mode_sel = CTL_DEFAULT_GROUP_ID  << 28;
 
-   if (cfg->dsc)
-   DPU_REG_WRITE(>hw, CTL_DSC_FLUSH, cfg->dsc);
-
if (cfg->intf_mode_sel == DPU_CTL_MODE_SEL_CMD)
mode_sel |= BIT(17);
 
@@ -524,10 +533,8 @@ static void dpu_hw_ctl_intf_cfg_v1(struct dpu_hw_ctl *ctx,
if (cfg->merge_3d)
DPU_REG_WRITE(c, CTL_MERGE_3D_ACTIVE,
  BIT(cfg->merge_3d - MERGE_3D_0));
-   if (cfg->dsc) {
-   DPU_REG_WRITE(>hw, CTL_FLUSH, DSC_IDX);
+   if (cfg->dsc)
DPU_REG_WRITE(c, CTL_DSC_ACTIVE, cfg->dsc);
-   }
 }
 
 static void dpu_hw_ctl_intf_cfg(struct dpu_hw_ctl *ctx,
@@ -630,6 +637,9 @@ static void _setup_ctl_ops(struct dpu_hw_ctl_ops *ops,
ops->update_pending_flush_merge_3d =
dpu_hw_ctl_update_pending_flush_merge_3d_v1;
ops->update_pending_flush_wb = 
dpu_hw_ctl_update_pending_flush_wb_v1;
+

[PATCH v4 3/7] drm/msm/dpu: add DPU_PINGPONG_DSC bits into PP_BLK and PP_BLK_TE marcos

2023-05-03 Thread Kuogee Hsieh
At legacy chipsets, it required DPU_PINGPONG_DSC bit be set to indicate
pingpong ops functions are required to complete DSC data path setup if
this chipset has DSC hardware block presented. This patch add
DPU_PINGPONG_DSC bit to both PP_BLK and PP_BLK_TE marcos if it has DSC
hardware block presented.

Signed-off-by: Kuogee Hsieh 
---
 .../drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h| 16 ++---
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h |  8 +++
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h | 26 ++
 .../drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h| 24 ++--
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_0_sm8250.h | 26 ++
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_2_sc7180.h |  4 ++--
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_3_sm6115.h |  2 +-
 .../drm/msm/disp/dpu1/catalog/dpu_6_5_qcm2290.h|  2 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c |  8 +++
 9 files changed, 56 insertions(+), 60 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
index 17f821c..dc2ad1f 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
@@ -112,17 +112,17 @@ static const struct dpu_lm_cfg msm8998_lm[] = {
 };
 
 static const struct dpu_pingpong_cfg msm8998_pp[] = {
-   PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 0, sdm845_pp_sblk_te,
-   DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
+   PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, BIT(DPU_PINGPONG_DSC), 0,
+   sdm845_pp_sblk_te, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12)),
-   PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 0, sdm845_pp_sblk_te,
-   DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
+   PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, BIT(DPU_PINGPONG_DSC), 0,
+   sdm845_pp_sblk_te, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13)),
-   PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, sdm845_pp_sblk,
-   DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 10),
+   PP_BLK("pingpong_2", PINGPONG_2, 0x71000, BIT(DPU_PINGPONG_DSC), 0,
+   sdm845_pp_sblk, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 10),
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 14)),
-   PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, sdm845_pp_sblk,
-   DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 11),
+   PP_BLK("pingpong_3", PINGPONG_3, 0x71800, BIT(DPU_PINGPONG_DSC), 0,
+   sdm845_pp_sblk, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 11),
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 15)),
 };
 
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h
index ceca741..bd9 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h
@@ -110,16 +110,16 @@ static const struct dpu_lm_cfg sdm845_lm[] = {
 };
 
 static const struct dpu_pingpong_cfg sdm845_pp[] = {
-   PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 0, sdm845_pp_sblk_te,
+   PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, BIT(DPU_PINGPONG_DSC), 0, 
sdm845_pp_sblk_te,
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12)),
-   PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 0, sdm845_pp_sblk_te,
+   PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, BIT(DPU_PINGPONG_DSC), 0, 
sdm845_pp_sblk_te,
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13)),
-   PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, sdm845_pp_sblk,
+   PP_BLK("pingpong_2", PINGPONG_2, 0x71000, BIT(DPU_PINGPONG_DSC), 0, 
sdm845_pp_sblk,
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 10),
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 14)),
-   PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, sdm845_pp_sblk,
+   PP_BLK("pingpong_3", PINGPONG_3, 0x71800, BIT(DPU_PINGPONG_DSC), 0, 
sdm845_pp_sblk,
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 11),
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 15)),
 };
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h
index 42b0e58..3a7dffa 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h
@@ -128,24 +128,22 @@ static const struct dpu_dspp_cfg sm8150_dspp[] = {
 };
 
 static const struct dpu_pingpong_cfg sm8150_pp[] = {
-   PP_BLK("pingpong_0", PINGPONG_0, 0x7, MERGE_3D_0, sdm845_pp_sblk,
-   DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
+   PP_BLK("pingpong_0", PINGPONG_0, 

[PATCH v4 2/7] drm/msm/dpu: add DPU_PINGPONG_DSC feature bit

2023-05-03 Thread Kuogee Hsieh
Legacy DPU (DPU < 7.0.0) requires PP block to be involved during
DSC setting up. Since then, enable and start the DSC encoder engine
had moved to INTF with helps of flush mechanism. This patch adds
DPU_PINGPONG_DSC feature bit to indicate that both
dpu_hw_pp_setup_dsc() and dpu_hw_pp_dsc_enable() pingpong ops
functions are required to complete DSC datapath setup and start
DSC engine.

Changes in v4:
-- add more details commit text

Reported-by: Marijn Suijten 
Signed-off-by: Kuogee Hsieh 
Reviewed-by: Dmitry Baryshkov 
---
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h  | 2 ++
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c | 9 ++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h
index 71584cd..5d210f3 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h
@@ -144,6 +144,7 @@ enum {
  * @DPU_PINGPONG_SPLIT  PP block supports split fifo
  * @DPU_PINGPONG_SLAVE  PP block is a suitable slave for split fifo
  * @DPU_PINGPONG_DITHER,Dither blocks
+ * @DPU_PINGPONG_DSC,   PP ops functions required for DSC
  * @DPU_PINGPONG_MAX
  */
 enum {
@@ -152,6 +153,7 @@ enum {
DPU_PINGPONG_SPLIT,
DPU_PINGPONG_SLAVE,
DPU_PINGPONG_DITHER,
+   DPU_PINGPONG_DSC,
DPU_PINGPONG_MAX
 };
 
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c
index 3822e06..f255a04 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c
@@ -264,9 +264,12 @@ static void _setup_pingpong_ops(struct dpu_hw_pingpong *c,
c->ops.get_autorefresh = dpu_hw_pp_get_autorefresh_config;
c->ops.poll_timeout_wr_ptr = dpu_hw_pp_poll_timeout_wr_ptr;
c->ops.get_line_count = dpu_hw_pp_get_line_count;
-   c->ops.setup_dsc = dpu_hw_pp_setup_dsc;
-   c->ops.enable_dsc = dpu_hw_pp_dsc_enable;
-   c->ops.disable_dsc = dpu_hw_pp_dsc_disable;
+
+   if (features & BIT(DPU_PINGPONG_DSC)) {
+   c->ops.setup_dsc = dpu_hw_pp_setup_dsc;
+   c->ops.enable_dsc = dpu_hw_pp_dsc_enable;
+   c->ops.disable_dsc = dpu_hw_pp_dsc_disable;
+   }
 
if (test_bit(DPU_PINGPONG_DITHER, ))
c->ops.setup_dither = dpu_hw_pp_setup_dither;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v4 1/7] drm/msm/dpu: add dsc blocks for remaining chipsets in catalog

2023-05-03 Thread Kuogee Hsieh
From: Abhinav Kumar 

There are some platforms has DSC blocks but it is not declared at catalog.
For completeness, this patch adds DSC blocks for platforms which missed
them.

Signed-off-by: Abhinav Kumar 
Reviewed-by: Dmitry Baryshkov 
---
 drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h |  7 +++
 drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h | 11 +++
 2 files changed, 18 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
index 2b3ae84..17f821c 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
@@ -126,6 +126,11 @@ static const struct dpu_pingpong_cfg msm8998_pp[] = {
DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 15)),
 };
 
+static const struct dpu_dsc_cfg msm8998_dsc[] = {
+   DSC_BLK("dsc_0", DSC_0, 0x8, 0),
+   DSC_BLK("dsc_1", DSC_1, 0x80400, 0),
+};
+
 static const struct dpu_dspp_cfg msm8998_dspp[] = {
DSPP_BLK("dspp_0", DSPP_0, 0x54000, DSPP_MSM8998_MASK,
 _dspp_sblk),
@@ -191,6 +196,8 @@ const struct dpu_mdss_cfg dpu_msm8998_cfg = {
.dspp = msm8998_dspp,
.pingpong_count = ARRAY_SIZE(msm8998_pp),
.pingpong = msm8998_pp,
+   .dsc_count = ARRAY_SIZE(msm8998_dsc),
+   .dsc = msm8998_dsc,
.intf_count = ARRAY_SIZE(msm8998_intf),
.intf = msm8998_intf,
.vbif_count = ARRAY_SIZE(msm8998_vbif),
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h
index e3bdfe7..5bb9882 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h
@@ -142,6 +142,15 @@ static const struct dpu_merge_3d_cfg sc8180x_merge_3d[] = {
MERGE_3D_BLK("merge_3d_2", MERGE_3D_2, 0x83200),
 };
 
+static const struct dpu_dsc_cfg sc8180x_dsc[] = {
+   DSC_BLK("dsc_0", DSC_0, 0x8, BIT(DPU_DSC_OUTPUT_CTRL)),
+   DSC_BLK("dsc_1", DSC_1, 0x80400, BIT(DPU_DSC_OUTPUT_CTRL)),
+   DSC_BLK("dsc_2", DSC_2, 0x80800, BIT(DPU_DSC_OUTPUT_CTRL)),
+   DSC_BLK("dsc_3", DSC_3, 0x80c00, BIT(DPU_DSC_OUTPUT_CTRL)),
+   DSC_BLK("dsc_4", DSC_4, 0x81000, BIT(DPU_DSC_OUTPUT_CTRL)),
+   DSC_BLK("dsc_5", DSC_5, 0x81400, BIT(DPU_DSC_OUTPUT_CTRL)),
+};
+
 static const struct dpu_intf_cfg sc8180x_intf[] = {
INTF_BLK("intf_0", INTF_0, 0x6a000, 0x280, INTF_DP, 
MSM_DP_CONTROLLER_0, 24, INTF_SC7180_MASK, MDP_SSPP_TOP0_INTR, 24, 25),
INTF_BLK("intf_1", INTF_1, 0x6a800, 0x2bc, INTF_DSI, 0, 24, 
INTF_SC7180_MASK, MDP_SSPP_TOP0_INTR, 26, 27),
@@ -192,6 +201,8 @@ const struct dpu_mdss_cfg dpu_sc8180x_cfg = {
.mixer = sc8180x_lm,
.pingpong_count = ARRAY_SIZE(sc8180x_pp),
.pingpong = sc8180x_pp,
+   .dsc_count = ARRAY_SIZE(sc8180x_dsc),
+   .dsc = sc8180x_dsc,
.merge_3d_count = ARRAY_SIZE(sc8180x_merge_3d),
.merge_3d = sc8180x_merge_3d,
.intf_count = ARRAY_SIZE(sc8180x_intf),
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v4 0/7] add DSC 1.2 dpu supports

2023-05-03 Thread Kuogee Hsieh
This series adds the DPU side changes to support DSC 1.2 encoder. This
was validated with both DSI DSC 1.2 panel and DP DSC 1.2 monitor.
The DSI and DP parts will be pushed later on top of this change.
This seriel is rebase on [1], [2] and catalog fixes from [3].

Abhinav Kumar (2):
  drm/msm/dpu: add dsc blocks for remaining chipsets in catalog
  drm/msm/dpu: add DSC 1.2 hw blocks for relevant chipsets

Kuogee Hsieh (5):
  drm/msm/dpu: add DPU_PINGPONG_DSC feature bit
  drm/msm/dpu: add DPU_PINGPONG_DSC bits into PP_BLK and PP_BLK_TE
marcos
  drm/msm/dpu: add PINGPONG_NONE to disconnect DSC from PINGPONG
  drm/msm/dpu: add support for DSC encoder v1.2 engine
  drm/msm/dpu: separate DSC flush update out of interface

 drivers/gpu/drm/msm/Makefile   |   1 +
 .../drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h|  23 +-
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h |   8 +-
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h |  26 +-
 .../drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h|  35 +-
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_0_sm8250.h |  26 +-
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_2_sc7180.h |   4 +-
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_3_sm6115.h |   2 +-
 .../drm/msm/disp/dpu1/catalog/dpu_6_5_qcm2290.h|   2 +-
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h |  14 +
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h |   7 +
 .../drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h   |  16 +
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_8_1_sm8450.h |  14 +
 .../gpu/drm/msm/disp/dpu1/catalog/dpu_9_0_sm8550.h |  14 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c|  16 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c |  33 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h |  34 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.c |  22 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_ctl.h |  10 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c |   7 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h |  15 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc_1_2.c | 385 +
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h|   3 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_pingpong.c|   9 +-
 drivers/gpu/drm/msm/disp/dpu1/dpu_rm.c |   7 +-
 25 files changed, 650 insertions(+), 83 deletions(-)
 create mode 100644 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc_1_2.c

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



Re: [PATCH v3 3/7] drm/msm/dpu: add DPU_PINGPONG_DSC bits into PP_BLK and PP_BLK_TE marcos

2023-05-03 Thread Kuogee Hsieh



On 5/3/2023 11:55 AM, Dmitry Baryshkov wrote:

On 03/05/2023 20:45, Kuogee Hsieh wrote:


On 5/2/2023 3:42 PM, Dmitry Baryshkov wrote:

On 03/05/2023 00:02, Kuogee Hsieh wrote:
At legacy chipsets, it required DPU_PINGPONG_DSC bit be set to 
indicate

pingpong ops functions are required to complete DSC data path setup if
this chipset has DSC hardware block presented. This patch add
DPU_PINGPONG_DSC bit to both PP_BLK and PP_BLK_TE marcos if it has DSC
hardware block presented.

Signed-off-by: Kuogee Hsieh 
---
  .../drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h    | 12 +-
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h |  8 +++
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h | 26 
++
  .../drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h    | 24 
++--
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_0_sm8250.h | 26 
++

  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_2_sc7180.h |  4 ++--
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_3_sm6115.h |  2 +-
  .../drm/msm/disp/dpu1/catalog/dpu_6_5_qcm2290.h    |  2 +-
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c |  8 +++
  9 files changed, 54 insertions(+), 58 deletions(-)

diff --git 
a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h

index 17f821c..b7cd746 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
@@ -112,16 +112,16 @@ static const struct dpu_lm_cfg msm8998_lm[] = {
  };
    static const struct dpu_pingpong_cfg msm8998_pp[] = {
-    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 0, 
sdm845_pp_sblk_te,

-    DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
+    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 
BIT(DPU_PINGPONG_DSC), 0,

+    sdm845_pp_sblk_te, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12)),
-    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 0, 
sdm845_pp_sblk_te,

-    DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
+    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 
BIT(DPU_PINGPONG_DSC), 0,

+    sdm845_pp_sblk_te, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13)),
-    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, 0, sdm845_pp_sblk,
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 10),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 14)),
-    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, 0, sdm845_pp_sblk,
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 11),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 15)),


Just to doublecheck: why don't we have DPU_PINGPONG_DSC for PP_3/_4? 
We do have them on sdm845. Is it because we should not use DSC with 
thos PINGPONG blocks?



I think it only have two DSPP connect to pp blocks


So, can they be connected to PP3/4 or not?


no, my previous reply is not correct.

original i though pp_3/_4 are for write back.

but this not correct, 2 dspp can connect to pp_3/_4 also.

I will add DPU_PINGPONG_DSC to pp_3/_4.





  };
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h

index ceca741..bd9 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h
@@ -110,16 +110,16 @@ static const struct dpu_lm_cfg sdm845_lm[] = {
  };
    static const struct dpu_pingpong_cfg sdm845_pp[] = {
-    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 0, 
sdm845_pp_sblk_te,
+    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 
BIT(DPU_PINGPONG_DSC), 0, sdm845_pp_sblk_te,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12)),
-    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 0, 
sdm845_pp_sblk_te,
+    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 
BIT(DPU_PINGPONG_DSC), 0, sdm845_pp_sblk_te,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13)),
-    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 
BIT(DPU_PINGPONG_DSC), 0, sdm845_pp_sblk,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 10),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 14)),
-    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 
BIT(DPU_PINGPONG_DSC), 0, sdm845_pp_sblk,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 11),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 15)),



[skipped the rest, looks good to me]





Re: [PATCH 4/4] drm/msm/dpu: Enable compression for command mode

2023-05-03 Thread Dmitry Baryshkov

On 03/05/2023 22:04, Jessica Zhang wrote:



On 5/3/2023 12:28 AM, Marijn Suijten wrote:

On 2023-05-02 18:19:15, Jessica Zhang wrote:

Add a dpu_hw_intf op to enable data compression.

Signed-off-by: Jessica Zhang 
---
  drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c | 4 
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c  | 7 +++
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h  | 2 ++
  3 files changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c

index 74470d068622..4321a1aba17f 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c


Can we have INTF DCE on video-mode encoders as well?


Hi Marijn,

Currently, there's no way to validate DSC for video mode as I've only 
made changes to support DSI for command mode. We are planning to post 
changes to support DSC over DP, which will include changes for video mode.


If I remember correctly, HDK8350 panel should support DSC for both 
command and video modes.







@@ -72,6 +72,10 @@ static void _dpu_encoder_phys_cmd_update_intf_cfg(
  phys_enc->hw_intf,
  true,
  phys_enc->hw_pp->idx);
+
+    if (phys_enc->dpu_kms->catalog->caps->has_data_compress &&


As per my suggestion on patch 3/4, drop the flag and check above and
only check if the function is NULL (below).


Acked.




+    phys_enc->hw_intf->ops.enable_compression)
+    phys_enc->hw_intf->ops.enable_compression(phys_enc->hw_intf);
  }
  static void dpu_encoder_phys_cmd_pp_tx_done_irq(void *arg, int 
irq_idx)
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c

index 671048a78801..4ce7ffdd7a05 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c
@@ -64,10 +64,16 @@
  #define INTF_CFG2_DATABUS_WIDEN    BIT(0)
  #define INTF_CFG2_DATA_HCTL_EN    BIT(4)


These should probably be reindented to match the below... And the rest
of the defines use spaces instead of tabs.


Fair point, though I think fixing the whitespace for these 2 macros 
specifically might be better in a more relevant series.


With that being said, I'll change the spacing of the DATA_COMPRESS bit 
to spaces instead of tabs.





+#define INTF_CFG2_DCE_DATA_COMPRESS    BIT(12)
  #define INTF_MISR_CTRL    0x180
  #define INTF_MISR_SIGNATURE    0x184


This does not seem to apply on top of:
https://lore.kernel.org/linux-arm-msm/20230411-dpu-intf-te-v4-10-27ce1a5ab...@somainline.org/


Seems like I'm missing some patches from that series on my working 
branch. Will rebase on top of the full series for the v2.




+static inline void dpu_hw_intf_enable_compression(struct dpu_hw_intf 
*ctx)


Why inline?  This is used as a pointer callback.


Acked, will remove the inline.




+{
+    DPU_REG_WRITE(>hw, INTF_CONFIG2, INTF_CFG2_DCE_DATA_COMPRESS);


dpu_hw_intf_setup_timing_engine() also programs INTF_CONFIG2.  Is it
double-buffered, or is that config **always** unused when DSI CMD mode
is used in conjunction with DSC/DCE?  Otherwise this should perhaps OR
the bitflag into the register, or write the whole thing at once in
dpu_hw_intf_setup_timing_engine()?


For command mode, INTF_CONFIG2 is unused aside from setting 
DATA_COMPRESS for DSC.


Since setup_timing_engine() is only used for video mode, the 
corresponding changes will be made in the DSC v1.2 for DP changes.


So, for command mode panels is this the only bit that should be set in 
INTF_CFG2?

--
With best wishes
Dmitry



Re: [RFC PATCH 0/1] Add AMDGPU_INFO_GUILTY_APP ioctl

2023-05-03 Thread André Almeida

Em 03/05/2023 14:43, Timur Kristóf escreveu:

Hi Felix,

On Wed, 2023-05-03 at 11:08 -0400, Felix Kuehling wrote:

That's the worst-case scenario where you're debugging HW or FW
issues.
Those should be pretty rare post-bringup. But are there hangs caused
by
user mode driver or application bugs that are easier to debug and
probably don't even require a GPU reset?


There are many GPU hangs that gamers experience while playing. We have
dozens of open bug reports against RADV about GPU hangs on various GPU
generations. These usually fall into two categories:

1. When the hang always happens at the same point in a game. These are
painful to debug but manageable.
2. "Random" hangs that happen to users over the course of playing a
game for several hours. It is absolute hell to try to even reproduce
let alone diagnose these issues, and this is what we would like to
improve.

For these hard-to-diagnose problems, it is already a challenge to
determine whether the problem is the kernel (eg. setting wrong voltages
/ frequencies) or userspace (eg. missing some synchronization), can be
even a game bug that we need to work around.


For example most VM faults can
be handled without hanging the GPU. Similarly, a shader in an endless
loop should not require a full GPU reset.


This is actually not the case, AFAIK André's test case was an app that
had an infinite loop in a shader.



This is the test app if anyone want to try out: 
https://github.com/andrealmeid/vulkan-triangle-v1. Just compile and run.


The kernel calls amdgpu_ring_soft_recovery() when I run my example, but 
I'm not sure what a soft recovery means here and if it's a full GPU 
reset or not.


But if we can at least trust the CP registers to dump information for 
soft resets, it would be some improvement from the current state I think




It's more complicated for graphics because of the more complex
pipeline
and the lack of CWSR. But it should still be possible to do some
debugging without JTAG if the problem is in SW and not HW or FW. It's
probably worth improving that debugability without getting hung-up on
the worst case.


I agree, and we welcome any constructive suggestion to improve the
situation. It seems like our idea doesn't work if the kernel can't give
us the information we need.

How do we move forward?

Best regards,
Timur



Re: [PATCH v3 6/6] fbdev: Rename fb_mem*() helpers

2023-05-03 Thread Sam Ravnborg
Hi Thomas,

On Wed, May 03, 2023 at 10:15:46AM +0200, Thomas Zimmermann wrote:
> Hi
> 
> Am 02.05.23 um 22:08 schrieb Sam Ravnborg:
> > Hi Thomas.
> > 
> > On Tue, May 02, 2023 at 03:02:23PM +0200, Thomas Zimmermann wrote:
> > > Update the names of the fb_mem*() helpers to be consistent with their
> > > regular counterparts. Hence, fb_memset() now becomes fb_memset_io(),
> > > fb_memcpy_fromfb() now becomes fb_memcpy_fromio() and fb_memcpy_tofb()
> > > becomes fb_memcpy_toio(). No functional changes.
> > > 
> > > Signed-off-by: Thomas Zimmermann 
> > > ---
> > ...
> > > -#ifndef fb_memcpy_fromfb
> > > -static inline void fb_memcpy_fromfb(void *to, const volatile void 
> > > __iomem *from, size_t n)
> > > +#ifndef fb_memcpy_fromio
> > > +static inline void fb_memcpy_fromio(void *to, const volatile void 
> > > __iomem *from, size_t n)
> > >   {
> > >   memcpy_fromio(to, from, n);
> > >   }
> > > -#define fb_memcpy_fromfb fb_memcpy_fromfb
> > > +#define fb_memcpy_fromio fb_memcpy_fromio
> > >   #endif
> > > -#ifndef fb_memcpy_tofb
> > > -static inline void fb_memcpy_tofb(volatile void __iomem *to, const void 
> > > *from, size_t n)
> > > +#ifndef fb_memcpy_toio
> > > +static inline void fb_memcpy_toio(volatile void __iomem *to, const void 
> > > *from, size_t n)
> > >   {
> > >   memcpy_toio(to, from, n);
> > >   }
> > > -#define fb_memcpy_tofb fb_memcpy_tofb
> > > +#define fb_memcpy_toio fb_memcpy_toio
> > >   #endif
> > >   #ifndef fb_memset
> > > -static inline void fb_memset(volatile void __iomem *addr, int c, size_t 
> > > n)
> > > +static inline void fb_memset_io(volatile void __iomem *addr, int c, 
> > > size_t n)
> > >   {
> > >   memset_io(addr, c, n);
> > >   }
> > > -#define fb_memset fb_memset
> > > +#define fb_memset fb_memset_io
> > 
> > The static inlines wrappers does not provide any value, and could be 
> > replaced by
> > direct calls to memcpy_fromio(), memcpy_toio(), memset_io().
> > 
> > If you decide to keep the wrappers I will not hold you back, so the
> > patch has my:
> > Reviewed-by: Sam Ravnborg 
> > 
> > But I prefer the direct calls without the wrappers
> 
> At first I was also skeptical if those fb_mem*() wrappers are needed. But
> Arnd mentioned that there are subtle differences between the current code
> and Linux' mem*_io() functions. Keeping the wrappers might be needed.
Saw the dialog, and agree that keeping current behaviour is the way to
go for now even if this is more code and wrappers.

Sam


Re: [PATCH 4/4] drm/msm/dpu: Enable compression for command mode

2023-05-03 Thread Jessica Zhang




On 5/3/2023 12:28 AM, Marijn Suijten wrote:

On 2023-05-02 18:19:15, Jessica Zhang wrote:

Add a dpu_hw_intf op to enable data compression.

Signed-off-by: Jessica Zhang 
---
  drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c | 4 
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c  | 7 +++
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h  | 2 ++
  3 files changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c
index 74470d068622..4321a1aba17f 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_cmd.c


Can we have INTF DCE on video-mode encoders as well?


Hi Marijn,

Currently, there's no way to validate DSC for video mode as I've only 
made changes to support DSI for command mode. We are planning to post 
changes to support DSC over DP, which will include changes for video mode.





@@ -72,6 +72,10 @@ static void _dpu_encoder_phys_cmd_update_intf_cfg(
phys_enc->hw_intf,
true,
phys_enc->hw_pp->idx);
+
+   if (phys_enc->dpu_kms->catalog->caps->has_data_compress &&


As per my suggestion on patch 3/4, drop the flag and check above and
only check if the function is NULL (below).


Acked.




+   phys_enc->hw_intf->ops.enable_compression)
+   phys_enc->hw_intf->ops.enable_compression(phys_enc->hw_intf);
  }
  
  static void dpu_encoder_phys_cmd_pp_tx_done_irq(void *arg, int irq_idx)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c
index 671048a78801..4ce7ffdd7a05 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.c
@@ -64,10 +64,16 @@
  
  #define INTF_CFG2_DATABUS_WIDEN	BIT(0)

  #define INTF_CFG2_DATA_HCTL_ENBIT(4)


These should probably be reindented to match the below... And the rest
of the defines use spaces instead of tabs.


Fair point, though I think fixing the whitespace for these 2 macros 
specifically might be better in a more relevant series.


With that being said, I'll change the spacing of the DATA_COMPRESS bit 
to spaces instead of tabs.





+#define INTF_CFG2_DCE_DATA_COMPRESSBIT(12)
  
  #define INTF_MISR_CTRL			0x180

  #define INTF_MISR_SIGNATURE   0x184


This does not seem to apply on top of:
https://lore.kernel.org/linux-arm-msm/20230411-dpu-intf-te-v4-10-27ce1a5ab...@somainline.org/


Seems like I'm missing some patches from that series on my working 
branch. Will rebase on top of the full series for the v2.




  
+static inline void dpu_hw_intf_enable_compression(struct dpu_hw_intf *ctx)


Why inline?  This is used as a pointer callback.


Acked, will remove the inline.




+{
+   DPU_REG_WRITE(>hw, INTF_CONFIG2, INTF_CFG2_DCE_DATA_COMPRESS);


dpu_hw_intf_setup_timing_engine() also programs INTF_CONFIG2.  Is it
double-buffered, or is that config **always** unused when DSI CMD mode
is used in conjunction with DSC/DCE?  Otherwise this should perhaps OR
the bitflag into the register, or write the whole thing at once in
dpu_hw_intf_setup_timing_engine()?


For command mode, INTF_CONFIG2 is unused aside from setting 
DATA_COMPRESS for DSC.


Since setup_timing_engine() is only used for video mode, the 
corresponding changes will be made in the DSC v1.2 for DP changes.





+}
+
  static void dpu_hw_intf_setup_timing_engine(struct dpu_hw_intf *ctx,
const struct intf_timing_params *p,
const struct dpu_format *fmt)
@@ -325,6 +331,7 @@ static void _setup_intf_ops(struct dpu_hw_intf_ops *ops,
ops->bind_pingpong_blk = dpu_hw_intf_bind_pingpong_blk;
ops->setup_misr = dpu_hw_intf_setup_misr;
ops->collect_misr = dpu_hw_intf_collect_misr;
+   ops->enable_compression = dpu_hw_intf_enable_compression;


And per the same suggestion on patch 3/4, this is then wrapped in:

 if (cap & BIT(DPU_INTF_DATA_COMPRESS))

(or similary named) flag check.


Acked.

Thanks,

Jessica Zhang




  }


This also doesn't seem to apply on top of the INTF TE [1] support
series, even though it depends on DSC 1.2 DPU support(s?) [2] which
mentions it was rebase(d) on top of that.

[1]: https://patchwork.freedesktop.org/series/112332/
[2]: https://patchwork.freedesktop.org/series/116789/

- Marijn

  
  struct dpu_hw_intf *dpu_hw_intf_init(const struct dpu_intf_cfg *cfg,

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h
index 102c4f0e812b..99528c735368 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_intf.h
@@ -60,6 +60,7 @@ struct intf_status {
   * feed pixels to this interface
   * @setup_misr: enable/disable MISR
   * @collect_misr: read MISR signature
+ * @enable_compression: 

Re: [PATCH 3/4] drm/msm/dpu: Add has_data_compress to dpu_caps

2023-05-03 Thread Jessica Zhang




On 5/3/2023 12:07 AM, Marijn Suijten wrote:

On 2023-05-02 18:19:14, Jessica Zhang wrote:

Add data_compress feature to DPU HW catalog.

In DPU 7.x and later, there is a DATA_COMPRESS register that must be set
within the DPU INTF block for DSC to work.

As core_rev (and related macros) was removed from the dpu_kms struct, the
most straightforward way to indicate the presence of this register would be
to have a flag in dpu_caps.


This is a very generic name to have in the global dpu_caps for a very
specific register on the INTF block since DPU >= 7.0.0, and I doubt any
new catalog contributor will know how to fill this field.  After all,
DPU < 7.0.0 also has DCE but it is controlled via the PINGPONG block.

Instead, how about having it as a DPU_INTF_DATA_COMPRESS (or similar)
feature flag on the INTF block?  We do the same for other (register
related) features on the INTF block, and you did the same to disable DSC
callbacks on PP in [1].


Hi Marijn,

Sounds good.



In fact it seems that the DSC/DCE (enablement) registers have been moved
from PINGPONG to INTF in DPU 7.0.0.  Can you clarify in the patch
message for v2 that this is the case, and do the same in the linked
PINGPONG patch?  Perhaps these patches should be part of the same series
as they do not seem DSI-specific.


Will make a note of the PP to INTF change in the commit message.

I would prefer to keep this patch in this series is because it is needed 
for DSI over command mode to work and the subsequent patch is 
specifically for command mode.


Thanks,

Jessica Zhang



[1]: 
https://lore.kernel.org/linux-arm-msm/1683061382-32651-3-git-send-email-quic_khs...@quicinc.com/

- Marijn


Signed-off-by: Jessica Zhang 
---
  drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h   | 1 +
  drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h   | 1 +
  drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h | 1 +
  drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_1_sm8450.h   | 1 +
  drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_0_sm8550.h   | 1 +
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.h   | 2 ++
  6 files changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h
index f98c2a5b0e87..4160a35ff20f 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h
@@ -15,6 +15,7 @@ static const struct dpu_caps sm8350_dpu_caps = {
.has_dim_layer = true,
.has_idle_pc = true,
.has_3d_merge = true,
+   .has_data_compress = true,
.max_linewidth = 4096,
.pixel_ram_size = DEFAULT_PIXEL_RAM_SIZE,
  };
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
index 3fd0498ab420..23230841a0d1 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
@@ -13,6 +13,7 @@ static const struct dpu_caps sc7280_dpu_caps = {
.qseed_type = DPU_SSPP_SCALER_QSEED4,
.has_dim_layer = true,
.has_idle_pc = true,
+   .has_data_compress = true,
.max_linewidth = 2400,
.pixel_ram_size = DEFAULT_PIXEL_RAM_SIZE,
  };
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h
index ce583eb14b06..c990406e4bca 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h
@@ -15,6 +15,7 @@ static const struct dpu_caps sc8280xp_dpu_caps = {
.has_dim_layer = true,
.has_idle_pc = true,
.has_3d_merge = true,
+   .has_data_compress = true,
.max_linewidth = 5120,
.pixel_ram_size = DEFAULT_PIXEL_RAM_SIZE,
  };
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_1_sm8450.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_1_sm8450.h
index 3950e7b946a5..7094640e2fbf 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_1_sm8450.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_1_sm8450.h
@@ -15,6 +15,7 @@ static const struct dpu_caps sm8450_dpu_caps = {
.has_dim_layer = true,
.has_idle_pc = true,
.has_3d_merge = true,
+   .has_data_compress = true,
.max_linewidth = 5120,
.pixel_ram_size = DEFAULT_PIXEL_RAM_SIZE,
  };
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_0_sm8550.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_0_sm8550.h
index 1b3f5424aea8..970049559e02 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_0_sm8550.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_9_0_sm8550.h
@@ -15,6 +15,7 @@ static const struct dpu_caps sm8550_dpu_caps = {
.has_dim_layer = true,
.has_idle_pc = true,
.has_3d_merge = true,
+   .has_data_compress = true,
.max_linewidth = 5120,
.pixel_ram_size = DEFAULT_PIXEL_RAM_SIZE,
  };

Re: [PATCH v3 5/6] fbdev: Move framebuffer I/O helpers into

2023-05-03 Thread Sam Ravnborg
Hi Thomas,

> > But I am missing something somewhere as I cannot see how this builds.
> > asm-generic now provide the fb_read/fb_write helpers.
> > But for example sparc has an architecture specifc fb.h so it will not
> > use the asm-generic variant. So I wonder how sparc get hold of the
> > asm-generic fb.h file?
> 
> All architecture's  files include , so that they
> all get the interfaces which they don't define themselves. For Sparc, this
> is at [1].
> 
> Best regards
> Thomas
> 
> 
> [1]
> https://cgit.freedesktop.org/drm/drm-tip/tree/arch/sparc/include/asm/fb.h#n19
> 
> > 
> > Maybe it is obvious, but I miss it.

OK, it was obvious and I missed it.
I looked at the mainline kernel, and not the drm-tip variant.
Sorry for the noise.

Sam


Re: [PATCH v3 3/7] drm/msm/dpu: add DPU_PINGPONG_DSC bits into PP_BLK and PP_BLK_TE marcos

2023-05-03 Thread Dmitry Baryshkov

On 03/05/2023 20:45, Kuogee Hsieh wrote:


On 5/2/2023 3:42 PM, Dmitry Baryshkov wrote:

On 03/05/2023 00:02, Kuogee Hsieh wrote:

At legacy chipsets, it required DPU_PINGPONG_DSC bit be set to indicate
pingpong ops functions are required to complete DSC data path setup if
this chipset has DSC hardware block presented. This patch add
DPU_PINGPONG_DSC bit to both PP_BLK and PP_BLK_TE marcos if it has DSC
hardware block presented.

Signed-off-by: Kuogee Hsieh 
---
  .../drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h    | 12 +-
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h |  8 +++
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h | 26 
++
  .../drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h    | 24 
++--
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_0_sm8250.h | 26 
++

  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_2_sc7180.h |  4 ++--
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_3_sm6115.h |  2 +-
  .../drm/msm/disp/dpu1/catalog/dpu_6_5_qcm2290.h    |  2 +-
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c |  8 +++
  9 files changed, 54 insertions(+), 58 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h

index 17f821c..b7cd746 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
@@ -112,16 +112,16 @@ static const struct dpu_lm_cfg msm8998_lm[] = {
  };
    static const struct dpu_pingpong_cfg msm8998_pp[] = {
-    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 0, sdm845_pp_sblk_te,
-    DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
+    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 
BIT(DPU_PINGPONG_DSC), 0,

+    sdm845_pp_sblk_te, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12)),
-    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 0, sdm845_pp_sblk_te,
-    DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
+    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 
BIT(DPU_PINGPONG_DSC), 0,

+    sdm845_pp_sblk_te, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13)),
-    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, 0, sdm845_pp_sblk,
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 10),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 14)),
-    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, 0, sdm845_pp_sblk,
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 11),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 15)),


Just to doublecheck: why don't we have DPU_PINGPONG_DSC for PP_3/_4? 
We do have them on sdm845. Is it because we should not use DSC with 
thos PINGPONG blocks?



I think it only have two DSPP connect to pp blocks


So, can they be connected to PP3/4 or not?


  };
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h

index ceca741..bd9 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h
@@ -110,16 +110,16 @@ static const struct dpu_lm_cfg sdm845_lm[] = {
  };
    static const struct dpu_pingpong_cfg sdm845_pp[] = {
-    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 0, sdm845_pp_sblk_te,
+    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 
BIT(DPU_PINGPONG_DSC), 0, sdm845_pp_sblk_te,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12)),
-    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 0, sdm845_pp_sblk_te,
+    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 
BIT(DPU_PINGPONG_DSC), 0, sdm845_pp_sblk_te,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13)),
-    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, BIT(DPU_PINGPONG_DSC), 
0, sdm845_pp_sblk,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 10),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 14)),
-    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, BIT(DPU_PINGPONG_DSC), 
0, sdm845_pp_sblk,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 11),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 15)),



[skipped the rest, looks good to me]



--
With best wishes
Dmitry



Re: [PATCH v3 7/7] drm/msm/dpu: add DSC 1.2 hw blocks for relevant chipsets

2023-05-03 Thread Abhinav Kumar




On 5/2/2023 2:42 PM, Dmitry Baryshkov wrote:

On 03/05/2023 00:03, Kuogee Hsieh wrote:

From: Abhinav Kumar 

Add DSC 1.2 hardware blocks to the catalog with necessary sub-block and
feature flag information.  Each display compression engine (DCE) contains
dual hard slice DSC encoders so both share same base address but with
its own different sub block address.

Signed-off-by: Abhinav Kumar 
Signed-off-by: Kuogee Hsieh 


Reviewed-by: Dmitry Baryshkov 

Minor question below.


---
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h | 14 +++
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h |  7 ++
  .../drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h   | 16 +
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_8_1_sm8450.h | 14 +++
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_9_0_sm8550.h | 14 +++
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 27 
--

  6 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h

index 4f6a965..f98c2a5 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_0_sm8350.h
@@ -153,6 +153,18 @@ static const struct dpu_merge_3d_cfg 
sm8350_merge_3d[] = {

  MERGE_3D_BLK("merge_3d_2", MERGE_3D_2, 0x5),
  };
+/*
+ * NOTE: Each display compression engine (DCE) contains dual hard
+ * slice DSC encoders so both share same base address but with
+ * its own different sub block address.
+ */
+static const struct dpu_dsc_cfg sm8350_dsc[] = {
+    DSC_BLK_1_2("dce_0", DSC_0, 0x8, 0x100, 0, dsc_sblk_0),
+    DSC_BLK_1_2("dce_0", DSC_1, 0x8, 0x100, 0, dsc_sblk_1),
+    DSC_BLK_1_2("dce_1", DSC_2, 0x81000, 0x100, 
BIT(DPU_DSC_NATIVE_422_EN), dsc_sblk_0),
+    DSC_BLK_1_2("dce_1", DSC_3, 0x81000, 0x100, 
BIT(DPU_DSC_NATIVE_422_EN), dsc_sblk_1),

+};
+
  static const struct dpu_intf_cfg sm8350_intf[] = {
  INTF_BLK("intf_0", INTF_0, 0x34000, 0x280, INTF_DP, 
MSM_DP_CONTROLLER_0, 24, INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 24, 25),
  INTF_BLK("intf_1", INTF_1, 0x35000, 0x2c4, INTF_DSI, 0, 24, 
INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 26, 27),

@@ -205,6 +217,8 @@ const struct dpu_mdss_cfg dpu_sm8350_cfg = {
  .dspp = sm8350_dspp,
  .pingpong_count = ARRAY_SIZE(sm8350_pp),
  .pingpong = sm8350_pp,
+    .dsc = sm8350_dsc,
+    .dsc_count = ARRAY_SIZE(sm8350_dsc),
  .merge_3d_count = ARRAY_SIZE(sm8350_merge_3d),
  .merge_3d = sm8350_merge_3d,
  .intf_count = ARRAY_SIZE(sm8350_intf),
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h

index 6b2c7ea..3fd0498a 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_7_2_sc7280.h
@@ -93,6 +93,11 @@ static const struct dpu_pingpong_cfg sc7280_pp[] = {
  PP_BLK_DITHER("pingpong_3", PINGPONG_3, 0x6c000, 0, 
sc7280_pp_sblk, -1, -1),

  };
+/* NOTE: sc7280 only has one dsc hard slice encoder */
+static const struct dpu_dsc_cfg sc7280_dsc[] = {
+    DSC_BLK_1_2("dce_0", DSC_0, 0x8, 0x100, 
BIT(DPU_DSC_NATIVE_422_EN), dsc_sblk_0),

+};
+
  static const struct dpu_intf_cfg sc7280_intf[] = {
  INTF_BLK("intf_0", INTF_0, 0x34000, 0x280, INTF_DP, 
MSM_DP_CONTROLLER_0, 24, INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 24, 25),
  INTF_BLK("intf_1", INTF_1, 0x35000, 0x2c4, INTF_DSI, 0, 24, 
INTF_SC7280_MASK, MDP_SSPP_TOP0_INTR, 26, 27),

@@ -142,6 +147,8 @@ const struct dpu_mdss_cfg dpu_sc7280_cfg = {
  .mixer = sc7280_lm,
  .pingpong_count = ARRAY_SIZE(sc7280_pp),
  .pingpong = sc7280_pp,
+    .dsc_count = ARRAY_SIZE(sc7280_dsc),
+    .dsc = sc7280_dsc,
  .intf_count = ARRAY_SIZE(sc7280_intf),
  .intf = sc7280_intf,
  .vbif_count = ARRAY_SIZE(sdm845_vbif),
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h

index 706d0f1..ce583eb 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_8_0_sc8280xp.h
@@ -141,6 +141,20 @@ static const struct dpu_merge_3d_cfg 
sc8280xp_merge_3d[] = {

  MERGE_3D_BLK("merge_3d_2", MERGE_3D_2, 0x5),
  };
+/*
+ * NOTE: Each display compression engine (DCE) contains dual hard
+ * slice DSC encoders so both share same base address but with
+ * its own different sub block address.
+ */
+static const struct dpu_dsc_cfg sc8280xp_dsc[] = {
+    DSC_BLK_1_2("dce_0", DSC_0, 0x8, 0x100, 
BIT(DPU_DSC_NATIVE_422_EN), dsc_sblk_0),
+    DSC_BLK_1_2("dce_0", DSC_1, 0x8, 0x100, 
BIT(DPU_DSC_NATIVE_422_EN), dsc_sblk_1),
+    DSC_BLK_1_2("dce_1", DSC_2, 0x81000, 0x100, 
BIT(DPU_DSC_NATIVE_422_EN), dsc_sblk_0),
+    DSC_BLK_1_2("dce_1", DSC_3, 0x81000, 0x100, 
BIT(DPU_DSC_NATIVE_422_EN), dsc_sblk_1),

+    DSC_BLK_1_2("dce_2", DSC_4, 0x82000, 0x100, 0, dsc_sblk_0),

Re: [RFC PATCH 0/1] Add AMDGPU_INFO_GUILTY_APP ioctl

2023-05-03 Thread André Almeida

Em 03/05/2023 14:08, Marek Olšák escreveu:
GPU hangs are pretty common post-bringup. They are not common per user, 
but if we gather all hangs from all users, we can have lots and lots of 
them.


GPU hangs are indeed not very debuggable. There are however some things 
we can do:

- Identify the hanging IB by its VA (the kernel should know it)


How can the kernel tell which VA range is being executed? I only found 
that information at mmCP_IB1_BASE_ regs, but as stated in this thread by 
Christian this is not reliable to be read.



- Read and parse the IB to detect memory corruption.
- Print active waves with shader disassembly if SQ isn't hung (often 
it's not).


Determining which packet the CP is stuck on is tricky. The CP has 2 
engines (one frontend and one backend) that work on the same command 
buffer. The frontend engine runs ahead, executes some packets and 
forwards others to the backend engine. Only the frontend engine has the 
command buffer VA somewhere. The backend engine only receives packets 
from the frontend engine via a FIFO, so it might not be possible to tell 
where it's stuck if it's stuck.


Do they run at the same asynchronously or does the front end waits the 
back end to execute?




When the gfx pipeline hangs outside of shaders, making a scandump seems 
to be the only way to have a chance at finding out what's going wrong, 
and only AMD-internal versions of hw can be scanned.


Marek

On Wed, May 3, 2023 at 11:23 AM Christian König 
> wrote:


Am 03.05.23 um 17:08 schrieb Felix Kuehling:
 > Am 2023-05-03 um 03:59 schrieb Christian König:
 >> Am 02.05.23 um 20:41 schrieb Alex Deucher:
 >>> On Tue, May 2, 2023 at 11:22 AM Timur Kristóf
 >>> mailto:timur.kris...@gmail.com>> wrote:
  [SNIP]
  In my opinion, the correct solution to those problems would be
  if
  the kernel could give userspace the necessary information
about
  a
  GPU hang before a GPU reset.
 
 >>>   The fundamental problem here is that the kernel doesn't have
 >>> that
 >>> information either. We know which IB timed out and can
 >>> potentially do
 >>> a devcoredump when that happens, but that's it.
 >>
 >> Is it really not possible to know such a fundamental thing
as what
 >> the
 >> GPU was doing when it hung? How are we supposed to do any
kind of
 >> debugging without knowing that?
 >>
 >> Yes, that's indeed something at least I try to figure out for years
 >> as well.
 >>
 >> Basically there are two major problems:
 >> 1. When the ASIC is hung you can't talk to the firmware engines any
 >> more and most state is not exposed directly, but just through some
 >> fw/hw interface.
 >>     Just take a look at how umr reads the shader state from the SQ.
 >> When that block is hung you can't do that any more and basically
have
 >> no chance at all to figure out why it's hung.
 >>
 >>     Same for other engines, I remember once spending a week
figuring
 >> out why the UVD block is hung during suspend. Turned out to be a
 >> debugging nightmare because any time you touch any register of that
 >> block the whole system would hang.
 >>
 >> 2. There are tons of things going on in a pipeline fashion or even
 >> completely in parallel. For example the CP is just the beginning
of a
 >> rather long pipeline which at the end produces a bunch of pixels.
 >>     In almost all cases I've seen you ran into a problem somewhere
 >> deep in the pipeline and only very rarely at the beginning.
 >>
 >>
 >> I wonder what AMD's Windows driver team is doing with this
problem,
 >> surely they must have better tools to deal with GPU hangs?
 > For better or worse, most teams internally rely on scan dumps via
 > JTAG
 > which sort of limits the usefulness outside of AMD, but also
gives
 > you
 > the exact state of the hardware when it's hung so the
hardware teams
 > prefer it.
 >
  How does this approach scale? It's not something we can ask
users to
  do, and even if all of us in the radv team had a JTAG device, we
  wouldn't be able to play every game that users experience
random hangs
  with.
 >>> It doesn't scale or lend itself particularly well to external
 >>> development, but that's the current state of affairs.
 >>
 >> The usual approach seems to be to reproduce a problem in a lab and
 >> have a JTAG attached to give the hw guys a scan dump and they can
 >> then tell you why something didn't worked as expected.
 >
 > That's the worst-case scenario where you're debugging HW or FW
issues.
 > Those should be pretty rare post-bringup. But 

Re: [PATCH 1/4] Input/ARM: ads7846: Get pendown IRQ from descriptors

2023-05-03 Thread Dmitry Torokhov
On Sun, Apr 30, 2023 at 11:22:16AM +0200, Linus Walleij wrote:
> The ADS7846 has some limited support for using GPIO descriptors,
> let's convert it over completely and fix all users to provide
> GPIOs in descriptor tables.
> 
> The Nokia 770 now has dynamic allocation of IRQ numbers, so this
> needs to be fixed for it to work.
> 
> Fixes: 92bf78b33b0b ("gpio: omap: use dynamic allocation of base")
> Signed-off-by: Linus Walleij 
> ---
>  arch/arm/mach-omap1/board-nokia770.c | 12 +++-
>  arch/arm/mach-pxa/spitz.c| 11 ++-
>  arch/mips/alchemy/devboards/db1000.c | 11 ++-
>  drivers/input/touchscreen/ads7846.c  | 32 
>  include/linux/spi/ads7846.h  |  2 --
>  5 files changed, 39 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/arm/mach-omap1/board-nokia770.c 
> b/arch/arm/mach-omap1/board-nokia770.c
> index a501a473ffd6..eb7652670447 100644
> --- a/arch/arm/mach-omap1/board-nokia770.c
> +++ b/arch/arm/mach-omap1/board-nokia770.c
> @@ -118,7 +118,16 @@ static struct ads7846_platform_data 
> nokia770_ads7846_platform_data __initdata =
>   .debounce_max   = 10,
>   .debounce_tol   = 3,
>   .debounce_rep   = 1,
> - .gpio_pendown   = ADS7846_PENDOWN_GPIO,
> +};
> +
> +static struct gpiod_lookup_table nokia770_ads7846_gpio_table = {
> + /* SPI bus 2, device with chip select 0 */
> + .dev_id = "spi2.0",
> + .table = {
> + GPIO_LOOKUP("gpio-0-15", ADS7846_PENDOWN_GPIO,
> + "pendown", GPIO_ACTIVE_HIGH),
> + { }
> + },
>  };

I would like to eventually get rid of GPIO_LOOKUP in favor of
PROPERTY_ENTRY_GPIO. Can we try something like the draft below (just
typed, not even compiled):

diff --git a/arch/arm/mach-omap1/board-nokia770.c 
b/arch/arm/mach-omap1/board-nokia770.c
index a501a473ffd6..34b8e392b917 100644
--- a/arch/arm/mach-omap1/board-nokia770.c
+++ b/arch/arm/mach-omap1/board-nokia770.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -35,6 +36,24 @@
 #include "clock.h"
 #include "mmc.h"
 
+static const struct software_node nokia770_mpuio_gpiochip_node = {
+   .name = "mpuio",
+};
+
+static const struct software_node nokia770_gpiochip1_node = {
+   .name = "gpio-0-15",
+};
+
+static const struct software_node nokia770_gpiochip2_node = {
+   .name = "gpio-16-31",
+};
+
+static const struct software_node nokia770_gpiochip_nodes[] = {
+   _mpuio_gpiochip_node
+   _gpiochip1_node,
+   _gpiochip2_node,
+};
+
 #define ADS7846_PENDOWN_GPIO   15
 
 static const unsigned int nokia770_keymap[] = {
@@ -102,6 +121,17 @@ static const struct omap_lcd_config nokia770_lcd_config 
__initconst = {
.ctrl_name  = "hwa742",
 };
 
+static const struct property_entry nokia770_mipid_props[] = {
+   PROPERTY_ENTRY_GPIO("reset-gpios", _gpiochip1_node,
+   13, GPIO_ACTIVE_LOW),
+   { }
+};
+
+static const struct software_node nokia770_mipid_swnode = {
+   .name = "lcd_mipid",
+   .properties = nokia770_mipid_props,
+};
+
 static void __init mipid_dev_init(void)
 {
nokia770_mipid_platform_data.nreset_gpio = 13;
@@ -110,15 +140,22 @@ static void __init mipid_dev_init(void)
omapfb_set_lcd_config(_lcd_config);
 }
 
-static struct ads7846_platform_data nokia770_ads7846_platform_data __initdata 
= {
-   .x_max  = 0x0fff,
-   .y_max  = 0x0fff,
-   .x_plate_ohms   = 180,
-   .pressure_max   = 255,
-   .debounce_max   = 10,
-   .debounce_tol   = 3,
-   .debounce_rep   = 1,
-   .gpio_pendown   = ADS7846_PENDOWN_GPIO,
+static const struct property_entry nokia770_ads7846_props[] = {
+   PROPERTY_ENTRY_U32("touchscreen-size-x", 4096),
+   PROPERTY_ENTRY_U32("touchscreen-size-y", 4096),
+   PROPERTY_ENTRY_U32("touchscreen-max-pressure", 256),
+   PROPERTY_ENTRY_U32("touchscreen-average-samples", 10),
+   PROPERTY_ENTRY_U16("ti,x-plate-ohms", 180),
+   PROPERTY_ENTRY_U16("ti,debounce-tol", 3),
+   PROPERTY_ENTRY_U16("ti,debounce-rep", 1),
+   PROPERTY_ENTRY_GPIO("pendown-gpios", _gpiochip1_node,
+   ADS7846_PENDOWN_GPIO, GPIO_ACTIVE_HIGH),
+   { }
+};
+
+static const struct software_node nokia770_ads7846_swnode = {
+   .name = "ads7846",
+   .properties = nokia770_ads7846_props,
 };
 
 static struct spi_board_info nokia770_spi_board_info[] __initdata = {
@@ -128,13 +165,14 @@ static struct spi_board_info nokia770_spi_board_info[] 
__initdata = {
.chip_select= 3,
.max_speed_hz   = 1200,
.platform_data  = _mipid_platform_data,
+   .swnode = _mipid_swnode,
},
[1] = {
.modalias   = "ads7846",
.bus_num= 2,
.chip_select= 0,
.max_speed_hz   = 250,
-   .platform_data  = 

Re: drm/amdgpu: fix an amdgpu_irq_put() issue in gmc_v9_0_hw_fini()

2023-05-03 Thread Limonciello, Mario



On 5/2/2023 11:51 AM, Hamza Mahfooz wrote:

As made mention of, in commit 9128e6babf10 ("drm/amdgpu: fix
amdgpu_irq_put call trace in gmc_v10_0_hw_fini") and commit c094b8923bdd
("drm/amdgpu: fix amdgpu_irq_put call trace in gmc_v11_0_hw_fini"). It
is meaningless to call amdgpu_irq_put() for gmc.ecc_irq. So, remove it
from gmc_v9_0_hw_fini().

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2522
Fixes: 3029c855d79f ("drm/amdgpu: Fix desktop freezed after gpu-reset")
Signed-off-by: Hamza Mahfooz 


Reviewed-by: Mario Limonciello 


---
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 290804a06e05..6ae5cee9b64b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1999,7 +1999,6 @@ static int gmc_v9_0_hw_fini(void *handle)
if (adev->mmhub.funcs->update_power_gating)
adev->mmhub.funcs->update_power_gating(adev, false);
  
-	amdgpu_irq_put(adev, >gmc.ecc_irq, 0);

amdgpu_irq_put(adev, >gmc.vm_fault, 0);
  
  	return 0;


Re: [PATCH v2 17/19] fbdev: Validate info->screen_{base,buffer} in fb_ops implementations

2023-05-03 Thread Thomas Zimmermann

Hi

Am 03.05.23 um 17:02 schrieb Geert Uytterhoeven:

Hi Thomas,

On Wed, May 3, 2023 at 4:30 PM Thomas Zimmermann  wrote:

Am 03.05.23 um 11:51 schrieb Geert Uytterhoeven:

On Fri, Apr 28, 2023 at 2:26 PM Thomas Zimmermann  wrote:

Push the test for info->screen_base from fb_read() and fb_write() into
the implementations of struct fb_ops.{fb_read,fb_write}. In cases where
the driver operates on info->screen_buffer, test this field instead.

While bothi fields, screen_base and screen_buffer, are stored in the


both


same location, they refer to different address spaces. For correctness,
we want to test each field in exactly the code that uses it.


Not a direct comment for this patch: and later the union can be split
in two separate fields, to protect against misuse?


No idea. Currently we have sparse that warns about mismatching address
spaces if the fields are mixed up. That's good enough, as far I'm concerned.


The potential issue that is still present is that an fbdev driver uses
fb_info.screen_base, and configures the use of drawing ops that use
fb_info.screen_buffer (or vice-versa), which will happily use the wrong
type of pointer.  Sparse doesn't protect against that.


Right. From a quick grep, I've found quite a cases where cfb_ functions 
operate on non-__iomem memory. I'm sure that the opposite with sys_ 
functions exists as well. Fixing this will be a good follow-up patchset. 
Thanks for the suggestion.


Best regards
Thomas



Gr{oetje,eeting}s,

 Geert



--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)


OpenPGP_signature
Description: OpenPGP digital signature


Re: [Intel-gfx] [PATCH v2 3/4] drm/i915/guc: Capture list naming clean up

2023-05-03 Thread Teres Alexis, Alan Previn
LGTM:
Reviewed-by: Alan Previn 

On Fri, 2023-04-28 at 11:56 -0700, john.c.harri...@intel.com wrote:
> From: John Harrison 
> 
> Don't use 'xe_lp*' prefixes for register lists that are common with
> Gen8.
> 
> Don't add Xe only GSC registers to pre-Xe devices that don't
> even have a GSC engine.
> 
> Fix Xe_LP name.
> 
> Don't use GEN9 as a prefix for register lists that contain all GEN8
> registers.
> 
> Rename the 'default_' register list prefix to 'gen8_' as that is the
> more accurate name.
alan:snip


Re: [PATCH v3 4/7] drm/msm/dpu: add PINGPONG_NONE to disconnect DSC from PINGPONG

2023-05-03 Thread Kuogee Hsieh



On 5/3/2023 1:03 AM, Marijn Suijten wrote:

On 2023-05-02 14:02:59, Kuogee Hsieh wrote:

During DSC setup, the crossbar mux need to be programmed to engage
DSC to specified PINGPONG. Hence during tear down, the crossbar mux
need to be reset to disengage DSC from PINGPONG. This patch add
PINGPONG_NONE to serve as disable to reset crossbar mux.

This patch doesn't *just add* PINGPONG_NONE to reset the crossbar; that
functionality was already available thanks to a `bool enable` function
parameter.  Instead it should explain why you think PINGPONG_NONE is
more convenient than passing a bool that warrants this replacement.
(Hint: I think because you don't have a hw_pp->idx available in the
  teardown path, and/or its value is not relevant for the disable case
  anyway.)

In addition I don't see this series use PINGPONG_NONE anywhere yet: will
that be added in the DSC 1.2 series for DP (to support hotplug)?


PINGPONG_NONE will be used to tear down DSC data path later at DP DSC 
patch series.


Current it is not used because DSI does not do tear down.




Signed-off-by: Kuogee Hsieh 
---
  drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 2 +-
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c  | 7 +++
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h  | 1 -
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h | 3 ++-
  4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
index 1dc5dbe..d9ad334 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c
@@ -1839,7 +1839,7 @@ static void dpu_encoder_dsc_pipe_cfg(struct dpu_hw_dsc 
*hw_dsc,
hw_pp->ops.setup_dsc(hw_pp);
  
  	if (hw_dsc->ops.dsc_bind_pingpong_blk)

-   hw_dsc->ops.dsc_bind_pingpong_blk(hw_dsc, true, hw_pp->idx);
+   hw_dsc->ops.dsc_bind_pingpong_blk(hw_dsc, hw_pp->idx);
  
  	if (hw_pp->ops.enable_dsc)

hw_pp->ops.enable_dsc(hw_pp);
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c
index 4a6bbcc..3e68d47 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.c
@@ -157,7 +157,6 @@ static void dpu_hw_dsc_config_thresh(struct dpu_hw_dsc 
*hw_dsc,
  
  static void dpu_hw_dsc_bind_pingpong_blk(

struct dpu_hw_dsc *hw_dsc,
-   bool enable,
const enum dpu_pingpong pp)
  {
struct dpu_hw_blk_reg_map *c = _dsc->hw;
@@ -166,13 +165,13 @@ static void dpu_hw_dsc_bind_pingpong_blk(
  
  	dsc_ctl_offset = DSC_CTL(hw_dsc->idx);
  
-	if (enable)

+   if (pp)
mux_cfg = (pp - PINGPONG_0) & 0x7;
  
  	DRM_DEBUG_KMS("%s dsc:%d %s pp:%d\n",

-   enable ? "Binding" : "Unbinding",
+   pp ? "Binding" : "Unbinding",
hw_dsc->idx - DSC_0,
-   enable ? "to" : "from",
+   pp ? "to" : "from",
pp - PINGPONG_0);

PINGPONG_NONE - PINGPONG_0 = -1, so this whole debug log likely needs to
be rewritten for the disable case as we don't know what PINGPONG it is
being unbound from.  How about:

if (pp)
DRM_DEBUG_KMS("Binding dsc:%d to pp:%d\n",
hw_dsc->idx - DSC_0,
pp - PINGPONG_0);
else
DRM_DEBUG_KMS("Unbinding dsc:%d from any pp\n",
hw_dsc->idx - DSC_0);

- Marijn

  
  	DPU_REG_WRITE(c, dsc_ctl_offset, mux_cfg);

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h
index 287ec5f..138080a 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_dsc.h
@@ -44,7 +44,6 @@ struct dpu_hw_dsc_ops {
  struct drm_dsc_config *dsc);
  
  	void (*dsc_bind_pingpong_blk)(struct dpu_hw_dsc *hw_dsc,

- bool enable,
  enum dpu_pingpong pp);
  };
  
diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h

index 2d9192a..56826a9 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_mdss.h
@@ -191,7 +191,8 @@ enum dpu_dsc {
  };
  
  enum dpu_pingpong {

-   PINGPONG_0 = 1,
+   PINGPONG_NONE,
+   PINGPONG_0,
PINGPONG_1,
PINGPONG_2,
PINGPONG_3,
--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



Re: [PATCH v3 3/7] drm/msm/dpu: add DPU_PINGPONG_DSC bits into PP_BLK and PP_BLK_TE marcos

2023-05-03 Thread Kuogee Hsieh



On 5/2/2023 3:42 PM, Dmitry Baryshkov wrote:

On 03/05/2023 00:02, Kuogee Hsieh wrote:

At legacy chipsets, it required DPU_PINGPONG_DSC bit be set to indicate
pingpong ops functions are required to complete DSC data path setup if
this chipset has DSC hardware block presented. This patch add
DPU_PINGPONG_DSC bit to both PP_BLK and PP_BLK_TE marcos if it has DSC
hardware block presented.

Signed-off-by: Kuogee Hsieh 
---
  .../drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h    | 12 +-
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h |  8 +++
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_5_0_sm8150.h | 26 
++
  .../drm/msm/disp/dpu1/catalog/dpu_5_1_sc8180x.h    | 24 
++--
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_0_sm8250.h | 26 
++

  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_2_sc7180.h |  4 ++--
  .../gpu/drm/msm/disp/dpu1/catalog/dpu_6_3_sm6115.h |  2 +-
  .../drm/msm/disp/dpu1/catalog/dpu_6_5_qcm2290.h    |  2 +-
  drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c |  8 +++
  9 files changed, 54 insertions(+), 58 deletions(-)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h

index 17f821c..b7cd746 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_0_msm8998.h
@@ -112,16 +112,16 @@ static const struct dpu_lm_cfg msm8998_lm[] = {
  };
    static const struct dpu_pingpong_cfg msm8998_pp[] = {
-    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 0, sdm845_pp_sblk_te,
-    DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
+    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 
BIT(DPU_PINGPONG_DSC), 0,

+    sdm845_pp_sblk_te, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12)),
-    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 0, sdm845_pp_sblk_te,
-    DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
+    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 
BIT(DPU_PINGPONG_DSC), 0,

+    sdm845_pp_sblk_te, DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13)),
-    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, 0, sdm845_pp_sblk,
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 10),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 14)),
-    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, 0, sdm845_pp_sblk,
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 11),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 15)),


Just to doublecheck: why don't we have DPU_PINGPONG_DSC for PP_3/_4? 
We do have them on sdm845. Is it because we should not use DSC with 
thos PINGPONG blocks?



I think it only have two DSPP connect to pp blocks

  };
diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h 
b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h

index ceca741..bd9 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h
+++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_4_0_sdm845.h
@@ -110,16 +110,16 @@ static const struct dpu_lm_cfg sdm845_lm[] = {
  };
    static const struct dpu_pingpong_cfg sdm845_pp[] = {
-    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 0, sdm845_pp_sblk_te,
+    PP_BLK_TE("pingpong_0", PINGPONG_0, 0x7, 
BIT(DPU_PINGPONG_DSC), 0, sdm845_pp_sblk_te,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 8),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 12)),
-    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 0, sdm845_pp_sblk_te,
+    PP_BLK_TE("pingpong_1", PINGPONG_1, 0x70800, 
BIT(DPU_PINGPONG_DSC), 0, sdm845_pp_sblk_te,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 9),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 13)),
-    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_2", PINGPONG_2, 0x71000, BIT(DPU_PINGPONG_DSC), 
0, sdm845_pp_sblk,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 10),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 14)),
-    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, 0, sdm845_pp_sblk,
+    PP_BLK("pingpong_3", PINGPONG_3, 0x71800, BIT(DPU_PINGPONG_DSC), 
0, sdm845_pp_sblk,

  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 11),
  DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 15)),



[skipped the rest, looks good to me]



Re: [PATCH 2/4] drm/msm/dsi: Fix compressed word count calculation

2023-05-03 Thread Jessica Zhang




On 5/3/2023 1:26 AM, Dmitry Baryshkov wrote:

On 03/05/2023 04:19, Jessica Zhang wrote:

Currently, word count is calculated using slice_count. This is incorrect
as downstream uses slice per packet, which is different from
slice_count.

Slice count represents the number of soft slices per interface, and its
value will not always match that of slice per packet. For example, it is
possible to have cases where there are multiple soft slices per interface
but the panel specifies only one slice per packet.

Thus, use the default value of one slice per packet and remove 
slice_count

from the word count calculation.

Fixes: bc6b6ff8135c ("drm/msm/dsi: Use DSC slice(s) packet size to 
compute word count")

Signed-off-by: Jessica Zhang 
---
  drivers/gpu/drm/msm/dsi/dsi_host.c | 9 -
  1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c 
b/drivers/gpu/drm/msm/dsi/dsi_host.c

index 35c69dbe5f6f..b0d448ffb078 100644
--- a/drivers/gpu/drm/msm/dsi/dsi_host.c
+++ b/drivers/gpu/drm/msm/dsi/dsi_host.c
@@ -996,7 +996,14 @@ static void dsi_timing_setup(struct msm_dsi_host 
*msm_host, bool is_bonded_dsi)

  if (!msm_host->dsc)
  wc = hdisplay * dsi_get_bpp(msm_host->format) / 8 + 1;
  else
-    wc = msm_host->dsc->slice_chunk_size * 
msm_host->dsc->slice_count + 1;

+    /*
+ * When DSC is enabled, WC = slice_chunk_size * 
slice_per_packet + 1.
+ * Currently, the driver only supports default value of 
slice_per_packet = 1

+ *
+ * TODO: Expand drm_panel struct to hold slice_per_packet 
info
+ *   and adjust DSC math to account for 
slice_per_packet.


slice_per_packet is not a part of the standard DSC, so I'm not sure how 
that can be implemented. And definitely we should not care about the 
drm_panel here. It should be either a part of drm_dsc_config, or 
mipi_dsi_device.


Hi Dmitry,

IIRC slice per packet is given by the panel specs with the default value 
being 1 if no value is specified, so it might be better to have it as 
part of mipi_dsi_device.


Will update the TODO comment accordingly.

Thanks,

Jessica Zhang




+ */
+    wc = msm_host->dsc->slice_chunk_size + 1;
  dsi_write(msm_host, REG_DSI_CMD_MDP_STREAM0_CTRL,
  DSI_CMD_MDP_STREAM0_CTRL_WORD_COUNT(wc) |



--
With best wishes
Dmitry



Re: [RFC PATCH 0/1] Add AMDGPU_INFO_GUILTY_APP ioctl

2023-05-03 Thread Marek Olšák
WRITE_DATA with ENGINE=PFP will execute the packet on the frontend engine,
while ENGINE=ME will execute the packet on the backend engine.

Marek

On Wed, May 3, 2023 at 1:08 PM Marek Olšák  wrote:

> GPU hangs are pretty common post-bringup. They are not common per user,
> but if we gather all hangs from all users, we can have lots and lots of
> them.
>
> GPU hangs are indeed not very debuggable. There are however some things we
> can do:
> - Identify the hanging IB by its VA (the kernel should know it)
> - Read and parse the IB to detect memory corruption.
> - Print active waves with shader disassembly if SQ isn't hung (often it's
> not).
>
> Determining which packet the CP is stuck on is tricky. The CP has 2
> engines (one frontend and one backend) that work on the same command
> buffer. The frontend engine runs ahead, executes some packets and forwards
> others to the backend engine. Only the frontend engine has the command
> buffer VA somewhere. The backend engine only receives packets from the
> frontend engine via a FIFO, so it might not be possible to tell where it's
> stuck if it's stuck.
>
> When the gfx pipeline hangs outside of shaders, making a scandump seems to
> be the only way to have a chance at finding out what's going wrong, and
> only AMD-internal versions of hw can be scanned.
>
> Marek
>
> On Wed, May 3, 2023 at 11:23 AM Christian König <
> ckoenig.leichtzumer...@gmail.com> wrote:
>
>> Am 03.05.23 um 17:08 schrieb Felix Kuehling:
>> > Am 2023-05-03 um 03:59 schrieb Christian König:
>> >> Am 02.05.23 um 20:41 schrieb Alex Deucher:
>> >>> On Tue, May 2, 2023 at 11:22 AM Timur Kristóf
>> >>>  wrote:
>>  [SNIP]
>>  In my opinion, the correct solution to those problems would be
>>  if
>>  the kernel could give userspace the necessary information about
>>  a
>>  GPU hang before a GPU reset.
>> 
>> >>>   The fundamental problem here is that the kernel doesn't have
>> >>> that
>> >>> information either. We know which IB timed out and can
>> >>> potentially do
>> >>> a devcoredump when that happens, but that's it.
>> >>
>> >> Is it really not possible to know such a fundamental thing as what
>> >> the
>> >> GPU was doing when it hung? How are we supposed to do any kind of
>> >> debugging without knowing that?
>> >>
>> >> Yes, that's indeed something at least I try to figure out for years
>> >> as well.
>> >>
>> >> Basically there are two major problems:
>> >> 1. When the ASIC is hung you can't talk to the firmware engines any
>> >> more and most state is not exposed directly, but just through some
>> >> fw/hw interface.
>> >> Just take a look at how umr reads the shader state from the SQ.
>> >> When that block is hung you can't do that any more and basically have
>> >> no chance at all to figure out why it's hung.
>> >>
>> >> Same for other engines, I remember once spending a week figuring
>> >> out why the UVD block is hung during suspend. Turned out to be a
>> >> debugging nightmare because any time you touch any register of that
>> >> block the whole system would hang.
>> >>
>> >> 2. There are tons of things going on in a pipeline fashion or even
>> >> completely in parallel. For example the CP is just the beginning of a
>> >> rather long pipeline which at the end produces a bunch of pixels.
>> >> In almost all cases I've seen you ran into a problem somewhere
>> >> deep in the pipeline and only very rarely at the beginning.
>> >>
>> >>
>> >> I wonder what AMD's Windows driver team is doing with this problem,
>> >> surely they must have better tools to deal with GPU hangs?
>> > For better or worse, most teams internally rely on scan dumps via
>> > JTAG
>> > which sort of limits the usefulness outside of AMD, but also gives
>> > you
>> > the exact state of the hardware when it's hung so the hardware teams
>> > prefer it.
>> >
>>  How does this approach scale? It's not something we can ask users to
>>  do, and even if all of us in the radv team had a JTAG device, we
>>  wouldn't be able to play every game that users experience random
>> hangs
>>  with.
>> >>> It doesn't scale or lend itself particularly well to external
>> >>> development, but that's the current state of affairs.
>> >>
>> >> The usual approach seems to be to reproduce a problem in a lab and
>> >> have a JTAG attached to give the hw guys a scan dump and they can
>> >> then tell you why something didn't worked as expected.
>> >
>> > That's the worst-case scenario where you're debugging HW or FW issues.
>> > Those should be pretty rare post-bringup. But are there hangs caused
>> > by user mode driver or application bugs that are easier to debug and
>> > probably don't even require a GPU reset? For example most VM faults
>> > can be handled without hanging the GPU. Similarly, a shader in an
>> > endless loop should not require a full GPU reset. In the KFD compute
>> > case, 

Re: [PATCH 1/4] drm/msm/dsi: Adjust pclk rate for compression

2023-05-03 Thread Jessica Zhang




On 5/3/2023 1:33 AM, Dmitry Baryshkov wrote:

On 03/05/2023 04:19, Jessica Zhang wrote:

Divide the pclk rate by the compression ratio when DSC is enabled

Signed-off-by: Jessica Zhang 
---
  drivers/gpu/drm/msm/dsi/dsi_host.c | 14 ++
  1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/dsi/dsi_host.c 
b/drivers/gpu/drm/msm/dsi/dsi_host.c

index 43a5ec33eee8..35c69dbe5f6f 100644
--- a/drivers/gpu/drm/msm/dsi/dsi_host.c
+++ b/drivers/gpu/drm/msm/dsi/dsi_host.c
@@ -561,7 +561,8 @@ void dsi_link_clk_disable_v2(struct msm_dsi_host 
*msm_host)

  clk_disable_unprepare(msm_host->byte_clk);
  }
-static unsigned long dsi_get_pclk_rate(const struct drm_display_mode 
*mode, bool is_bonded_dsi)
+static unsigned long dsi_get_pclk_rate(const struct drm_display_mode 
*mode,

+    struct drm_dsc_config *dsc, bool is_bonded_dsi)
  {
  unsigned long pclk_rate;
@@ -576,6 +577,11 @@ static unsigned long dsi_get_pclk_rate(const 
struct drm_display_mode *mode, bool

  if (is_bonded_dsi)
  pclk_rate /= 2;
+    /* If DSC is enabled, divide pclk by compression ratio */
+    if (dsc)
+    pclk_rate = DIV_ROUND_UP(pclk_rate,
+    dsc->bits_per_component * 3 / msm_dsc_get_bpp_int(dsc));
+


Don't we loose precision here?
Would DIV_ROUND_UP(pclk_rate * bpp, dsc->bpc * 3) be better?


Hi Dmitry,

Acked.

Thanks,

Jessica Zhang




  return pclk_rate;
  }
@@ -585,7 +591,7 @@ unsigned long dsi_byte_clk_get_rate(struct 
mipi_dsi_host *host, bool is_bonded_d

  struct msm_dsi_host *msm_host = to_msm_dsi_host(host);
  u8 lanes = msm_host->lanes;
  u32 bpp = dsi_get_bpp(msm_host->format);
-    unsigned long pclk_rate = dsi_get_pclk_rate(mode, is_bonded_dsi);
+    unsigned long pclk_rate = dsi_get_pclk_rate(mode, msm_host->dsc, 
is_bonded_dsi);

  u64 pclk_bpp = (u64)pclk_rate * bpp;
  if (lanes == 0) {
@@ -604,7 +610,7 @@ unsigned long dsi_byte_clk_get_rate(struct 
mipi_dsi_host *host, bool is_bonded_d
  static void dsi_calc_pclk(struct msm_dsi_host *msm_host, bool 
is_bonded_dsi)

  {
-    msm_host->pixel_clk_rate = dsi_get_pclk_rate(msm_host->mode, 
is_bonded_dsi);
+    msm_host->pixel_clk_rate = dsi_get_pclk_rate(msm_host->mode, 
msm_host->dsc, is_bonded_dsi);
  msm_host->byte_clk_rate = dsi_byte_clk_get_rate(_host->base, 
is_bonded_dsi,

  msm_host->mode);
@@ -634,7 +640,7 @@ int dsi_calc_clk_rate_v2(struct msm_dsi_host 
*msm_host, bool is_bonded_dsi)

  dsi_calc_pclk(msm_host, is_bonded_dsi);
-    pclk_bpp = (u64)dsi_get_pclk_rate(msm_host->mode, is_bonded_dsi) 
* bpp;
+    pclk_bpp = (u64)dsi_get_pclk_rate(msm_host->mode, msm_host->dsc, 
is_bonded_dsi) * bpp;

  do_div(pclk_bpp, 8);
  msm_host->src_clk_rate = pclk_bpp;



--
With best wishes
Dmitry



Re: [RFC PATCH 0/1] Add AMDGPU_INFO_GUILTY_APP ioctl

2023-05-03 Thread Marek Olšák
GPU hangs are pretty common post-bringup. They are not common per user, but
if we gather all hangs from all users, we can have lots and lots of them.

GPU hangs are indeed not very debuggable. There are however some things we
can do:
- Identify the hanging IB by its VA (the kernel should know it)
- Read and parse the IB to detect memory corruption.
- Print active waves with shader disassembly if SQ isn't hung (often it's
not).

Determining which packet the CP is stuck on is tricky. The CP has 2 engines
(one frontend and one backend) that work on the same command buffer. The
frontend engine runs ahead, executes some packets and forwards others to
the backend engine. Only the frontend engine has the command buffer VA
somewhere. The backend engine only receives packets from the frontend
engine via a FIFO, so it might not be possible to tell where it's stuck if
it's stuck.

When the gfx pipeline hangs outside of shaders, making a scandump seems to
be the only way to have a chance at finding out what's going wrong, and
only AMD-internal versions of hw can be scanned.

Marek

On Wed, May 3, 2023 at 11:23 AM Christian König <
ckoenig.leichtzumer...@gmail.com> wrote:

> Am 03.05.23 um 17:08 schrieb Felix Kuehling:
> > Am 2023-05-03 um 03:59 schrieb Christian König:
> >> Am 02.05.23 um 20:41 schrieb Alex Deucher:
> >>> On Tue, May 2, 2023 at 11:22 AM Timur Kristóf
> >>>  wrote:
>  [SNIP]
>  In my opinion, the correct solution to those problems would be
>  if
>  the kernel could give userspace the necessary information about
>  a
>  GPU hang before a GPU reset.
> 
> >>>   The fundamental problem here is that the kernel doesn't have
> >>> that
> >>> information either. We know which IB timed out and can
> >>> potentially do
> >>> a devcoredump when that happens, but that's it.
> >>
> >> Is it really not possible to know such a fundamental thing as what
> >> the
> >> GPU was doing when it hung? How are we supposed to do any kind of
> >> debugging without knowing that?
> >>
> >> Yes, that's indeed something at least I try to figure out for years
> >> as well.
> >>
> >> Basically there are two major problems:
> >> 1. When the ASIC is hung you can't talk to the firmware engines any
> >> more and most state is not exposed directly, but just through some
> >> fw/hw interface.
> >> Just take a look at how umr reads the shader state from the SQ.
> >> When that block is hung you can't do that any more and basically have
> >> no chance at all to figure out why it's hung.
> >>
> >> Same for other engines, I remember once spending a week figuring
> >> out why the UVD block is hung during suspend. Turned out to be a
> >> debugging nightmare because any time you touch any register of that
> >> block the whole system would hang.
> >>
> >> 2. There are tons of things going on in a pipeline fashion or even
> >> completely in parallel. For example the CP is just the beginning of a
> >> rather long pipeline which at the end produces a bunch of pixels.
> >> In almost all cases I've seen you ran into a problem somewhere
> >> deep in the pipeline and only very rarely at the beginning.
> >>
> >>
> >> I wonder what AMD's Windows driver team is doing with this problem,
> >> surely they must have better tools to deal with GPU hangs?
> > For better or worse, most teams internally rely on scan dumps via
> > JTAG
> > which sort of limits the usefulness outside of AMD, but also gives
> > you
> > the exact state of the hardware when it's hung so the hardware teams
> > prefer it.
> >
>  How does this approach scale? It's not something we can ask users to
>  do, and even if all of us in the radv team had a JTAG device, we
>  wouldn't be able to play every game that users experience random hangs
>  with.
> >>> It doesn't scale or lend itself particularly well to external
> >>> development, but that's the current state of affairs.
> >>
> >> The usual approach seems to be to reproduce a problem in a lab and
> >> have a JTAG attached to give the hw guys a scan dump and they can
> >> then tell you why something didn't worked as expected.
> >
> > That's the worst-case scenario where you're debugging HW or FW issues.
> > Those should be pretty rare post-bringup. But are there hangs caused
> > by user mode driver or application bugs that are easier to debug and
> > probably don't even require a GPU reset? For example most VM faults
> > can be handled without hanging the GPU. Similarly, a shader in an
> > endless loop should not require a full GPU reset. In the KFD compute
> > case, that's still preemptible and the offending process can be killed
> > with Ctrl-C or debugged with rocm-gdb.
>
> We also have infinite loop in shader abort for gfx and page faults are
> pretty rare with OpenGL (a bit more often with Vulkan) and can be
> handled gracefully on modern hw (they just spam the logs).
>
> The 

Re: [PATCH v6 0/3] Add sync object UAPI support to VirtIO-GPU driver

2023-05-03 Thread Gurchetan Singh
On Mon, May 1, 2023 at 8:38 AM Dmitry Osipenko <
dmitry.osipe...@collabora.com> wrote:

> On 4/16/23 14:52, Dmitry Osipenko wrote:
> > We have multiple Vulkan context types that are awaiting for the addition
> > of the sync object DRM UAPI support to the VirtIO-GPU kernel driver:
> >
> >  1. Venus context
> >  2. Native contexts (virtio-freedreno, virtio-intel, virtio-amdgpu)
> >
> > Mesa core supports DRM sync object UAPI, providing Vulkan drivers with a
> > generic fencing implementation that we want to utilize.
> >
> > This patch adds initial sync objects support. It creates fundament for a
> > further fencing improvements. Later on we will want to extend the
> VirtIO-GPU
> > fencing API with passing fence IDs to host for waiting, it will be a new
> > additional VirtIO-GPU IOCTL and more. Today we have several VirtIO-GPU
> context
> > drivers in works that require VirtIO-GPU to support sync objects UAPI.
> >
> > The patch is heavily inspired by the sync object UAPI implementation of
> the
> > MSM driver.
>
> Gerd, do you have any objections to merging this series?
>
> We have AMDGPU [1] and Intel [2] native context WIP drivers depending on
> the sync object support. It is the only part missing from kernel today
> that is wanted by the native context drivers. Otherwise, there are few
> other things in Qemu and virglrenderer left to sort out.
>
> [1] https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21658
> [2]
> https://gitlab.freedesktop.org/digetx/mesa/-/commits/native-context-iris


I'm not saying this change isn't good, just it's probably possible to
implement the native contexts (even up to even VK1.2) without it.  But this
patch series may be the most ergonomic way to do it, given how Mesa is
designed.  But you probably want one of Mesa MRs reviewed first before
merging (I added a comment on the amdgpu change) and that is a requirement
[a].

[a] "The userspace side must be fully reviewed and tested to the standards
of that user space project. For e.g. mesa this means piglit testcases and
review on the mailing list. This is again to ensure that the new interface
actually gets the job done." -- from the requirements


>
>
> --
> Best regards,
> Dmitry
>
>


Re: [PATCH V2 1/6] drm: bridge: samsung-dsim: fix blanking packet size calculation

2023-05-03 Thread Jagan Teki
On Mon, Apr 24, 2023 at 3:17 PM Adam Ford  wrote:
>
> On Mon, Apr 24, 2023 at 4:03 AM Jagan Teki  wrote:
> >
> > On Sun, Apr 23, 2023 at 5:42 PM Adam Ford  wrote:
> > >
> > > From: Lucas Stach 
> > >
> > > Scale the blanking packet sizes to match the ratio between HS clock
> > > and DPI interface clock. The controller seems to do internal scaling
> > > to the number of active lanes, so we don't take those into account.
> > >
> > > Signed-off-by: Lucas Stach 
> > > Signed-off-by: Adam Ford 
> > > ---
> > >  drivers/gpu/drm/bridge/samsung-dsim.c | 18 +++---
> > >  1 file changed, 15 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c 
> > > b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > index e0a402a85787..2be3b58624c3 100644
> > > --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> > > +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> > > @@ -874,17 +874,29 @@ static void samsung_dsim_set_display_mode(struct 
> > > samsung_dsim *dsi)
> > > u32 reg;
> > >
> > > if (dsi->mode_flags & MIPI_DSI_MODE_VIDEO) {
> > > +   int byte_clk_khz = dsi->burst_clk_rate / 1000 / 8;
> > > +   int hfp = (m->hsync_start - m->hdisplay) * byte_clk_khz / 
> > > m->clock;
> >
> > I do not quite understand why it depends on burst_clk_rate, would you
> > please explain? does it depends on bpp something like this
> >
> > mipi_dsi_pixel_format_to_bpp(format) / 8
>
> The pixel clock is currently set to the burst clock rate.  Dividing
> the clock by 1000 gets the pixel clock in KHz, and dividing by 8
> converts bits to bytes.
> Later in the series, I change the clock from the burst clock to the
> cached value returned from samsung_dsim_set_pll.

Okay.

>
> >
> > > +   int hbp = (m->htotal - m->hsync_end) * byte_clk_khz / 
> > > m->clock;
> > > +   int hsa = (m->hsync_end - m->hsync_start) * byte_clk_khz 
> > > / m->clock;
> > > +
> > > +   /* remove packet overhead when possible */
> > > +   hfp = max(hfp - 6, 0);
> > > +   hbp = max(hbp - 6, 0);
> > > +   hsa = max(hsa - 6, 0);
> >
> > 6 blanking packet overhead here means, 4 bytes + payload + 2 bytes
> > format? does this packet overhead depends on the respective porch's
> > like hpf, hbp and hsa has different packet overheads?
>
> Lucas might be able to explain this better.  However, it does match
> the values of the downstream NXP kernel, and I tried playing with
> these values manually, and 6 appeared to be the only number that
> seemed to work for me too.  I abandoned my approach for Lucas'
> implementation, because it seemed more clear than mine.
> Maybe Lucas can chime in, since this is really his patch.

Lucan, any inputs?

Jagan.


[PATCH v2 2/2] drm/bridge: ti-sn65dsi83: Fix enable/disable flow to meet spec

2023-05-03 Thread Frieder Schrempf
From: Frieder Schrempf 

The datasheet describes the following initialization flow including
minimum delay times between each step:

1. DSI data lanes need to be in LP-11 and the clock lane in HS mode
2. toggle EN signal
3. initialize registers
4. enable PLL
5. soft reset
6. enable DSI stream
7. check error status register

To meet this requirement we need to make sure the host bridge's
pre_enable() is called first by using the pre_enable_prev_first
flag.

Furthermore we need to split enable() into pre_enable() which covers
steps 2-5 from above and enable() which covers step 7 and is called
after the host bridge's enable().

Signed-off-by: Frieder Schrempf 
---
Changes for v2:
* Drop RFC
---
 drivers/gpu/drm/bridge/ti-sn65dsi83.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi83.c 
b/drivers/gpu/drm/bridge/ti-sn65dsi83.c
index 75286c9afbb9..a82f10b8109f 100644
--- a/drivers/gpu/drm/bridge/ti-sn65dsi83.c
+++ b/drivers/gpu/drm/bridge/ti-sn65dsi83.c
@@ -321,8 +321,8 @@ static u8 sn65dsi83_get_dsi_div(struct sn65dsi83 *ctx)
return dsi_div - 1;
 }
 
-static void sn65dsi83_atomic_enable(struct drm_bridge *bridge,
-   struct drm_bridge_state *old_bridge_state)
+static void sn65dsi83_atomic_pre_enable(struct drm_bridge *bridge,
+   struct drm_bridge_state 
*old_bridge_state)
 {
struct sn65dsi83 *ctx = bridge_to_sn65dsi83(bridge);
struct drm_atomic_state *state = old_bridge_state->base.state;
@@ -484,11 +484,22 @@ static void sn65dsi83_atomic_enable(struct drm_bridge 
*bridge,
/* Trigger reset after CSR register update. */
regmap_write(ctx->regmap, REG_RC_RESET, REG_RC_RESET_SOFT_RESET);
 
+   /* Wait for 10ms after soft reset as specified in datasheet */
+   usleep_range(1, 12000);
+}
+
+static void sn65dsi83_atomic_enable(struct drm_bridge *bridge,
+   struct drm_bridge_state *old_bridge_state)
+{
+   struct sn65dsi83 *ctx = bridge_to_sn65dsi83(bridge);
+   unsigned int pval;
+
/* Clear all errors that got asserted during initialization. */
regmap_read(ctx->regmap, REG_IRQ_STAT, );
regmap_write(ctx->regmap, REG_IRQ_STAT, pval);
 
-   usleep_range(1, 12000);
+   /* Wait for 1ms and check for errors in status register */
+   usleep_range(1000, 1100);
regmap_read(ctx->regmap, REG_IRQ_STAT, );
if (pval)
dev_err(ctx->dev, "Unexpected link status 0x%02x\n", pval);
@@ -555,6 +566,7 @@ static const struct drm_bridge_funcs sn65dsi83_funcs = {
.attach = sn65dsi83_attach,
.detach = sn65dsi83_detach,
.atomic_enable  = sn65dsi83_atomic_enable,
+   .atomic_pre_enable  = sn65dsi83_atomic_pre_enable,
.atomic_disable = sn65dsi83_atomic_disable,
.mode_valid = sn65dsi83_mode_valid,
 
@@ -697,6 +709,7 @@ static int sn65dsi83_probe(struct i2c_client *client)
 
ctx->bridge.funcs = _funcs;
ctx->bridge.of_node = dev->of_node;
+   ctx->bridge.pre_enable_prev_first = true;
drm_bridge_add(>bridge);
 
ret = sn65dsi83_host_attach(ctx);
-- 
2.40.0



[PATCH v2 1/2] drm: bridge: samsung-dsim: Fix i.MX8M enable flow to meet spec

2023-05-03 Thread Frieder Schrempf
From: Frieder Schrempf 

According to the documentation [1] the proper enable flow is:

1. Enable DSI link and keep data lanes in LP-11 (stop state)
2. Disable stop state to bring data lanes into HS mode

Currently we do this all at once within enable(), which doesn't
allow to meet the requirements of some downstream bridges.

To fix this we now enable the DSI in pre_enable() and force it
into stop state using the FORCE_STOP_STATE bit in the ESCMODE
register until enable() is called where we reset the bit.

We currently do this only for i.MX8M as Exynos uses a different
init flow where samsung_dsim_init() is called from
samsung_dsim_host_transfer().

[1] https://docs.kernel.org/gpu/drm-kms-helpers.html#mipi-dsi-bridge-operation

Signed-off-by: Frieder Schrempf 
---
Changes for v2:
* Drop RFC
---
 drivers/gpu/drm/bridge/samsung-dsim.c | 25 +++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c 
b/drivers/gpu/drm/bridge/samsung-dsim.c
index e0a402a85787..9775779721d9 100644
--- a/drivers/gpu/drm/bridge/samsung-dsim.c
+++ b/drivers/gpu/drm/bridge/samsung-dsim.c
@@ -859,6 +859,10 @@ static int samsung_dsim_init_link(struct samsung_dsim *dsi)
reg = samsung_dsim_read(dsi, DSIM_ESCMODE_REG);
reg &= ~DSIM_STOP_STATE_CNT_MASK;
reg |= DSIM_STOP_STATE_CNT(driver_data->reg_values[STOP_STATE_CNT]);
+
+   if (!samsung_dsim_hw_is_exynos(dsi->plat_data->hw_type))
+   reg |= DSIM_FORCE_STOP_STATE;
+
samsung_dsim_write(dsi, DSIM_ESCMODE_REG, reg);
 
reg = DSIM_BTA_TIMEOUT(0xff) | DSIM_LPDR_TIMEOUT(0x);
@@ -1340,6 +1344,9 @@ static void samsung_dsim_atomic_pre_enable(struct 
drm_bridge *bridge,
ret = samsung_dsim_init(dsi);
if (ret)
return;
+
+   samsung_dsim_set_display_mode(dsi);
+   samsung_dsim_set_display_enable(dsi, true);
}
 }
 
@@ -1347,9 +1354,16 @@ static void samsung_dsim_atomic_enable(struct drm_bridge 
*bridge,
   struct drm_bridge_state 
*old_bridge_state)
 {
struct samsung_dsim *dsi = bridge_to_dsi(bridge);
+   u32 reg;
 
-   samsung_dsim_set_display_mode(dsi);
-   samsung_dsim_set_display_enable(dsi, true);
+   if (samsung_dsim_hw_is_exynos(dsi->plat_data->hw_type)) {
+   samsung_dsim_set_display_mode(dsi);
+   samsung_dsim_set_display_enable(dsi, true);
+   } else {
+   reg = samsung_dsim_read(dsi, DSIM_ESCMODE_REG);
+   reg &= ~DSIM_FORCE_STOP_STATE;
+   samsung_dsim_write(dsi, DSIM_ESCMODE_REG, reg);
+   }
 
dsi->state |= DSIM_STATE_VIDOUT_AVAILABLE;
 }
@@ -1358,10 +1372,17 @@ static void samsung_dsim_atomic_disable(struct 
drm_bridge *bridge,
struct drm_bridge_state 
*old_bridge_state)
 {
struct samsung_dsim *dsi = bridge_to_dsi(bridge);
+   u32 reg;
 
if (!(dsi->state & DSIM_STATE_ENABLED))
return;
 
+   if (!samsung_dsim_hw_is_exynos(dsi->plat_data->hw_type)) {
+   reg = samsung_dsim_read(dsi, DSIM_ESCMODE_REG);
+   reg |= DSIM_FORCE_STOP_STATE;
+   samsung_dsim_write(dsi, DSIM_ESCMODE_REG, reg);
+   }
+
dsi->state &= ~DSIM_STATE_VIDOUT_AVAILABLE;
 }
 
-- 
2.40.0



[PATCH v2 0/2] Init flow fixes for Samsung DSIM and TI SN65DSI84

2023-05-03 Thread Frieder Schrempf
From: Frieder Schrempf 

This patchset contains a proposal to fix the initialization flow for
the display pipeline used on our i.MX8MM Kontron boards:

  i.MX8MM LCDIF -> i.MX8MM DSIM -> TI SN65DSI84 -> 7" LVDS Panel

Without these changes the display works most of the time, but fails
to come up occassionally when booting or doing on/off cycling tests
with:

  echo 0 > 
/sys/devices/platform/soc@0/32c0.bus/32e0.lcdif/graphics/fb0/blank
  echo 1 > 
/sys/devices/platform/soc@0/32c0.bus/32e0.lcdif/graphics/fb0/blank

All the changes intend to follow the documentation provided here:
https://docs.kernel.org/gpu/drm-kms-helpers.html#mipi-dsi-bridge-operation

Changes for v2:
* Drop RFC
* Drop non-working Exynos cleanup patch 3/3

Frieder Schrempf (2):
  drm: bridge: samsung-dsim: Fix i.MX8M enable flow to meet spec
  drm/bridge: ti-sn65dsi83: Fix enable/disable flow to meet spec

 drivers/gpu/drm/bridge/samsung-dsim.c | 25 +++--
 drivers/gpu/drm/bridge/ti-sn65dsi83.c | 19 ---
 2 files changed, 39 insertions(+), 5 deletions(-)

-- 
2.40.0



Re: [PATCH V3 7/7] drm: bridge: samsung-dsim: Let blanking calcuation work in non-burst mode

2023-05-03 Thread Adam Ford
On Wed, May 3, 2023 at 10:52 AM Frieder Schrempf
 wrote:
>
> On 02.05.23 03:07, Adam Ford wrote:
> > The blanking calculation currently uses burst_clk_rate for calculating
> > the settings. Since it's possible to use this in non-burst mode, it's
> > possible that where won't be burst_clk_rate.  Instead, cache the
>
> "possible that burst_clk_rate is 0"
>
> > clock rate configured from of samsung_dsim_set_pll and use it instead.
> >
> > Signed-off-by: Adam Ford  Tested-by: Chen-Yu Tsai 
> > 
>
> Maybe this patch should be squashed into patch 6/7 as otherwise
> burst_clk_rate could be 0 here causing bisection issues?

I thought about squashing them and I went back and forth on that.
Since there are some other minor edits in this series, I can push a V4
with these squashed.

>
> Apart from that:
>
> Tested on Kontron BL i.MX8MM with SN65DSI84 and ADV7535 bridges.
>

Thank you for testing this series.

> Tested-by: Frieder Schrempf 
> Reviewed-by: Frieder Schrempf 
>

adam
> > ---
> >  drivers/gpu/drm/bridge/samsung-dsim.c | 4 +++-
> >  include/drm/bridge/samsung-dsim.h | 1 +
> >  2 files changed, 4 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c 
> > b/drivers/gpu/drm/bridge/samsung-dsim.c
> > index 53099461cdc2..1dc913db2cb3 100644
> > --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> > +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> > @@ -652,6 +652,8 @@ static unsigned long samsung_dsim_set_pll(struct 
> > samsung_dsim *dsi,
> >   reg = samsung_dsim_read(dsi, DSIM_STATUS_REG);
> >   } while ((reg & DSIM_PLL_STABLE) == 0);
> >
> > + dsi->hs_clock = fout;
> > +
> >   return fout;
> >  }
> >
> > @@ -960,7 +962,7 @@ static void samsung_dsim_set_display_mode(struct 
> > samsung_dsim *dsi)
> >   u32 reg;
> >
> >   if (dsi->mode_flags & MIPI_DSI_MODE_VIDEO) {
> > - int byte_clk_khz = dsi->burst_clk_rate / 1000 / 8;
> > + int byte_clk_khz = dsi->hs_clock / 1000 / 8;>   int 
> > hfp = (m->hsync_start - m->hdisplay) * byte_clk_khz / m->clock;
> >   int hbp = (m->htotal - m->hsync_end) * byte_clk_khz / 
> > m->clock;
> >   int hsa = (m->hsync_end - m->hsync_start) * byte_clk_khz / 
> > m->clock;
> > diff --git a/include/drm/bridge/samsung-dsim.h 
> > b/include/drm/bridge/samsung-dsim.h
> > index 76ea8a1720cc..14176e6e9040 100644
> > --- a/include/drm/bridge/samsung-dsim.h
> > +++ b/include/drm/bridge/samsung-dsim.h
> > @@ -94,6 +94,7 @@ struct samsung_dsim {
> >
> >   u32 pll_clk_rate;
> >   u32 burst_clk_rate;
> > + u32 hs_clock;
> >   u32 esc_clk_rate;
> >   u32 lanes;
> >   u32 mode_flags;


Re: [PATCH V3 7/7] drm: bridge: samsung-dsim: Let blanking calcuation work in non-burst mode

2023-05-03 Thread Frieder Schrempf
On 02.05.23 03:07, Adam Ford wrote:
> The blanking calculation currently uses burst_clk_rate for calculating
> the settings. Since it's possible to use this in non-burst mode, it's
> possible that where won't be burst_clk_rate.  Instead, cache the

"possible that burst_clk_rate is 0"

> clock rate configured from of samsung_dsim_set_pll and use it instead.
> 
> Signed-off-by: Adam Ford  Tested-by: Chen-Yu Tsai 
> 

Maybe this patch should be squashed into patch 6/7 as otherwise
burst_clk_rate could be 0 here causing bisection issues?

Apart from that:

Tested on Kontron BL i.MX8MM with SN65DSI84 and ADV7535 bridges.

Tested-by: Frieder Schrempf 
Reviewed-by: Frieder Schrempf 

> ---
>  drivers/gpu/drm/bridge/samsung-dsim.c | 4 +++-
>  include/drm/bridge/samsung-dsim.h | 1 +
>  2 files changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c 
> b/drivers/gpu/drm/bridge/samsung-dsim.c
> index 53099461cdc2..1dc913db2cb3 100644
> --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> @@ -652,6 +652,8 @@ static unsigned long samsung_dsim_set_pll(struct 
> samsung_dsim *dsi,
>   reg = samsung_dsim_read(dsi, DSIM_STATUS_REG);
>   } while ((reg & DSIM_PLL_STABLE) == 0);
>  
> + dsi->hs_clock = fout;
> +
>   return fout;
>  }
>  
> @@ -960,7 +962,7 @@ static void samsung_dsim_set_display_mode(struct 
> samsung_dsim *dsi)
>   u32 reg;
>  
>   if (dsi->mode_flags & MIPI_DSI_MODE_VIDEO) {
> - int byte_clk_khz = dsi->burst_clk_rate / 1000 / 8;
> + int byte_clk_khz = dsi->hs_clock / 1000 / 8;>   int hfp 
> = (m->hsync_start - m->hdisplay) * byte_clk_khz / m->clock;
>   int hbp = (m->htotal - m->hsync_end) * byte_clk_khz / m->clock;
>   int hsa = (m->hsync_end - m->hsync_start) * byte_clk_khz / 
> m->clock;
> diff --git a/include/drm/bridge/samsung-dsim.h 
> b/include/drm/bridge/samsung-dsim.h
> index 76ea8a1720cc..14176e6e9040 100644
> --- a/include/drm/bridge/samsung-dsim.h
> +++ b/include/drm/bridge/samsung-dsim.h
> @@ -94,6 +94,7 @@ struct samsung_dsim {
>  
>   u32 pll_clk_rate;
>   u32 burst_clk_rate;
> + u32 hs_clock;
>   u32 esc_clk_rate;
>   u32 lanes;
>   u32 mode_flags;


Re: [PATCH V3 6/7] drm: bridge: samsung-dsim: Support non-burst mode

2023-05-03 Thread Frieder Schrempf
On 02.05.23 03:07, Adam Ford wrote:
> The high-speed clock is hard-coded to the burst-clock
> frequency specified in the device tree.  However, when
> using devices like certain bridge chips without burst mode
> and varying resolutions and refresh rates, it may be
> necessary to set the high-speed clock dynamically based
> on the desired pixel clock for the connected device.
> 
> This also removes the need to set a clock speed from
> the device tree for non-burst mode operation, since the
> pixel clock rate is the rate requested from the attached
> device like an HDMI bridge chip.  This should have no
> impact for people using burst-mode and setting the burst
> clock rate is still required for those users.
> 
> Signed-off-by: Adam Ford 
> Tested-by: Chen-Yu Tsai 

Tested on Kontron BL i.MX8MM with SN65DSI84 and ADV7535 bridges.

Tested-by: Frieder Schrempf 
Reviewed-by: Frieder Schrempf 


Re: [PATCH V3 5/7] drm: bridge: samsung-dsim: Dynamically configure DPHY timing

2023-05-03 Thread Frieder Schrempf
On 02.05.23 03:07, Adam Ford wrote:
> The DPHY timings are currently hard coded. Since the input
> clock can be variable, the phy timings need to be variable
> too.  Add an additional variable to the driver data to enable
> this feature to prevent breaking boards that don't support it.
> 
> The phy_mipi_dphy_get_default_config function configures the
> DPHY timings in pico-seconds, and a small macro converts those
> timings into clock cycles based on the pixel clock rate.
> 
> Signed-off-by: Adam Ford 
> Tested-by: Chen-Yu Tsai 

A few nitpicks below, otherwise:

Tested on Kontron BL i.MX8MM with SN65DSI84 and ADV7535 bridges.

Tested-by: Frieder Schrempf 
Reviewed-by: Frieder Schrempf 

> ---
>  drivers/gpu/drm/bridge/samsung-dsim.c | 79 +++
>  include/drm/bridge/samsung-dsim.h |  1 +
>  2 files changed, 70 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/gpu/drm/bridge/samsung-dsim.c 
> b/drivers/gpu/drm/bridge/samsung-dsim.c
> index 2dc02a9e37c0..99642230a54a 100644
> --- a/drivers/gpu/drm/bridge/samsung-dsim.c
> +++ b/drivers/gpu/drm/bridge/samsung-dsim.c
> @@ -18,9 +18,7 @@
>  #include 
>  #include 
>  #include 
> -
>  #include 
> -

Unrelated blank lines removed above!?

>  #include 
>  #include 
>  #include 
> @@ -218,6 +216,8 @@
>  
>  #define OLD_SCLK_MIPI_CLK_NAME   "pll_clk"
>  
> +#define PS_TO_CYCLE(PS, MHz) DIV64_U64_ROUND_CLOSEST(((PS) * (MHz)), 
> 1ULL)

Should macro arguments PS and MHz better be all lower-case?
Also, MHz is actually in Hz, right? So it should be renamed.

> +
>  static const char *const clk_names[5] = {
>   "bus_clk",
>   "sclk_mipi",
> @@ -487,6 +487,7 @@ static const struct samsung_dsim_driver_data 
> imx8mm_dsi_driver_data = {
>   .m_min = 64,
>   .m_max = 1023,
>   .min_freq = 1050,
> + .dynamic_dphy = 1,
>  };
>  
>  static const struct samsung_dsim_driver_data *
> @@ -698,13 +699,50 @@ static void samsung_dsim_set_phy_ctrl(struct 
> samsung_dsim *dsi)
>   const struct samsung_dsim_driver_data *driver_data = dsi->driver_data;
>   const unsigned int *reg_values = driver_data->reg_values;
>   u32 reg;
> + struct drm_display_mode *m = >mode;
> + int bpp = mipi_dsi_pixel_format_to_bpp(dsi->format);
> + struct phy_configure_opts_mipi_dphy cfg;
> + int clk_prepare, lpx, clk_zero, clk_post, clk_trail;
> + int hs_exit, hs_prepare, hs_zero, hs_trail;
> + unsigned long long clock_in_hz = m->clock * 1000;
>  
>   if (driver_data->has_freqband)
>   return;
>  
> + /* The dynamic_phy has the ability to adjust PHY Timing settings */
> + if (driver_data->dynamic_dphy) {
> + phy_mipi_dphy_get_default_config(clock_in_hz, bpp, dsi->lanes, 
> );
> +
> + /*
> +  * TODO:
> +  * The tech reference manual for i.MX8M Mini/Nano/Plus
> +  * doesn't state what the definition of the PHYTIMING
> +  * bits are beyond their address and bit position.
> +  * After reviewing NXP's downstream code, it appears
> +  * that the various PHYTIMING registers take the number
> +  * of cycles and use various dividers on them.  This
> +  * calculation does not result in an exact match to the
> +  * downstream code, but it is very close, and it appears
> +  * to sync at a variety of resolutions. If someone
> +  * can get a more accurate mathematical equation needed
> +  * for these registers, this should be updated.
> +  */
> +
> + lpx = PS_TO_CYCLE(cfg.lpx, clock_in_hz);
> + hs_exit = PS_TO_CYCLE(cfg.hs_exit, clock_in_hz);
> + clk_prepare = PS_TO_CYCLE(cfg.clk_prepare, clock_in_hz);
> + clk_zero = PS_TO_CYCLE(cfg.clk_zero, clock_in_hz);
> + clk_post = PS_TO_CYCLE(cfg.clk_post, clock_in_hz);
> + clk_trail = PS_TO_CYCLE(cfg.clk_trail, clock_in_hz);
> + hs_prepare = PS_TO_CYCLE(cfg.hs_prepare, clock_in_hz);
> + hs_zero = PS_TO_CYCLE(cfg.hs_zero, clock_in_hz);
> + hs_trail = PS_TO_CYCLE(cfg.hs_trail, clock_in_hz);
> + }
> +
>   /* B D-PHY: D-PHY Master & Slave Analog Block control */
>   reg = reg_values[PHYCTRL_ULPS_EXIT] | reg_values[PHYCTRL_VREG_LP] |
>   reg_values[PHYCTRL_SLEW_UP];
> +
>   samsung_dsim_write(dsi, DSIM_PHYCTRL_REG, reg);
>  
>   /*
> @@ -712,7 +750,11 @@ static void samsung_dsim_set_phy_ctrl(struct 
> samsung_dsim *dsi)
>* T HS-EXIT: Time that the transmitter drives LP-11 following a HS
>*  burst
>*/
> - reg = reg_values[PHYTIMING_LPX] | reg_values[PHYTIMING_HS_EXIT];
> + if (driver_data->dynamic_dphy)
> + reg  = DSIM_PHYTIMING_LPX(lpx) | 
> DSIM_PHYTIMING_HS_EXIT(hs_exit);
> + else
> + reg = reg_values[PHYTIMING_LPX] | reg_values[PHYTIMING_HS_EXIT];
> +
>   

Re: [Intel-gfx] [RFC PATCH 2/4] drm/cgroup: Add memory accounting to DRM cgroup

2023-05-03 Thread Maarten Lankhorst



On 2023-05-03 17:31, Tvrtko Ursulin wrote:


On 03/05/2023 09:34, Maarten Lankhorst wrote:

Based roughly on the rdma and misc cgroup controllers, with a lot of
the accounting code borrowed from rdma.

The interface is simple:
- populate drmcgroup_device->regions[..] name and size for each active
   region.
- Call drm(m)cg_register_device()
- Use drmcg_try_charge to check if you can allocate a chunk of memory,
   use drmcg_uncharge when freeing it. This may return an error code,
   or -EAGAIN when the cgroup limit is reached.

The ttm code transforms -EAGAIN back to -ENOSPC since it has specific
logic for -ENOSPC, and returning -EAGAIN to userspace causes drmIoctl
to restart infinitely.

This API allows you to limit stuff with cgroups.
You can see the supported cards in /sys/fs/cgroup/drm.capacity
You need to echo +drm to cgroup.subtree_control, and then you can
partition memory.

In each cgroup subdir:
drm.max shows the current limits of the cgroup.
drm.current the current amount of allocated memory used by this cgroup.
drm.events shows the amount of time max memory was reached.


Events is not in the patch?


Oops, correct.

I removed it since it added more complexity, and didn't seem granular 
enough to be useful.


I removed it from the documentation, but not the commit message it seems. :)




Signed-off-by: Maarten Lankhorst 
---
  Documentation/admin-guide/cgroup-v2.rst |  46 ++
  Documentation/gpu/drm-compute.rst   |  54 +++
  include/linux/cgroup_drm.h  |  81 
  kernel/cgroup/drm.c | 539 +++-
  4 files changed, 699 insertions(+), 21 deletions(-)
  create mode 100644 Documentation/gpu/drm-compute.rst

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst

index f67c0829350b..b858d99cb2ef 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2374,6 +2374,52 @@ RDMA Interface Files
    mlx4_0 hca_handle=1 hca_object=20
    ocrdma1 hca_handle=1 hca_object=23
  +DRM
+
+
+The "drm" controller regulates the distribution and accounting of
+DRM resources.
+
+DRM Interface Files
+
+
+  drm.max
+    A readwrite nested-keyed file that exists for all the cgroups
+    except root that describes current configured resource limit
+    for a DRM device.
+
+    Lines are keyed by device name and are not ordered.
+    Each line contains space separated resource name and its configured
+    limit that can be distributed.
+
+    The following nested keys are defined.
+
+  == 
===
+  region.* Maximum amount of bytes that allocatable in this 
region
+  == 
===

+
+    An example for xe follows::
+
+  :03:00.0 region.vram0=1073741824 region.stolen=max
+
+  drm.capacity
+    A read-only file that describes maximum region capacity.
+    It only exists on the root cgroup. Not all memory can be
+    allocated by cgroups, as the kernel reserves some for
+    internal use.
+
+    An example for xe follows::
+
+  :03:00.0 region.vram0=8514437120 region.stolen=67108864
+
+  drm.current
+    A read-only file that describes current resource usage.
+    It exists for all the cgroup except root.
+
+    An example for xe follows::
+
+  :03:00.0 region.vram0=12550144 region.stolen=8650752
+
  HugeTLB
  ---
  diff --git a/Documentation/gpu/drm-compute.rst 
b/Documentation/gpu/drm-compute.rst

new file mode 100644
index ..116270976ef7
--- /dev/null
+++ b/Documentation/gpu/drm-compute.rst
@@ -0,0 +1,54 @@
+==
+Long running workloads and compute
+==
+
+Long running workloads (compute) are workloads that will not 
complete in 10
+seconds. (The time let the user wait before he reaches for the power 
button).
+This means that other techniques need to be used to manage those 
workloads,

+that cannot use fences.
+
+Some hardware may schedule compute jobs, and have no way to pre-empt 
them, or
+have their memory swapped out from them. Or they simply want their 
workload

+not to be preempted or swapped out at all.
+
+This means that it differs from what is described in 
driver-api/dma-buf.rst.

+
+As with normal compute jobs, dma-fence may not be used at all. In 
this case,
+not even to force preemption. The driver with is simply forced to 
unmap a BO
+from the long compute job's address space on unbind immediately, not 
even
+waiting for the workload to complete. Effectively this terminates 
the workload

+when there is no hardware support to recover.
+
+Since this is undesirable, there need to be mitigations to prevent a 
workload
+from being terminated. There are several possible approach, all with 
their

+advantages and drawbacks.
+
+The first approach you will likely try is to pin all buffers used by 
compute.
+This 

Re: [Intel-gfx] [RFC PATCH 2/4] drm/cgroup: Add memory accounting to DRM cgroup

2023-05-03 Thread Tvrtko Ursulin



On 03/05/2023 09:34, Maarten Lankhorst wrote:

Based roughly on the rdma and misc cgroup controllers, with a lot of
the accounting code borrowed from rdma.

The interface is simple:
- populate drmcgroup_device->regions[..] name and size for each active
   region.
- Call drm(m)cg_register_device()
- Use drmcg_try_charge to check if you can allocate a chunk of memory,
   use drmcg_uncharge when freeing it. This may return an error code,
   or -EAGAIN when the cgroup limit is reached.

The ttm code transforms -EAGAIN back to -ENOSPC since it has specific
logic for -ENOSPC, and returning -EAGAIN to userspace causes drmIoctl
to restart infinitely.

This API allows you to limit stuff with cgroups.
You can see the supported cards in /sys/fs/cgroup/drm.capacity
You need to echo +drm to cgroup.subtree_control, and then you can
partition memory.

In each cgroup subdir:
drm.max shows the current limits of the cgroup.
drm.current the current amount of allocated memory used by this cgroup.
drm.events shows the amount of time max memory was reached.


Events is not in the patch?


Signed-off-by: Maarten Lankhorst 
---
  Documentation/admin-guide/cgroup-v2.rst |  46 ++
  Documentation/gpu/drm-compute.rst   |  54 +++
  include/linux/cgroup_drm.h  |  81 
  kernel/cgroup/drm.c | 539 +++-
  4 files changed, 699 insertions(+), 21 deletions(-)
  create mode 100644 Documentation/gpu/drm-compute.rst

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index f67c0829350b..b858d99cb2ef 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2374,6 +2374,52 @@ RDMA Interface Files
  mlx4_0 hca_handle=1 hca_object=20
  ocrdma1 hca_handle=1 hca_object=23
  
+DRM

+
+
+The "drm" controller regulates the distribution and accounting of
+DRM resources.
+
+DRM Interface Files
+
+
+  drm.max
+   A readwrite nested-keyed file that exists for all the cgroups
+   except root that describes current configured resource limit
+   for a DRM device.
+
+   Lines are keyed by device name and are not ordered.
+   Each line contains space separated resource name and its configured
+   limit that can be distributed.
+
+   The following nested keys are defined.
+
+ =====
+ region.*  Maximum amount of bytes that allocatable in this region
+ =====
+
+   An example for xe follows::
+
+ :03:00.0 region.vram0=1073741824 region.stolen=max
+
+  drm.capacity
+   A read-only file that describes maximum region capacity.
+   It only exists on the root cgroup. Not all memory can be
+   allocated by cgroups, as the kernel reserves some for
+   internal use.
+
+   An example for xe follows::
+
+ :03:00.0 region.vram0=8514437120 region.stolen=67108864
+
+  drm.current
+   A read-only file that describes current resource usage.
+   It exists for all the cgroup except root.
+
+   An example for xe follows::
+
+ :03:00.0 region.vram0=12550144 region.stolen=8650752
+
  HugeTLB
  ---
  
diff --git a/Documentation/gpu/drm-compute.rst b/Documentation/gpu/drm-compute.rst

new file mode 100644
index ..116270976ef7
--- /dev/null
+++ b/Documentation/gpu/drm-compute.rst
@@ -0,0 +1,54 @@
+==
+Long running workloads and compute
+==
+
+Long running workloads (compute) are workloads that will not complete in 10
+seconds. (The time let the user wait before he reaches for the power button).
+This means that other techniques need to be used to manage those workloads,
+that cannot use fences.
+
+Some hardware may schedule compute jobs, and have no way to pre-empt them, or
+have their memory swapped out from them. Or they simply want their workload
+not to be preempted or swapped out at all.
+
+This means that it differs from what is described in driver-api/dma-buf.rst.
+
+As with normal compute jobs, dma-fence may not be used at all. In this case,
+not even to force preemption. The driver with is simply forced to unmap a BO
+from the long compute job's address space on unbind immediately, not even
+waiting for the workload to complete. Effectively this terminates the workload
+when there is no hardware support to recover.
+
+Since this is undesirable, there need to be mitigations to prevent a workload
+from being terminated. There are several possible approach, all with their
+advantages and drawbacks.
+
+The first approach you will likely try is to pin all buffers used by compute.
+This guarantees that the job will run uninterrupted, but also allows a very
+denial of service attack by pinning as much memory as possible, hogging the
+all GPU memory, and possibly a 

Re: [RFC PATCH 0/1] Add AMDGPU_INFO_GUILTY_APP ioctl

2023-05-03 Thread Christian König

Am 03.05.23 um 17:08 schrieb Felix Kuehling:

Am 2023-05-03 um 03:59 schrieb Christian König:

Am 02.05.23 um 20:41 schrieb Alex Deucher:
On Tue, May 2, 2023 at 11:22 AM Timur Kristóf 
 wrote:

[SNIP]

In my opinion, the correct solution to those problems would be
if
the kernel could give userspace the necessary information about
a
GPU hang before a GPU reset.


  The fundamental problem here is that the kernel doesn't have
that
information either. We know which IB timed out and can
potentially do
a devcoredump when that happens, but that's it.


Is it really not possible to know such a fundamental thing as what
the
GPU was doing when it hung? How are we supposed to do any kind of
debugging without knowing that?


Yes, that's indeed something at least I try to figure out for years 
as well.


Basically there are two major problems:
1. When the ASIC is hung you can't talk to the firmware engines any 
more and most state is not exposed directly, but just through some 
fw/hw interface.
    Just take a look at how umr reads the shader state from the SQ. 
When that block is hung you can't do that any more and basically have 
no chance at all to figure out why it's hung.


    Same for other engines, I remember once spending a week figuring 
out why the UVD block is hung during suspend. Turned out to be a 
debugging nightmare because any time you touch any register of that 
block the whole system would hang.


2. There are tons of things going on in a pipeline fashion or even 
completely in parallel. For example the CP is just the beginning of a 
rather long pipeline which at the end produces a bunch of pixels.
    In almost all cases I've seen you ran into a problem somewhere 
deep in the pipeline and only very rarely at the beginning.




I wonder what AMD's Windows driver team is doing with this problem,
surely they must have better tools to deal with GPU hangs?

For better or worse, most teams internally rely on scan dumps via
JTAG
which sort of limits the usefulness outside of AMD, but also gives
you
the exact state of the hardware when it's hung so the hardware teams
prefer it.


How does this approach scale? It's not something we can ask users to
do, and even if all of us in the radv team had a JTAG device, we
wouldn't be able to play every game that users experience random hangs
with.

It doesn't scale or lend itself particularly well to external
development, but that's the current state of affairs.


The usual approach seems to be to reproduce a problem in a lab and 
have a JTAG attached to give the hw guys a scan dump and they can 
then tell you why something didn't worked as expected.


That's the worst-case scenario where you're debugging HW or FW issues. 
Those should be pretty rare post-bringup. But are there hangs caused 
by user mode driver or application bugs that are easier to debug and 
probably don't even require a GPU reset? For example most VM faults 
can be handled without hanging the GPU. Similarly, a shader in an 
endless loop should not require a full GPU reset. In the KFD compute 
case, that's still preemptible and the offending process can be killed 
with Ctrl-C or debugged with rocm-gdb.


We also have infinite loop in shader abort for gfx and page faults are 
pretty rare with OpenGL (a bit more often with Vulkan) and can be 
handled gracefully on modern hw (they just spam the logs).


The majority of the problems is unfortunately that we really get hard 
hangs because of some hw issues. That can be caused by unlucky timing, 
power management or doing things in an order the hw doesn't expected.


Regards,
Christian.



It's more complicated for graphics because of the more complex 
pipeline and the lack of CWSR. But it should still be possible to do 
some debugging without JTAG if the problem is in SW and not HW or FW. 
It's probably worth improving that debugability without getting 
hung-up on the worst case.


Maybe user mode graphics queues will offer a better way of recovering 
from these kinds of bugs, if the graphics pipeline can be unstuck 
without a GPU reset, just by killing the offending user mode queue.


Regards,
  Felix




And yes that absolutely doesn't scale.

Christian.



Alex






Re: [PATCH V3 4/7] drm: bridge: samsung-dsim: Select GENERIC_PHY_MIPI_DPHY

2023-05-03 Thread Frieder Schrempf
On 02.05.23 03:07, Adam Ford wrote:
> In order to support variable DPHY timings, it's necessary
> to enable GENERIC_PHY_MIPI_DPHY so phy_mipi_dphy_get_default_config
> can be used to determine the nominal values for a given resolution
> and refresh rate.
> 
> Signed-off-by: Adam Ford 

This fixes the build error which existed in v2!

Tested on Kontron BL i.MX8MM with SN65DSI84 and ADV7535 bridges.

Tested-by: Frieder Schrempf 
Reviewed-by: Frieder Schrempf 


Re: [PATCH V3 3/7] drm: bridge: samsung-dsim: Fetch pll-clock-frequency automatically

2023-05-03 Thread Frieder Schrempf
On 02.05.23 03:07, Adam Ford wrote:
> Make the pll-clock-frequency optional.  If it's present, use it
> to maintain backwards compatibility with existing hardware.  If it
> is absent, read clock rate of "sclk_mipi" to determine the rate.
> 
> Signed-off-by: Adam Ford 
> Tested-by: Chen-Yu Tsai 

Tested on Kontron BL i.MX8MM with SN65DSI84 and ADV7535 bridges.

Tested-by: Frieder Schrempf 
Reviewed-by: Frieder Schrempf 



Re: [PATCH V3 2/7] drm: bridge: samsung-dsim: Fix PMS Calculator on imx8m[mnp]

2023-05-03 Thread Frieder Schrempf
On 02.05.23 03:07, Adam Ford wrote:
> According to Table 13-45 of the i.MX8M Mini Reference Manual, the min
> and max values for M and the frequency range for the VCO_out
> calculator were incorrect.  This information was contradicted in other
> parts of the mini, nano and plus manuals.  After reaching out to my
> NXP Rep, when confronting him about discrepencies in the Nano manual,
> he responded with:
>  "Yes it is definitely wrong, the one that is part
>   of the NOTE in MIPI_DPHY_M_PLLPMS register table against PMS_P,
>   PMS_M and PMS_S is not correct. I will report this to Doc team,
>   the one customer should be take into account is the Table 13-40
>   DPHY PLL Parameters and the Note above."
> 
> These updated values also match what is used in the NXP downstream
> kernel.
> 
> To fix this, make new variables to hold the min and max values of m
> and the minimum value of VCO_out, and update the PMS calculator to
> use these new variables instead of using hard-coded values to keep
> the backwards compatibility with other parts using this driver.
> 
> Fixes: 4d562c70c4dc ("drm: bridge: samsung-dsim: Add i.MX8M Mini/Nano 
> support")
> Signed-off-by: Adam Ford 
> Reviewed-by: Lucas Stach 
> Tested-by: Chen-Yu Tsai 

Tested on Kontron BL i.MX8MM with SN65DSI84 and ADV7535 bridges.

Tested-by: Frieder Schrempf 
Reviewed-by: Frieder Schrempf 


Re: [PATCH V3 1/7] drm: bridge: samsung-dsim: fix blanking packet size calculation

2023-05-03 Thread Frieder Schrempf
On 02.05.23 03:07, Adam Ford wrote:
> From: Lucas Stach 
> 
> Scale the blanking packet sizes to match the ratio between HS clock
> and DPI interface clock. The controller seems to do internal scaling
> to the number of active lanes, so we don't take those into account.
> 
> Signed-off-by: Lucas Stach 
> Signed-off-by: Adam Ford 
> Tested-by: Chen-Yu Tsai 

Tested-by: Frieder Schrempf 


Re: [RFC PATCH 0/1] Add AMDGPU_INFO_GUILTY_APP ioctl

2023-05-03 Thread Felix Kuehling

Am 2023-05-03 um 03:59 schrieb Christian König:

Am 02.05.23 um 20:41 schrieb Alex Deucher:
On Tue, May 2, 2023 at 11:22 AM Timur Kristóf 
 wrote:

[SNIP]

In my opinion, the correct solution to those problems would be
if
the kernel could give userspace the necessary information about
a
GPU hang before a GPU reset.


  The fundamental problem here is that the kernel doesn't have
that
information either. We know which IB timed out and can
potentially do
a devcoredump when that happens, but that's it.


Is it really not possible to know such a fundamental thing as what
the
GPU was doing when it hung? How are we supposed to do any kind of
debugging without knowing that?


Yes, that's indeed something at least I try to figure out for years as 
well.


Basically there are two major problems:
1. When the ASIC is hung you can't talk to the firmware engines any 
more and most state is not exposed directly, but just through some 
fw/hw interface.
    Just take a look at how umr reads the shader state from the SQ. 
When that block is hung you can't do that any more and basically have 
no chance at all to figure out why it's hung.


    Same for other engines, I remember once spending a week figuring 
out why the UVD block is hung during suspend. Turned out to be a 
debugging nightmare because any time you touch any register of that 
block the whole system would hang.


2. There are tons of things going on in a pipeline fashion or even 
completely in parallel. For example the CP is just the beginning of a 
rather long pipeline which at the end produces a bunch of pixels.
    In almost all cases I've seen you ran into a problem somewhere 
deep in the pipeline and only very rarely at the beginning.




I wonder what AMD's Windows driver team is doing with this problem,
surely they must have better tools to deal with GPU hangs?

For better or worse, most teams internally rely on scan dumps via
JTAG
which sort of limits the usefulness outside of AMD, but also gives
you
the exact state of the hardware when it's hung so the hardware teams
prefer it.


How does this approach scale? It's not something we can ask users to
do, and even if all of us in the radv team had a JTAG device, we
wouldn't be able to play every game that users experience random hangs
with.

It doesn't scale or lend itself particularly well to external
development, but that's the current state of affairs.


The usual approach seems to be to reproduce a problem in a lab and 
have a JTAG attached to give the hw guys a scan dump and they can then 
tell you why something didn't worked as expected.


That's the worst-case scenario where you're debugging HW or FW issues. 
Those should be pretty rare post-bringup. But are there hangs caused by 
user mode driver or application bugs that are easier to debug and 
probably don't even require a GPU reset? For example most VM faults can 
be handled without hanging the GPU. Similarly, a shader in an endless 
loop should not require a full GPU reset. In the KFD compute case, 
that's still preemptible and the offending process can be killed with 
Ctrl-C or debugged with rocm-gdb.


It's more complicated for graphics because of the more complex pipeline 
and the lack of CWSR. But it should still be possible to do some 
debugging without JTAG if the problem is in SW and not HW or FW. It's 
probably worth improving that debugability without getting hung-up on 
the worst case.


Maybe user mode graphics queues will offer a better way of recovering 
from these kinds of bugs, if the graphics pipeline can be unstuck 
without a GPU reset, just by killing the offending user mode queue.


Regards,
  Felix




And yes that absolutely doesn't scale.

Christian.



Alex




Re: [PATCH v3 5/6] fbdev: Move framebuffer I/O helpers into

2023-05-03 Thread Arnd Bergmann
On Wed, May 3, 2023, at 16:55, Thomas Zimmermann wrote:
> Am 02.05.23 um 22:06 schrieb Arnd Bergmann:

>> It's probably safe to deal with all the above by either adding
>> architecture specific overrides to the current version, or
>> by doing the semantic changes before the move to asm/fb.h, but
>> one way or the other I'd prefer this to be separate from the
>> consolidation patch that should not have any changes in behavior.
>
> I think I'll add architecture overrides that contain the current code, 
> even if they contain some force-casting wrt __iomem. If anyone wants to 
> fix the issues, they can then address them easily.

Ok, sounds good,

 Arnd


Re: [PATCH v2 17/19] fbdev: Validate info->screen_{base,buffer} in fb_ops implementations

2023-05-03 Thread Geert Uytterhoeven
Hi Thomas,

On Wed, May 3, 2023 at 4:30 PM Thomas Zimmermann  wrote:
> Am 03.05.23 um 11:51 schrieb Geert Uytterhoeven:
> > On Fri, Apr 28, 2023 at 2:26 PM Thomas Zimmermann  
> > wrote:
> >> Push the test for info->screen_base from fb_read() and fb_write() into
> >> the implementations of struct fb_ops.{fb_read,fb_write}. In cases where
> >> the driver operates on info->screen_buffer, test this field instead.
> >>
> >> While bothi fields, screen_base and screen_buffer, are stored in the
> >
> > both
> >
> >> same location, they refer to different address spaces. For correctness,
> >> we want to test each field in exactly the code that uses it.
> >
> > Not a direct comment for this patch: and later the union can be split
> > in two separate fields, to protect against misuse?
>
> No idea. Currently we have sparse that warns about mismatching address
> spaces if the fields are mixed up. That's good enough, as far I'm concerned.

The potential issue that is still present is that an fbdev driver uses
fb_info.screen_base, and configures the use of drawing ops that use
fb_info.screen_buffer (or vice-versa), which will happily use the wrong
type of pointer.  Sparse doesn't protect against that.

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


Re: drm/sched: Replacement for drm_sched_resubmit_jobs() is deprecated

2023-05-03 Thread Christian König

Am 03.05.23 um 15:10 schrieb Lucas Stach:

Am Mittwoch, dem 03.05.2023 um 13:40 +0200 schrieb Christian König:

Hi Lucas,

Am 03.05.23 um 12:28 schrieb Lucas Stach:

Hi Christian,

Am Mittwoch, dem 03.05.2023 um 10:47 +0200 schrieb Christian König:

Adding Luben as well.

Am 03.05.23 um 10:16 schrieb Boris Brezillon:

[SNIP]

To sum-up, we shouldn't call drm_sched_{start,stop,resubmit_jobs}().

After the discussion I had with Matthew yesterday on IRC, I
realized there was no clear agreement on this. Matthew uses those 3
helpers in the Xe driver right now, and given he intends to use a
multi-threaded wq for its 1:1 schedulers run queue, there's no way he
can get away without calling drm_sched_{start,stop}().
drm_sched_resubmit_jobs() can be open-coded in each driver, but I'm
wondering if it wouldn't be preferable to add a ::resubmit_job() method
or extend the ::run_job() one to support the resubmit semantics, which,
AFAIU, is just about enforcing the job done fence (the one returned by
::run_job()) doesn't transition from a signaled to an unsignaled state.

But probably more important than providing a generic helper, we should
document the resubmit semantics (AKA, what should and/or shouldn't be
done with pending jobs when a recovery happens). Because forbidding
people to use a generic helper function doesn't give any guarantee that
they'll do the right thing when coding their own logic, unless we give
clues about what's considered right/wrong, and the current state of the
doc is pretty unclear in this regard.

I should probably talk about the history of the re-submit feature a bit
more.

Basically AMD came up with re-submission as a cheap way of increasing
the reliability of GPU resets. Problem is that it turned into an
absolutely nightmare. We tried for the last 5 years or so to get that
stable and it's still crashing.

The first and most major problem is that the kernel doesn't even has the
information if re-submitting jobs is possible or not. For example a job
which has already been pushed to the hw could have grabbed a binary
semaphore and re-submitting it will just wait forever for the semaphore
to be released.


I can follow this argument, but concluding that job resubmission is
impossible is punishing simple GPUs. On Vivante GPUs we have exactly
one job running at a time and all dependencies are visible to the
scheduler, as we don't have/use any hardware synchronization mechanism,
so all synchronization is piped through kernel visible fences.

It's reasonably easy for the etnaviv driver to find the guilty job to
skip but resubmit all other jobs in the current hardware queue. I'm not
really fond of having to make all applications deal with innocent
context resets, while we can solve this via resubmission on simple HW.

I know that more complex hardware and use-cases might still require the
kernel driver for this HW to give up and shoot all contexts active at
the time of the GPU reset, but that's the price you pay for the
hardware being more capable. I don't see why we should also pay that
price on really simple HW.

You can still re-create the hw state inside your driver to continue work
from some point if know that this will work.

As I wrote below the scheduler component can even provide help with with
that in the form of providing all the unsignaled hw or scheduler fences
for example.

But what we absolutely should *not* do is to have this re-submission
feature, because that requires re-using the dma_fence objects. In other
words this dance with detaching the scheduler fence from the hw fence
and attach a new one is what absolutely doesn't work.


The second problem is that the dma_fence semantics don't allow to ever
transit the state of a fence from signaled back to unsignaled. This
means that you can't re-use the hw fence and need to allocate a new one,
but since memory allocation is forbidden inside a reset handler as well
(YES we need to better document that part) you actually need to keep a
bunch of hw fences pre-allocated around to make this work. Amdgpu choose
to illegally re-use the hw fence instead which only works with quite
extreme hacks.


I'm with Boris here. Could you please explain when a fence would be
already signaled in a GPU reset scenario and would need to go back to
unsignaled, so we are on the same page here?

Take a look at how this re-submission feature of the scheduler works.
The approach is basically that you detach the hw fence from the
scheduler fence and then attach a new one.


Right, but this shouldn't be a problem, as long as the old fence isn't
signaled yet, right? It becomes a problem when the GPU reset and fence
signaling are racing each other, due to insufficient hardware/software
state synchronization.


Exactly that.


I'm sure that the necessary synchronization can be hard to get right,
but it's not the act of switching one unsignaled fence to a new one or
reusing the old unsignaled fence that's causing problems, but the
complications of making sure that 

Re: [PATCH v3 5/6] fbdev: Move framebuffer I/O helpers into

2023-05-03 Thread Thomas Zimmermann

Hi

Am 02.05.23 um 22:06 schrieb Arnd Bergmann:

On Tue, May 2, 2023, at 15:02, Thomas Zimmermann wrote:

Implement framebuffer I/O helpers, such as fb_read*() and fb_write*(),
in the architecture's  header file or the generic one.

The common case has been the use of regular I/O functions, such as
__raw_readb() or memset_io(). A few architectures used plain system-
memory reads and writes. Sparc used helpers for its SBus.

The architectures that used special cases provide the same code in
their __raw_*() I/O helpers. So the patch replaces this code with the
__raw_*() functions and moves it to  for all
architectures.

v3:
* implement all architectures with generic helpers
* support reordering and native byte order (Geert, Arnd)


This looks good for the read/write helpers, but I'm a little
worried about the memset and memcpy functions, since they do
change behavior on some architectures:

- on sparc64, fb_mem{set,cpy} uses ASI_PHYS_BYPASS_EC_E (like __raw_readb)
   while mem{set_,cpy_from,cpy_to} uses ASI_PHYS_BYPASS_EC_E_L (like readb)
   I don't know the effect of that, but it seems intentional

- on loongarch and csky, the _io variants avoid unaligned access,
   while the normal memcpy/memset is probably broken, so your
   patch is a bugfix

- on ia64, the _io variants use bytewise access and avoid any longer
   loads and stores, so your patch probably makes things slower.

It's probably safe to deal with all the above by either adding
architecture specific overrides to the current version, or
by doing the semantic changes before the move to asm/fb.h, but
one way or the other I'd prefer this to be separate from the
consolidation patch that should not have any changes in behavior.


I think I'll add architecture overrides that contain the current code, 
even if they contain some force-casting wrt __iomem. If anyone wants to 
fix the issues, they can then address them easily.


Best regards
Thomas



  Arnd


--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)


OpenPGP_signature
Description: OpenPGP digital signature


Re: [PATCH] drm/udl: delete dead code

2023-05-03 Thread Thomas Zimmermann

Merged, thanks!

Am 02.05.23 um 14:59 schrieb Dan Carpenter:

The "unode" pointer cannot be NULL here and checking for it causes
Smatch warnings:

drivers/gpu/drm/udl/udl_main.c:259 udl_get_urb_locked()
warn: can 'unode' even be NULL?

Fortunately, it's just harmless dead code which can be removed.  It's
left over from commit c5c354a3a472 ("drm/udl: Fix inconsistent urbs.count
value during udl_free_urb_list()").

Reported-by: kernel test robot 
Signed-off-by: Dan Carpenter 
---
  drivers/gpu/drm/udl/udl_main.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/udl/udl_main.c b/drivers/gpu/drm/udl/udl_main.c
index 061cb88c08a2..3ebe2ce55dfd 100644
--- a/drivers/gpu/drm/udl/udl_main.c
+++ b/drivers/gpu/drm/udl/udl_main.c
@@ -255,7 +255,7 @@ static struct urb *udl_get_urb_locked(struct udl_device 
*udl, long timeout)
list_del_init(>entry);
udl->urbs.available--;
  
-	return unode ? unode->urb : NULL;

+   return unode->urb;
  }
  
  #define GET_URB_TIMEOUT	HZ


--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)


OpenPGP_signature
Description: OpenPGP digital signature


Re: [PATCH v2 17/19] fbdev: Validate info->screen_{base,buffer} in fb_ops implementations

2023-05-03 Thread Thomas Zimmermann

Hi

Am 03.05.23 um 11:51 schrieb Geert Uytterhoeven:

On Fri, Apr 28, 2023 at 2:26 PM Thomas Zimmermann  wrote:

Push the test for info->screen_base from fb_read() and fb_write() into
the implementations of struct fb_ops.{fb_read,fb_write}. In cases where
the driver operates on info->screen_buffer, test this field instead.

While bothi fields, screen_base and screen_buffer, are stored in the


both


same location, they refer to different address spaces. For correctness,
we want to test each field in exactly the code that uses it.


Not a direct comment for this patch: and later the union can be split
in two separate fields, to protect against misuse?


No idea. Currently we have sparse that warns about mismatching address 
spaces if the fields are mixed up. That's good enough, as far I'm concerned.


Best regards
Thomas



Gr{oetje,eeting}s,

 Geert


--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
 -- Linus Torvalds


--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Frankenstrasse 146, 90461 Nuernberg, Germany
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman
HRB 36809 (AG Nuernberg)


OpenPGP_signature
Description: OpenPGP digital signature


[PATCH v3] drm/i915: avoid flush_scheduled_work() usage

2023-05-03 Thread Tetsuo Handa
Like commit c4f135d643823a86 ("workqueue: Wrap flush_workqueue() using a
macro") says, flush_scheduled_work() is dangerous and will be forbidden.

i915 became the last flush_scheduled_work() user, but developers cannot
find time for auditing which work items does this flush_scheduled_work()
need to wait.

Therefore, for now let's start with blind/mechanical conversion within
the whole drivers/gpu/drm/i915/ directory, based on an assumption that
i915 does not need to wait for work items outside of this directory.

Link: https://lkml.kernel.org/r/87sfeita1p@intel.com
Signed-off-by: Tetsuo Handa 
Cc: Tvrtko Ursulin 
Cc: Jani Nikula 
Cc: Ville Syrjälä 
---
Changes in v3:
  Refreshed using drm-tip.git, for commit 40053823baad ("drm/i915/display:
  move modeset probe/remove functions to intel_display_driver.c") moved
  flush_scheduled_work() from intel_display.c to intel_display_driver.c .

  Please check the comment from Daniel Vetter at
  https://lkml.kernel.org/r/ZDuntOkUeh0Eve8a@phenom.ffwll.local .

Changes in v2:
  Add missing alloc_workqueue() failure check.

 drivers/gpu/drm/i915/display/intel_display.c   |  2 +-
 .../drm/i915/display/intel_display_driver.c|  2 +-
 drivers/gpu/drm/i915/display/intel_dmc.c   |  2 +-
 drivers/gpu/drm/i915/display/intel_dp.c|  2 +-
 .../drm/i915/display/intel_dp_link_training.c  |  2 +-
 drivers/gpu/drm/i915/display/intel_drrs.c  |  2 +-
 drivers/gpu/drm/i915/display/intel_fbc.c   |  2 +-
 drivers/gpu/drm/i915/display/intel_fbdev.c |  2 +-
 drivers/gpu/drm/i915/display/intel_hdcp.c  | 18 +-
 drivers/gpu/drm/i915/display/intel_hotplug.c   | 12 ++--
 drivers/gpu/drm/i915/display/intel_opregion.c  |  2 +-
 drivers/gpu/drm/i915/display/intel_pps.c   |  2 +-
 drivers/gpu/drm/i915/display/intel_psr.c   |  6 +++---
 .../drm/i915/gt/intel_execlists_submission.c   |  4 ++--
 drivers/gpu/drm/i915/gt/intel_gt_buffer_pool.c |  8 
 drivers/gpu/drm/i915/gt/intel_gt_irq.c |  2 +-
 drivers/gpu/drm/i915/gt/intel_gt_requests.c| 10 +-
 drivers/gpu/drm/i915/gt/intel_reset.c  |  2 +-
 drivers/gpu/drm/i915/gt/intel_rps.c| 14 +++---
 drivers/gpu/drm/i915/gt/selftest_engine_cs.c   |  2 +-
 drivers/gpu/drm/i915/i915_drv.h|  1 +
 drivers/gpu/drm/i915/i915_module.c |  7 +++
 drivers/gpu/drm/i915/i915_request.c|  2 +-
 drivers/gpu/drm/i915/intel_wakeref.c   |  4 +++-
 drivers/gpu/drm/i915/selftests/i915_sw_fence.c |  4 +++-
 25 files changed, 64 insertions(+), 52 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_display.c 
b/drivers/gpu/drm/i915/display/intel_display.c
index 3d3483e6f836..3436f95a081c 100644
--- a/drivers/gpu/drm/i915/display/intel_display.c
+++ b/drivers/gpu/drm/i915/display/intel_display.c
@@ -7122,7 +7122,7 @@ intel_atomic_commit_ready(struct i915_sw_fence *fence,

_i915(state->base.dev)->display.atomic_helper;
 
if (llist_add(>freed, >free_list))
-   schedule_work(>free_work);
+   queue_work(i915_wq, >free_work);
break;
}
}
diff --git a/drivers/gpu/drm/i915/display/intel_display_driver.c 
b/drivers/gpu/drm/i915/display/intel_display_driver.c
index 60ce10fc7205..a20a9cfaab0e 100644
--- a/drivers/gpu/drm/i915/display/intel_display_driver.c
+++ b/drivers/gpu/drm/i915/display/intel_display_driver.c
@@ -435,7 +435,7 @@ void intel_display_driver_remove_noirq(struct 
drm_i915_private *i915)
intel_unregister_dsm_handler();
 
/* flush any delayed tasks or pending work */
-   flush_scheduled_work();
+   flush_workqueue(i915_wq);
 
intel_hdcp_component_fini(i915);
 
diff --git a/drivers/gpu/drm/i915/display/intel_dmc.c 
b/drivers/gpu/drm/i915/display/intel_dmc.c
index 8a88de67ff0a..57d015006784 100644
--- a/drivers/gpu/drm/i915/display/intel_dmc.c
+++ b/drivers/gpu/drm/i915/display/intel_dmc.c
@@ -1057,7 +1057,7 @@ void intel_dmc_init(struct drm_i915_private *i915)
i915->display.dmc.dmc = dmc;
 
drm_dbg_kms(>drm, "Loading %s\n", dmc->fw_path);
-   schedule_work(>work);
+   queue_work(i915_wq, >work);
 
return;
 
diff --git a/drivers/gpu/drm/i915/display/intel_dp.c 
b/drivers/gpu/drm/i915/display/intel_dp.c
index 4361c1ac65c3..b4f0f4472835 100644
--- a/drivers/gpu/drm/i915/display/intel_dp.c
+++ b/drivers/gpu/drm/i915/display/intel_dp.c
@@ -5158,7 +5158,7 @@ static void intel_dp_oob_hotplug_event(struct 
drm_connector *connector)
spin_lock_irq(>irq_lock);
i915->display.hotplug.event_bits |= BIT(encoder->hpd_pin);
spin_unlock_irq(>irq_lock);
-   queue_delayed_work(system_wq, >display.hotplug.hotplug_work, 0);
+   queue_delayed_work(i915_wq, >display.hotplug.hotplug_work, 0);
 }
 
 static const struct drm_connector_funcs intel_dp_connector_funcs = {
diff 

Re: [Intel-gfx] [PATCH v4 2/3] drm/i915: use pat_index instead of cache_level

2023-05-03 Thread Tvrtko Ursulin



On 02/05/2023 05:11, fei.y...@intel.com wrote:

From: Fei Yang 

Currently the KMD is using enum i915_cache_level to set caching policy for
buffer objects. This is flaky because the PAT index which really controls
the caching behavior in PTE has far more levels than what's defined in the
enum. In addition, the PAT index is platform dependent, having to translate
between i915_cache_level and PAT index is not reliable, and makes the code
more complicated.


From UMD's perspective there is also a necessity to set caching policy for

performance fine tuning. It's much easier for the UMD to directly use PAT
index because the behavior of each PAT index is clearly defined in Bspec.
Having the abstracted i915_cache_level sitting in between would only cause
more ambiguity.

For these reasons this patch replaces i915_cache_level with PAT index. Also
note, the cache_level is not completely removed yet, because the KMD still
has the need of creating buffer objects with simple cache settings such as
cached, uncached, or writethrough. For such simple cases, using cache_level
would help simplify the code.

Cc: Chris Wilson 
Cc: Matt Roper 
Signed-off-by: Fei Yang 
Reviewed-by: Andi Shyti 
---
  drivers/gpu/drm/i915/display/intel_dpt.c  | 12 +--
  drivers/gpu/drm/i915/gem/i915_gem_domain.c| 47 +++
  .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 10 ++-
  drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  3 +-
  drivers/gpu/drm/i915/gem/i915_gem_object.c| 52 +++-
  drivers/gpu/drm/i915/gem/i915_gem_object.h|  4 +
  .../gpu/drm/i915/gem/i915_gem_object_types.h  | 25 +-
  drivers/gpu/drm/i915/gem/i915_gem_stolen.c|  4 +-
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  | 16 ++--
  .../gpu/drm/i915/gem/selftests/huge_pages.c   |  2 +-
  .../drm/i915/gem/selftests/i915_gem_migrate.c |  2 +-
  .../drm/i915/gem/selftests/i915_gem_mman.c|  2 +-
  drivers/gpu/drm/i915/gt/gen6_ppgtt.c  | 10 ++-
  drivers/gpu/drm/i915/gt/gen8_ppgtt.c  | 71 
  drivers/gpu/drm/i915/gt/gen8_ppgtt.h  |  3 +-
  drivers/gpu/drm/i915/gt/intel_ggtt.c  | 82 +--
  drivers/gpu/drm/i915/gt/intel_gtt.h   | 20 ++---
  drivers/gpu/drm/i915/gt/intel_migrate.c   | 47 ++-
  drivers/gpu/drm/i915/gt/intel_migrate.h   | 13 ++-
  drivers/gpu/drm/i915/gt/intel_ppgtt.c |  6 +-
  drivers/gpu/drm/i915/gt/selftest_migrate.c| 47 ++-
  drivers/gpu/drm/i915/gt/selftest_reset.c  |  8 +-
  drivers/gpu/drm/i915/gt/selftest_timeline.c   |  2 +-
  drivers/gpu/drm/i915/gt/selftest_tlb.c|  4 +-
  drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  | 10 ++-
  drivers/gpu/drm/i915/i915_debugfs.c   | 55 ++---
  drivers/gpu/drm/i915/i915_gem.c   | 16 +++-
  drivers/gpu/drm/i915/i915_gpu_error.c |  8 +-
  drivers/gpu/drm/i915/i915_vma.c   | 16 ++--
  drivers/gpu/drm/i915/i915_vma.h   |  2 +-
  drivers/gpu/drm/i915/i915_vma_types.h |  2 -
  drivers/gpu/drm/i915/selftests/i915_gem.c |  5 +-
  .../gpu/drm/i915/selftests/i915_gem_evict.c   |  4 +-
  drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 15 ++--
  .../drm/i915/selftests/intel_memory_region.c  |  4 +-
  drivers/gpu/drm/i915/selftests/mock_gtt.c |  8 +-
  36 files changed, 398 insertions(+), 239 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_dpt.c 
b/drivers/gpu/drm/i915/display/intel_dpt.c
index c5eacfdba1a5..7c5fddb203ba 100644
--- a/drivers/gpu/drm/i915/display/intel_dpt.c
+++ b/drivers/gpu/drm/i915/display/intel_dpt.c
@@ -43,24 +43,24 @@ static void gen8_set_pte(void __iomem *addr, gen8_pte_t pte)
  static void dpt_insert_page(struct i915_address_space *vm,
dma_addr_t addr,
u64 offset,
-   enum i915_cache_level level,
+   unsigned int pat_index,
u32 flags)
  {
struct i915_dpt *dpt = i915_vm_to_dpt(vm);
gen8_pte_t __iomem *base = dpt->iomem;
  
  	gen8_set_pte(base + offset / I915_GTT_PAGE_SIZE,

-vm->pte_encode(addr, level, flags));
+vm->pte_encode(addr, pat_index, flags));
  }
  
  static void dpt_insert_entries(struct i915_address_space *vm,

   struct i915_vma_resource *vma_res,
-  enum i915_cache_level level,
+  unsigned int pat_index,
   u32 flags)
  {
struct i915_dpt *dpt = i915_vm_to_dpt(vm);
gen8_pte_t __iomem *base = dpt->iomem;
-   const gen8_pte_t pte_encode = vm->pte_encode(0, level, flags);
+   const gen8_pte_t pte_encode = vm->pte_encode(0, pat_index, flags);
struct sgt_iter sgt_iter;
dma_addr_t addr;
int i;
@@ -83,7 +83,7 @@ static void dpt_clear_range(struct i915_address_space *vm,
  static void dpt_bind_vma(struct 

Re: drm/sched: Replacement for drm_sched_resubmit_jobs() is deprecated

2023-05-03 Thread Lucas Stach
Am Mittwoch, dem 03.05.2023 um 13:40 +0200 schrieb Christian König:
> Hi Lucas,
> 
> Am 03.05.23 um 12:28 schrieb Lucas Stach:
> > Hi Christian,
> > 
> > Am Mittwoch, dem 03.05.2023 um 10:47 +0200 schrieb Christian König:
> > > Adding Luben as well.
> > > 
> > > Am 03.05.23 um 10:16 schrieb Boris Brezillon:
> > > > [SNIP]
> > > > > To sum-up, we shouldn't call drm_sched_{start,stop,resubmit_jobs}().
> > > > After the discussion I had with Matthew yesterday on IRC, I
> > > > realized there was no clear agreement on this. Matthew uses those 3
> > > > helpers in the Xe driver right now, and given he intends to use a
> > > > multi-threaded wq for its 1:1 schedulers run queue, there's no way he
> > > > can get away without calling drm_sched_{start,stop}().
> > > > drm_sched_resubmit_jobs() can be open-coded in each driver, but I'm
> > > > wondering if it wouldn't be preferable to add a ::resubmit_job() method
> > > > or extend the ::run_job() one to support the resubmit semantics, which,
> > > > AFAIU, is just about enforcing the job done fence (the one returned by
> > > > ::run_job()) doesn't transition from a signaled to an unsignaled state.
> > > > 
> > > > But probably more important than providing a generic helper, we should
> > > > document the resubmit semantics (AKA, what should and/or shouldn't be
> > > > done with pending jobs when a recovery happens). Because forbidding
> > > > people to use a generic helper function doesn't give any guarantee that
> > > > they'll do the right thing when coding their own logic, unless we give
> > > > clues about what's considered right/wrong, and the current state of the
> > > > doc is pretty unclear in this regard.
> > > I should probably talk about the history of the re-submit feature a bit
> > > more.
> > > 
> > > Basically AMD came up with re-submission as a cheap way of increasing
> > > the reliability of GPU resets. Problem is that it turned into an
> > > absolutely nightmare. We tried for the last 5 years or so to get that
> > > stable and it's still crashing.
> > > 
> > > The first and most major problem is that the kernel doesn't even has the
> > > information if re-submitting jobs is possible or not. For example a job
> > > which has already been pushed to the hw could have grabbed a binary
> > > semaphore and re-submitting it will just wait forever for the semaphore
> > > to be released.
> > > 
> > I can follow this argument, but concluding that job resubmission is
> > impossible is punishing simple GPUs. On Vivante GPUs we have exactly
> > one job running at a time and all dependencies are visible to the
> > scheduler, as we don't have/use any hardware synchronization mechanism,
> > so all synchronization is piped through kernel visible fences.
> > 
> > It's reasonably easy for the etnaviv driver to find the guilty job to
> > skip but resubmit all other jobs in the current hardware queue. I'm not
> > really fond of having to make all applications deal with innocent
> > context resets, while we can solve this via resubmission on simple HW.
> > 
> > I know that more complex hardware and use-cases might still require the
> > kernel driver for this HW to give up and shoot all contexts active at
> > the time of the GPU reset, but that's the price you pay for the
> > hardware being more capable. I don't see why we should also pay that
> > price on really simple HW.
> 
> You can still re-create the hw state inside your driver to continue work 
> from some point if know that this will work.
> 
> As I wrote below the scheduler component can even provide help with with 
> that in the form of providing all the unsignaled hw or scheduler fences 
> for example.
> 
> But what we absolutely should *not* do is to have this re-submission 
> feature, because that requires re-using the dma_fence objects. In other 
> words this dance with detaching the scheduler fence from the hw fence 
> and attach a new one is what absolutely doesn't work.
> 
> > > The second problem is that the dma_fence semantics don't allow to ever
> > > transit the state of a fence from signaled back to unsignaled. This
> > > means that you can't re-use the hw fence and need to allocate a new one,
> > > but since memory allocation is forbidden inside a reset handler as well
> > > (YES we need to better document that part) you actually need to keep a
> > > bunch of hw fences pre-allocated around to make this work. Amdgpu choose
> > > to illegally re-use the hw fence instead which only works with quite
> > > extreme hacks.
> > > 
> > I'm with Boris here. Could you please explain when a fence would be
> > already signaled in a GPU reset scenario and would need to go back to
> > unsignaled, so we are on the same page here?
> 
> Take a look at how this re-submission feature of the scheduler works. 
> The approach is basically that you detach the hw fence from the 
> scheduler fence and then attach a new one.
> 
Right, but this shouldn't be a problem, as long as the old fence isn't
signaled yet, right? It 

Re: [PATCH 3/4] ARM/mmc: Convert old mmci-omap to GPIO descriptors

2023-05-03 Thread Linus Walleij
On Tue, May 2, 2023 at 4:26 PM Ulf Hansson  wrote:
> On Sun, 30 Apr 2023 at 11:22, Linus Walleij  wrote:

> > Fixes: 92bf78b33b0b ("gpio: omap: use dynamic allocation of base")
> > Signed-off-by: Linus Walleij 
>
> This looks like it's best funneled through the soc maintainer's tree(s), 
> right?
>
> Acked-by: Ulf Hansson 

Thanks, yeah the plan is to wait and see if I get some testing from
the OMAP1/2/3
guys and then collect the lot and put on a branch to SoC unless Tony wants
the job :D

Yours,
Linus Walleij


Re: [PATCH 02/11] drm/i915/mst: Remove broken MST DSC support

2023-05-03 Thread Lisovskiy, Stanislav
On Wed, May 03, 2023 at 02:07:04PM +0300, Ville Syrjälä wrote:
> On Wed, May 03, 2023 at 10:36:42AM +0300, Lisovskiy, Stanislav wrote:
> > On Tue, May 02, 2023 at 05:38:57PM +0300, Ville Syrjala wrote:
> > > From: Ville Syrjälä 
> > > 
> > > The MST DSC code has a myriad of issues:
> > > - Platform checks are wrong (MST+DSC is TGL+ only IIRC)
> > > - Return values of .mode_valid_ctx() are wrong
> > > - .mode_valid_ctx() assumes bigjoiner might be used, but ther rest
> > >   of the code doesn't agree
> > > - compressed bpp calculations don't make sense
> > > - FEC handling needs to consider the entire link as opposed to just
> > >   the single stream. Currently FEC would only get enabled if the
> > >   first enabled stream is compressed. Also I'm not seeing anything
> > >   that would account for the FEC overhead in any bandwidth calculations
> > > - PPS SDP is only handled for the first stream via the dig_port
> > >   hooks, other streams will not be transmittitng any PPS SDPs
> > > - PPS SDP readout is missing (also missing for SST!)
> > > - VDSC readout is missing (also missing for SST!)
> > > 
> > > The FEC issues is really the big one since we have no way currently
> > > to apply such link wide configuration constraints. Changing that is
> > > going to require a much bigger rework of the higher level modeset
> > > .compute_config() logic. We will also need such a rework to properly
> > > distribute the available bandwidth across all the streams on the
> > > same link (which is a must to eg. enable deep color).
> > 
> > Also all the things you mentioned are subject for discussion, for example
> > I see that FEC overhead is actually accounted for bpp calculation for 
> > instance.
> 
> AFAICS FEC is only accounted for in the data M/N calculations,
> assuming that particular stream happened to be compressed. I'm
> not sure if that actually matters since at least the link M/N
> are not even used by the MST sink. I suppose the data M/N might
> still be used for something though. For any uncompressed stream
> on the same link the data M/N values will be calculated
> incorrectly without FEC.
> 
> And as mentioned, the FEC bandwidth overhead doesn't seem to
> be accounted anywhere so no guarantee that we won't try to
> oversubcribe the link.
> 
> And FEC will only be enabled if the first stream to be enabled
> is compressed, otherwise we will enable the link without FEC
> and still try to cram other compressed streams through it
> (albeit without the PPS SDP so who knows what will happen)
> and that is illegal.
> 
> > We usually improve things by gradually fixing, because if we act same way 
> > towards
> > all wrong code in the driver, we could end up removing the whole i915.
> 
> We ususally don't merge code that has this many obvious and/or
> fundemental issues.

Well, this is arguable and subjective judgement. Fact is that, so far we had 
more MST hubs
working with that code than without. Also no regressions or anything like that.
Moreover we usually merge code after code review, in particular those patches
did spend lots of time in review, where you could comment also.

Regarding merging code with fundamental issues, just recently you had admitted 
yourself
that bigjoiner issue for instance, we had recently, was partly caused by your 
code, because
we don't anymore copy the pll state to slave crtc. 
I would say that words like "obvious" and "fundamental"
issues can be applied to many things, however I thought that we always fix 
things in constructive,
but not destructive/negative way. 
Should I call also all code completely broken and remove it, once we discover 
some flaws 
there? Oh, we had many regressions, where I could say the same.

And once again I'm completely okay, if you did introduce better functionality 
instead
AND I know you have some valid points there, but now we are just removing 
everything completely,
without providing anything better.

But okay, I've mentioned what I think about this and from side this is nak. 
And once the guys to whom those patches helped will pop up from gitlab,
asking why their MST hubs stopped working - I will just refer them here.

> 
> Now, most of the issues I listed above are probably fixable
> in a way that could be backported to stable kernels, but
> unfortunately the FEC issue is not one of those. That one
> will likely need massive amounts of work all over the driver
> modeset code, making a backport impossible.
> 
> > So from my side I would nack it, at least until you have a code which 
> > handles
> > all of this better - I have no doubt you probably have some ideas in your 
> > mind, so lets be constructive at least and propose something better first.
> > This code doesn't cause any regressions, but still provides "some" support 
> > to DP MST DSC to say the least and even if that would be removed, if some 
> > of those users 
> > refer to me, I would probably then just point to this mail discussion 
> > everytime.
> 
> It seems very likely that it 

[PATCH v2 09/11] drm/mediatek: gamma: Add support for 12-bit LUT and MT8195

2023-05-03 Thread AngeloGioacchino Del Regno
Add support for 12-bit gamma lookup tables and introduce the first
user for it: MT8195.
While at it, also reorder the variables in mtk_gamma_set_common()
and rename `lut_base` to `lut0_base` to improve readability.

Signed-off-by: AngeloGioacchino Del Regno 

---
 drivers/gpu/drm/mediatek/mtk_disp_gamma.c | 63 ++-
 1 file changed, 49 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c 
b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
index 58eeebae568a..d5da3e84cd53 100644
--- a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
+++ b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
@@ -27,12 +27,20 @@
 #define DISP_GAMMA_SIZE_VSIZE  GENMASK(12, 0)
 #define DISP_GAMMA_BANK0x0100
 #define DISP_GAMMA_BANK_BANK   GENMASK(1, 0)
+#define DISP_GAMMA_BANK_DATA_MODE  BIT(2)
 #define DISP_GAMMA_LUT 0x0700
+#define DISP_GAMMA_LUT10x0b00
 
+/* For 10 bit LUT layout, R/G/B are in the same register */
 #define DISP_GAMMA_LUT_10BIT_R GENMASK(29, 20)
 #define DISP_GAMMA_LUT_10BIT_G GENMASK(19, 10)
 #define DISP_GAMMA_LUT_10BIT_B GENMASK(9, 0)
 
+/* For 12 bit LUT layout, R/G are in LUT, B is in LUT1 */
+#define DISP_GAMMA_LUT_12BIT_R GENMASK(11, 0)
+#define DISP_GAMMA_LUT_12BIT_G GENMASK(23, 12)
+#define DISP_GAMMA_LUT_12BIT_B GENMASK(11, 0)
+
 #define LUT_10BIT_MASK 0x03ff
 #define LUT_BITS_DEFAULT   10
 #define LUT_SIZE_DEFAULT   512 /* for setting gamma lut 
from AAL */
@@ -83,14 +91,15 @@ unsigned int mtk_gamma_get_lut_size(struct device *dev)
 void mtk_gamma_set_common(struct device *dev, void __iomem *regs, struct 
drm_crtc_state *state)
 {
struct mtk_disp_gamma *gamma = dev_get_drvdata(dev);
-   unsigned int i;
+   void __iomem *lut0_base = regs + DISP_GAMMA_LUT;
+   void __iomem *lut1_base = regs + DISP_GAMMA_LUT1;
+   u32 cfg_val, data_mode, lbank_val, word[2];
+   int cur_bank, num_lut_banks;
+   u16 lut_bank_size, lut_size;
struct drm_color_lut *lut;
-   void __iomem *lut_base;
+   unsigned int i;
bool lut_diff;
-   u16 lut_bank_size, lut_size;
u8 lut_bits;
-   u32 cfg_val, lbank_val, word;
-   int cur_bank, num_lut_banks;
 
/* If there's no gamma lut there's nothing to do here. */
if (!state->gamma_lut)
@@ -114,19 +123,22 @@ void mtk_gamma_set_common(struct device *dev, void 
__iomem *regs, struct drm_crt
num_lut_banks = 1;
 
cfg_val = readl(regs + DISP_GAMMA_CFG);
-   lut_base = regs + DISP_GAMMA_LUT;
lut = (struct drm_color_lut *)state->gamma_lut->data;
 
+   /* Switch to 12 bits data mode if supported */
+   data_mode = FIELD_PREP(DISP_GAMMA_BANK_DATA_MODE, !!(lut_bits == 12));
+
for (cur_bank = 0; cur_bank < num_lut_banks; cur_bank++) {
 
/* Switch gamma bank and set data mode before writing LUT */
if (lut_bank_size) {
lbank_val = FIELD_PREP(DISP_GAMMA_BANK_BANK, cur_bank);
+   lbank_val |= data_mode;
writel(lbank_val, regs + DISP_GAMMA_BANK);
}
 
for (i = 0; i < lut_size; i++) {
-   int n = (cur_bank * (bank_size - 1)) + i;
+   int n = (cur_bank * (lut_bank_size - 1)) + i;
struct drm_color_lut diff, hwlut;
 
hwlut.red = drm_color_lut_extract(lut[n].red, lut_bits);
@@ -134,9 +146,15 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
hwlut.red = drm_color_lut_extract(lut[n].blue, 
lut_bits);
 
if (!lut_diff || (i % 2 == 0)) {
-   word = FIELD_PREP(DISP_GAMMA_LUT_10BIT_R, 
hwlut.red);
-   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_G, 
hwlut.green);
-   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_B, 
hwlut.blue);
+   if (lut_bits == 12) {
+   word[0] = 
FIELD_PREP(DISP_GAMMA_LUT_12BIT_R, hwlut.red);
+   word[0] |= 
FIELD_PREP(DISP_GAMMA_LUT_12BIT_G, hwlut.green);
+   word[1] = 
FIELD_PREP(DISP_GAMMA_LUT_12BIT_B, hwlut.blue);
+   } else {
+   word[0] = 
FIELD_PREP(DISP_GAMMA_LUT_10BIT_R, hwlut.red);
+   word[0] |= 
FIELD_PREP(DISP_GAMMA_LUT_10BIT_G, hwlut.green);
+   word[1] |= 
FIELD_PREP(DISP_GAMMA_LUT_10BIT_B, hwlut.blue);
+   }
} 

[PATCH v2 11/11] drm/mediatek: gamma: Program gamma LUT type for descending or rising

2023-05-03 Thread AngeloGioacchino Del Regno
All of the SoCs that don't have dithering control in the gamma IP
have got a GAMMA_LUT_TYPE bit that tells to the IP if the LUT is
"descending" (bit set) or "rising" (bit cleared): make sure to set
it correctly after programming the LUT.

Signed-off-by: AngeloGioacchino Del Regno 

---
 drivers/gpu/drm/mediatek/mtk_disp_gamma.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c 
b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
index 44f397f88d0a..1cf3c89175a3 100644
--- a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
+++ b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
@@ -23,6 +23,7 @@
 #define GAMMA_RELAY_MODE   BIT(0)
 #define GAMMA_LUT_EN   BIT(1)
 #define GAMMA_DITHERINGBIT(2)
+#define GAMMA_LUT_TYPE BIT(2)
 #define DISP_GAMMA_SIZE0x0030
 #define DISP_GAMMA_SIZE_HSIZE  GENMASK(28, 16)
 #define DISP_GAMMA_SIZE_VSIZE  GENMASK(12, 0)
@@ -89,6 +90,16 @@ unsigned int mtk_gamma_get_lut_size(struct device *dev)
return lut_size;
 }
 
+static bool mtk_gamma_lut_is_descending(struct drm_color_lut *lut, u32 
lut_size)
+{
+   u64 first, last;
+
+   first = lut[0].red + lut[0].green + lut[0].blue;
+   last = lut[lut_size].red + lut[lut_size].green + lut[lut_size].blue;
+
+   return !!(first > last);
+}
+
 void mtk_gamma_set_common(struct device *dev, void __iomem *regs, struct 
drm_crtc_state *state)
 {
struct mtk_disp_gamma *gamma = dev_get_drvdata(dev);
@@ -182,6 +193,14 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
}
}
 
+   if (gamma && !gamma->data->has_dither) {
+   /* Descending or Rising LUT */
+   if (mtk_gamma_lut_is_descending(lut, lut_size))
+   cfg_val |= FIELD_PREP(GAMMA_LUT_TYPE, 1);
+   else
+   cfg_val &= ~GAMMA_LUT_TYPE;
+   }
+
/* Enable the gamma table */
cfg_val |= FIELD_PREP(GAMMA_LUT_EN, 1);
 
-- 
2.40.1



[PATCH v2 08/11] drm/mediatek: gamma: Support multi-bank gamma LUT

2023-05-03 Thread AngeloGioacchino Del Regno
Newer Gamma IP have got multiple LUT banks: support specifying the
size of the LUT banks and handle bank-switching before programming
the LUT in mtk_gamma_set_common() in preparation for adding support
for MT8195 and newer SoCs.

Suggested-by: Jason-JH.Lin 
[Angelo: Refactored original commit]
Signed-off-by: AngeloGioacchino Del Regno 

---
 drivers/gpu/drm/mediatek/mtk_disp_gamma.c | 76 +++
 1 file changed, 49 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c 
b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
index a655373d568d..58eeebae568a 100644
--- a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
+++ b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
@@ -25,6 +25,8 @@
 #define DISP_GAMMA_SIZE0x0030
 #define DISP_GAMMA_SIZE_HSIZE  GENMASK(28, 16)
 #define DISP_GAMMA_SIZE_VSIZE  GENMASK(12, 0)
+#define DISP_GAMMA_BANK0x0100
+#define DISP_GAMMA_BANK_BANK   GENMASK(1, 0)
 #define DISP_GAMMA_LUT 0x0700
 
 #define DISP_GAMMA_LUT_10BIT_R GENMASK(29, 20)
@@ -38,6 +40,7 @@
 struct mtk_disp_gamma_data {
bool has_dither;
bool lut_diff;
+   u16 lut_bank_size;
u16 lut_size;
u8 lut_bits;
 };
@@ -84,9 +87,10 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
struct drm_color_lut *lut;
void __iomem *lut_base;
bool lut_diff;
-   u16 lut_size;
+   u16 lut_bank_size, lut_size;
u8 lut_bits;
-   u32 cfg_val, word;
+   u32 cfg_val, lbank_val, word;
+   int cur_bank, num_lut_banks;
 
/* If there's no gamma lut there's nothing to do here. */
if (!state->gamma_lut)
@@ -94,43 +98,61 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
 
if (gamma && gamma->data) {
lut_diff = gamma->data->lut_diff;
+   lut_bank_size = gamma->data->lut_bank_size;
lut_bits = gamma->data->lut_bits;
lut_size = gamma->data->lut_size;
} else {
lut_diff = false;
+   lut_bank_size = 0;
lut_bits = LUT_BITS_DEFAULT;
lut_size = LUT_SIZE_DEFAULT;
}
 
+   if (lut_bank_size)
+   num_lut_banks = lut_size / lut_bank_size;
+   else
+   num_lut_banks = 1;
+
cfg_val = readl(regs + DISP_GAMMA_CFG);
lut_base = regs + DISP_GAMMA_LUT;
lut = (struct drm_color_lut *)state->gamma_lut->data;
-   for (i = 0; i < lut_size; i++) {
-   struct drm_color_lut diff, hwlut;
-
-   hwlut.red = drm_color_lut_extract(lut[i].red, lut_bits);
-   hwlut.green = drm_color_lut_extract(lut[i].green, lut_bits);
-   hwlut.red = drm_color_lut_extract(lut[i].blue, lut_bits);
-
-   if (!lut_diff || (i % 2 == 0)) {
-   word = FIELD_PREP(DISP_GAMMA_LUT_10BIT_R, hwlut.red);
-   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_G, hwlut.green);
-   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_B, hwlut.blue);
-   } else {
-   diff.red = lut[i].red - lut[i - 1].red;
-   diff.red = drm_color_lut_extract(diff.red, lut_bits);
-
-   diff.green = lut[i].green - lut[i - 1].green;
-   diff.green = drm_color_lut_extract(diff.green, 
lut_bits);
-
-   diff.blue = lut[i].blue - lut[i - 1].blue;
-   diff.blue = drm_color_lut_extract(diff.blue, lut_bits);
-
-   word = FIELD_PREP(DISP_GAMMA_LUT_10BIT_R, diff.red);
-   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_G, diff.green);
-   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_B, diff.blue);
+
+   for (cur_bank = 0; cur_bank < num_lut_banks; cur_bank++) {
+
+   /* Switch gamma bank and set data mode before writing LUT */
+   if (lut_bank_size) {
+   lbank_val = FIELD_PREP(DISP_GAMMA_BANK_BANK, cur_bank);
+   writel(lbank_val, regs + DISP_GAMMA_BANK);
+   }
+
+   for (i = 0; i < lut_size; i++) {
+   int n = (cur_bank * (bank_size - 1)) + i;
+   struct drm_color_lut diff, hwlut;
+
+   hwlut.red = drm_color_lut_extract(lut[n].red, lut_bits);
+   hwlut.green = drm_color_lut_extract(lut[n].green, 
lut_bits);
+   hwlut.red = drm_color_lut_extract(lut[n].blue, 
lut_bits);
+
+   if (!lut_diff || (i % 2 == 0)) {
+   word = FIELD_PREP(DISP_GAMMA_LUT_10BIT_R, 
hwlut.red);
+   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_G, 
hwlut.green);
+ 

[PATCH v2 10/11] drm/mediatek: gamma: Make sure relay mode is disabled

2023-05-03 Thread AngeloGioacchino Del Regno
Disable relay mode at the end of LUT programming to make sure that the
processed image goes through.

Signed-off-by: AngeloGioacchino Del Regno 

---
 drivers/gpu/drm/mediatek/mtk_disp_gamma.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c 
b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
index d5da3e84cd53..44f397f88d0a 100644
--- a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
+++ b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
@@ -20,6 +20,7 @@
 #define DISP_GAMMA_EN  0x
 #define GAMMA_EN   BIT(0)
 #define DISP_GAMMA_CFG 0x0020
+#define GAMMA_RELAY_MODE   BIT(0)
 #define GAMMA_LUT_EN   BIT(1)
 #define GAMMA_DITHERINGBIT(2)
 #define DISP_GAMMA_SIZE0x0030
@@ -184,6 +185,9 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
/* Enable the gamma table */
cfg_val |= FIELD_PREP(GAMMA_LUT_EN, 1);
 
+   /* Disable RELAY mode to pass the processed image */
+   cfg_val &= ~GAMMA_RELAY_MODE;
+
writel(cfg_val, regs + DISP_GAMMA_CFG);
 }
 
-- 
2.40.1



[PATCH v2 07/11] drm/mediatek: gamma: Support specifying number of bits per LUT component

2023-05-03 Thread AngeloGioacchino Del Regno
New SoCs, like MT8195, not only may support bigger lookup tables, but
have got a different register layout to support bigger precision:
support specifying the number of `lut_bits` for each SoC and use it
in mtk_gamma_set_common() to perform the right calculation.

Signed-off-by: AngeloGioacchino Del Regno 

---
 drivers/gpu/drm/mediatek/mtk_disp_gamma.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c 
b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
index 1436e2c860cb..a655373d568d 100644
--- a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
+++ b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
@@ -39,6 +39,7 @@ struct mtk_disp_gamma_data {
bool has_dither;
bool lut_diff;
u16 lut_size;
+   u8 lut_bits;
 };
 
 /*
@@ -84,6 +85,7 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
void __iomem *lut_base;
bool lut_diff;
u16 lut_size;
+   u8 lut_bits;
u32 cfg_val, word;
 
/* If there's no gamma lut there's nothing to do here. */
@@ -92,9 +94,11 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
 
if (gamma && gamma->data) {
lut_diff = gamma->data->lut_diff;
+   lut_bits = gamma->data->lut_bits;
lut_size = gamma->data->lut_size;
} else {
lut_diff = false;
+   lut_bits = LUT_BITS_DEFAULT;
lut_size = LUT_SIZE_DEFAULT;
}
 
@@ -104,9 +108,9 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
for (i = 0; i < lut_size; i++) {
struct drm_color_lut diff, hwlut;
 
-   hwlut.red = drm_color_lut_extract(lut[i].red, LUT_BITS_DEFAULT);
-   hwlut.green = drm_color_lut_extract(lut[i].green, 
LUT_BITS_DEFAULT);
-   hwlut.red = drm_color_lut_extract(lut[i].blue, 
LUT_BITS_DEFAULT);
+   hwlut.red = drm_color_lut_extract(lut[i].red, lut_bits);
+   hwlut.green = drm_color_lut_extract(lut[i].green, lut_bits);
+   hwlut.red = drm_color_lut_extract(lut[i].blue, lut_bits);
 
if (!lut_diff || (i % 2 == 0)) {
word = FIELD_PREP(DISP_GAMMA_LUT_10BIT_R, hwlut.red);
@@ -114,13 +118,13 @@ void mtk_gamma_set_common(struct device *dev, void 
__iomem *regs, struct drm_crt
word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_B, hwlut.blue);
} else {
diff.red = lut[i].red - lut[i - 1].red;
-   diff.red = drm_color_lut_extract(diff.red, 
LUT_BITS_DEFAULT);
+   diff.red = drm_color_lut_extract(diff.red, lut_bits);
 
diff.green = lut[i].green - lut[i - 1].green;
-   diff.green = drm_color_lut_extract(diff.green, 
LUT_BITS_DEFAULT);
+   diff.green = drm_color_lut_extract(diff.green, 
lut_bits);
 
diff.blue = lut[i].blue - lut[i - 1].blue;
-   diff.blue = drm_color_lut_extract(diff.blue, 
LUT_BITS_DEFAULT);
+   diff.blue = drm_color_lut_extract(diff.blue, lut_bits);
 
word = FIELD_PREP(DISP_GAMMA_LUT_10BIT_R, diff.red);
word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_G, diff.green);
@@ -237,10 +241,12 @@ static int mtk_disp_gamma_remove(struct platform_device 
*pdev)
 
 static const struct mtk_disp_gamma_data mt8173_gamma_driver_data = {
.has_dither = true,
+   .lut_bits = 10,
.lut_size = 512,
 };
 
 static const struct mtk_disp_gamma_data mt8183_gamma_driver_data = {
+   .lut_bits = 10,
.lut_diff = true,
.lut_size = 512,
 };
-- 
2.40.1



[PATCH v2 06/11] drm/mediatek: gamma: Use bitfield macros

2023-05-03 Thread AngeloGioacchino Del Regno
Make the code more robust and improve readability by using bitfield
macros instead of open coding bit operations.
While at it, also add a definition for LUT_BITS_DEFAULT.

Signed-off-by: AngeloGioacchino Del Regno 

---
 drivers/gpu/drm/mediatek/mtk_disp_gamma.c | 41 ++-
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c 
b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
index 97b34963ef73..1436e2c860cb 100644
--- a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
+++ b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2021 MediaTek Inc.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -22,9 +23,16 @@
 #define GAMMA_LUT_EN   BIT(1)
 #define GAMMA_DITHERINGBIT(2)
 #define DISP_GAMMA_SIZE0x0030
+#define DISP_GAMMA_SIZE_HSIZE  GENMASK(28, 16)
+#define DISP_GAMMA_SIZE_VSIZE  GENMASK(12, 0)
 #define DISP_GAMMA_LUT 0x0700
 
+#define DISP_GAMMA_LUT_10BIT_R GENMASK(29, 20)
+#define DISP_GAMMA_LUT_10BIT_G GENMASK(19, 10)
+#define DISP_GAMMA_LUT_10BIT_B GENMASK(9, 0)
+
 #define LUT_10BIT_MASK 0x03ff
+#define LUT_BITS_DEFAULT   10
 #define LUT_SIZE_DEFAULT   512 /* for setting gamma lut 
from AAL */
 
 struct mtk_disp_gamma_data {
@@ -96,33 +104,33 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
for (i = 0; i < lut_size; i++) {
struct drm_color_lut diff, hwlut;
 
-   hwlut.red = drm_color_lut_extract(lut[i].red, 10);
-   hwlut.green = drm_color_lut_extract(lut[i].green, 10);
-   hwlut.red = drm_color_lut_extract(lut[i].blue, 10);
+   hwlut.red = drm_color_lut_extract(lut[i].red, LUT_BITS_DEFAULT);
+   hwlut.green = drm_color_lut_extract(lut[i].green, 
LUT_BITS_DEFAULT);
+   hwlut.red = drm_color_lut_extract(lut[i].blue, 
LUT_BITS_DEFAULT);
 
if (!lut_diff || (i % 2 == 0)) {
-   word = hwlut.red << 20 +
-  hwlut.green << 10 +
-  hwlut.red;
+   word = FIELD_PREP(DISP_GAMMA_LUT_10BIT_R, hwlut.red);
+   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_G, hwlut.green);
+   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_B, hwlut.blue);
} else {
diff.red = lut[i].red - lut[i - 1].red;
-   diff.red = drm_color_lut_extract(diff.red, 10);
+   diff.red = drm_color_lut_extract(diff.red, 
LUT_BITS_DEFAULT);
 
diff.green = lut[i].green - lut[i - 1].green;
-   diff.green = drm_color_lut_extract(diff.green, 10);
+   diff.green = drm_color_lut_extract(diff.green, 
LUT_BITS_DEFAULT);
 
diff.blue = lut[i].blue - lut[i - 1].blue;
-   diff.blue = drm_color_lut_extract(diff.blue, 10);
+   diff.blue = drm_color_lut_extract(diff.blue, 
LUT_BITS_DEFAULT);
 
-   word = diff.blue << 20 +
-  diff.green << 10 +
-  diff.red;
+   word = FIELD_PREP(DISP_GAMMA_LUT_10BIT_R, diff.red);
+   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_G, diff.green);
+   word |= FIELD_PREP(DISP_GAMMA_LUT_10BIT_B, diff.blue);
}
writel(word, (lut_base + i * 4));
}
 
/* Enable the gamma table */
-   cfg_val = cfg_val | GAMMA_LUT_EN;
+   cfg_val |= FIELD_PREP(GAMMA_LUT_EN, 1);
 
writel(cfg_val, regs + DISP_GAMMA_CFG);
 }
@@ -139,9 +147,12 @@ void mtk_gamma_config(struct device *dev, unsigned int w,
  unsigned int bpc, struct cmdq_pkt *cmdq_pkt)
 {
struct mtk_disp_gamma *gamma = dev_get_drvdata(dev);
+   u32 sz;
+
+   sz = FIELD_PREP(DISP_GAMMA_SIZE_HSIZE, w);
+   sz |= FIELD_PREP(DISP_GAMMA_SIZE_VSIZE, h);
 
-   mtk_ddp_write(cmdq_pkt, h << 16 | w, >cmdq_reg, gamma->regs,
- DISP_GAMMA_SIZE);
+   mtk_ddp_write(cmdq_pkt, sz, >cmdq_reg, gamma->regs, 
DISP_GAMMA_SIZE);
if (gamma->data && gamma->data->has_dither)
mtk_dither_set_common(gamma->regs, >cmdq_reg, bpc,
  DISP_GAMMA_CFG, GAMMA_DITHERING, 
cmdq_pkt);
-- 
2.40.1



[PATCH v2 04/11] drm/mediatek: gamma: Improve and simplify HW LUT calculation

2023-05-03 Thread AngeloGioacchino Del Regno
Use drm_color_lut_extract() to avoid open-coding the bits reduction
calculations for each color channel and use a struct drm_color_lut
to temporarily store the information instead of an array of u32.

Also, slightly improve the precision of the HW LUT calculation in the
LUT DIFF case by performing the subtractions on the 16-bits values and
doing the 10 bits conversion later.

Signed-off-by: AngeloGioacchino Del Regno 

---
 drivers/gpu/drm/mediatek/mtk_disp_gamma.c | 30 +++
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c 
b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
index d194d9bc2e2b..89a1640c2e8f 100644
--- a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
+++ b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
@@ -77,7 +77,6 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
bool lut_diff;
u16 lut_size;
u32 word;
-   u32 diff[3] = {0};
 
/* If there's no gamma lut there's nothing to do here. */
if (!state->gamma_lut)
@@ -97,18 +96,29 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
lut_base = regs + DISP_GAMMA_LUT;
lut = (struct drm_color_lut *)state->gamma_lut->data;
for (i = 0; i < lut_size; i++) {
+   struct drm_color_lut diff, hwlut;
+
+   hwlut.red = drm_color_lut_extract(lut[i].red, 10);
+   hwlut.green = drm_color_lut_extract(lut[i].green, 10);
+   hwlut.red = drm_color_lut_extract(lut[i].blue, 10);
+
if (!lut_diff || (i % 2 == 0)) {
-   word = (((lut[i].red >> 6) & LUT_10BIT_MASK) << 20) +
-   (((lut[i].green >> 6) & LUT_10BIT_MASK) << 10) +
-   ((lut[i].blue >> 6) & LUT_10BIT_MASK);
+   word = hwlut.red << 20 +
+  hwlut.green << 10 +
+  hwlut.red;
} else {
-   diff[0] = (lut[i].red >> 6) - (lut[i - 1].red >> 6);
-   diff[1] = (lut[i].green >> 6) - (lut[i - 1].green >> 6);
-   diff[2] = (lut[i].blue >> 6) - (lut[i - 1].blue >> 6);
+   diff.red = lut[i].red - lut[i - 1].red;
+   diff.red = drm_color_lut_extract(diff.red, 10);
+
+   diff.green = lut[i].green - lut[i - 1].green;
+   diff.green = drm_color_lut_extract(diff.green, 10);
+
+   diff.blue = lut[i].blue - lut[i - 1].blue;
+   diff.blue = drm_color_lut_extract(diff.blue, 10);
 
-   word = ((diff[0] & LUT_10BIT_MASK) << 20) +
-   ((diff[1] & LUT_10BIT_MASK) << 10) +
-   (diff[2] & LUT_10BIT_MASK);
+   word = diff.blue << 20 +
+  diff.green << 10 +
+  diff.red;
}
writel(word, (lut_base + i * 4));
}
-- 
2.40.1



[PATCH v2 05/11] drm/mediatek: gamma: Enable the Gamma LUT table only after programming

2023-05-03 Thread AngeloGioacchino Del Regno
Move the write to DISP_GAMMA_CFG to enable the Gamma LUT to after
programming the actual table to avoid potential visual glitches during
table modification.

Signed-off-by: AngeloGioacchino Del Regno 

---
 drivers/gpu/drm/mediatek/mtk_disp_gamma.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c 
b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
index 89a1640c2e8f..97b34963ef73 100644
--- a/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
+++ b/drivers/gpu/drm/mediatek/mtk_disp_gamma.c
@@ -71,12 +71,12 @@ unsigned int mtk_gamma_get_lut_size(struct device *dev)
 void mtk_gamma_set_common(struct device *dev, void __iomem *regs, struct 
drm_crtc_state *state)
 {
struct mtk_disp_gamma *gamma = dev_get_drvdata(dev);
-   unsigned int i, reg;
+   unsigned int i;
struct drm_color_lut *lut;
void __iomem *lut_base;
bool lut_diff;
u16 lut_size;
-   u32 word;
+   u32 cfg_val, word;
 
/* If there's no gamma lut there's nothing to do here. */
if (!state->gamma_lut)
@@ -90,9 +90,7 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
lut_size = LUT_SIZE_DEFAULT;
}
 
-   reg = readl(regs + DISP_GAMMA_CFG);
-   reg = reg | GAMMA_LUT_EN;
-   writel(reg, regs + DISP_GAMMA_CFG);
+   cfg_val = readl(regs + DISP_GAMMA_CFG);
lut_base = regs + DISP_GAMMA_LUT;
lut = (struct drm_color_lut *)state->gamma_lut->data;
for (i = 0; i < lut_size; i++) {
@@ -122,6 +120,11 @@ void mtk_gamma_set_common(struct device *dev, void __iomem 
*regs, struct drm_crt
}
writel(word, (lut_base + i * 4));
}
+
+   /* Enable the gamma table */
+   cfg_val = cfg_val | GAMMA_LUT_EN;
+
+   writel(cfg_val, regs + DISP_GAMMA_CFG);
 }
 
 void mtk_gamma_set(struct device *dev, struct drm_crtc_state *state)
-- 
2.40.1



  1   2   >