Re: [PATCH v3 17/19] drm/xe/device: implement transient flush

2024-05-03 Thread Lucas De Marchi

On Tue, Apr 30, 2024 at 10:28:48AM GMT, Radhakrishna Sripada wrote:

From: Nirmoy Das 

Display surfaces can be tagged as transient by mapping it using one of
the various L3:XD PAT index modes on Xe2. The expectation is that KMD
needs to request transient data flush at the start of flip sequence to
ensure all transient data in L3 cache is flushed to memory. Add a
routine for this which we can then call from the display code.

v2: rebase(RK)

Signed-off-by: Nirmoy Das 
Co-developed-by: Matthew Auld 
Signed-off-by: Matthew Auld 
Signed-off-by: Balasubramani Vivekanandan 
Reviewed-by: Matt Roper 
Signed-off-by: Radhakrishna Sripada 



Acked-by: Lucas De Marchi 

for merging this through drm-intel-next.

Lucas De Marchi


---
drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 ++
drivers/gpu/drm/xe/xe_device.c   | 49 
drivers/gpu/drm/xe/xe_device.h   |  1 +
3 files changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h 
b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index 83847f2da72a..b4f1a3264e8c 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -334,6 +334,9 @@

#define XE2LPM_L3SQCREG5XE_REG_MCR(0xb658)

+#define XE2_TDF_CTRL   XE_REG(0xb418)
+#define   TRANSIENT_FLUSH_REQUEST  REG_BIT(0)
+
#define XEHP_MERT_MOD_CTRL  XE_REG_MCR(0xcf28)
#define RENDER_MOD_CTRL XE_REG_MCR(0xcf2c)
#define COMP_MOD_CTRL   XE_REG_MCR(0xcf30)
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index b61f8356e23e..05c28314b748 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -719,6 +719,55 @@ void xe_device_wmb(struct xe_device *xe)
xe_mmio_write32(gt, SOFTWARE_FLAGS_SPR33, 0);
}

+/**
+ * xe_device_td_flush() - Flush transient L3 cache entries
+ * @xe: The device
+ *
+ * Display engine has direct access to memory and is never coherent with L3/L4
+ * caches (or CPU caches), however KMD is responsible for specifically flushing
+ * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
+ * can happen from such a surface without seeing corruption.
+ *
+ * Display surfaces can be tagged as transient by mapping it using one of the
+ * various L3:XD PAT index modes on Xe2.
+ *
+ * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is 
flushed
+ * at the end of each submission via PIPE_CONTROL for compute/render, since SA
+ * Media is not coherent with L3 and we want to support render-vs-media
+ * usescases. For other engines like copy/blt the HW internally forces uncached
+ * behaviour, hence why we can skip the TDF on such platforms.
+ */
+void xe_device_td_flush(struct xe_device *xe)
+{
+   struct xe_gt *gt;
+   u8 id;
+
+   if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
+   return;
+
+   for_each_gt(gt, xe, id) {
+   if (xe_gt_is_media_type(gt))
+   continue;
+
+   if (xe_force_wake_get(gt_to_fw(gt), XE_FW_GT))
+   return;
+
+   xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
+   /*
+* FIXME: We can likely do better here with our choice of
+* timeout. Currently we just assume the worst case, i.e. 150us,
+* which is believed to be sufficient to cover the worst case
+* scenario on current platforms if all cache entries are
+* transient and need to be flushed..
+*/
+   if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0,
+  150, NULL, false))
+   xe_gt_err_once(gt, "TD flush timeout\n");
+
+   xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
+   }
+}
+
u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
{
return xe_device_has_flat_ccs(xe) ?
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index 82317580f4bf..f2a78b6a9bff 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -173,5 +173,6 @@ static inline bool xe_device_wedged(struct xe_device *xe)
}

void xe_device_declare_wedged(struct xe_device *xe);
+void xe_device_td_flush(struct xe_device *xe);

#endif
--
2.34.1



[PATCH v3 17/19] drm/xe/device: implement transient flush

2024-04-30 Thread Radhakrishna Sripada
From: Nirmoy Das 

Display surfaces can be tagged as transient by mapping it using one of
the various L3:XD PAT index modes on Xe2. The expectation is that KMD
needs to request transient data flush at the start of flip sequence to
ensure all transient data in L3 cache is flushed to memory. Add a
routine for this which we can then call from the display code.

v2: rebase(RK)

Signed-off-by: Nirmoy Das 
Co-developed-by: Matthew Auld 
Signed-off-by: Matthew Auld 
Signed-off-by: Balasubramani Vivekanandan 
Reviewed-by: Matt Roper 
Signed-off-by: Radhakrishna Sripada 
---
 drivers/gpu/drm/xe/regs/xe_gt_regs.h |  3 ++
 drivers/gpu/drm/xe/xe_device.c   | 49 
 drivers/gpu/drm/xe/xe_device.h   |  1 +
 3 files changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/xe/regs/xe_gt_regs.h 
b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
index 83847f2da72a..b4f1a3264e8c 100644
--- a/drivers/gpu/drm/xe/regs/xe_gt_regs.h
+++ b/drivers/gpu/drm/xe/regs/xe_gt_regs.h
@@ -334,6 +334,9 @@
 
 #define XE2LPM_L3SQCREG5   XE_REG_MCR(0xb658)
 
+#define XE2_TDF_CTRL   XE_REG(0xb418)
+#define   TRANSIENT_FLUSH_REQUEST  REG_BIT(0)
+
 #define XEHP_MERT_MOD_CTRL XE_REG_MCR(0xcf28)
 #define RENDER_MOD_CTRLXE_REG_MCR(0xcf2c)
 #define COMP_MOD_CTRL  XE_REG_MCR(0xcf30)
diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
index b61f8356e23e..05c28314b748 100644
--- a/drivers/gpu/drm/xe/xe_device.c
+++ b/drivers/gpu/drm/xe/xe_device.c
@@ -719,6 +719,55 @@ void xe_device_wmb(struct xe_device *xe)
xe_mmio_write32(gt, SOFTWARE_FLAGS_SPR33, 0);
 }
 
+/**
+ * xe_device_td_flush() - Flush transient L3 cache entries
+ * @xe: The device
+ *
+ * Display engine has direct access to memory and is never coherent with L3/L4
+ * caches (or CPU caches), however KMD is responsible for specifically flushing
+ * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
+ * can happen from such a surface without seeing corruption.
+ *
+ * Display surfaces can be tagged as transient by mapping it using one of the
+ * various L3:XD PAT index modes on Xe2.
+ *
+ * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is 
flushed
+ * at the end of each submission via PIPE_CONTROL for compute/render, since SA
+ * Media is not coherent with L3 and we want to support render-vs-media
+ * usescases. For other engines like copy/blt the HW internally forces uncached
+ * behaviour, hence why we can skip the TDF on such platforms.
+ */
+void xe_device_td_flush(struct xe_device *xe)
+{
+   struct xe_gt *gt;
+   u8 id;
+
+   if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
+   return;
+
+   for_each_gt(gt, xe, id) {
+   if (xe_gt_is_media_type(gt))
+   continue;
+
+   if (xe_force_wake_get(gt_to_fw(gt), XE_FW_GT))
+   return;
+
+   xe_mmio_write32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);
+   /*
+* FIXME: We can likely do better here with our choice of
+* timeout. Currently we just assume the worst case, i.e. 150us,
+* which is believed to be sufficient to cover the worst case
+* scenario on current platforms if all cache entries are
+* transient and need to be flushed..
+*/
+   if (xe_mmio_wait32(gt, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST, 0,
+  150, NULL, false))
+   xe_gt_err_once(gt, "TD flush timeout\n");
+
+   xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
+   }
+}
+
 u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
 {
return xe_device_has_flat_ccs(xe) ?
diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
index 82317580f4bf..f2a78b6a9bff 100644
--- a/drivers/gpu/drm/xe/xe_device.h
+++ b/drivers/gpu/drm/xe/xe_device.h
@@ -173,5 +173,6 @@ static inline bool xe_device_wedged(struct xe_device *xe)
 }
 
 void xe_device_declare_wedged(struct xe_device *xe);
+void xe_device_td_flush(struct xe_device *xe);
 
 #endif
-- 
2.34.1