[PATCH v7 0/5] Update mdp clk to max supported value to support higher refresh rates

2022-03-21 Thread Vinod Polimera
Drop the assigned clock rate property and vote on the mdp clock to
max frequency during bind/probe sequence.

Changes in v2:
- Remove assigned-clock-rate property and set mdp clk during
resume sequence.
- Add fixes tag.

Changes in v3:
- Remove extra line after fixes tag.(Stephen Boyd)
- Add similar changes for sc7180, sdm845 which uses opp table for
voting mdp clk.(Stephen Boyd)
- Drop patch: "drm/msm/disp/dpu1: set mdp clk to the maximum
frequency in opp table"

Changes in v4:
- Add similar change for sm8250.(Dmitry)

Changes in v5:
- Add change to set mdp clk to max frequency in opp table
during mdp probe/bind.

Changes in v6:
- Remove change log in dt patch.
- Fix the leak reference for opp by adding dev_pm_opp_put. (Dmitry)

Changes in v7:
- Update commit message and fix tag. (Stephen/Doug)

Vinod Polimera (5):
  drm/msm/disp/dpu1: set mdp clk to the maximum frequency in opp table
during probe
  arm64: dts: qcom: sm7280: remove assigned-clock-rate property for mdp
clk
  arm64: dts: qcom: sm7180: remove assigned-clock-rate property for mdp
clk
  arm64: dts: qcom: sdm845: remove assigned-clock-rate property for mdp
clk
  arm64: dts: qcom: sm8250: remove assigned-clock-rate property for mdp
clk

 arch/arm64/boot/dts/qcom/sc7180.dtsi| 9 ++---
 arch/arm64/boot/dts/qcom/sc7280.dtsi| 9 ++---
 arch/arm64/boot/dts/qcom/sdm845.dtsi| 9 ++---
 arch/arm64/boot/dts/qcom/sm8250.dtsi| 9 ++---
 drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c | 8 
 5 files changed, 16 insertions(+), 28 deletions(-)

-- 
2.7.4



[PATCH v7 5/5] arm64: dts: qcom: sm8250: remove assigned-clock-rate property for mdp clk

2022-03-21 Thread Vinod Polimera
Drop the assigned clock rate property and vote on the mdp clock as per
calculated value during the usecase.

This patch is dependent on the patch ("drm/msm/disp/dpu1: set mdp clk
to the maximum frequency in opp table during probe") [1].

[1] 
https://lore.kernel.org/r/1647269217-14064-2-git-send-email-quic_vpoli...@quicinc.com/

Signed-off-by: Vinod Polimera 
Reviewed-by: Stephen Boyd 
Reviewed-by: Douglas Anderson 
---
 arch/arm64/boot/dts/qcom/sm8250.dtsi | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sm8250.dtsi 
b/arch/arm64/boot/dts/qcom/sm8250.dtsi
index fdaf303..2105eb7 100644
--- a/arch/arm64/boot/dts/qcom/sm8250.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8250.dtsi
@@ -3164,9 +3164,6 @@
 < DISP_CC_MDSS_MDP_CLK>;
clock-names = "iface", "bus", "nrt_bus", "core";
 
-   assigned-clocks = < DISP_CC_MDSS_MDP_CLK>;
-   assigned-clock-rates = <46000>;
-
interrupts = ;
interrupt-controller;
#interrupt-cells = <1>;
@@ -3191,10 +3188,8 @@
 < DISP_CC_MDSS_VSYNC_CLK>;
clock-names = "iface", "bus", "core", "vsync";
 
-   assigned-clocks = < 
DISP_CC_MDSS_MDP_CLK>,
- < 
DISP_CC_MDSS_VSYNC_CLK>;
-   assigned-clock-rates = <46000>,
-  <1920>;
+   assigned-clocks = < 
DISP_CC_MDSS_VSYNC_CLK>;
+   assigned-clock-rates = <1920>;
 
operating-points-v2 = <_opp_table>;
power-domains = < SM8250_MMCX>;
-- 
2.7.4



[PATCH v7 1/5] drm/msm/disp/dpu1: set mdp clk to the maximum frequency in opp table during probe

2022-03-21 Thread Vinod Polimera
Set mdp clock to max clock rate during probe/bind sequence from the
opp table so that rails are not at undetermined state. Since we do not
know what will be the rate set in boot loader, it would be ideal to
vote at max frequency. There could be a firmware display programmed
in bootloader and we want to transition it to kernel without underflowing.
The clock will be scaled down later when framework sends an update.

Fixes: 25fdd5933e4c ("drm/msm: Add SDM845 DPU support")
Signed-off-by: Vinod Polimera 
Reviewed-by: Dmitry Baryshkov 
Reviewed-by: Douglas Anderson 
---
 drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c 
b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c
index e29796c..9c346ce 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_kms.c
@@ -1202,7 +1202,9 @@ static int dpu_bind(struct device *dev, struct device 
*master, void *data)
struct platform_device *pdev = to_platform_device(dev);
struct drm_device *ddev = priv->dev;
struct dpu_kms *dpu_kms;
+   struct dev_pm_opp *opp;
int ret = 0;
+   unsigned long max_freq = ULONG_MAX;
 
dpu_kms = devm_kzalloc(>dev, sizeof(*dpu_kms), GFP_KERNEL);
if (!dpu_kms)
@@ -1225,6 +1227,12 @@ static int dpu_bind(struct device *dev, struct device 
*master, void *data)
}
dpu_kms->num_clocks = ret;
 
+   opp = dev_pm_opp_find_freq_floor(dev, _freq);
+   if (!IS_ERR(opp))
+   dev_pm_opp_put(opp);
+
+   dev_pm_opp_set_rate(dev, max_freq);
+
platform_set_drvdata(pdev, dpu_kms);
 
ret = msm_kms_init(_kms->base, _funcs);
-- 
2.7.4



[PATCH v7 2/5] arm64: dts: qcom: sm7280: remove assigned-clock-rate property for mdp clk

2022-03-21 Thread Vinod Polimera
Drop the assigned clock rate property and vote on the mdp clock as per
calculated value during the usecase.

This patch is dependent on the patch ("drm/msm/disp/dpu1: set mdp clk
to the maximum frequency in opp table during probe") [1].

[1] 
https://lore.kernel.org/r/1647269217-14064-2-git-send-email-quic_vpoli...@quicinc.com/

Signed-off-by: Vinod Polimera 
Reviewed-by: Stephen Boyd 
Reviewed-by: Douglas Anderson 
---
 arch/arm64/boot/dts/qcom/sc7280.dtsi | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi 
b/arch/arm64/boot/dts/qcom/sc7280.dtsi
index c07765d..a3c768c 100644
--- a/arch/arm64/boot/dts/qcom/sc7280.dtsi
+++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi
@@ -3086,9 +3086,6 @@
  "ahb",
  "core";
 
-   assigned-clocks = < DISP_CC_MDSS_MDP_CLK>;
-   assigned-clock-rates = <3>;
-
interrupts = ;
interrupt-controller;
#interrupt-cells = <1>;
@@ -3122,11 +3119,9 @@
  "lut",
  "core",
  "vsync";
-   assigned-clocks = < 
DISP_CC_MDSS_MDP_CLK>,
-   < 
DISP_CC_MDSS_VSYNC_CLK>,
+   assigned-clocks = < 
DISP_CC_MDSS_VSYNC_CLK>,
< DISP_CC_MDSS_AHB_CLK>;
-   assigned-clock-rates = <3>,
-   <1920>,
+   assigned-clock-rates = <1920>,
<1920>;
operating-points-v2 = <_opp_table>;
power-domains = < SC7280_CX>;
-- 
2.7.4



[PATCH v7 3/5] arm64: dts: qcom: sm7180: remove assigned-clock-rate property for mdp clk

2022-03-21 Thread Vinod Polimera
Drop the assigned clock rate property and vote on the mdp clock as per
calculated value during the usecase.

This patch is dependent on the patch ("drm/msm/disp/dpu1: set mdp clk
to the maximum frequency in opp table during probe") [1].

[1] 
https://lore.kernel.org/r/1647269217-14064-2-git-send-email-quic_vpoli...@quicinc.com/

Signed-off-by: Vinod Polimera 
Reviewed-by: Stephen Boyd 
Reviewed-by: Douglas Anderson 
---
 arch/arm64/boot/dts/qcom/sc7180.dtsi | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sc7180.dtsi 
b/arch/arm64/boot/dts/qcom/sc7180.dtsi
index e1c46b8..eaab746 100644
--- a/arch/arm64/boot/dts/qcom/sc7180.dtsi
+++ b/arch/arm64/boot/dts/qcom/sc7180.dtsi
@@ -2900,9 +2900,6 @@
 < DISP_CC_MDSS_MDP_CLK>;
clock-names = "iface", "ahb", "core";
 
-   assigned-clocks = < DISP_CC_MDSS_MDP_CLK>;
-   assigned-clock-rates = <3>;
-
interrupts = ;
interrupt-controller;
#interrupt-cells = <1>;
@@ -2932,12 +2929,10 @@
 < DISP_CC_MDSS_VSYNC_CLK>;
clock-names = "bus", "iface", "rot", "lut", 
"core",
  "vsync";
-   assigned-clocks = < 
DISP_CC_MDSS_MDP_CLK>,
- < 
DISP_CC_MDSS_VSYNC_CLK>,
+   assigned-clocks = < 
DISP_CC_MDSS_VSYNC_CLK>,
  < 
DISP_CC_MDSS_ROT_CLK>,
  < 
DISP_CC_MDSS_AHB_CLK>;
-   assigned-clock-rates = <3>,
-  <1920>,
+   assigned-clock-rates = <1920>,
   <1920>,
   <1920>;
operating-points-v2 = <_opp_table>;
-- 
2.7.4



[PATCH v7 4/5] arm64: dts: qcom: sdm845: remove assigned-clock-rate property for mdp clk

2022-03-21 Thread Vinod Polimera
Drop the assigned clock rate property and vote on the mdp clock as per
calculated value during the usecase.

This patch is dependent on the patch ("drm/msm/disp/dpu1: set mdp clk
to the maximum frequency in opp table during probe") [1].

[1] 
https://lore.kernel.org/r/1647269217-14064-2-git-send-email-quic_vpoli...@quicinc.com/

Signed-off-by: Vinod Polimera 
Reviewed-by: Stephen Boyd 
Reviewed-by: Douglas Anderson 
---
 arch/arm64/boot/dts/qcom/sdm845.dtsi | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi 
b/arch/arm64/boot/dts/qcom/sdm845.dtsi
index 41f4e46..c0771d2 100644
--- a/arch/arm64/boot/dts/qcom/sdm845.dtsi
+++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi
@@ -4240,9 +4240,6 @@
 < DISP_CC_MDSS_MDP_CLK>;
clock-names = "iface", "core";
 
-   assigned-clocks = < DISP_CC_MDSS_MDP_CLK>;
-   assigned-clock-rates = <3>;
-
interrupts = ;
interrupt-controller;
#interrupt-cells = <1>;
@@ -4273,10 +4270,8 @@
 < DISP_CC_MDSS_VSYNC_CLK>;
clock-names = "gcc-bus", "iface", "bus", 
"core", "vsync";
 
-   assigned-clocks = < 
DISP_CC_MDSS_MDP_CLK>,
- < 
DISP_CC_MDSS_VSYNC_CLK>;
-   assigned-clock-rates = <3>,
-  <1920>;
+   assigned-clocks = < 
DISP_CC_MDSS_VSYNC_CLK>;
+   assigned-clock-rates = <1920>;
operating-points-v2 = <_opp_table>;
power-domains = < SDM845_CX>;
 
-- 
2.7.4



Re: [PATCH v11 5/7] dt-bindings: display: Add Loongson display controller

2022-03-21 Thread Sui Jingfeng



On 2022/3/22 07:20, Rob Herring wrote:

On Tue, Mar 22, 2022 at 12:29:14AM +0800, Sui Jingfeng wrote:

From: suijingfeng 


Needs a commit message.


Signed-off-by: suijingfeng 
Signed-off-by: Sui Jingfeng <15330273...@189.cn>

Same person? Don't need both emails.


Yes,  suijingf...@loongson.cn is my company's email. But it can not be 
used to send patches to dri-devel,


when send patches with this email, the patch will not be shown on patch 
works.


Emails  are either blocked or got  rejected  by loongson's mail server.  
It can only receive emails


from you and other people, but not dri-devel. so have to use my personal 
email(15330273...@189.cn) to send patches.



---
  .../loongson/loongson,display-controller.yaml | 230 ++
  1 file changed, 230 insertions(+)
  create mode 100644 
Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml

diff --git 
a/Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
 
b/Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
new file mode 100644
index ..7be63346289e
--- /dev/null
+++ 
b/Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
@@ -0,0 +1,230 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: 
http://devicetree.org/schemas/display/loongson/loongson,display-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Loongson LS7A1000/LS2K1000/LS2K0500 Display Controller Device Tree 
Bindings
+
+maintainers:
+  - Sui Jingfeng 
+
+description: |+
+
+  Loongson display controllers are simple which require scanout buffers
+  to be physically contiguous. LS2K1000/LS2K0500 is a SOC, only system
+  memory is available. LS7A1000/LS7A2000 is bridge chip which is equipped
+  with a dedicated video RAM which is 64MB or more, precise size can be
+  read from the PCI BAR 2 of the GPU device(0x0014:0x7A15) in the bridge
+  chip.
+
+  LSDC has two display pipes, each way has a DVO interface which provide
+  RGB888 signals, vertical & horizontal synchronisations, data enable and
+  the pixel clock. LSDC has two CRTC, each CRTC is able to scanout from
+  1920x1080 resolution at 60Hz. Each CRTC has two FB address registers.
+
+  For LS7A1000, there are 4 dedicated GPIOs whose control register is
+  located at the DC register space. They are used to emulate two way i2c,
+  One for DVO0, another for DVO1.
+
+  LS2K1000 and LS2K0500 SoC grab i2c adapter from other module, either
+  general purpose GPIO emulated i2c or hardware i2c in the SoC.
+
+  LSDC's display pipeline have several components as below description,
+
+  The display controller in LS7A1000:
+ ___ _
+|---|   | |
+|  CRTC0 --> | DVO0 > Encoder0 ---> Connector0 ---> | Monitor |
+|  _   _ ---|^ ^|_|
+| | | | |---|| |
+| |_| |_|| i2c0 <+-+
+|---|
+|   DC IN LS7A1000  |
+|  _   _ ---|
+| | | | || i2c1 <+-+
+| |_| |_|---|| | _
+|---|| || |
+|  CRTC1 --> | DVO1 > Encoder1 ---> Connector1 ---> |  Panel  |
+|---|   |_|
+|___|
+
+  Simple usage of LS7A1000 with LS3A4000 CPU:
+
++--++---+
+| DDR4 ||  +---+|
++--+|  | PCIe Root complex |   LS7A1000 |
+   || MC0   |  +--++-+++|
+  +--+  HT 3.0  | || || |
+  | LS3A4000 |<>| +---++---+  +--++--++-+   +--+
+  |   CPU|<>| | GC1000 |  | LSDC |<-->| DDR3 MC |<->| VRAM |
+  +--+  | ++  +-+--+-++-+   +--+
+   || MC1   +---|--|+
++--+|  |
+| DDR4 |  +---+   DVO0  |  |  DVO1   +--+
++--+   VGA <--|ADV7125|<+  +>|TFP410|--> DVI/HDMI
+  +---+  +--+
+
+  The display controller in LS2K1000/LS2K0500:
+ ___ _
+|---|   | |
+|  CRTC0 --> | DVO0 > Encoder0 ---> Connector0 ---> | Monitor |
+|  _   _ ---|^  ^   |_|
+| | | | |   ||  |
+| |_| |_|   | +--+  |
+|   <>| i2c0 |<-+
+|   DC IN LS2K1000  | 

[PATCH v5 3/4] drm/vc4: change vc4 driver to use drm_writeback_connector_init_with_encoder()

2022-03-21 Thread Abhinav Kumar
vc4 driver currently embeds the drm_encoder into struct vc4_txp
and later on uses container_of to retrieve the vc4_txp from
the drm_encoder.

Make vc4 driver use the new API so that the embedded encoder model
can be retained in the driver and there is no change in
functionality.

changes in v5:
- reorder this change to come before in the series
  to avoid incorrect sequence in subsequent changes
- continue using struct drm_encoder instead of
  struct drm_encoder * and switch it in next change

Signed-off-by: Abhinav Kumar 
---
 drivers/gpu/drm/vc4/vc4_txp.c | 30 +++---
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/vc4/vc4_txp.c b/drivers/gpu/drm/vc4/vc4_txp.c
index 3447eb6..5490162 100644
--- a/drivers/gpu/drm/vc4/vc4_txp.c
+++ b/drivers/gpu/drm/vc4/vc4_txp.c
@@ -368,6 +368,10 @@ static const struct drm_encoder_helper_funcs 
vc4_txp_encoder_helper_funcs = {
.disable = vc4_txp_encoder_disable,
 };
 
+static const struct drm_encoder_funcs vc4_txp_encoder_funcs = {
+   .destroy = drm_encoder_cleanup,
+};
+
 static int vc4_txp_enable_vblank(struct drm_crtc *crtc)
 {
return 0;
@@ -467,6 +471,7 @@ static int vc4_txp_bind(struct device *dev, struct device 
*master, void *data)
struct vc4_txp *txp;
struct drm_crtc *crtc;
struct drm_encoder *encoder;
+   struct drm_writeback_connector *wb_conn;
int ret, irq;
 
irq = platform_get_irq(pdev, 0);
@@ -492,16 +497,27 @@ static int vc4_txp_bind(struct device *dev, struct device 
*master, void *data)
txp->regset.regs = txp_regs;
txp->regset.nregs = ARRAY_SIZE(txp_regs);
 
-   drm_connector_helper_add(>connector.base,
-_txp_connector_helper_funcs);
-   ret = drm_writeback_connector_init(drm, >connector,
-  _txp_connector_funcs,
-  _txp_encoder_helper_funcs,
-  drm_fmts, ARRAY_SIZE(drm_fmts),
-  0);
+   wb_conn = >connector;
+
+   drm_encoder_helper_add(_conn->encoder, 
_txp_encoder_helper_funcs);
+
+   ret = drm_encoder_init(drm, _conn->encoder,
+   _txp_encoder_funcs,
+   DRM_MODE_ENCODER_VIRTUAL, NULL);
+
if (ret)
return ret;
 
+   drm_connector_helper_add(_conn->base,
+_txp_connector_helper_funcs);
+
+   ret = drm_writeback_connector_init_with_encoder(drm, wb_conn,
+   _txp_connector_funcs, drm_fmts, 
ARRAY_SIZE(drm_fmts));
+   if (ret) {
+   drm_encoder_cleanup(_conn->encoder);
+   return ret;
+   }
+
ret = vc4_crtc_init(drm, vc4_crtc,
_txp_crtc_funcs, _txp_crtc_helper_funcs);
if (ret)
-- 
2.7.4



[PATCH v5 4/4] drm: allow real encoder to be passed for drm_writeback_connector

2022-03-21 Thread Abhinav Kumar
For some vendor driver implementations, display hardware can
be shared between the encoder used for writeback and the physical
display.

In addition resources such as clocks and interrupts can
also be shared between writeback and the real encoder.

To accommodate such vendor drivers and hardware, allow
real encoder to be passed for drm_writeback_connector.

changes in v5:
- re-order the change to come last in the series
- rework necessary changes as part of the re-order

Co-developed-by: Kandpal Suraj 
Signed-off-by: Abhinav Kumar 
---
 drivers/gpu/drm/drm_writeback.c | 12 +++-
 drivers/gpu/drm/vc4/vc4_txp.c   | 13 -
 include/drm/drm_writeback.h | 18 --
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
index abe78b9..d0672f5 100644
--- a/drivers/gpu/drm/drm_writeback.c
+++ b/drivers/gpu/drm/drm_writeback.c
@@ -178,7 +178,7 @@ static int drm_writeback_connector_setup(struct drm_device 
*dev,
goto connector_fail;
 
ret = drm_connector_attach_encoder(connector,
-   _connector->encoder);
+   wb_connector->encoder);
if (ret)
goto attach_fail;
 
@@ -241,11 +241,13 @@ int drm_writeback_connector_init(struct drm_device *dev,
 {
int ret = 0;
 
-   drm_encoder_helper_add(_connector->encoder, enc_helper_funcs);
+   wb_connector->encoder = _connector->internal_encoder;
 
-   wb_connector->encoder.possible_crtcs = possible_crtcs;
+   drm_encoder_helper_add(wb_connector->encoder, enc_helper_funcs);
 
-   ret = drm_encoder_init(dev, _connector->encoder,
+   wb_connector->encoder->possible_crtcs = possible_crtcs;
+
+   ret = drm_encoder_init(dev, wb_connector->encoder,
   _writeback_encoder_funcs,
   DRM_MODE_ENCODER_VIRTUAL, NULL);
if (ret)
@@ -255,7 +257,7 @@ int drm_writeback_connector_init(struct drm_device *dev,
n_formats);
 
if (ret)
-   drm_encoder_cleanup(_connector->encoder);
+   drm_encoder_cleanup(wb_connector->encoder);
 
return ret;
 }
diff --git a/drivers/gpu/drm/vc4/vc4_txp.c b/drivers/gpu/drm/vc4/vc4_txp.c
index 5490162..3d24ef5 100644
--- a/drivers/gpu/drm/vc4/vc4_txp.c
+++ b/drivers/gpu/drm/vc4/vc4_txp.c
@@ -151,6 +151,8 @@ struct vc4_txp {
 
struct platform_device *pdev;
 
+   struct drm_encoder drm_enc;
+
struct drm_writeback_connector connector;
 
void __iomem *regs;
@@ -159,7 +161,7 @@ struct vc4_txp {
 
 static inline struct vc4_txp *encoder_to_vc4_txp(struct drm_encoder *encoder)
 {
-   return container_of(encoder, struct vc4_txp, connector.encoder);
+   return container_of(encoder, struct vc4_txp, drm_enc);
 }
 
 static inline struct vc4_txp *connector_to_vc4_txp(struct drm_connector *conn)
@@ -498,10 +500,11 @@ static int vc4_txp_bind(struct device *dev, struct device 
*master, void *data)
txp->regset.nregs = ARRAY_SIZE(txp_regs);
 
wb_conn = >connector;
+   wb_conn->encoder = >drm_enc;
 
-   drm_encoder_helper_add(_conn->encoder, 
_txp_encoder_helper_funcs);
+   drm_encoder_helper_add(wb_conn->encoder, _txp_encoder_helper_funcs);
 
-   ret = drm_encoder_init(drm, _conn->encoder,
+   ret = drm_encoder_init(drm, wb_conn->encoder,
_txp_encoder_funcs,
DRM_MODE_ENCODER_VIRTUAL, NULL);
 
@@ -514,7 +517,7 @@ static int vc4_txp_bind(struct device *dev, struct device 
*master, void *data)
ret = drm_writeback_connector_init_with_encoder(drm, wb_conn,
_txp_connector_funcs, drm_fmts, 
ARRAY_SIZE(drm_fmts));
if (ret) {
-   drm_encoder_cleanup(_conn->encoder);
+   drm_encoder_cleanup(wb_conn->encoder);
return ret;
}
 
@@ -523,7 +526,7 @@ static int vc4_txp_bind(struct device *dev, struct device 
*master, void *data)
if (ret)
return ret;
 
-   encoder = >connector.encoder;
+   encoder = txp->connector.encoder;
encoder->possible_crtcs = drm_crtc_mask(crtc);
 
ret = devm_request_irq(dev, irq, vc4_txp_interrupt, 0,
diff --git a/include/drm/drm_writeback.h b/include/drm/drm_writeback.h
index 0093bab..ed35c3d 100644
--- a/include/drm/drm_writeback.h
+++ b/include/drm/drm_writeback.h
@@ -25,15 +25,29 @@ struct drm_writeback_connector {
struct drm_connector base;
 
/**
-* @encoder: Internal encoder used by the connector to fulfill
+* @encoder: handle to drm_encoder used by the connector to fulfill
 * the DRM framework requirements. The users of the
 * @drm_writeback_connector control the behaviour of the @encoder
 * by passing the @enc_funcs parameter to 

[PATCH v5 2/4] drm: introduce drm_writeback_connector_init_with_encoder() API

2022-03-21 Thread Abhinav Kumar
For vendors drivers which pass an already allocated and
initialized encoder especially for cases where the encoder
hardware is shared OR the writeback encoder shares the resources
with the rest of the display pipeline introduce a new API,
drm_writeback_connector_init_with_encoder() which expects
an initialized encoder as a parameter and only sets up the
writeback connector.

changes in v5:
- reorder this change to come before in the series
  to avoid incorrect functionality in subsequent changes
- continue using struct drm_encoder instead of
  struct drm_encoder * and switch it in next change

Signed-off-by: Abhinav Kumar 
---
 drivers/gpu/drm/drm_writeback.c | 143 
 include/drm/drm_writeback.h |   5 ++
 2 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
index dc2ef12..abe78b9 100644
--- a/drivers/gpu/drm/drm_writeback.c
+++ b/drivers/gpu/drm/drm_writeback.c
@@ -149,37 +149,15 @@ static const struct drm_encoder_funcs 
drm_writeback_encoder_funcs = {
.destroy = drm_encoder_cleanup,
 };
 
-/**
- * drm_writeback_connector_init - Initialize a writeback connector and its 
properties
- * @dev: DRM device
- * @wb_connector: Writeback connector to initialize
- * @con_funcs: Connector funcs vtable
- * @enc_helper_funcs: Encoder helper funcs vtable to be used by the internal 
encoder
- * @formats: Array of supported pixel formats for the writeback engine
- * @n_formats: Length of the formats array
- * @possible_crtcs: possible crtcs for the internal writeback encoder
- *
- * This function creates the writeback-connector-specific properties if they
- * have not been already created, initializes the connector as
- * type DRM_MODE_CONNECTOR_WRITEBACK, and correctly initializes the property
- * values. It will also create an internal encoder associated with the
- * drm_writeback_connector and set it to use the @enc_helper_funcs vtable for
- * the encoder helper.
- *
- * Drivers should always use this function instead of drm_connector_init() to
- * set up writeback connectors.
- *
- * Returns: 0 on success, or a negative error code
- */
-int drm_writeback_connector_init(struct drm_device *dev,
-struct drm_writeback_connector *wb_connector,
-const struct drm_connector_funcs *con_funcs,
-const struct drm_encoder_helper_funcs 
*enc_helper_funcs,
-const u32 *formats, int n_formats, uint32_t 
possible_crtcs)
+static int drm_writeback_connector_setup(struct drm_device *dev,
+   struct drm_writeback_connector *wb_connector,
+   const struct drm_connector_funcs *con_funcs, const u32 *formats,
+   int n_formats)
 {
struct drm_property_blob *blob;
-   struct drm_connector *connector = _connector->base;
struct drm_mode_config *config = >mode_config;
+   struct drm_connector *connector = _connector->base;
+
int ret = create_writeback_properties(dev);
 
if (ret != 0)
@@ -187,18 +165,10 @@ int drm_writeback_connector_init(struct drm_device *dev,
 
blob = drm_property_create_blob(dev, n_formats * sizeof(*formats),
formats);
-   if (IS_ERR(blob))
-   return PTR_ERR(blob);
-
-   drm_encoder_helper_add(_connector->encoder, enc_helper_funcs);
-
-   wb_connector->encoder.possible_crtcs = possible_crtcs;
-
-   ret = drm_encoder_init(dev, _connector->encoder,
-  _writeback_encoder_funcs,
-  DRM_MODE_ENCODER_VIRTUAL, NULL);
-   if (ret)
-   goto fail;
+   if (IS_ERR(blob)) {
+   ret = PTR_ERR(blob);
+   return ret;
+   }
 
connector->interlace_allowed = 0;
 
@@ -237,13 +207,102 @@ int drm_writeback_connector_init(struct drm_device *dev,
 attach_fail:
drm_connector_cleanup(connector);
 connector_fail:
-   drm_encoder_cleanup(_connector->encoder);
-fail:
drm_property_blob_put(blob);
return ret;
 }
+
+/**
+ * drm_writeback_connector_init - Initialize a writeback connector and its 
properties
+ * @dev: DRM device
+ * @wb_connector: Writeback connector to initialize
+ * @con_funcs: Connector funcs vtable
+ * @enc_helper_funcs: Encoder helper funcs vtable to be used by the internal 
encoder
+ * @formats: Array of supported pixel formats for the writeback engine
+ * @n_formats: Length of the formats array
+ * @possible_crtcs: possible crtcs for the internal writeback encoder
+ *
+ * This function creates the writeback-connector-specific properties if they
+ * have not been already created, initializes the connector as
+ * type DRM_MODE_CONNECTOR_WRITEBACK, and correctly initializes the property
+ * values. It will also create an internal encoder associated with the
+ * 

[PATCH v5 1/4] drm: allow passing possible_crtcs to drm_writeback_connector_init()

2022-03-21 Thread Abhinav Kumar
Clients of drm_writeback_connector_init() initialize the
possible_crtcs and then invoke the call to this API.

To simplify things, allow passing possible_crtcs as a parameter
to drm_writeback_connector_init() and make changes to the
other drm drivers to make them compatible with this change.

changes in v5:
 - None

Signed-off-by: Abhinav Kumar 
Acked-by: Liviu Dudau 
---
 drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c | 3 +--
 drivers/gpu/drm/arm/malidp_mw.c  | 4 ++--
 drivers/gpu/drm/drm_writeback.c  | 6 +-
 drivers/gpu/drm/rcar-du/rcar_du_writeback.c  | 4 ++--
 drivers/gpu/drm/vc4/vc4_txp.c| 3 ++-
 drivers/gpu/drm/vkms/vkms_writeback.c| 4 ++--
 include/drm/drm_writeback.h  | 2 +-
 7 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c 
b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
index e465cc4..40774e6 100644
--- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
+++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
@@ -155,7 +155,6 @@ static int komeda_wb_connector_add(struct komeda_kms_dev 
*kms,
kwb_conn->wb_layer = kcrtc->master->wb_layer;
 
wb_conn = _conn->base;
-   wb_conn->encoder.possible_crtcs = BIT(drm_crtc_index(>base));
 
formats = komeda_get_layer_fourcc_list(>fmt_tbl,
   kwb_conn->wb_layer->layer_type,
@@ -164,7 +163,7 @@ static int komeda_wb_connector_add(struct komeda_kms_dev 
*kms,
err = drm_writeback_connector_init(>base, wb_conn,
   _wb_connector_funcs,
   _wb_encoder_helper_funcs,
-  formats, n_formats);
+  formats, n_formats, 
BIT(drm_crtc_index(>base)));
komeda_put_fourcc_list(formats);
if (err) {
kfree(kwb_conn);
diff --git a/drivers/gpu/drm/arm/malidp_mw.c b/drivers/gpu/drm/arm/malidp_mw.c
index f5847a7..e54921d 100644
--- a/drivers/gpu/drm/arm/malidp_mw.c
+++ b/drivers/gpu/drm/arm/malidp_mw.c
@@ -212,7 +212,6 @@ int malidp_mw_connector_init(struct drm_device *drm)
if (!malidp->dev->hw->enable_memwrite)
return 0;
 
-   malidp->mw_connector.encoder.possible_crtcs = 1 << 
drm_crtc_index(>crtc);
drm_connector_helper_add(>mw_connector.base,
 _mw_connector_helper_funcs);
 
@@ -223,7 +222,8 @@ int malidp_mw_connector_init(struct drm_device *drm)
ret = drm_writeback_connector_init(drm, >mw_connector,
   _mw_connector_funcs,
   _mw_encoder_helper_funcs,
-  formats, n_formats);
+  formats, n_formats,
+ (1 << drm_crtc_index(>crtc)));
kfree(formats);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
index dccf4504..dc2ef12 100644
--- a/drivers/gpu/drm/drm_writeback.c
+++ b/drivers/gpu/drm/drm_writeback.c
@@ -157,6 +157,7 @@ static const struct drm_encoder_funcs 
drm_writeback_encoder_funcs = {
  * @enc_helper_funcs: Encoder helper funcs vtable to be used by the internal 
encoder
  * @formats: Array of supported pixel formats for the writeback engine
  * @n_formats: Length of the formats array
+ * @possible_crtcs: possible crtcs for the internal writeback encoder
  *
  * This function creates the writeback-connector-specific properties if they
  * have not been already created, initializes the connector as
@@ -174,7 +175,7 @@ int drm_writeback_connector_init(struct drm_device *dev,
 struct drm_writeback_connector *wb_connector,
 const struct drm_connector_funcs *con_funcs,
 const struct drm_encoder_helper_funcs 
*enc_helper_funcs,
-const u32 *formats, int n_formats)
+const u32 *formats, int n_formats, uint32_t 
possible_crtcs)
 {
struct drm_property_blob *blob;
struct drm_connector *connector = _connector->base;
@@ -190,6 +191,9 @@ int drm_writeback_connector_init(struct drm_device *dev,
return PTR_ERR(blob);
 
drm_encoder_helper_add(_connector->encoder, enc_helper_funcs);
+
+   wb_connector->encoder.possible_crtcs = possible_crtcs;
+
ret = drm_encoder_init(dev, _connector->encoder,
   _writeback_encoder_funcs,
   DRM_MODE_ENCODER_VIRTUAL, NULL);
diff --git a/drivers/gpu/drm/rcar-du/rcar_du_writeback.c 
b/drivers/gpu/drm/rcar-du/rcar_du_writeback.c
index 

[PATCH v5 0/4] Allow drm_writeback_connector to accept pointer to drm_encoder

2022-03-21 Thread Abhinav Kumar
There are some vendor drivers for which the writeback encoder shares
hardware resources such as clocks and interrupts with the rest of the
display pipeline. In addition, there can be use-cases where the
writeback encoder could be a shared encoder between the physical display
path and the writeback path.

To accommodate for such cases, change the drm_writeback_connector to
accept a pointer to drm_encoder.

For existing users of drm_writeback_connector there will not be any
change in functionality due to this change.

This approach was first posted by Suraj Kandpal here [1] for both
encoder and connector. But after discussions [2], the consensus was
reached to split this change for the drm_encoder first and the
drm_connector part can be reworked in a subsequent change later.

Validation of this change was done using igt_writeback tests on
MSM based RB5 board using the changes posted here [3].

For all other chipsets, these changes were compile-tested.

[1] 
https://patchwork.kernel.org/project/dri-devel/patch/20220202081702.22119-1-suraj.kand...@intel.com/
[2] 
https://patchwork.kernel.org/project/dri-devel/patch/20220202085429.22261-6-suraj.kand...@intel.com/
[3] https://patchwork.freedesktop.org/series/99724/

changes in v5:
- re-order the series to make sure the encoder initialization
  is not broken due to incorrect order of changes

Abhinav Kumar (4):
  drm: allow passing possible_crtcs to drm_writeback_connector_init()
  drm: introduce drm_writeback_connector_init_with_encoder() API
  drm/vc4: change vc4 driver to use
drm_writeback_connector_init_with_encoder()
  drm: allow real encoder to be passed for drm_writeback_connector

 .../drm/arm/display/komeda/komeda_wb_connector.c   |   3 +-
 drivers/gpu/drm/arm/malidp_mw.c|   4 +-
 drivers/gpu/drm/drm_writeback.c| 143 +++--
 drivers/gpu/drm/rcar-du/rcar_du_writeback.c|   4 +-
 drivers/gpu/drm/vc4/vc4_txp.c  |  36 --
 drivers/gpu/drm/vkms/vkms_writeback.c  |   4 +-
 include/drm/drm_writeback.h|  25 +++-
 7 files changed, 161 insertions(+), 58 deletions(-)

-- 
2.7.4



[PATCH v2 3/4] drm/i915/selftest: Clear the output buffers before GPU writes

2022-03-21 Thread Ramalingam C
From: Chris Wilson 

When testing whether we can get the GPU to leak information about
non-privileged state, we first need to ensure that the output buffer is
set to a known value as the HW may opt to skip the write into memory for
a non-privileged read of a sensitive register. We chose POISON_INUSE (0x5a)
so that is both non-zero and distinct from the poison values used during
the test.

v2:
  Use i915_gem_object_pin_map_unlocked

Reported-by: CQ Tang 
Signed-off-by: Chris Wilson 
Cc: CQ Tang 
cc: Joonas Lahtinen 
Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 32 ++
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c 
b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 0a8ed4246082..6c394d0c0fb0 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1346,6 +1346,30 @@ static int compare_isolation(struct intel_engine_cs 
*engine,
return err;
 }
 
+static struct i915_vma *
+create_result_vma(struct i915_address_space *vm, unsigned long sz)
+{
+   struct i915_vma *vma;
+   void *ptr;
+
+   vma = create_user_vma(vm, sz);
+   if (IS_ERR(vma))
+   return vma;
+
+   /* Set the results to a known value distinct from the poison */
+   ptr = i915_gem_object_pin_map_unlocked(vma->obj, I915_MAP_WC);
+   if (IS_ERR(ptr)) {
+   i915_vma_put(vma);
+   return ERR_CAST(ptr);
+   }
+
+   memset(ptr, POISON_INUSE, vma->size);
+   i915_gem_object_flush_map(vma->obj);
+   i915_gem_object_unpin_map(vma->obj);
+
+   return vma;
+}
+
 static int __lrc_isolation(struct intel_engine_cs *engine, u32 poison)
 {
u32 *sema = memset32(engine->status_page.addr + 1000, 0, 1);
@@ -1364,13 +1388,13 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
goto err_A;
}
 
-   ref[0] = create_user_vma(A->vm, SZ_64K);
+   ref[0] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(ref[0])) {
err = PTR_ERR(ref[0]);
goto err_B;
}
 
-   ref[1] = create_user_vma(A->vm, SZ_64K);
+   ref[1] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(ref[1])) {
err = PTR_ERR(ref[1]);
goto err_ref0;
@@ -1392,13 +1416,13 @@ static int __lrc_isolation(struct intel_engine_cs 
*engine, u32 poison)
}
i915_request_put(rq);
 
-   result[0] = create_user_vma(A->vm, SZ_64K);
+   result[0] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(result[0])) {
err = PTR_ERR(result[0]);
goto err_ref1;
}
 
-   result[1] = create_user_vma(A->vm, SZ_64K);
+   result[1] = create_result_vma(A->vm, SZ_64K);
if (IS_ERR(result[1])) {
err = PTR_ERR(result[1]);
goto err_result0;
-- 
2.20.1



Re: [PATCH v11 5/7] dt-bindings: display: Add Loongson display controller

2022-03-21 Thread Rob Herring
On Tue, Mar 22, 2022 at 12:29:14AM +0800, Sui Jingfeng wrote:
> From: suijingfeng 
> 

Needs a commit message.

> Signed-off-by: suijingfeng 
> Signed-off-by: Sui Jingfeng <15330273...@189.cn>

Same person? Don't need both emails.

> ---
>  .../loongson/loongson,display-controller.yaml | 230 ++
>  1 file changed, 230 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
> 
> diff --git 
> a/Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
>  
> b/Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
> new file mode 100644
> index ..7be63346289e
> --- /dev/null
> +++ 
> b/Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
> @@ -0,0 +1,230 @@
> +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +%YAML 1.2
> +---
> +$id: 
> http://devicetree.org/schemas/display/loongson/loongson,display-controller.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Loongson LS7A1000/LS2K1000/LS2K0500 Display Controller Device Tree 
> Bindings
> +
> +maintainers:
> +  - Sui Jingfeng 
> +
> +description: |+
> +
> +  Loongson display controllers are simple which require scanout buffers
> +  to be physically contiguous. LS2K1000/LS2K0500 is a SOC, only system
> +  memory is available. LS7A1000/LS7A2000 is bridge chip which is equipped
> +  with a dedicated video RAM which is 64MB or more, precise size can be
> +  read from the PCI BAR 2 of the GPU device(0x0014:0x7A15) in the bridge
> +  chip.
> +
> +  LSDC has two display pipes, each way has a DVO interface which provide
> +  RGB888 signals, vertical & horizontal synchronisations, data enable and
> +  the pixel clock. LSDC has two CRTC, each CRTC is able to scanout from
> +  1920x1080 resolution at 60Hz. Each CRTC has two FB address registers.
> +
> +  For LS7A1000, there are 4 dedicated GPIOs whose control register is
> +  located at the DC register space. They are used to emulate two way i2c,
> +  One for DVO0, another for DVO1.
> +
> +  LS2K1000 and LS2K0500 SoC grab i2c adapter from other module, either
> +  general purpose GPIO emulated i2c or hardware i2c in the SoC.
> +
> +  LSDC's display pipeline have several components as below description,
> +
> +  The display controller in LS7A1000:
> + ___ _
> +|---|   | |
> +|  CRTC0 --> | DVO0 > Encoder0 ---> Connector0 ---> | Monitor |
> +|  _   _ ---|^ ^|_|
> +| | | | |---|| |
> +| |_| |_|| i2c0 <+-+
> +|---|
> +|   DC IN LS7A1000  |
> +|  _   _ ---|
> +| | | | || i2c1 <+-+
> +| |_| |_|---|| | _
> +|---|| || |
> +|  CRTC1 --> | DVO1 > Encoder1 ---> Connector1 ---> |  Panel  |
> +|---|   |_|
> +|___|
> +
> +  Simple usage of LS7A1000 with LS3A4000 CPU:
> +
> ++--++---+
> +| DDR4 ||  +---+|
> ++--+|  | PCIe Root complex |   LS7A1000 |
> +   || MC0   |  +--++-+++|
> +  +--+  HT 3.0  | || || |
> +  | LS3A4000 |<>| +---++---+  +--++--++-+   +--+
> +  |   CPU|<>| | GC1000 |  | LSDC |<-->| DDR3 MC |<->| VRAM |
> +  +--+  | ++  +-+--+-++-+   +--+
> +   || MC1   +---|--|+
> ++--+|  |
> +| DDR4 |  +---+   DVO0  |  |  DVO1   +--+
> ++--+   VGA <--|ADV7125|<+  +>|TFP410|--> DVI/HDMI
> +  +---+  +--+
> +
> +  The display controller in LS2K1000/LS2K0500:
> + ___ _
> +|---|   | |
> +|  CRTC0 --> | DVO0 > Encoder0 ---> Connector0 ---> | Monitor |
> +|  _   _ ---|^  ^   |_|
> +| | | | |   ||  |
> +| |_| |_|   | +--+  |
> +|   <>| i2c0 |<-+
> +|   DC IN LS2K1000  | +--+
> +|  _   _| +--+
> +| | | | |   <>| i2c1 |--+
> +| |_| |_|   | +--+  |_
> +|---||  |   | |
> +|  

Re: [Intel-gfx] [PATCH v2 5/7] drm/i915/selftests: use the memcpy_from_wc call from the drm

2022-03-21 Thread Lucas De Marchi

Now Cc'ing Daniel properly

Lucas De Marchi

On Mon, Mar 21, 2022 at 04:00:56PM -0700, Lucas De Marchi wrote:

+Thomas Zimmermann and +Daniel Vetter

Could you take a look below regarding the I/O to I/O memory access?

On Thu, Mar 03, 2022 at 11:30:11PM +0530, Balasubramani Vivekanandan wrote:

memcpy_from_wc functions in i915_memcpy.c will be removed and replaced
by the implementation in drm_cache.c.
Updated to use the functions provided by drm_cache.c.

v2: check if the source and destination memory address is from local
  memory or system memory and initialize the iosys_map accordingly
  (Lucas)

Cc: Lucas De Marchi 
Cc: Matthew Auld 
Cc: Thomas Hellstr_m 

Signed-off-by: Balasubramani Vivekanandan 
---
.../drm/i915/selftests/intel_memory_region.c  | 41 +--
1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_memory_region.c 
b/drivers/gpu/drm/i915/selftests/intel_memory_region.c
index ba32893e0873..d16ecb905f3b 100644
--- a/drivers/gpu/drm/i915/selftests/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/selftests/intel_memory_region.c
@@ -7,6 +7,7 @@
#include 

#include 
+#include 

#include "../i915_selftest.h"

@@ -1133,7 +1134,7 @@ static const char *repr_type(u32 type)

static struct drm_i915_gem_object *
create_region_for_mapping(struct intel_memory_region *mr, u64 size, u32 type,
- void **out_addr)
+ struct iosys_map *out_addr)
{
struct drm_i915_gem_object *obj;
void *addr;
@@ -1153,7 +1154,11 @@ create_region_for_mapping(struct intel_memory_region 
*mr, u64 size, u32 type,
return addr;
}

-   *out_addr = addr;
+   if (i915_gem_object_is_lmem(obj))
+   iosys_map_set_vaddr_iomem(out_addr, (void __iomem *)addr);
+   else
+   iosys_map_set_vaddr(out_addr, addr);
+
return obj;
}

@@ -1164,24 +1169,33 @@ static int wrap_ktime_compare(const void *A, const void 
*B)
return ktime_compare(*a, *b);
}

-static void igt_memcpy_long(void *dst, const void *src, size_t size)
+static void igt_memcpy_long(struct iosys_map *dst, struct iosys_map *src,
+   size_t size)
{
-   unsigned long *tmp = dst;
-   const unsigned long *s = src;
+   unsigned long *tmp = dst->is_iomem ?
+   (unsigned long __force *)dst->vaddr_iomem :
+   dst->vaddr;


if we access vaddr_iomem/vaddr we basically break the promise of
abstracting system and I/O memory. There is no point in receiving
struct iosys_map as argument and then break the abstraction.


+   const unsigned long *s = src->is_iomem ?
+   (unsigned long __force *)src->vaddr_iomem :
+   src->vaddr;

size = size / sizeof(unsigned long);
while (size--)
*tmp++ = *s++;



so we basically want to copy from one place to the other on a word
boundary. And it may be

a) I/O -> I/O or
b) system -> I/O or
c) I/O -> system

(b) and (c) should work, but AFAICS (a) is not possible with the current
iosys-map API. Not even the underlying APIs have that abstracted. Both
memcpy_fromio() and memcpy_toio() expect one of them to be RAM (system
memory)

I remember seeing people using a temporary in buffer in system memory
for proxying the copy. But maybe we need an abstraction for that?
Also adding Thomas Zimmermann here for that question.

and since this is a selftest testing the performance of the memcpy from
one memory region to the other, it would be good to have this test
executed to a) make sure it still works and b) record in the commit
message any possible slow down we are incurring.

thanks
Lucas De Marchi



}

-static inline void igt_memcpy(void *dst, const void *src, size_t size)
+static inline void igt_memcpy(struct iosys_map *dst, struct iosys_map *src,
+ size_t size)
{
-   memcpy(dst, src, size);
+   memcpy(dst->is_iomem ? (void __force *)dst->vaddr_iomem : dst->vaddr,
+  src->is_iomem ? (void __force *)src->vaddr_iomem : src->vaddr,
+  size);
}

-static inline void igt_memcpy_from_wc(void *dst, const void *src, size_t size)
+static inline void igt_memcpy_from_wc(struct iosys_map *dst, struct iosys_map 
*src,
+ size_t size)
{
-   i915_memcpy_from_wc(dst, src, size);
+   drm_memcpy_from_wc(dst, src, size);
}

static int _perf_memcpy(struct intel_memory_region *src_mr,
@@ -1191,7 +1205,8 @@ static int _perf_memcpy(struct intel_memory_region 
*src_mr,
struct drm_i915_private *i915 = src_mr->i915;
const struct {
const char *name;
-   void (*copy)(void *dst, const void *src, size_t size);
+   void (*copy)(struct iosys_map *dst, struct iosys_map *src,
+size_t size);
bool skip;

Re: [PATCH v4 1/8] drm/i915/gt: Use XY_FASR_COLOR_BLT to clear obj on graphics ver 12+

2022-03-21 Thread Ramalingam C
On 2022-03-21 at 14:19:01 +0530, Hellstrom, Thomas wrote:
> On Sun, 2022-03-20 at 02:12 +0530, Ramalingam C wrote:
> > XY_FAST_COLOR_BLT cmd is faster than the older XY_COLOR_BLT. Hence
> > for
> > clearing (Zero out) the pages of the newly allocated object, faster
> > cmd
> > is used.
> 
> NIT: Imperative wording
> 
> >
> > Signed-off-by: Ramalingam C 
> > Signed-off-by: Chris Wilson 
> 
> Also there's a typo in the patch title.
Fixed them in the next version. Thanks for the review Thomas.

Ram
> 
> With that fixed:
> Reviewed-by: Thomas Hellström 
> 
> 
> > ---
> >  drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  5 +++
> >  drivers/gpu/drm/i915/gt/intel_migrate.c  | 43 +-
> > --
> >  2 files changed, 43 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> > b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> > index d112ffd56418..925e55b6a94f 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
> > @@ -205,6 +205,11 @@
> >
> >  #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 -
> > 2))
> >  #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
> > +#define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
> > +#define   XY_FAST_COLOR_BLT_DEPTH_32   (2 << 19)
> > +#define   XY_FAST_COLOR_BLT_DW 16
> > +#define   XY_FAST_COLOR_BLT_MOCS_MASK  GENMASK(27, 21)
> > +#define   XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT 31
> >  #define SRC_COPY_BLT_CMD   (2 << 29 | 0x43 << 22)
> >  #define GEN9_XY_FAST_COPY_BLT_CMD  (2 << 29 | 0x42 << 22)
> >  #define XY_SRC_COPY_BLT_CMD(2 << 29 | 0x53 << 22)
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > index 20444d6ceb3c..73199ebf0671 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -614,20 +614,53 @@ intel_context_migrate_copy(struct intel_context
> > *ce,
> > return err;
> >  }
> >
> > -static int emit_clear(struct i915_request *rq, u64 offset, int size,
> > u32 value)
> > +static int emit_clear(struct i915_request *rq, u64 offset, int size,
> > + u32 value, bool is_lmem)
> >  {
> > -   const int ver = GRAPHICS_VER(rq->engine->i915);
> > +   struct drm_i915_private *i915 = rq->engine->i915;
> > +   int mocs = rq->engine->gt->mocs.uc_index << 1;
> > +   const int ver = GRAPHICS_VER(i915);
> > +   int ring_sz;
> > u32 *cs;
> >
> > GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
> >
> > offset += (u64)rq->engine->instance << 32;
> >
> > -   cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
> > +   if (ver >= 12)
> > +   ring_sz = 16;
> > +   else if (ver >= 8)
> > +   ring_sz = 8;
> > +   else
> > +   ring_sz = 6;
> > +
> > +   cs = intel_ring_begin(rq, ring_sz);
> > if (IS_ERR(cs))
> > return PTR_ERR(cs);
> >
> > -   if (ver >= 8) {
> > +   if (ver >= 12) {
> > +   *cs++ = XY_FAST_COLOR_BLT_CMD |
> > XY_FAST_COLOR_BLT_DEPTH_32 |
> > +   (XY_FAST_COLOR_BLT_DW - 2);
> > +   *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs)
> > |
> > +   (PAGE_SIZE - 1);
> > +   *cs++ = 0;
> > +   *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
> > +   *cs++ = lower_32_bits(offset);
> > +   *cs++ = upper_32_bits(offset);
> > +   *cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
> > +   /* BG7 */
> > +   *cs++ = value;
> > +   *cs++ = 0;
> > +   *cs++ = 0;
> > +   *cs++ = 0;
> > +   /* BG11 */
> > +   *cs++ = 0;
> > +   *cs++ = 0;
> > +   /* BG13 */
> > +   *cs++ = 0;
> > +   *cs++ = 0;
> > +   *cs++ = 0;
> > +   } else if (ver >= 8) {
> > *cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
> > *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY |
> > PAGE_SIZE;
> > *cs++ = 0;
> > @@ -711,7 +744,7 @@ intel_context_migrate_clear(struct intel_context
> > *ce,
> > if (err)
> > goto out_rq;
> >
> > -   err = emit_clear(rq, offset, len, value);
> > +   err = emit_clear(rq, offset, len, value, is_lmem);
> >
> > /* Arbitration is re-enabled between requests. */
> >  out_rq:
> 


Re: [Intel-gfx] [PATCH v4 6/8] drm/ttm: Add a parameter to add extra pages into ttm_tt

2022-03-21 Thread Ramalingam C
On 2022-03-21 at 11:11:33 +0100, Das, Nirmoy wrote:
> In the previous version I replied only to the mailing list email so probably
> my email slipped through.

Sorry for the miss. Thank so much for the review.

Ram
> 
> Reviewed-by: Nirmoy Das  for patch 6-7
> 
> On 3/19/2022 9:42 PM, Ramalingam C wrote:
> > Add a parameter called "extra_pages" for ttm_tt_init, to indicate that
> > driver needs extra pages in ttm_tt.
> > 
> > v2:
> >Used imperative wording [Thomas and Christian]
> > 
> > Signed-off-by: Ramalingam C 
> > cc: Christian Koenig 
> > cc: Hellstrom Thomas 
> > Reviewed-by: Thomas Hellstrom 
> > Reviewed-by: Christian Konig 
> > ---
> >   drivers/gpu/drm/drm_gem_vram_helper.c  |  2 +-
> >   drivers/gpu/drm/i915/gem/i915_gem_ttm.c|  2 +-
> >   drivers/gpu/drm/qxl/qxl_ttm.c  |  2 +-
> >   drivers/gpu/drm/ttm/ttm_agp_backend.c  |  2 +-
> >   drivers/gpu/drm/ttm/ttm_tt.c   | 12 +++-
> >   drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c |  2 +-
> >   include/drm/ttm/ttm_tt.h   |  4 +++-
> >   7 files changed, 15 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c 
> > b/drivers/gpu/drm/drm_gem_vram_helper.c
> > index dc7f938bfff2..123045b58fec 100644
> > --- a/drivers/gpu/drm/drm_gem_vram_helper.c
> > +++ b/drivers/gpu/drm/drm_gem_vram_helper.c
> > @@ -867,7 +867,7 @@ static struct ttm_tt *bo_driver_ttm_tt_create(struct 
> > ttm_buffer_object *bo,
> > if (!tt)
> > return NULL;
> > -   ret = ttm_tt_init(tt, bo, page_flags, ttm_cached);
> > +   ret = ttm_tt_init(tt, bo, page_flags, ttm_cached, 0);
> > if (ret < 0)
> > goto err_ttm_tt_init;
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
> > b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > index e4a06fcf741a..3b9f99c765c4 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> > @@ -290,7 +290,7 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
> > ttm_buffer_object *bo,
> > i915_tt->is_shmem = true;
> > }
> > -   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching);
> > +   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
> > if (ret)
> > goto err_free;
> > diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
> > index b2e33d5ba5d0..52156b54498f 100644
> > --- a/drivers/gpu/drm/qxl/qxl_ttm.c
> > +++ b/drivers/gpu/drm/qxl/qxl_ttm.c
> > @@ -113,7 +113,7 @@ static struct ttm_tt *qxl_ttm_tt_create(struct 
> > ttm_buffer_object *bo,
> > ttm = kzalloc(sizeof(struct ttm_tt), GFP_KERNEL);
> > if (ttm == NULL)
> > return NULL;
> > -   if (ttm_tt_init(ttm, bo, page_flags, ttm_cached)) {
> > +   if (ttm_tt_init(ttm, bo, page_flags, ttm_cached, 0)) {
> > kfree(ttm);
> > return NULL;
> > }
> > diff --git a/drivers/gpu/drm/ttm/ttm_agp_backend.c 
> > b/drivers/gpu/drm/ttm/ttm_agp_backend.c
> > index 6ddc16f0fe2b..d27691f2e451 100644
> > --- a/drivers/gpu/drm/ttm/ttm_agp_backend.c
> > +++ b/drivers/gpu/drm/ttm/ttm_agp_backend.c
> > @@ -134,7 +134,7 @@ struct ttm_tt *ttm_agp_tt_create(struct 
> > ttm_buffer_object *bo,
> > agp_be->mem = NULL;
> > agp_be->bridge = bridge;
> > -   if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined)) {
> > +   if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined, 0)) {
> > kfree(agp_be);
> > return NULL;
> > }
> > diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
> > index d234aab800a0..1a66d9fc589a 100644
> > --- a/drivers/gpu/drm/ttm/ttm_tt.c
> > +++ b/drivers/gpu/drm/ttm/ttm_tt.c
> > @@ -134,9 +134,10 @@ void ttm_tt_destroy(struct ttm_device *bdev, struct 
> > ttm_tt *ttm)
> >   static void ttm_tt_init_fields(struct ttm_tt *ttm,
> >struct ttm_buffer_object *bo,
> >uint32_t page_flags,
> > -  enum ttm_caching caching)
> > +  enum ttm_caching caching,
> > +  unsigned long extra_pages)
> >   {
> > -   ttm->num_pages = PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT;
> > +   ttm->num_pages = (PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT) + 
> > extra_pages;
> > ttm->caching = ttm_cached;
> > ttm->page_flags = page_flags;
> > ttm->dma_address = NULL;
> > @@ -146,9 +147,10 @@ static void ttm_tt_init_fields(struct ttm_tt *ttm,
> >   }
> >   int ttm_tt_init(struct ttm_tt *ttm, struct ttm_buffer_object *bo,
> > -   uint32_t page_flags, enum ttm_caching caching)
> > +   uint32_t page_flags, enum ttm_caching caching,
> > +   unsigned long extra_pages)
> >   {
> > -   ttm_tt_init_fields(ttm, bo, page_flags, caching);
> > +   ttm_tt_init_fields(ttm, bo, page_flags, caching, extra_pages);
> > if (ttm_tt_alloc_page_directory(ttm)) {
> > pr_err("Failed allocating page table\n");
> > @@ -180,7 +182,7 @@ int 

Re: [PATCH v4 4/8] drm/i915/selftest_migrate: Check CCS meta data clear

2022-03-21 Thread Ramalingam C
On 2022-03-21 at 16:09:08 +0530, Hellstrom, Thomas wrote:
> On Sun, 2022-03-20 at 02:12 +0530, Ramalingam C wrote:
> > While clearing the Flat-CCS capable lmem object, we need to clear the
> > CCS
> > meta data corresponding to the memory.
> >
> > As part of live_migrate_clear add check for the ccs meta data clear
> > for
> > the Flat-CCS capable lmem object.
> >
> > Signed-off-by: Ramalingam C 
> > ---
> >  drivers/gpu/drm/i915/gt/intel_migrate.c|  32 +++
> >  drivers/gpu/drm/i915/gt/selftest_migrate.c | 274 ++-
> > --
> >  2 files changed, 278 insertions(+), 28 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > index c1db8daf994a..bbfea570c239 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -572,6 +572,38 @@ static u32 *_i915_ctrl_surf_copy_blt(u32 *cmd,
> > u64 src_addr, u64 dst_addr,
> > return cmd;
> >  }
> >
> > +static int emit_copy_ccs(struct i915_request *rq,
> > +u32 dst_offset, u8 dst_access,
> > +u32 src_offset, u8 src_access, int size)
> > +{
> > +   struct drm_i915_private *i915 = rq->engine->i915;
> > +   int mocs = rq->engine->gt->mocs.uc_index << 1;
> > +   u32 num_ccs_blks, ccs_ring_size;
> > +   u32 *cs;
> > +
> > +   ccs_ring_size = calc_ctrl_surf_instr_size(i915, size);
> > +   WARN_ON(!ccs_ring_size);
> > +
> > +   cs = intel_ring_begin(rq, round_up(ccs_ring_size, 2));
> > +   if (IS_ERR(cs))
> > +   return PTR_ERR(cs);
> > +
> > +   num_ccs_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
> > +   NUM_CCS_BYTES_PER_BLOCK);
> > +
> > +   cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS);
> > +   cs = _i915_ctrl_surf_copy_blt(cs, src_offset, dst_offset,
> > + src_access, dst_access,
> > + mocs, mocs, num_ccs_blks);
> > +   cs = i915_flush_dw(cs, MI_FLUSH_DW_LLC | MI_FLUSH_DW_CCS);
> > +   if (ccs_ring_size & 1)
> > +   *cs++ = MI_NOOP;
> > +
> > +   intel_ring_advance(rq, cs);
> > +
> > +   return 0;
> > +}
> 
> 
> This would be an unused function if selftests are not configured,
> right?
No Thomas. This is reused between selftest and eviction flow. in next
version i am reusing it for evict_clear too.

> 
> 
> > +
> >  static int emit_copy(struct i915_request *rq,
> >  u32 dst_offset, u32 src_offset, int size)
> >  {
> > diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c
> > b/drivers/gpu/drm/i915/gt/selftest_migrate.c
> > index b5da8b8cd039..e32cc994f4a2 100644
> > --- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
> > +++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
> > @@ -132,6 +132,126 @@ static int copy(struct intel_migrate *migrate,
> > return err;
> >  }
> >
> > +static int intel_context_copy_ccs(struct intel_context *ce,
> > + const struct i915_deps *deps,
> > + struct scatterlist *sg,
> > + enum i915_cache_level cache_level,
> > + bool write_to_ccs,
> > + struct i915_request **out)
> > +{
> > +   u8 src_access = write_to_ccs ? DIRECT_ACCESS :
> > INDIRECT_ACCESS;
> > +   u8 dst_access = write_to_ccs ? INDIRECT_ACCESS :
> > DIRECT_ACCESS;
> > +   struct sgt_dma it = sg_sgt(sg);
> > +   struct i915_request *rq;
> > +   u32 offset;
> > +   int err;
> > +
> > +   GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
> > +   *out = NULL;
> > +
> > +   GEM_BUG_ON(ce->ring->size < SZ_64K);
> > +
> > +   offset = 0;
> > +   if (HAS_64K_PAGES(ce->engine->i915))
> > +   offset = CHUNK_SZ;
> > +   offset += (u64)rq->engine->instance << 32;
> > +
> > +   do {
> > +   int len;
> > +
> > +   rq = i915_request_create(ce);
> > +   if (IS_ERR(rq)) {
> > +   err = PTR_ERR(rq);
> > +   goto out_ce;
> > +   }
> > +
> > +   if (deps) {
> > +   err = i915_request_await_deps(rq, deps);
> > +   if (err)
> > +   goto out_rq;
> > +
> > +   if (rq->engine->emit_init_breadcrumb) {
> > +   err = rq->engine-
> > >emit_init_breadcrumb(rq);
> > +   if (err)
> > +   goto out_rq;
> > +   }
> > +
> > +   deps = NULL;
> > +   }
> > +
> > +   /* The PTE updates + clear must not be interrupted.
> > */
> > +   err = emit_no_arbitration(rq);
> > +   if (err)
> > +   goto out_rq;

Re: [PATCH v2 5/7] drm/i915/selftests: use the memcpy_from_wc call from the drm

2022-03-21 Thread Lucas De Marchi

+Thomas Zimmermann and +Daniel Vetter

Could you take a look below regarding the I/O to I/O memory access?

On Thu, Mar 03, 2022 at 11:30:11PM +0530, Balasubramani Vivekanandan wrote:

memcpy_from_wc functions in i915_memcpy.c will be removed and replaced
by the implementation in drm_cache.c.
Updated to use the functions provided by drm_cache.c.

v2: check if the source and destination memory address is from local
   memory or system memory and initialize the iosys_map accordingly
   (Lucas)

Cc: Lucas De Marchi 
Cc: Matthew Auld 
Cc: Thomas Hellstr_m 

Signed-off-by: Balasubramani Vivekanandan 
---
.../drm/i915/selftests/intel_memory_region.c  | 41 +--
1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_memory_region.c 
b/drivers/gpu/drm/i915/selftests/intel_memory_region.c
index ba32893e0873..d16ecb905f3b 100644
--- a/drivers/gpu/drm/i915/selftests/intel_memory_region.c
+++ b/drivers/gpu/drm/i915/selftests/intel_memory_region.c
@@ -7,6 +7,7 @@
#include 

#include 
+#include 

#include "../i915_selftest.h"

@@ -1133,7 +1134,7 @@ static const char *repr_type(u32 type)

static struct drm_i915_gem_object *
create_region_for_mapping(struct intel_memory_region *mr, u64 size, u32 type,
- void **out_addr)
+ struct iosys_map *out_addr)
{
struct drm_i915_gem_object *obj;
void *addr;
@@ -1153,7 +1154,11 @@ create_region_for_mapping(struct intel_memory_region 
*mr, u64 size, u32 type,
return addr;
}

-   *out_addr = addr;
+   if (i915_gem_object_is_lmem(obj))
+   iosys_map_set_vaddr_iomem(out_addr, (void __iomem *)addr);
+   else
+   iosys_map_set_vaddr(out_addr, addr);
+
return obj;
}

@@ -1164,24 +1169,33 @@ static int wrap_ktime_compare(const void *A, const void 
*B)
return ktime_compare(*a, *b);
}

-static void igt_memcpy_long(void *dst, const void *src, size_t size)
+static void igt_memcpy_long(struct iosys_map *dst, struct iosys_map *src,
+   size_t size)
{
-   unsigned long *tmp = dst;
-   const unsigned long *s = src;
+   unsigned long *tmp = dst->is_iomem ?
+   (unsigned long __force *)dst->vaddr_iomem :
+   dst->vaddr;


if we access vaddr_iomem/vaddr we basically break the promise of
abstracting system and I/O memory. There is no point in receiving
struct iosys_map as argument and then break the abstraction.


+   const unsigned long *s = src->is_iomem ?
+   (unsigned long __force *)src->vaddr_iomem :
+   src->vaddr;

size = size / sizeof(unsigned long);
while (size--)
*tmp++ = *s++;



so we basically want to copy from one place to the other on a word
boundary. And it may be

a) I/O -> I/O or
b) system -> I/O or
c) I/O -> system

(b) and (c) should work, but AFAICS (a) is not possible with the current
iosys-map API. Not even the underlying APIs have that abstracted. Both
memcpy_fromio() and memcpy_toio() expect one of them to be RAM (system
memory)

I remember seeing people using a temporary in buffer in system memory
for proxying the copy. But maybe we need an abstraction for that?
Also adding Thomas Zimmermann here for that question.

and since this is a selftest testing the performance of the memcpy from
one memory region to the other, it would be good to have this test
executed to a) make sure it still works and b) record in the commit
message any possible slow down we are incurring.

thanks
Lucas De Marchi



}

-static inline void igt_memcpy(void *dst, const void *src, size_t size)
+static inline void igt_memcpy(struct iosys_map *dst, struct iosys_map *src,
+ size_t size)
{
-   memcpy(dst, src, size);
+   memcpy(dst->is_iomem ? (void __force *)dst->vaddr_iomem : dst->vaddr,
+  src->is_iomem ? (void __force *)src->vaddr_iomem : src->vaddr,
+  size);
}

-static inline void igt_memcpy_from_wc(void *dst, const void *src, size_t size)
+static inline void igt_memcpy_from_wc(struct iosys_map *dst, struct iosys_map 
*src,
+ size_t size)
{
-   i915_memcpy_from_wc(dst, src, size);
+   drm_memcpy_from_wc(dst, src, size);
}

static int _perf_memcpy(struct intel_memory_region *src_mr,
@@ -1191,7 +1205,8 @@ static int _perf_memcpy(struct intel_memory_region 
*src_mr,
struct drm_i915_private *i915 = src_mr->i915;
const struct {
const char *name;
-   void (*copy)(void *dst, const void *src, size_t size);
+   void (*copy)(struct iosys_map *dst, struct iosys_map *src,
+size_t size);
bool skip;
} tests[] = {
{
@@ -1205,11 +1220,11 @@ static int _perf_memcpy(struct intel_memory_region 

[PATCH v5 9/9] drm/i915/migrate: Evict and restore the flatccs capable lmem obj

2022-03-21 Thread Ramalingam C
When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem. ccs data is 1/256 of
lmem size.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v2: Fixing the ccs handling
v3: Handle the ccs data at same loop as main memory [Thomas]
v4: changes for emit_copy_ccs

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 163 +++-
 1 file changed, 159 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 5f6341f91622..22e3c079468f 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -657,6 +657,65 @@ static int emit_copy(struct i915_request *rq,
return 0;
 }
 
+static int scatter_list_length(struct scatterlist *sg)
+{
+   int len = 0;
+
+   while (sg && sg_dma_len(sg)) {
+   len += sg_dma_len(sg);
+   sg = sg_next(sg);
+   };
+
+   return len;
+}
+
+static void
+calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
+  int *src_sz, int *ccs_sz, u32 bytes_to_cpy,
+  u32 ccs_bytes_to_cpy)
+{
+   if (ccs_bytes_to_cpy) {
+   /*
+* We can only copy the ccs data corresponding to
+* the CHUNK_SZ of lmem which is
+* GET_CCS_BYTES(i915, CHUNK_SZ))
+*/
+   *ccs_sz = min_t(int, ccs_bytes_to_cpy, GET_CCS_BYTES(i915, 
CHUNK_SZ));
+
+   if (!src_is_lmem)
+   /*
+* When CHUNK_SZ is passed all the pages upto CHUNK_SZ
+* will be taken for the blt. in Flat-ccs supported
+* platform Smem obj will have more pages than required
+* for main meory hence limit it to the required size
+* for main memory
+*/
+   *src_sz = min_t(int, bytes_to_cpy, CHUNK_SZ);
+   } else { /* ccs handling is not required */
+   *src_sz = CHUNK_SZ;
+   }
+}
+
+static void get_ccs_sg_sgt(struct sgt_dma *it, u32 bytes_to_cpy)
+{
+   u32 len;
+
+   do {
+   GEM_BUG_ON(!it->sg || !sg_dma_len(it->sg));
+   len = it->max - it->dma;
+   if (len > bytes_to_cpy) {
+   it->dma += bytes_to_cpy;
+   break;
+   }
+
+   bytes_to_cpy -= len;
+
+   it->sg = __sg_next(it->sg);
+   it->dma = sg_dma_address(it->sg);
+   it->max = it->dma + sg_dma_len(it->sg);
+   } while (bytes_to_cpy);
+}
+
 int
 intel_context_migrate_copy(struct intel_context *ce,
   const struct i915_deps *deps,
@@ -668,9 +727,15 @@ intel_context_migrate_copy(struct intel_context *ce,
   bool dst_is_lmem,
   struct i915_request **out)
 {
-   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst), it_ccs;
+   struct drm_i915_private *i915 = ce->engine->i915;
+   u32 ccs_bytes_to_cpy = 0, bytes_to_cpy;
+   enum i915_cache_level ccs_cache_level;
+   int src_sz, dst_sz, ccs_sz;
u32 src_offset, dst_offset;
+   u8 src_access, dst_access;
struct i915_request *rq;
+   bool ccs_is_src;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -678,6 +743,38 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_sz = scatter_list_length(src);
+   bytes_to_cpy = src_sz;
+
+   if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
+   src_access = !src_is_lmem && dst_is_lmem;
+   dst_access = !src_access;
+
+   dst_sz = scatter_list_length(dst);
+   if (src_is_lmem) {
+   it_ccs = it_dst;
+   ccs_cache_level = dst_cache_level;
+   ccs_is_src = false;
+   } else if (dst_is_lmem) {
+   bytes_to_cpy = dst_sz;
+   it_ccs = it_src;
+   ccs_cache_level = src_cache_level;
+  

[PATCH v5 8/9] drm/i915/gem: Add extra pages in ttm_tt for ccs data

2022-03-21 Thread Ramalingam C
On Xe-HP and later devices, dedicated compression control state (CCS)
stored in local memory is used for each surface, to support the
3D and media compression formats.

The memory required for the CCS of the entire local memory is 1/256 of
the local memory size. So before the kernel boot, the required memory
is reserved for the CCS data and a secure register will be programmed
with the CCS base address

So when an object is allocated in local memory, dont need to explicitly
allocate the space for ccs data. But when the obj is evicted into the
smem, to hold the compression related data along with the obj extra space
is needed in smem. i.e obj_size + (obj_size/256).

Hence when a smem pages are allocated for an obj with lmem placement
possibility we create with the extra pages required for the ccs data for
the obj size.

v2:
  Used imperative wording [Thomas]
v3:
  Inflate the pages only when obj's placement is lmem only

Signed-off-by: Ramalingam C 
cc: Christian Koenig 
cc: Hellstrom Thomas 
Reviewed-by: Thomas Hellstrom 
Reviewed-by: Nirmoy Das 
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 29 -
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index 3b9f99c765c4..0305a150b9d4 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -20,6 +20,7 @@
 #include "gem/i915_gem_ttm.h"
 #include "gem/i915_gem_ttm_move.h"
 #include "gem/i915_gem_ttm_pm.h"
+#include "gt/intel_gpu_commands.h"
 
 #define I915_TTM_PRIO_PURGE 0
 #define I915_TTM_PRIO_NO_PAGES  1
@@ -262,12 +263,33 @@ static const struct i915_refct_sgt_ops tt_rsgt_ops = {
.release = i915_ttm_tt_release
 };
 
+static inline bool
+i915_gem_object_needs_ccs_pages(struct drm_i915_gem_object *obj)
+{
+   bool lmem_placement = false;
+   int i;
+
+   for (i = 0; i < obj->mm.n_placements; i++) {
+   /* Compression is not allowed for the objects with smem 
placement */
+   if (obj->mm.placements[i]->type == INTEL_MEMORY_SYSTEM)
+   return false;
+   if (!lmem_placement &&
+   obj->mm.placements[i]->type == INTEL_MEMORY_LOCAL)
+   lmem_placement = true;
+   }
+
+   return lmem_placement;
+}
+
 static struct ttm_tt *i915_ttm_tt_create(struct ttm_buffer_object *bo,
 uint32_t page_flags)
 {
+   struct drm_i915_private *i915 = container_of(bo->bdev, typeof(*i915),
+bdev);
struct ttm_resource_manager *man =
ttm_manager_type(bo->bdev, bo->resource->mem_type);
struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
+   unsigned long ccs_pages = 0;
enum ttm_caching caching;
struct i915_ttm_tt *i915_tt;
int ret;
@@ -290,7 +312,12 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
ttm_buffer_object *bo,
i915_tt->is_shmem = true;
}
 
-   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
+   if (HAS_FLAT_CCS(i915) && i915_gem_object_needs_ccs_pages(obj))
+   ccs_pages = DIV_ROUND_UP(DIV_ROUND_UP(bo->base.size,
+ NUM_BYTES_PER_CCS_BYTE),
+PAGE_SIZE);
+
+   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, ccs_pages);
if (ret)
goto err_free;
 
-- 
2.20.1



[PATCH v5 7/9] drm/ttm: Add a parameter to add extra pages into ttm_tt

2022-03-21 Thread Ramalingam C
Add a parameter called "extra_pages" for ttm_tt_init, to indicate that
driver needs extra pages in ttm_tt.

v2:
  Used imperative wording [Thomas and Christian]

Signed-off-by: Ramalingam C 
cc: Christian Koenig 
cc: Hellstrom Thomas 
Reviewed-by: Thomas Hellstrom 
Reviewed-by: Christian Konig 
Reviewed-by: Nirmoy Das 
---
 drivers/gpu/drm/drm_gem_vram_helper.c  |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c|  2 +-
 drivers/gpu/drm/qxl/qxl_ttm.c  |  2 +-
 drivers/gpu/drm/ttm/ttm_agp_backend.c  |  2 +-
 drivers/gpu/drm/ttm/ttm_tt.c   | 12 +++-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c |  2 +-
 include/drm/ttm/ttm_tt.h   |  4 +++-
 7 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/drm_gem_vram_helper.c 
b/drivers/gpu/drm/drm_gem_vram_helper.c
index dc7f938bfff2..123045b58fec 100644
--- a/drivers/gpu/drm/drm_gem_vram_helper.c
+++ b/drivers/gpu/drm/drm_gem_vram_helper.c
@@ -867,7 +867,7 @@ static struct ttm_tt *bo_driver_ttm_tt_create(struct 
ttm_buffer_object *bo,
if (!tt)
return NULL;
 
-   ret = ttm_tt_init(tt, bo, page_flags, ttm_cached);
+   ret = ttm_tt_init(tt, bo, page_flags, ttm_cached, 0);
if (ret < 0)
goto err_ttm_tt_init;
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index e4a06fcf741a..3b9f99c765c4 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -290,7 +290,7 @@ static struct ttm_tt *i915_ttm_tt_create(struct 
ttm_buffer_object *bo,
i915_tt->is_shmem = true;
}
 
-   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching);
+   ret = ttm_tt_init(_tt->ttm, bo, page_flags, caching, 0);
if (ret)
goto err_free;
 
diff --git a/drivers/gpu/drm/qxl/qxl_ttm.c b/drivers/gpu/drm/qxl/qxl_ttm.c
index b2e33d5ba5d0..52156b54498f 100644
--- a/drivers/gpu/drm/qxl/qxl_ttm.c
+++ b/drivers/gpu/drm/qxl/qxl_ttm.c
@@ -113,7 +113,7 @@ static struct ttm_tt *qxl_ttm_tt_create(struct 
ttm_buffer_object *bo,
ttm = kzalloc(sizeof(struct ttm_tt), GFP_KERNEL);
if (ttm == NULL)
return NULL;
-   if (ttm_tt_init(ttm, bo, page_flags, ttm_cached)) {
+   if (ttm_tt_init(ttm, bo, page_flags, ttm_cached, 0)) {
kfree(ttm);
return NULL;
}
diff --git a/drivers/gpu/drm/ttm/ttm_agp_backend.c 
b/drivers/gpu/drm/ttm/ttm_agp_backend.c
index 6ddc16f0fe2b..d27691f2e451 100644
--- a/drivers/gpu/drm/ttm/ttm_agp_backend.c
+++ b/drivers/gpu/drm/ttm/ttm_agp_backend.c
@@ -134,7 +134,7 @@ struct ttm_tt *ttm_agp_tt_create(struct ttm_buffer_object 
*bo,
agp_be->mem = NULL;
agp_be->bridge = bridge;
 
-   if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined)) {
+   if (ttm_tt_init(_be->ttm, bo, page_flags, ttm_write_combined, 0)) {
kfree(agp_be);
return NULL;
}
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index d234aab800a0..1a66d9fc589a 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -134,9 +134,10 @@ void ttm_tt_destroy(struct ttm_device *bdev, struct ttm_tt 
*ttm)
 static void ttm_tt_init_fields(struct ttm_tt *ttm,
   struct ttm_buffer_object *bo,
   uint32_t page_flags,
-  enum ttm_caching caching)
+  enum ttm_caching caching,
+  unsigned long extra_pages)
 {
-   ttm->num_pages = PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT;
+   ttm->num_pages = (PAGE_ALIGN(bo->base.size) >> PAGE_SHIFT) + 
extra_pages;
ttm->caching = ttm_cached;
ttm->page_flags = page_flags;
ttm->dma_address = NULL;
@@ -146,9 +147,10 @@ static void ttm_tt_init_fields(struct ttm_tt *ttm,
 }
 
 int ttm_tt_init(struct ttm_tt *ttm, struct ttm_buffer_object *bo,
-   uint32_t page_flags, enum ttm_caching caching)
+   uint32_t page_flags, enum ttm_caching caching,
+   unsigned long extra_pages)
 {
-   ttm_tt_init_fields(ttm, bo, page_flags, caching);
+   ttm_tt_init_fields(ttm, bo, page_flags, caching, extra_pages);
 
if (ttm_tt_alloc_page_directory(ttm)) {
pr_err("Failed allocating page table\n");
@@ -180,7 +182,7 @@ int ttm_sg_tt_init(struct ttm_tt *ttm, struct 
ttm_buffer_object *bo,
 {
int ret;
 
-   ttm_tt_init_fields(ttm, bo, page_flags, caching);
+   ttm_tt_init_fields(ttm, bo, page_flags, caching, 0);
 
if (page_flags & TTM_TT_FLAG_EXTERNAL)
ret = ttm_sg_tt_alloc_page_directory(ttm);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c 
b/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
index b84ecc6d6611..4e3938e62c08 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c
+++ 

[PATCH v5 6/9] drm/i915/gt: offset handling for multiple copy engines

2022-03-21 Thread Ramalingam C
Handle the src and dst chunk offsets for different instances of the copy
engines.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 39a5f8ae664d..5f6341f91622 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -614,6 +614,9 @@ static int emit_copy(struct i915_request *rq,
u32 instance = rq->engine->instance;
u32 *cs;
 
+   src_offset += (u64)rq->engine->instance << 32;
+   dst_offset += (u64)rq->engine->instance << 32;
+
cs = intel_ring_begin(rq, ver >= 8 ? 10 : 6);
if (IS_ERR(cs))
return PTR_ERR(cs);
-- 
2.20.1



[PATCH v5 4/9] drm/i915/selftest_migrate: Consider the possible roundup of size

2022-03-21 Thread Ramalingam C
Consider the possible round up happened at obj size alignment to
min_page_size during the obj allocation.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index c9c4f391c5cc..b5da8b8cd039 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -152,6 +152,9 @@ static int clear(struct intel_migrate *migrate,
if (IS_ERR(obj))
return 0;
 
+   /* Consider the rounded up memory too */
+   sz = obj->base.size;
+
for_i915_gem_ww(, err, true) {
err = i915_gem_object_lock(obj, );
if (err)
-- 
2.20.1



[PATCH v5 3/9] drm/i915/gt: Clear compress metadata for Flat-ccs objects

2022-03-21 Thread Ramalingam C
Xe-HP and latest devices support Flat CCS which reserved a portion of
the device memory to store compression metadata, during the clearing of
device memory buffer object we also need to clear the associated
CCS buffer.

XY_CTRL_SURF_COPY_BLT is a BLT cmd used for reading and writing the
ccs surface of a lmem memory. So on Flat-CCS capable platform we use
XY_CTRL_SURF_COPY_BLT  to clear the CCS meta data.

v2: Fixed issues with platform naming [Lucas]
v3: Rebased [Ram]
Used the round_up funcs [Bob]
v4: Fixed ccs blk calculation [Ram]
Added Kdoc on flat-ccs.
v5: GENMASK is used [Matt]
mocs fix [Matt]
Comments Fix [Matt]
Flush address programming [Ram]
v6: FLUSH_DW is fixed
Few coding style fix
v7: Adopting the XY_FAST_COLOR_BLT (Thomas]
v8: XY_CTRL_SURF_COPY_BLT for ccs clearing.
v9: emit_copy_ccs is used.

Signed-off-by: Ramalingam C 
Signed-off-by: Ayaz A Siddiqui 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  15 ++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 164 ++-
 2 files changed, 175 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index 925e55b6a94f..6b4eb7927ec7 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -153,8 +153,10 @@
 #define   MI_FLUSH_DW_PROTECTED_MEM_EN (1 << 22)
 #define   MI_FLUSH_DW_STORE_INDEX  (1<<21)
 #define   MI_INVALIDATE_TLB(1<<18)
+#define   MI_FLUSH_DW_CCS  (1<<16)
 #define   MI_FLUSH_DW_OP_STOREDW   (1<<14)
 #define   MI_FLUSH_DW_OP_MASK  (3<<14)
+#define   MI_FLUSH_DW_LLC  (1<<9)
 #define   MI_FLUSH_DW_NOTIFY   (1<<8)
 #define   MI_INVALIDATE_BSD(1<<7)
 #define   MI_FLUSH_DW_USE_GTT  (1<<2)
@@ -203,6 +205,19 @@
 #define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3))
 #define GFX_OP_DRAWRECT_INFO_I965  ((0x7900<<16)|0x2)
 
+#define XY_CTRL_SURF_INSTR_SIZE5
+#define MI_FLUSH_DW_SIZE   3
+#define XY_CTRL_SURF_COPY_BLT  ((2 << 29) | (0x48 << 22) | 3)
+#define   SRC_ACCESS_TYPE_SHIFT21
+#define   DST_ACCESS_TYPE_SHIFT20
+#define   CCS_SIZE_MASKGENMASK(17, 8)
+#define   XY_CTRL_SURF_MOCS_MASK   GENMASK(31, 25)
+#define   NUM_CCS_BYTES_PER_BLOCK  256
+#define   NUM_BYTES_PER_CCS_BYTE   256
+#define   NUM_CCS_BLKS_PER_XFER1024
+#define   INDIRECT_ACCESS  0
+#define   DIRECT_ACCESS1
+
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
 #define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index b656685a486d..39a5f8ae664d 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -16,7 +16,8 @@ struct insert_pte_data {
 };
 
 #define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
-
+#define GET_CCS_BYTES(i915, size)  (HAS_FLAT_CCS(i915) ? \
+DIV_ROUND_UP(size, 
NUM_BYTES_PER_CCS_BYTE) : 0)
 static bool engine_supports_migration(struct intel_engine_cs *engine)
 {
if (!engine)
@@ -467,6 +468,145 @@ static bool wa_1209644611_applies(int ver, u32 size)
return height % 4 == 3 && height <= 8;
 }
 
+/**
+ * DOC: Flat-CCS - Memory compression for Local memory
+ *
+ * On Xe-HP and later devices, we use dedicated compression control state (CCS)
+ * stored in local memory for each surface, to support the 3D and media
+ * compression formats.
+ *
+ * The memory required for the CCS of the entire local memory is 1/256 of the
+ * local memory size. So before the kernel boot, the required memory is 
reserved
+ * for the CCS data and a secure register will be programmed with the CCS base
+ * address.
+ *
+ * Flat CCS data needs to be cleared when a lmem object is allocated.
+ * And CCS data can be copied in and out of CCS region through
+ * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly.
+ *
+ * When we exhaust the lmem, if the object's placements support smem, then we 
can
+ * directly decompress the compressed lmem object into smem and start using it
+ * from smem itself.
+ *
+ * But when we need to swapout the compressed lmem object into a smem region
+ * though objects' placement doesn't support smem, then we copy the lmem 
content
+ * as it is into smem region along with ccs data (using XY_CTRL_SURF_COPY_BLT).
+ * When the object is referred, lmem content will be swaped in along with
+ * restoration of the CCS data (using XY_CTRL_SURF_COPY_BLT) at corresponding
+ * location.
+ */
+
+static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
+{
+   *cmd++ = MI_FLUSH_DW | flags;
+   *cmd++ = 0;
+   *cmd++ = 0;
+
+   return cmd;

[PATCH v5 5/9] drm/i915/selftest_migrate: Check CCS meta data clear

2022-03-21 Thread Ramalingam C
Extend the live migrate selftest, to verify the ccs surface clearing
during the Flat-CCS capable lmem obj clear.

v2:
  Look at right places for ccs data [Thomas]

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/selftest_migrate.c | 250 ++---
 1 file changed, 222 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c 
b/drivers/gpu/drm/i915/gt/selftest_migrate.c
index b5da8b8cd039..8cd9a22054f3 100644
--- a/drivers/gpu/drm/i915/gt/selftest_migrate.c
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -132,6 +132,124 @@ static int copy(struct intel_migrate *migrate,
return err;
 }
 
+static int intel_context_copy_ccs(struct intel_context *ce,
+ const struct i915_deps *deps,
+ struct scatterlist *sg,
+ enum i915_cache_level cache_level,
+ bool write_to_ccs,
+ struct i915_request **out)
+{
+   u8 src_access = write_to_ccs ? DIRECT_ACCESS : INDIRECT_ACCESS;
+   u8 dst_access = write_to_ccs ? INDIRECT_ACCESS : DIRECT_ACCESS;
+   struct sgt_dma it = sg_sgt(sg);
+   struct i915_request *rq;
+   u32 offset;
+   int err;
+
+   GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
+   *out = NULL;
+
+   GEM_BUG_ON(ce->ring->size < SZ_64K);
+
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915))
+   offset = CHUNK_SZ;
+
+   do {
+   int len;
+
+   rq = i915_request_create(ce);
+   if (IS_ERR(rq)) {
+   err = PTR_ERR(rq);
+   goto out_ce;
+   }
+
+   if (deps) {
+   err = i915_request_await_deps(rq, deps);
+   if (err)
+   goto out_rq;
+
+   if (rq->engine->emit_init_breadcrumb) {
+   err = rq->engine->emit_init_breadcrumb(rq);
+   if (err)
+   goto out_rq;
+   }
+
+   deps = NULL;
+   }
+
+   /* The PTE updates + clear must not be interrupted. */
+   err = emit_no_arbitration(rq);
+   if (err)
+   goto out_rq;
+
+   len = emit_pte(rq, , cache_level, true, offset, CHUNK_SZ);
+   if (len <= 0) {
+   err = len;
+   goto out_rq;
+   }
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+   if (err)
+   goto out_rq;
+
+   err = emit_copy_ccs(rq, offset, dst_access,
+   offset, src_access, len);
+   if (err)
+   goto out_rq;
+
+   err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+
+   /* Arbitration is re-enabled between requests. */
+out_rq:
+   if (*out)
+   i915_request_put(*out);
+   *out = i915_request_get(rq);
+   i915_request_add(rq);
+   if (err || !it.sg || !sg_dma_len(it.sg))
+   break;
+
+   cond_resched();
+   } while (1);
+
+out_ce:
+   return err;
+}
+
+static int
+intel_migrate_ccs_copy(struct intel_migrate *m,
+  struct i915_gem_ww_ctx *ww,
+  const struct i915_deps *deps,
+  struct scatterlist *sg,
+  enum i915_cache_level cache_level,
+  bool write_to_ccs,
+  struct i915_request **out)
+{
+   struct intel_context *ce;
+   int err;
+
+   *out = NULL;
+   if (!m->context)
+   return -ENODEV;
+
+   ce = intel_migrate_create_context(m);
+   if (IS_ERR(ce))
+   ce = intel_context_get(m->context);
+   GEM_BUG_ON(IS_ERR(ce));
+
+   err = intel_context_pin_ww(ce, ww);
+   if (err)
+   goto out;
+
+   err = intel_context_copy_ccs(ce, deps, sg, cache_level,
+write_to_ccs, out);
+
+   intel_context_unpin(ce);
+out:
+   intel_context_put(ce);
+   return err;
+}
+
 static int clear(struct intel_migrate *migrate,
 int (*fn)(struct intel_migrate *migrate,
   struct i915_gem_ww_ctx *ww,
@@ -144,7 +262,8 @@ static int clear(struct intel_migrate *migrate,
struct drm_i915_gem_object *obj;
struct i915_request *rq;
struct i915_gem_ww_ctx ww;
-   u32 *vaddr;
+   u32 *vaddr, val = 0;
+   bool ccs_cap = false;
int err = 0;
int i;
 
@@ -155,7 +274,12 @@ static int clear(struct intel_migrate *migrate,
/* Consider the rounded up memory too */
sz = obj->base.size;
 
+   if 

[PATCH v5 1/9] drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+

2022-03-21 Thread Ramalingam C
Use faster XY_FAST_COLOR_BLT cmd on graphics version of 12 and more,
for clearing (Zero out) the pages of the newly allocated object.

XY_FAST_COLOR_BLT is faster than the older XY_COLOR_BLT.

v2:
  Typo fix at title [Thomas]

Signed-off-by: Ramalingam C 
Signed-off-by: Chris Wilson 
Reviewed-by: Thomas Hellstrom 
---
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  5 +++
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 43 +---
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h 
b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index d112ffd56418..925e55b6a94f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -205,6 +205,11 @@
 
 #define COLOR_BLT_CMD  (2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD   (2 << 29 | 0x50 << 22)
+#define XY_FAST_COLOR_BLT_CMD  (2 << 29 | 0x44 << 22)
+#define   XY_FAST_COLOR_BLT_DEPTH_32   (2 << 19)
+#define   XY_FAST_COLOR_BLT_DW 16
+#define   XY_FAST_COLOR_BLT_MOCS_MASK  GENMASK(27, 21)
+#define   XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT 31
 #define SRC_COPY_BLT_CMD   (2 << 29 | 0x43 << 22)
 #define GEN9_XY_FAST_COPY_BLT_CMD  (2 << 29 | 0x42 << 22)
 #define XY_SRC_COPY_BLT_CMD(2 << 29 | 0x53 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 20444d6ceb3c..73199ebf0671 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -614,20 +614,53 @@ intel_context_migrate_copy(struct intel_context *ce,
return err;
 }
 
-static int emit_clear(struct i915_request *rq, u64 offset, int size, u32 value)
+static int emit_clear(struct i915_request *rq, u64 offset, int size,
+ u32 value, bool is_lmem)
 {
-   const int ver = GRAPHICS_VER(rq->engine->i915);
+   struct drm_i915_private *i915 = rq->engine->i915;
+   int mocs = rq->engine->gt->mocs.uc_index << 1;
+   const int ver = GRAPHICS_VER(i915);
+   int ring_sz;
u32 *cs;
 
GEM_BUG_ON(size >> PAGE_SHIFT > S16_MAX);
 
offset += (u64)rq->engine->instance << 32;
 
-   cs = intel_ring_begin(rq, ver >= 8 ? 8 : 6);
+   if (ver >= 12)
+   ring_sz = 16;
+   else if (ver >= 8)
+   ring_sz = 8;
+   else
+   ring_sz = 6;
+
+   cs = intel_ring_begin(rq, ring_sz);
if (IS_ERR(cs))
return PTR_ERR(cs);
 
-   if (ver >= 8) {
+   if (ver >= 12) {
+   *cs++ = XY_FAST_COLOR_BLT_CMD | XY_FAST_COLOR_BLT_DEPTH_32 |
+   (XY_FAST_COLOR_BLT_DW - 2);
+   *cs++ = FIELD_PREP(XY_FAST_COLOR_BLT_MOCS_MASK, mocs) |
+   (PAGE_SIZE - 1);
+   *cs++ = 0;
+   *cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+   *cs++ = lower_32_bits(offset);
+   *cs++ = upper_32_bits(offset);
+   *cs++ = !is_lmem << XY_FAST_COLOR_BLT_MEM_TYPE_SHIFT;
+   /* BG7 */
+   *cs++ = value;
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG11 */
+   *cs++ = 0;
+   *cs++ = 0;
+   /* BG13 */
+   *cs++ = 0;
+   *cs++ = 0;
+   *cs++ = 0;
+   } else if (ver >= 8) {
*cs++ = XY_COLOR_BLT_CMD | BLT_WRITE_RGBA | (7 - 2);
*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | PAGE_SIZE;
*cs++ = 0;
@@ -711,7 +744,7 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   err = emit_clear(rq, offset, len, value);
+   err = emit_clear(rq, offset, len, value, is_lmem);
 
/* Arbitration is re-enabled between requests. */
 out_rq:
-- 
2.20.1



[PATCH v5 2/9] drm/i915/gt: Optimize the migration and clear loop

2022-03-21 Thread Ramalingam C
Move the static calculations out of the loops for copy and clear.

Signed-off-by: Ramalingam C 
---
 drivers/gpu/drm/i915/gt/intel_migrate.c | 44 -
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c 
b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 73199ebf0671..b656685a486d 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -526,6 +526,7 @@ intel_context_migrate_copy(struct intel_context *ce,
   struct i915_request **out)
 {
struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+   u32 src_offset, dst_offset;
struct i915_request *rq;
int err;
 
@@ -534,8 +535,20 @@ intel_context_migrate_copy(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   src_offset = 0;
+   dst_offset = CHUNK_SZ;
+   if (HAS_64K_PAGES(ce->engine->i915)) {
+   GEM_BUG_ON(!src_is_lmem && !dst_is_lmem);
+
+   src_offset = 0;
+   dst_offset = 0;
+   if (src_is_lmem)
+   src_offset = CHUNK_SZ;
+   if (dst_is_lmem)
+   dst_offset = 2 * CHUNK_SZ;
+   }
+
do {
-   u32 src_offset, dst_offset;
int len;
 
rq = i915_request_create(ce);
@@ -563,19 +576,6 @@ intel_context_migrate_copy(struct intel_context *ce,
if (err)
goto out_rq;
 
-   src_offset = 0;
-   dst_offset = CHUNK_SZ;
-   if (HAS_64K_PAGES(ce->engine->i915)) {
-   GEM_BUG_ON(!src_is_lmem && !dst_is_lmem);
-
-   src_offset = 0;
-   dst_offset = 0;
-   if (src_is_lmem)
-   src_offset = CHUNK_SZ;
-   if (dst_is_lmem)
-   dst_offset = 2 * CHUNK_SZ;
-   }
-
len = emit_pte(rq, _src, src_cache_level, src_is_lmem,
   src_offset, CHUNK_SZ);
if (len <= 0) {
@@ -585,12 +585,10 @@ intel_context_migrate_copy(struct intel_context *ce,
 
err = emit_pte(rq, _dst, dst_cache_level, dst_is_lmem,
   dst_offset, len);
-   if (err < 0)
-   goto out_rq;
-   if (err < len) {
+   if (err < len)
err = -EINVAL;
+   if (err < 0)
goto out_rq;
-   }
 
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
if (err)
@@ -694,6 +692,7 @@ intel_context_migrate_clear(struct intel_context *ce,
 {
struct sgt_dma it = sg_sgt(sg);
struct i915_request *rq;
+   u32 offset;
int err;
 
GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -701,8 +700,11 @@ intel_context_migrate_clear(struct intel_context *ce,
 
GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+   offset = 0;
+   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
+   offset = CHUNK_SZ;
+
do {
-   u32 offset;
int len;
 
rq = i915_request_create(ce);
@@ -730,10 +732,6 @@ intel_context_migrate_clear(struct intel_context *ce,
if (err)
goto out_rq;
 
-   offset = 0;
-   if (HAS_64K_PAGES(ce->engine->i915) && is_lmem)
-   offset = CHUNK_SZ;
-
len = emit_pte(rq, , cache_level, is_lmem, offset, CHUNK_SZ);
if (len <= 0) {
err = len;
-- 
2.20.1



[PATCH v5 0/9] drm/i915/ttm: Evict and restore of compressed object

2022-03-21 Thread Ramalingam C
On Xe-HP and later devices, we use dedicated compression control
state (CCS) stored in local memory for each surface, to support
the 3D and media compression formats.

The memory required for the CCS of the entire local memory is
1/256 of the local memory size. So before the kernel
boot, the required memory is reserved for the CCS data and a
secure register will be programmed with the CCS base address

So when we allocate a object in local memory we dont need to explicitly
allocate the space for ccs data. But when we evict the obj into the smem
to hold the compression related data along with the obj we need smem
space of obj_size + (obj_size/256).

Hence when we create smem for an obj with lmem placement possibility we
create with the extra space.

When we are swapping out the local memory obj on flat-ccs capable platform,
we need to capture the ccs data too along with main meory and we need to
restore it when we are swapping in the content.

When lmem object is swapped into a smem obj, smem obj will
have the extra pages required to hold the ccs data corresponding to the
lmem main memory. So main memory of lmem will be copied into the initial
pages of the smem and then ccs data corresponding to the main memory
will be copied to the subsequent pages of smem.

Swapin happens exactly in reverse order. First main memory of lmem is
restored from the smem's initial pages and the ccs data will be restored
from the subsequent pages of smem.

Extracting and restoring the CCS data is done through a special cmd called
XY_CTRL_SURF_COPY_BLT

v5:
  Correct locating of ccs data [Thomas]
  Reuse of emit_copy_ccs.
  R-bs picked.

Test-with: 20220314051432.15785-1-ramalinga...@intel.com

Ramalingam C (9):
  drm/i915/gt: Use XY_FAST_COLOR_BLT to clear obj on graphics ver 12+
  drm/i915/gt: Optimize the migration and clear loop
  drm/i915/gt: Clear compress metadata for Flat-ccs objects
  drm/i915/selftest_migrate: Consider the possible roundup of size
  drm/i915/selftest_migrate: Check CCS meta data clear
  drm/i915/gt: offset handling for multiple copy engines
  drm/ttm: Add a parameter to add extra pages into ttm_tt
  drm/i915/gem: Add extra pages in ttm_tt for ccs data
  drm/i915/migrate: Evict and restore the flatccs capable lmem obj

 drivers/gpu/drm/drm_gem_vram_helper.c|   2 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c  |  29 +-
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h |  20 +
 drivers/gpu/drm/i915/gt/intel_migrate.c  | 409 +--
 drivers/gpu/drm/i915/gt/selftest_migrate.c   | 253 ++--
 drivers/gpu/drm/qxl/qxl_ttm.c|   2 +-
 drivers/gpu/drm/ttm/ttm_agp_backend.c|   2 +-
 drivers/gpu/drm/ttm/ttm_tt.c |  12 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_buffer.c   |   2 +-
 include/drm/ttm/ttm_tt.h |   4 +-
 10 files changed, 664 insertions(+), 71 deletions(-)

-- 
2.20.1



[PATCH v13 4/5] drm/i915/: Re-work clflush_write32

2022-03-21 Thread Michael Cheng
Use drm_clflush_virt_range instead of clflushopt and remove the memory
barrier, since drm_clflush_virt_range takes care of that.

v2(Michael Cheng): Use sizeof(*addr) instead of sizeof(addr) to get the
   actual size of the page. Thanks to Matt Roper for
   pointing this out.

Signed-off-by: Michael Cheng 
Reviewed-by: Matt Roper 
---
 drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 631bc268e7c8..42a49fd2f2ab 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1334,10 +1334,8 @@ static void *reloc_vaddr(struct i915_vma *vma,
 static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 {
if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
-   if (flushes & CLFLUSH_BEFORE) {
-   clflushopt(addr);
-   mb();
-   }
+   if (flushes & CLFLUSH_BEFORE)
+   drm_clflush_virt_range(addr, sizeof(*addr));
 
*addr = value;
 
@@ -1349,7 +1347,7 @@ static void clflush_write32(u32 *addr, u32 value, 
unsigned int flushes)
 * to ensure ordering of clflush wrt to the system.
 */
if (flushes & CLFLUSH_AFTER)
-   clflushopt(addr);
+   drm_clflush_virt_range(addr, sizeof(*addr));
} else
*addr = value;
 }
-- 
2.25.1



[PATCH v13 3/5] drm/i915/gt: Re-work reset_csb

2022-03-21 Thread Michael Cheng
Use drm_clflush_virt_range instead of directly invoking clflush. This
will prevent compiler errors when building for non-x86 architectures.

v2(Michael Cheng): Remove extra clflush

v3(Michael Cheng): Remove memory barrier since drm_clflush_virt_range
   takes care of it.

v4(Michael Cheng): Get the size of value and not the size of the pointer
   when passing in execlists->csb_write. Thanks to Matt
   Roper for pointing this out.

Signed-off-by: Michael Cheng 
Reviewed-by: Matt Roper 
---
 drivers/gpu/drm/i915/gt/intel_execlists_submission.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 5f8cf4942f07..46a2087a66de 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2953,9 +2953,8 @@ reset_csb(struct intel_engine_cs *engine, struct 
i915_request **inactive)
 {
struct intel_engine_execlists * const execlists = >execlists;
 
-   mb(); /* paranoia: read the CSB pointers from after the reset */
-   clflush(execlists->csb_write);
-   mb();
+   drm_clflush_virt_range(execlists->csb_write,
+  sizeof(execlists->csb_write[0]));
 
inactive = process_csb(engine, inactive); /* drain preemption events */
 
-- 
2.25.1



[PATCH v13 5/5] drm/i915/gt: replace cache_clflush_range

2022-03-21 Thread Michael Cheng
Replace all occurrence of cache_clflush_range with drm_clflush_virt_range.
This will prevent compile errors on non-x86 platforms.

Signed-off-by: Michael Cheng 
Reviewed-by: Matt Roper 
---
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 12 ++--
 drivers/gpu/drm/i915/gt/intel_execlists_submission.c |  2 +-
 drivers/gpu/drm/i915/gt/intel_gtt.c  |  2 +-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c|  2 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c|  2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c 
b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index f574da00eff1..c7bd5d71b03e 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -454,11 +454,11 @@ gen8_ppgtt_insert_pte(struct i915_ppgtt *ppgtt,
pd = pdp->entry[gen8_pd_index(idx, 2)];
}
 
-   clflush_cache_range(vaddr, PAGE_SIZE);
+   drm_clflush_virt_range(vaddr, PAGE_SIZE);
vaddr = px_vaddr(i915_pt_entry(pd, gen8_pd_index(idx, 
1)));
}
} while (1);
-   clflush_cache_range(vaddr, PAGE_SIZE);
+   drm_clflush_virt_range(vaddr, PAGE_SIZE);
 
return idx;
 }
@@ -631,7 +631,7 @@ static void gen8_ppgtt_insert_huge(struct 
i915_address_space *vm,
}
} while (rem >= page_size && index < I915_PDES);
 
-   clflush_cache_range(vaddr, PAGE_SIZE);
+   drm_clflush_virt_range(vaddr, PAGE_SIZE);
 
/*
 * Is it safe to mark the 2M block as 64K? -- Either we have
@@ -647,7 +647,7 @@ static void gen8_ppgtt_insert_huge(struct 
i915_address_space *vm,
  I915_GTT_PAGE_SIZE_2M {
vaddr = px_vaddr(pd);
vaddr[maybe_64K] |= GEN8_PDE_IPS_64K;
-   clflush_cache_range(vaddr, PAGE_SIZE);
+   drm_clflush_virt_range(vaddr, PAGE_SIZE);
page_size = I915_GTT_PAGE_SIZE_64K;
 
/*
@@ -668,7 +668,7 @@ static void gen8_ppgtt_insert_huge(struct 
i915_address_space *vm,
for (i = 1; i < index; i += 16)
memset64(vaddr + i, encode, 15);
 
-   clflush_cache_range(vaddr, PAGE_SIZE);
+   drm_clflush_virt_range(vaddr, PAGE_SIZE);
}
}
 
@@ -722,7 +722,7 @@ static void gen8_ppgtt_insert_entry(struct 
i915_address_space *vm,
 
vaddr = px_vaddr(pt);
vaddr[gen8_pd_index(idx, 0)] = gen8_pte_encode(addr, level, flags);
-   clflush_cache_range([gen8_pd_index(idx, 0)], sizeof(*vaddr));
+   drm_clflush_virt_range([gen8_pd_index(idx, 0)], sizeof(*vaddr));
 }
 
 static void __xehpsdv_ppgtt_insert_entry_lm(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 46a2087a66de..68a8160ce3e3 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2828,7 +2828,7 @@ static void execlists_sanitize(struct intel_engine_cs 
*engine)
sanitize_hwsp(engine);
 
/* And scrub the dirty cachelines for the HWSP */
-   clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
+   drm_clflush_virt_range(engine->status_page.addr, PAGE_SIZE);
 
intel_engine_reset_pinned_contexts(engine);
 }
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index aed6de2d5a79..719fd31eee80 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -298,7 +298,7 @@ fill_page_dma(struct drm_i915_gem_object *p, const u64 val, 
unsigned int count)
void *vaddr = __px_vaddr(p);
 
memset64(vaddr, val, count);
-   clflush_cache_range(vaddr, PAGE_SIZE);
+   drm_clflush_virt_range(vaddr, PAGE_SIZE);
 }
 
 static void poison_scratch_page(struct drm_i915_gem_object *scratch)
diff --git a/drivers/gpu/drm/i915/gt/intel_ppgtt.c 
b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
index d91e2beb7517..d8b94d638559 100644
--- a/drivers/gpu/drm/i915/gt/intel_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ppgtt.c
@@ -91,7 +91,7 @@ write_dma_entry(struct drm_i915_gem_object * const pdma,
u64 * const vaddr = __px_vaddr(pdma);
 
vaddr[idx] = encoded_entry;
-   clflush_cache_range([idx], sizeof(u64));
+   drm_clflush_virt_range([idx], sizeof(u64));
 }
 
 void
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 9ec03234d2c2..42c9e8b7bf42 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ 

[PATCH v13 2/5] drm/i915/gt: Drop invalidate_csb_entries

2022-03-21 Thread Michael Cheng
Drop invalidate_csb_entries and directly call drm_clflush_virt_range.
This allows for one less function call, and prevent complier errors when
building for non-x86 architectures.

v2(Michael Cheng): Drop invalidate_csb_entries function and directly
   invoke drm_clflush_virt_range. Thanks to Tvrtko for the
   sugguestion.

v3(Michael Cheng): Use correct parameters for drm_clflush_virt_range.
   Thanks to Tvrtko for pointing this out.

v4(Michael Cheng): Simplify >csb_status[0] to
   execlists->csb_status. Thanks to Matt Roper for the
   suggestion.

Signed-off-by: Michael Cheng 
Reviewed-by: Matt Roper 
---
 .../gpu/drm/i915/gt/intel_execlists_submission.c| 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index e1470bb60f34..5f8cf4942f07 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -1651,12 +1651,6 @@ cancel_port_requests(struct intel_engine_execlists * 
const execlists,
return inactive;
 }
 
-static void invalidate_csb_entries(const u64 *first, const u64 *last)
-{
-   clflush((void *)first);
-   clflush((void *)last);
-}
-
 /*
  * Starting with Gen12, the status has a new format:
  *
@@ -2004,7 +1998,7 @@ process_csb(struct intel_engine_cs *engine, struct 
i915_request **inactive)
 * the wash as hardware, working or not, will need to do the
 * invalidation before.
 */
-   invalidate_csb_entries([0], [num_entries - 1]);
+   drm_clflush_virt_range([0], num_entries * sizeof(buf[0]));
 
/*
 * We assume that any event reflects a change in context flow
@@ -2788,8 +2782,9 @@ static void reset_csb_pointers(struct intel_engine_cs 
*engine)
 
/* Check that the GPU does indeed update the CSB entries! */
memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
-   invalidate_csb_entries(>csb_status[0],
-  >csb_status[reset_value]);
+   drm_clflush_virt_range(execlists->csb_status,
+  execlists->csb_size *
+  sizeof(execlists->csb_status));
 
/* Once more for luck and our trusty paranoia */
ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
-- 
2.25.1



[PATCH v13 1/5] drm/i915/gt: Re-work intel_write_status_page

2022-03-21 Thread Michael Cheng
Re-work intel_write_status_page to use drm_clflush_virt_range. This
will prevent compiler errors when building for non-x86 architectures.

Signed-off-by: Michael Cheng 
Reviewed-by: Matt Roper 
---
 drivers/gpu/drm/i915/gt/intel_engine.h | 13 -
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h 
b/drivers/gpu/drm/i915/gt/intel_engine.h
index 1c0ab05c3c40..1431f1e9dbee 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -4,6 +4,7 @@
 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -143,15 +144,9 @@ intel_write_status_page(struct intel_engine_cs *engine, 
int reg, u32 value)
 * of extra paranoia to try and ensure that the HWS takes the value
 * we give and that it doesn't end up trapped inside the CPU!
 */
-   if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
-   mb();
-   clflush(>status_page.addr[reg]);
-   engine->status_page.addr[reg] = value;
-   clflush(>status_page.addr[reg]);
-   mb();
-   } else {
-   WRITE_ONCE(engine->status_page.addr[reg], value);
-   }
+   drm_clflush_virt_range(>status_page.addr[reg], sizeof(value));
+   WRITE_ONCE(engine->status_page.addr[reg], value);
+   drm_clflush_virt_range(>status_page.addr[reg], sizeof(value));
 }
 
 /*
-- 
2.25.1



[PATCH v13 0/5] Use drm_clflush* instead of clflush

2022-03-21 Thread Michael Cheng
This patch series re-work a few i915 functions to use drm_clflush_virt_range
instead of calling clflush or clflushopt directly. This will prevent errors
when building for non-x86 architectures.

v2: s/PAGE_SIZE/sizeof(value) for Re-work intel_write_status_page and added
more patches to convert additional clflush/clflushopt to use drm_clflush*.
(Michael Cheng)

v3: Drop invalidate_csb_entries and directly invoke drm_clflush_virt_ran

v4: Remove extra memory barriers

v5: s/cache_clflush_range/drm_clflush_virt_range

v6: Fix up "Drop invalidate_csb_entries" to use correct parameters. Also
added in arm64 support for drm_clflush_virt_range.

v7: Re-order patches, and use correct macro for dcache flush for arm64.

v8: Remove ifdef for asm/cacheflush.

v9: Rebased

v10: Replaced asm/cacheflush with linux/cacheflush

v11: Correctly get the sizeof certian addresses. Also rebased to the latest.

v12: Drop include of cacheflush.h and use caches_clean_inval_pou instead of
dcache_clean_inval_poc, since it is not exported for other modules to use.

v13: Drop arm64 implementation for drm_clflush_virt_range. This series will 
focus more on making i915 more architecture neutral by abstracting all clflush
and clflush opt to the drm layer. 

Michael Cheng (5):
  drm/i915/gt: Re-work intel_write_status_page
  drm/i915/gt: Drop invalidate_csb_entries
  drm/i915/gt: Re-work reset_csb
  drm/i915/: Re-work clflush_write32
  drm/i915/gt: replace cache_clflush_range

 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  8 +++-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  | 12 +--
 drivers/gpu/drm/i915/gt/intel_engine.h| 13 
 .../drm/i915/gt/intel_execlists_submission.c  | 20 +++
 drivers/gpu/drm/i915/gt/intel_gtt.c   |  2 +-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |  2 +-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  2 +-
 7 files changed, 23 insertions(+), 36 deletions(-)

-- 
2.25.1



Re: [PATCH 00/22] drm: Review of mode copies

2022-03-21 Thread Ville Syrjälä
On Tue, Mar 15, 2022 at 02:52:38PM -0400, Alex Deucher wrote:
> On Mon, Mar 14, 2022 at 6:12 PM Ville Syrjälä
>  wrote:
> >
> > On Fri, Feb 18, 2022 at 12:03:41PM +0200, Ville Syrjala wrote:
> > >   drm: Add drm_mode_init()
> > >   drm/bridge: Use drm_mode_copy()
> > >   drm/imx: Use drm_mode_duplicate()
> > >   drm/panel: Use drm_mode_duplicate()
> > >   drm/vc4: Use drm_mode_copy()
> > These have been pushed to drm-misc-next.
> >
> > >   drm/amdgpu: Remove pointless on stack mode copies
> > >   drm/amdgpu: Use drm_mode_init() for on-stack modes
> > >   drm/amdgpu: Use drm_mode_copy()
> > amdgpu ones are reviewed, but I'll leave them for the
> > AMD folks to push to whichever tree they prefer.
> 
> I pulled patches 2, 4, 5 into my tree.

Thanks.

> For 3, I'm happy to have it
> land via drm-misc with the rest of the mode_init changes if you'd
> prefer.

Either way works for me. I don't yet have reviews yet for
the other drivers, so I'll proably hold off for a bit more
at least. And the i915 patch I'll be merging via drm-intel.

> > >   drm/radeon: Use drm_mode_copy()
> > >   drm/gma500: Use drm_mode_copy()
> > >   drm/tilcdc: Use drm_mode_copy()
> > >   drm/i915: Use drm_mode_copy()

Those are now all in.

Which leaves us with these stragglers:
> > >   drm/hisilicon: Use drm_mode_init() for on-stack modes
> > >   drm/msm: Nuke weird on stack mode copy
> > >   drm/msm: Use drm_mode_init() for on-stack modes
> > >   drm/msm: Use drm_mode_copy()
> > >   drm/mtk: Use drm_mode_init() for on-stack modes
> > >   drm/rockchip: Use drm_mode_copy()
> > >   drm/sti: Use drm_mode_copy()
> > >   drm: Use drm_mode_init() for on-stack modes
> > >   drm: Use drm_mode_copy()

-- 
Ville Syrjälä
Intel


[PATCH 0/2] drm/v3d: replace objs lookup steps with drm_gem_objects_lookup

2022-03-21 Thread Melissa Wen
The first patch just prevents to iterate on a NULL job->bo array during
job cleanup. This situation can happen when v3d_lookup_bos() fails to
allocate memory for the job->bo array and job->bo_count was already set.
The second replace BOs lookup steps in v3d_lookup_bos() with the
common code in drm_gem_objects_lookup().

Melissa Wen (2):
  drm/v3d: cleanup BOs properly when lookup_bos fails
  drm/v3d: replace obj lookup steps with drm_gem_objects_lookup

 drivers/gpu/drm/v3d/v3d_gem.c | 55 ---
 1 file changed, 6 insertions(+), 49 deletions(-)

-- 
2.35.1



[PATCH 1/2] drm/v3d: cleanup BOs properly when lookup_bos fails

2022-03-21 Thread Melissa Wen
When v3d_lookup_bos fails to `allocate validated BO pointers`,
job->bo_count was already set to args->bo_count, but job->bo points to
NULL. In this scenario, we must verify that job->bo is not NULL before
iterating on it to proper clean up a job. Also, drm_gem_object_put
already checks that the object passed is not NULL, doing the job->bo[i]
checker redundant.

Signed-off-by: Melissa Wen 
---
 drivers/gpu/drm/v3d/v3d_gem.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 92bc0faee84f..926bfc7e07fb 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -358,11 +358,11 @@ v3d_job_free(struct kref *ref)
struct v3d_job *job = container_of(ref, struct v3d_job, refcount);
int i;
 
-   for (i = 0; i < job->bo_count; i++) {
-   if (job->bo[i])
+   if (job->bo) {
+   for (i = 0; i < job->bo_count; i++)
drm_gem_object_put(job->bo[i]);
+   kvfree(job->bo);
}
-   kvfree(job->bo);
 
dma_fence_put(job->irq_fence);
dma_fence_put(job->done_fence);
-- 
2.35.1



[PATCH 2/2] drm/v3d: replace obj lookup steps with drm_gem_objects_lookup

2022-03-21 Thread Melissa Wen
As v3d_lookup_bos() performs the same steps as drm_gem_objects_lookup(),
replace the explicit code in v3d to simply use the DRM function.

Signed-off-by: Melissa Wen 
---
 drivers/gpu/drm/v3d/v3d_gem.c | 49 +++
 1 file changed, 3 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
index 926bfc7e07fb..738b1080143d 100644
--- a/drivers/gpu/drm/v3d/v3d_gem.c
+++ b/drivers/gpu/drm/v3d/v3d_gem.c
@@ -294,10 +294,6 @@ v3d_lookup_bos(struct drm_device *dev,
   u64 bo_handles,
   u32 bo_count)
 {
-   u32 *handles;
-   int ret = 0;
-   int i;
-
job->bo_count = bo_count;
 
if (!job->bo_count) {
@@ -308,48 +304,9 @@ v3d_lookup_bos(struct drm_device *dev,
return -EINVAL;
}
 
-   job->bo = kvmalloc_array(job->bo_count,
-sizeof(struct drm_gem_cma_object *),
-GFP_KERNEL | __GFP_ZERO);
-   if (!job->bo) {
-   DRM_DEBUG("Failed to allocate validated BO pointers\n");
-   return -ENOMEM;
-   }
-
-   handles = kvmalloc_array(job->bo_count, sizeof(u32), GFP_KERNEL);
-   if (!handles) {
-   ret = -ENOMEM;
-   DRM_DEBUG("Failed to allocate incoming GEM handles\n");
-   goto fail;
-   }
-
-   if (copy_from_user(handles,
-  (void __user *)(uintptr_t)bo_handles,
-  job->bo_count * sizeof(u32))) {
-   ret = -EFAULT;
-   DRM_DEBUG("Failed to copy in GEM handles\n");
-   goto fail;
-   }
-
-   spin_lock(_priv->table_lock);
-   for (i = 0; i < job->bo_count; i++) {
-   struct drm_gem_object *bo = idr_find(_priv->object_idr,
-handles[i]);
-   if (!bo) {
-   DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
- i, handles[i]);
-   ret = -ENOENT;
-   spin_unlock(_priv->table_lock);
-   goto fail;
-   }
-   drm_gem_object_get(bo);
-   job->bo[i] = bo;
-   }
-   spin_unlock(_priv->table_lock);
-
-fail:
-   kvfree(handles);
-   return ret;
+   return drm_gem_objects_lookup(file_priv,
+ (void __user *)(uintptr_t)bo_handles,
+ job->bo_count, >bo);
 }
 
 static void
-- 
2.35.1



Re: [PATCH v2 4/7] drm/i915/guc: use the memcpy_from_wc call from the drm

2022-03-21 Thread Lucas De Marchi

On Thu, Mar 03, 2022 at 11:30:10PM +0530, Balasubramani Vivekanandan wrote:

memcpy_from_wc functions in i915_memcpy.c will be removed and replaced
by the implementation in drm_cache.c.
Updated to use the functions provided by drm_cache.c.

v2: Check if the log object allocated from local memory or system memory
   and according setup the iosys_map (Lucas)

Cc: Lucas De Marchi 

Signed-off-by: Balasubramani Vivekanandan 
---
drivers/gpu/drm/i915/gt/uc/intel_guc_log.c | 15 ---
1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index a24dc6441872..b9db765627ea 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -3,6 +3,7 @@
 * Copyright © 2014-2019 Intel Corporation
 */

+#include 
#include 
#include 

@@ -206,6 +207,7 @@ static void guc_read_update_log_buffer(struct intel_guc_log 
*log)
enum guc_log_buffer_type type;
void *src_data, *dst_data;
bool new_overflow;
+   struct iosys_map src_map;

mutex_lock(>relay.lock);

@@ -282,14 +284,21 @@ static void guc_read_update_log_buffer(struct 
intel_guc_log *log)
}

/* Just copy the newly written data */
+   if (i915_gem_object_is_lmem(log->vma->obj))
+   iosys_map_set_vaddr_iomem(_map, (void __iomem 
*)src_data);
+   else
+   iosys_map_set_vaddr(_map, src_data);


It would be better to keep this outside of the loop. So inside the loop
we can use only iosys_map_incr(_map, buffer_size). However you'd
also have to handle the read_offset. The iosys_map_ API has both a
src_offset and dst_offset due to situations like that. Maybe this is
missing in the new drm_memcpy_* function you're adding?

This function was not correct wrt to IO memory access with the other
2 places in this function doing plain memcpy(). Since we are starting to
use iosys_map here, we probably should handle this commit as "migrate to
iosys_map", and convert those. In your current final state
we have 3 variables aliasing the same memory location. IMO it will be
error prone to keep it like that

+Michal, some questions:

- I'm not very familiar with the relayfs API. Is the `dst_data += PAGE_SIZE;`
really correct?

- Could you double check this patch and ack if ok?

Heads up that since the log buffer is potentially in lmem, we will need
to convert this function to take that into account. All those accesses
to log_buf_state need to use the proper kernel abstraction for system vs
I/O memory.

thanks
Lucas De Marchi


+
if (read_offset > write_offset) {
-   i915_memcpy_from_wc(dst_data, src_data, write_offset);
+   drm_memcpy_from_wc_vaddr(dst_data, _map,
+write_offset);
bytes_to_copy = buffer_size - read_offset;
} else {
bytes_to_copy = write_offset - read_offset;
}
-   i915_memcpy_from_wc(dst_data + read_offset,
-   src_data + read_offset, bytes_to_copy);
+   iosys_map_incr(_map, read_offset);
+   drm_memcpy_from_wc_vaddr(dst_data + read_offset, _map,
+bytes_to_copy);

src_data += buffer_size;
dst_data += buffer_size;
--
2.25.1



Re: [Intel-gfx] [PATCH 18/22] drm/i915: Use drm_mode_init() for on-stack modes

2022-03-21 Thread Julia Lawall


On Mon, 21 Mar 2022, Ville Syrjälä wrote:

> On Wed, Mar 16, 2022 at 10:00:06AM +0200, Jani Nikula wrote:
> > On Fri, 18 Feb 2022, Ville Syrjala  wrote:
> > > From: Ville Syrjälä 
> > >
> > > Initialize on-stack modes with drm_mode_init() to guarantee
> > > no stack garbage in the list head, or that we aren't copying
> > > over another mode's list head.
> > >
> > > Based on the following cocci script, with manual fixups:
> > > @decl@
> > > identifier M;
> > > expression E;
> > > @@
> > > - struct drm_display_mode M = E;
> > > + struct drm_display_mode M;
> > >
> > > @@
> > > identifier decl.M;
> > > expression decl.E;
> > > statement S, S1;
> > > @@
> > > struct drm_display_mode M;
> > > ... when != S
> > > + drm_mode_init(, );
> > > +
> > > S1
> > >
> > > @@
> > > expression decl.E;
> > > @@
> > > - &*E
> > > + E
> > >
> > > Signed-off-by: Ville Syrjälä 
> >
> > I wonder if that cocci could be added to scripts/coccinelle or something
> > to detect anyone adding new ones?
>
> Maybe.
>
> Julia & co, would you be open to having drm subsystem specific
> coccinelle scripts? If so where should we put the?
> scripts/coccinelle/drm perhaps?

That would be fine.  It is possible to make a script only apply to a
specific directory, but I think that that is not necessary in this case,
since you mention types that are only relevant to drm code.

julia

Re: [Intel-gfx] [PATCH v2 2/7] drm: Add drm_memcpy_from_wc() variant which accepts destination address

2022-03-21 Thread Lucas De Marchi

On Thu, Mar 03, 2022 at 11:30:08PM +0530, Balasubramani Vivekanandan wrote:

Fast copy using non-temporal instructions for x86 currently exists at two
locations. One is implemented in i915 driver at i915/i915_memcpy.c and
another copy at drm_cache.c. The plan is to remove the duplicate
implementation in i915 driver and use the functions from drm_cache.c.

A variant of drm_memcpy_from_wc() is added in drm_cache.c which accepts
address as argument instead of iosys_map for destination. It is a very
common scenario in i915 to copy from a WC memory type, which may be an
io memory or a system memory to a destination address pointing to system
memory. To avoid the overhead of creating iosys_map type for the
destination, new variant is created to accept the address directly.

Also a new function is exported in drm_cache.c to find if the fast copy
is supported by the platform or not. It is required for i915.

Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: Thomas Zimmermann 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Thomas Hellstr_m 
Cc: Lucas De Marchi 

Signed-off-by: Balasubramani Vivekanandan 
---
drivers/gpu/drm/drm_cache.c | 54 +
include/drm/drm_cache.h |  3 +++
2 files changed, 57 insertions(+)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index a21c1350eb09..97959eecc300 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -358,6 +358,54 @@ void drm_memcpy_from_wc(struct iosys_map *dst,
}
EXPORT_SYMBOL(drm_memcpy_from_wc);

+/**
+ * drm_memcpy_from_wc_vaddr - Perform the fastest available memcpy from a 
source
+ * that may be WC to a destination in system memory.
+ * @dst: The destination pointer
+ * @src: The source pointer
+ * @len: The size of the area to transfer in bytes
+ *
+ * Same as drm_memcpy_from_wc except destination is accepted as system memory


drm_memcpy_from_wc() for kernel doc


+ * address. Useful in situations where passing destination address as iosys_map
+ * is simply an overhead and can be avoided.
+ */
+void drm_memcpy_from_wc_vaddr(void *dst, const struct iosys_map *src,


As in the first version, still don't like the name, but ok.


Reviewed-by: Lucas De Marchi 


Lucas De Marchi


Re: Regression from 3c196f056666 ("drm/amdgpu: always reset the asic in suspend (v2)") on suspend?

2022-03-21 Thread Thorsten Leemhuis
On 21.03.22 19:49, Dominique Dumont wrote:
> On Monday, 21 March 2022 09:57:59 CET Thorsten Leemhuis wrote:
>> Dominique/Salvatore/Eric, what's the status of this regression?
>> According to the debian bug tracker the problem is solved with 5.16 and
>> 5.17, but was 5.15 ever fixed?
> 
> I don't think so.
> 
> On kernel side, the commit fixing this issue is
> e55a3aea418269266d84f426b3bd70794d3389c8 . 
> 
> According to the logs of [1] , this commit landed in v5.17-rc3
> 
> HTH
> 
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

And from there it among others got backported to 5.15.22:

https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=linux-5.15.y=8a15ac1786c92dce6ecbeb4e4c237f5f80c2c703

https://lwn.net/Articles/884107/

Another indicator that Eric's problem is something else.

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)

P.S.: As the Linux kernel's regression tracker I'm getting a lot of
reports on my table. I can only look briefly into most of them and lack
knowledge about most of the areas they concern. I thus unfortunately
will sometimes get things wrong or miss something important. I hope
that's not the case here; if you think it is, don't hesitate to tell me
in a public reply, it's in everyone's interest to set the public record
straight.




Re: [Intel-gfx] [PATCH 18/22] drm/i915: Use drm_mode_init() for on-stack modes

2022-03-21 Thread Ville Syrjälä
On Wed, Mar 16, 2022 at 10:00:06AM +0200, Jani Nikula wrote:
> On Fri, 18 Feb 2022, Ville Syrjala  wrote:
> > From: Ville Syrjälä 
> >
> > Initialize on-stack modes with drm_mode_init() to guarantee
> > no stack garbage in the list head, or that we aren't copying
> > over another mode's list head.
> >
> > Based on the following cocci script, with manual fixups:
> > @decl@
> > identifier M;
> > expression E;
> > @@
> > - struct drm_display_mode M = E;
> > + struct drm_display_mode M;
> >
> > @@
> > identifier decl.M;
> > expression decl.E;
> > statement S, S1;
> > @@
> > struct drm_display_mode M;
> > ... when != S
> > + drm_mode_init(, );
> > +
> > S1
> >
> > @@
> > expression decl.E;
> > @@
> > - &*E
> > + E
> >
> > Signed-off-by: Ville Syrjälä 
> 
> I wonder if that cocci could be added to scripts/coccinelle or something
> to detect anyone adding new ones?

Maybe.

Julia & co, would you be open to having drm subsystem specific
coccinelle scripts? If so where should we put the?
scripts/coccinelle/drm perhaps?

-- 
Ville Syrjälä
Intel


Re: [PATCH 1/4] i915/gem: drop wbinvd_on_all_cpus usage

2022-03-21 Thread Michael Cheng


On 2022-03-21 4:07 a.m., Thomas Hellström wrote:


On 3/21/22 11:30, Tvrtko Ursulin wrote:


On 19/03/2022 19:42, Michael Cheng wrote:
Previous concern with using drm_clflush_sg was that we don't know 
what the

sg_table is pointing to, thus the usage of wbinvd_on_all_cpus to flush
everything at once to avoid paranoia.


And now we know, or we know it is not a concern?

To make i915 more architecture-neutral and be less paranoid, lets 
attempt to


"Lets attempt" as we don't know if this will work and/or what 
can/will break?



use drm_clflush_sg to flush the pages for when the GPU wants to read
from main memory.

Signed-off-by: Michael Cheng 
---
  drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 9 ++---
  1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c 
b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c

index f5062d0c6333..b0a5baaebc43 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -8,6 +8,7 @@
  #include 
  #include 
  #include 
+#include 
    #include 
  @@ -250,16 +251,10 @@ static int 
i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj)
   * DG1 is special here since it still snoops transactions even 
with
   * CACHE_NONE. This is not the case with other HAS_SNOOP 
platforms. We

   * might need to revisit this as we add new discrete platforms.
- *
- * XXX: Consider doing a vmap flush or something, where possible.
- * Currently we just do a heavy handed wbinvd_on_all_cpus() 
here since
- * the underlying sg_table might not even point to struct 
pages, so we
- * can't just call drm_clflush_sg or similar, like we do 
elsewhere in

- * the driver.
   */
  if (i915_gem_object_can_bypass_llc(obj) ||
  (!HAS_LLC(i915) && !IS_DG1(i915)))
-    wbinvd_on_all_cpus();
+    drm_clflush_sg(pages);


And as noticed before, drm_clfush_sg still can call 
wbinvd_on_all_cpus so are you just punting the issue somewhere else? 
How will it be solved there?


I think in this case, drm_clflush_sg() can't be immediately used, 
because pages may not contain actual page pointers; might be just the 
dma address. It needs to be preceded with a dmabuf vmap.


Could you elaborate more with using a dmabuf vmap?

Doing a quick grep on drm_clflush_sg, were you thinking about something 
similar to the following?


if (obj->cache_dirty) {
WARN_ON_ONCE(IS_DGFX(i915));
obj->write_domain = 0;
if (i915_gem_object_has_struct_page(obj))
drm_clflush_sg(pages);
obj->cache_dirty = false;
}


Thanks,

Michael Cheng

But otherwise this change, I figure, falls into the "prefer 
range-aware apis" category; If the CPU supports it, flush the range 
only, otherwise fall back to wbinvd().


/Thomas




Regards,

Tvrtko


    sg_page_sizes = i915_sg_dma_sizes(pages->sgl);
  __i915_gem_object_set_pages(obj, pages, sg_page_sizes);

Re: [PATCH v3 1/3] drm: allow real encoder to be passed for drm_writeback_connector

2022-03-21 Thread Abhinav Kumar

Hi Liviu

On 3/21/2022 11:07 AM, Liviu Dudau wrote:

On Thu, Mar 17, 2022 at 10:26:38AM -0700, Abhinav Kumar wrote:

Hi Laurent

Thanks for the review.

On 3/17/2022 1:51 AM, Laurent Pinchart wrote:

Hi Abhinav,

Thank you for the patch.

On Wed, Mar 16, 2022 at 11:48:16AM -0700, Abhinav Kumar wrote:

For some vendor driver implementations, display hardware can
be shared between the encoder used for writeback and the physical
display.

In addition resources such as clocks and interrupts can
also be shared between writeback and the real encoder.

To accommodate such vendor drivers and hardware, allow
real encoder to be passed for drm_writeback_connector using a new
drm_writeback_connector_init_with_encoder() API.


The commit message doesn't match the commit.

Sorry, while splitting the change , I missed this part of the commit text.
Will fix it up.



In addition, to preserve the same call flows for the existing
users of drm_writeback_connector_init(), also allow passing
possible_crtcs as a parameter so that encoder can be initialized
with it.

changes in v3:
- allow passing possible_crtcs for existing users of
  drm_writeback_connector_init()
- squash the vendor changes into the same commit so
  that each patch in the series can compile individually

Co-developed-by: Kandpal Suraj 
Signed-off-by: Abhinav Kumar 
---
   .../drm/arm/display/komeda/komeda_wb_connector.c   |   3 +-
   drivers/gpu/drm/arm/malidp_mw.c|   5 +-
   drivers/gpu/drm/drm_writeback.c| 103 
+
   drivers/gpu/drm/rcar-du/rcar_du_writeback.c|   5 +-
   drivers/gpu/drm/vc4/vc4_txp.c  |  19 ++--
   drivers/gpu/drm/vkms/vkms_writeback.c  |   3 +-
   include/drm/drm_writeback.h|  22 -
   7 files changed, 103 insertions(+), 57 deletions(-)

diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c 
b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
index e465cc4..40774e6 100644
--- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
+++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
@@ -155,7 +155,6 @@ static int komeda_wb_connector_add(struct komeda_kms_dev 
*kms,
kwb_conn->wb_layer = kcrtc->master->wb_layer;
wb_conn = _conn->base;
-   wb_conn->encoder.possible_crtcs = BIT(drm_crtc_index(>base));
formats = komeda_get_layer_fourcc_list(>fmt_tbl,
   kwb_conn->wb_layer->layer_type,
@@ -164,7 +163,7 @@ static int komeda_wb_connector_add(struct komeda_kms_dev 
*kms,
err = drm_writeback_connector_init(>base, wb_conn,
   _wb_connector_funcs,
   _wb_encoder_helper_funcs,
-  formats, n_formats);
+  formats, n_formats, 
BIT(drm_crtc_index(>base)));
komeda_put_fourcc_list(formats);
if (err) {
kfree(kwb_conn);
diff --git a/drivers/gpu/drm/arm/malidp_mw.c b/drivers/gpu/drm/arm/malidp_mw.c
index f5847a7..b882066 100644
--- a/drivers/gpu/drm/arm/malidp_mw.c
+++ b/drivers/gpu/drm/arm/malidp_mw.c
@@ -208,11 +208,12 @@ int malidp_mw_connector_init(struct drm_device *drm)
struct malidp_drm *malidp = drm->dev_private;
u32 *formats;
int ret, n_formats;
+   uint32_t possible_crtcs;
if (!malidp->dev->hw->enable_memwrite)
return 0;
-   malidp->mw_connector.encoder.possible_crtcs = 1 << 
drm_crtc_index(>crtc);
+   possible_crtcs = 1 << drm_crtc_index(>crtc);
drm_connector_helper_add(>mw_connector.base,
 _mw_connector_helper_funcs);
@@ -223,7 +224,7 @@ int malidp_mw_connector_init(struct drm_device *drm)
ret = drm_writeback_connector_init(drm, >mw_connector,
   _mw_connector_funcs,
   _mw_encoder_helper_funcs,
-  formats, n_formats);
+  formats, n_formats, possible_crtcs);


Do you need the local variable ?


Yes, we can dtop this. I just used this instead of "1 <<
drm_crtc_index(>crtc)" to simplify it.
No strong preference.




kfree(formats);
if (ret)
return ret;
diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
index dccf4504..17c1471 100644
--- a/drivers/gpu/drm/drm_writeback.c
+++ b/drivers/gpu/drm/drm_writeback.c
@@ -149,36 +149,15 @@ static const struct drm_encoder_funcs 
drm_writeback_encoder_funcs = {
.destroy = drm_encoder_cleanup,
   };
-/**
- * drm_writeback_connector_init - Initialize a writeback connector and its 
properties
- * @dev: DRM device
- * @wb_connector: Writeback connector to initialize
- * @con_funcs: Connector funcs vtable
- * @enc_helper_funcs: 

Re: [PATCH v4 2/4] drm: allow real encoder to be passed for drm_writeback_connector

2022-03-21 Thread Abhinav Kumar

Hi Livid

Thanks for your review.

All your comments are valid. I think I should re-order the patches like 
you have suggested. That should address all comments.


Thanks

Abhinav

On 3/21/2022 10:24 AM, Liviu Dudau wrote:

On Thu, Mar 17, 2022 at 06:45:34PM -0700, Abhinav Kumar wrote:

For some vendor driver implementations, display hardware can
be shared between the encoder used for writeback and the physical
display.

In addition resources such as clocks and interrupts can
also be shared between writeback and the real encoder.

To accommodate such vendor drivers and hardware, allow
real encoder to be passed for drm_writeback_connector.

changes in v4:
- split the possible_crtcs change and the parts which should
  belong to the addition of new API to the next change

Co-developed-by: Kandpal Suraj 
Signed-off-by: Abhinav Kumar 
---
  drivers/gpu/drm/drm_writeback.c | 12 +++-
  drivers/gpu/drm/vc4/vc4_txp.c   | 14 ++
  include/drm/drm_writeback.h | 18 --
  3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
index dc2ef12..a4c17d6 100644
--- a/drivers/gpu/drm/drm_writeback.c
+++ b/drivers/gpu/drm/drm_writeback.c
@@ -190,11 +190,13 @@ int drm_writeback_connector_init(struct drm_device *dev,
if (IS_ERR(blob))
return PTR_ERR(blob);
  
-	drm_encoder_helper_add(_connector->encoder, enc_helper_funcs);

+   drm_encoder_helper_add(wb_connector->encoder, enc_helper_funcs);
  
-	wb_connector->encoder.possible_crtcs = possible_crtcs;

+   wb_connector->encoder = _connector->internal_encoder;


You need to check here that the wb_connector doesn't have already an attached 
encoder
before you overwrite the pointer with the internal encoder.

  
-	ret = drm_encoder_init(dev, _connector->encoder,

+   wb_connector->encoder->possible_crtcs = possible_crtcs;
+
+   ret = drm_encoder_init(dev, wb_connector->encoder,
   _writeback_encoder_funcs,
   DRM_MODE_ENCODER_VIRTUAL, NULL);


Here you have initialised the encoder pointed at by wb_connector->encoder, 
which is
always wb_connector->internal_encoder with your code.


if (ret)
@@ -208,7 +210,7 @@ int drm_writeback_connector_init(struct drm_device *dev,
goto connector_fail;
  
  	ret = drm_connector_attach_encoder(connector,

-   _connector->encoder);
+   wb_connector->encoder);
if (ret)
goto attach_fail;
  
@@ -237,7 +239,7 @@ int drm_writeback_connector_init(struct drm_device *dev,

  attach_fail:
drm_connector_cleanup(connector);
  connector_fail:
-   drm_encoder_cleanup(_connector->encoder);
+   drm_encoder_cleanup(wb_connector->encoder);
  fail:
drm_property_blob_put(blob);
return ret;
diff --git a/drivers/gpu/drm/vc4/vc4_txp.c b/drivers/gpu/drm/vc4/vc4_txp.c
index 3447eb6..341a9be5 100644
--- a/drivers/gpu/drm/vc4/vc4_txp.c
+++ b/drivers/gpu/drm/vc4/vc4_txp.c
@@ -151,6 +151,8 @@ struct vc4_txp {
  
  	struct platform_device *pdev;
  
+	struct drm_encoder drm_enc;

+
struct drm_writeback_connector connector;
  
  	void __iomem *regs;

@@ -159,7 +161,7 @@ struct vc4_txp {
  
  static inline struct vc4_txp *encoder_to_vc4_txp(struct drm_encoder *encoder)

  {
-   return container_of(encoder, struct vc4_txp, connector.encoder);
+   return container_of(encoder, struct vc4_txp, drm_enc);
  }
  
  static inline struct vc4_txp *connector_to_vc4_txp(struct drm_connector *conn)

@@ -467,6 +469,7 @@ static int vc4_txp_bind(struct device *dev, struct device 
*master, void *data)
struct vc4_txp *txp;
struct drm_crtc *crtc;
struct drm_encoder *encoder;
+   struct drm_writeback_connector *wb_conn;
int ret, irq;
  
  	irq = platform_get_irq(pdev, 0);

@@ -492,9 +495,12 @@ static int vc4_txp_bind(struct device *dev, struct device 
*master, void *data)
txp->regset.regs = txp_regs;
txp->regset.nregs = ARRAY_SIZE(txp_regs);
  
-	drm_connector_helper_add(>connector.base,

+   wb_conn = >connector;
+   wb_conn->encoder = >drm_enc;
+
+   drm_connector_helper_add(_conn->base,
 _txp_connector_helper_funcs);
-   ret = drm_writeback_connector_init(drm, >connector,
+   ret = drm_writeback_connector_init(drm, wb_conn,
   _txp_connector_funcs,
   _txp_encoder_helper_funcs,
   drm_fmts, ARRAY_SIZE(drm_fmts),


This call will never initialise the txp->drm_enc, as per my comments above. 
However
if this was the intent, it's fine, but then you need to add a 
drm_encoder_init() call
here for txp->drm_enc. Otherwise, you need to stop overwriting the pointer in

Re: [PATCH v3 1/3] drm: allow real encoder to be passed for drm_writeback_connector

2022-03-21 Thread Liviu Dudau
On Thu, Mar 17, 2022 at 10:26:38AM -0700, Abhinav Kumar wrote:
> Hi Laurent
> 
> Thanks for the review.
> 
> On 3/17/2022 1:51 AM, Laurent Pinchart wrote:
> > Hi Abhinav,
> > 
> > Thank you for the patch.
> > 
> > On Wed, Mar 16, 2022 at 11:48:16AM -0700, Abhinav Kumar wrote:
> > > For some vendor driver implementations, display hardware can
> > > be shared between the encoder used for writeback and the physical
> > > display.
> > > 
> > > In addition resources such as clocks and interrupts can
> > > also be shared between writeback and the real encoder.
> > > 
> > > To accommodate such vendor drivers and hardware, allow
> > > real encoder to be passed for drm_writeback_connector using a new
> > > drm_writeback_connector_init_with_encoder() API.
> > 
> > The commit message doesn't match the commit.
> Sorry, while splitting the change , I missed this part of the commit text.
> Will fix it up.
> > 
> > > In addition, to preserve the same call flows for the existing
> > > users of drm_writeback_connector_init(), also allow passing
> > > possible_crtcs as a parameter so that encoder can be initialized
> > > with it.
> > > 
> > > changes in v3:
> > >   - allow passing possible_crtcs for existing users of
> > > drm_writeback_connector_init()
> > >   - squash the vendor changes into the same commit so
> > > that each patch in the series can compile individually
> > > 
> > > Co-developed-by: Kandpal Suraj 
> > > Signed-off-by: Abhinav Kumar 
> > > ---
> > >   .../drm/arm/display/komeda/komeda_wb_connector.c   |   3 +-
> > >   drivers/gpu/drm/arm/malidp_mw.c|   5 +-
> > >   drivers/gpu/drm/drm_writeback.c| 103 
> > > +
> > >   drivers/gpu/drm/rcar-du/rcar_du_writeback.c|   5 +-
> > >   drivers/gpu/drm/vc4/vc4_txp.c  |  19 ++--
> > >   drivers/gpu/drm/vkms/vkms_writeback.c  |   3 +-
> > >   include/drm/drm_writeback.h|  22 -
> > >   7 files changed, 103 insertions(+), 57 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c 
> > > b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
> > > index e465cc4..40774e6 100644
> > > --- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
> > > +++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
> > > @@ -155,7 +155,6 @@ static int komeda_wb_connector_add(struct 
> > > komeda_kms_dev *kms,
> > >   kwb_conn->wb_layer = kcrtc->master->wb_layer;
> > >   wb_conn = _conn->base;
> > > - wb_conn->encoder.possible_crtcs = BIT(drm_crtc_index(>base));
> > >   formats = komeda_get_layer_fourcc_list(>fmt_tbl,
> > >  
> > > kwb_conn->wb_layer->layer_type,
> > > @@ -164,7 +163,7 @@ static int komeda_wb_connector_add(struct 
> > > komeda_kms_dev *kms,
> > >   err = drm_writeback_connector_init(>base, wb_conn,
> > >  _wb_connector_funcs,
> > >  
> > > _wb_encoder_helper_funcs,
> > > -formats, n_formats);
> > > +formats, n_formats, 
> > > BIT(drm_crtc_index(>base)));
> > >   komeda_put_fourcc_list(formats);
> > >   if (err) {
> > >   kfree(kwb_conn);
> > > diff --git a/drivers/gpu/drm/arm/malidp_mw.c 
> > > b/drivers/gpu/drm/arm/malidp_mw.c
> > > index f5847a7..b882066 100644
> > > --- a/drivers/gpu/drm/arm/malidp_mw.c
> > > +++ b/drivers/gpu/drm/arm/malidp_mw.c
> > > @@ -208,11 +208,12 @@ int malidp_mw_connector_init(struct drm_device *drm)
> > >   struct malidp_drm *malidp = drm->dev_private;
> > >   u32 *formats;
> > >   int ret, n_formats;
> > > + uint32_t possible_crtcs;
> > >   if (!malidp->dev->hw->enable_memwrite)
> > >   return 0;
> > > - malidp->mw_connector.encoder.possible_crtcs = 1 << 
> > > drm_crtc_index(>crtc);
> > > + possible_crtcs = 1 << drm_crtc_index(>crtc);
> > >   drm_connector_helper_add(>mw_connector.base,
> > >_mw_connector_helper_funcs);
> > > @@ -223,7 +224,7 @@ int malidp_mw_connector_init(struct drm_device *drm)
> > >   ret = drm_writeback_connector_init(drm, >mw_connector,
> > >  _mw_connector_funcs,
> > >  
> > > _mw_encoder_helper_funcs,
> > > -formats, n_formats);
> > > +formats, n_formats, possible_crtcs);
> > 
> > Do you need the local variable ?
> 
> Yes, we can dtop this. I just used this instead of "1 <<
> drm_crtc_index(>crtc)" to simplify it.
> No strong preference.
> 
> > 
> > >   kfree(formats);
> > >   if (ret)
> > >   return ret;
> > > diff --git a/drivers/gpu/drm/drm_writeback.c 
> > > 

Re: [PATCH 1/4] i915/gem: drop wbinvd_on_all_cpus usage

2022-03-21 Thread Michael Cheng



On 2022-03-21 10:28 a.m., Tvrtko Ursulin wrote:


On 21/03/2022 16:31, Michael Cheng wrote:

On 2022-03-21 3:30 a.m., Tvrtko Ursulin wrote:



On 19/03/2022 19:42, Michael Cheng wrote:
Previous concern with using drm_clflush_sg was that we don't know 
what the

sg_table is pointing to, thus the usage of wbinvd_on_all_cpus to flush
everything at once to avoid paranoia.


And now we know, or we know it is not a concern?

To make i915 more architecture-neutral and be less paranoid, lets 
attempt to


"Lets attempt" as we don't know if this will work and/or what 
can/will break?


Yes, but it seems like there's no regression with IGT .

If there's a big hit in performance, or if this solution gets 
accepted and the bug reports come flying in, we can explore other 
solutions. But speaking to Dan Vetter, ideal solution would be to 
avoid any calls directly to wbinvd, and use drm helpers in place.


+Daniel for any extra input.


use drm_clflush_sg to flush the pages for when the GPU wants to read
from main memory.

Signed-off-by: Michael Cheng 
---
  drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 9 ++---
  1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c 
b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c

index f5062d0c6333..b0a5baaebc43 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -8,6 +8,7 @@
  #include 
  #include 
  #include 
+#include 
    #include 
  @@ -250,16 +251,10 @@ static int 
i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj)
   * DG1 is special here since it still snoops transactions 
even with
   * CACHE_NONE. This is not the case with other HAS_SNOOP 
platforms. We

   * might need to revisit this as we add new discrete platforms.
- *
- * XXX: Consider doing a vmap flush or something, where possible.
- * Currently we just do a heavy handed wbinvd_on_all_cpus() 
here since
- * the underlying sg_table might not even point to struct 
pages, so we
- * can't just call drm_clflush_sg or similar, like we do 
elsewhere in

- * the driver.
   */
  if (i915_gem_object_can_bypass_llc(obj) ||
  (!HAS_LLC(i915) && !IS_DG1(i915)))
-    wbinvd_on_all_cpus();
+    drm_clflush_sg(pages);


And as noticed before, drm_clfush_sg still can call 
wbinvd_on_all_cpus so are you just punting the issue somewhere else? 
How will it be solved there?


Instead of calling an x86 asm directly, we are using what's available 
to use to make the driver more architecture neutral. Agreeing with 
Thomas, this solution falls within the "prefer range-aware clflush 
apis", and since some other generation platform doesn't support 
clflushopt, it will fall back to using wbinvd.


Right, I was trying to get the information on what will drm_clflush_sg 
do on Arm. Is it range based or global there, or if the latter exists.



CCing a few ARM folks to see if they have any inputs.

+ Catalin And Robin


Regards,

Tvrtko


Re: [PATCH 1/4] i915/gem: drop wbinvd_on_all_cpus usage

2022-03-21 Thread Michael Cheng



On 2022-03-21 10:28 a.m., Tvrtko Ursulin wrote:


On 21/03/2022 16:31, Michael Cheng wrote:

On 2022-03-21 3:30 a.m., Tvrtko Ursulin wrote:



On 19/03/2022 19:42, Michael Cheng wrote:
Previous concern with using drm_clflush_sg was that we don't know 
what the

sg_table is pointing to, thus the usage of wbinvd_on_all_cpus to flush
everything at once to avoid paranoia.


And now we know, or we know it is not a concern?

To make i915 more architecture-neutral and be less paranoid, lets 
attempt to


"Lets attempt" as we don't know if this will work and/or what 
can/will break?


Yes, but it seems like there's no regression with IGT .

If there's a big hit in performance, or if this solution gets 
accepted and the bug reports come flying in, we can explore other 
solutions. But speaking to Dan Vetter, ideal solution would be to 
avoid any calls directly to wbinvd, and use drm helpers in place.


+Daniel for any extra input.


use drm_clflush_sg to flush the pages for when the GPU wants to read
from main memory.

Signed-off-by: Michael Cheng 
---
  drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 9 ++---
  1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c 
b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c

index f5062d0c6333..b0a5baaebc43 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -8,6 +8,7 @@
  #include 
  #include 
  #include 
+#include 
    #include 
  @@ -250,16 +251,10 @@ static int 
i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj)
   * DG1 is special here since it still snoops transactions 
even with
   * CACHE_NONE. This is not the case with other HAS_SNOOP 
platforms. We

   * might need to revisit this as we add new discrete platforms.
- *
- * XXX: Consider doing a vmap flush or something, where possible.
- * Currently we just do a heavy handed wbinvd_on_all_cpus() 
here since
- * the underlying sg_table might not even point to struct 
pages, so we
- * can't just call drm_clflush_sg or similar, like we do 
elsewhere in

- * the driver.
   */
  if (i915_gem_object_can_bypass_llc(obj) ||
  (!HAS_LLC(i915) && !IS_DG1(i915)))
-    wbinvd_on_all_cpus();
+    drm_clflush_sg(pages);


And as noticed before, drm_clfush_sg still can call 
wbinvd_on_all_cpus so are you just punting the issue somewhere else? 
How will it be solved there?


Instead of calling an x86 asm directly, we are using what's available 
to use to make the driver more architecture neutral. Agreeing with 
Thomas, this solution falls within the "prefer range-aware clflush 
apis", and since some other generation platform doesn't support 
clflushopt, it will fall back to using wbinvd.


Right, I was trying to get the information on what will drm_clflush_sg 
do on Arm. Is it range based or global there, or if the latter exists.


I am not too sure about the ARM side. We are currently working that out 
with the ARM folks in a different thread.

Regards,

Tvrtko


Re: [PATCH v6 1/5] drm/msm/disp/dpu1: set mdp clk to the maximum frequency in opp table during probe

2022-03-21 Thread Dmitry Baryshkov
On Mon, 21 Mar 2022 at 19:21, Vinod Polimera  wrote:
>
>
>
> > -Original Message-
> > From: Stephen Boyd 
> > Sent: Friday, March 18, 2022 2:41 AM
> > To: quic_vpolimer ;
> > devicet...@vger.kernel.org; dri-devel@lists.freedesktop.org;
> > freedr...@lists.freedesktop.org; linux-arm-...@vger.kernel.org
> > Cc: linux-ker...@vger.kernel.org; robdcl...@gmail.com;
> > dmitry.barysh...@linaro.org; diand...@chromium.org; quic_kalyant
> > 
> > Subject: Re: [PATCH v6 1/5] drm/msm/disp/dpu1: set mdp clk to the
> > maximum frequency in opp table during probe
> >
> > WARNING: This email originated from outside of Qualcomm. Please be wary
> > of any links or attachments, and do not enable macros.
> >
> > Quoting Vinod Polimera (2022-03-14 07:46:53)
> > > use max clock during probe/bind sequence from the opp table.
> > > The clock will be scaled down when framework sends an update.
> >
> > Capitalize 'use'.
> >
> > Why is it important to use max frequency during probe/bind? Does not
> > setting the clk rate during probe mean that we'll never use the max
> > rate? Does it speed things up during probe?
>
> We need to vote mdp clock during probe/bind so that rails are not set at 
> undetermined state as pointed out by Dmitry.
> Since we dont know what will be the rate set in boot loader, it would be 
> ideal to vote at max frequency.
> There could be a firmware display programmed in bootloader and we want to 
> transition it to kernel without underflowing.

This should be expressed in the commit message.


-- 
With best wishes
Dmitry


Re: [PATCH 1/4] i915/gem: drop wbinvd_on_all_cpus usage

2022-03-21 Thread Tvrtko Ursulin



On 21/03/2022 16:31, Michael Cheng wrote:

On 2022-03-21 3:30 a.m., Tvrtko Ursulin wrote:



On 19/03/2022 19:42, Michael Cheng wrote:
Previous concern with using drm_clflush_sg was that we don't know 
what the

sg_table is pointing to, thus the usage of wbinvd_on_all_cpus to flush
everything at once to avoid paranoia.


And now we know, or we know it is not a concern?

To make i915 more architecture-neutral and be less paranoid, lets 
attempt to


"Lets attempt" as we don't know if this will work and/or what can/will 
break?


Yes, but it seems like there's no regression with IGT .

If there's a big hit in performance, or if this solution gets accepted 
and the bug reports come flying in, we can explore other solutions. But 
speaking to Dan Vetter, ideal solution would be to avoid any calls 
directly to wbinvd, and use drm helpers in place.


+Daniel for any extra input.


use drm_clflush_sg to flush the pages for when the GPU wants to read
from main memory.

Signed-off-by: Michael Cheng 
---
  drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 9 ++---
  1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c 
b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c

index f5062d0c6333..b0a5baaebc43 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -8,6 +8,7 @@
  #include 
  #include 
  #include 
+#include 
    #include 
  @@ -250,16 +251,10 @@ static int 
i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj)
   * DG1 is special here since it still snoops transactions even 
with
   * CACHE_NONE. This is not the case with other HAS_SNOOP 
platforms. We

   * might need to revisit this as we add new discrete platforms.
- *
- * XXX: Consider doing a vmap flush or something, where possible.
- * Currently we just do a heavy handed wbinvd_on_all_cpus() here 
since
- * the underlying sg_table might not even point to struct pages, 
so we
- * can't just call drm_clflush_sg or similar, like we do 
elsewhere in

- * the driver.
   */
  if (i915_gem_object_can_bypass_llc(obj) ||
  (!HAS_LLC(i915) && !IS_DG1(i915)))
-    wbinvd_on_all_cpus();
+    drm_clflush_sg(pages);


And as noticed before, drm_clfush_sg still can call wbinvd_on_all_cpus 
so are you just punting the issue somewhere else? How will it be 
solved there?


Instead of calling an x86 asm directly, we are using what's available to 
use to make the driver more architecture neutral. Agreeing with Thomas, 
this solution falls within the "prefer range-aware clflush apis", and 
since some other generation platform doesn't support clflushopt, it will 
fall back to using wbinvd.


Right, I was trying to get the information on what will drm_clflush_sg 
do on Arm. Is it range based or global there, or if the latter exists.


Regards,

Tvrtko


Re: [PATCH v4 2/4] drm: allow real encoder to be passed for drm_writeback_connector

2022-03-21 Thread Liviu Dudau
On Thu, Mar 17, 2022 at 06:45:34PM -0700, Abhinav Kumar wrote:
> For some vendor driver implementations, display hardware can
> be shared between the encoder used for writeback and the physical
> display.
> 
> In addition resources such as clocks and interrupts can
> also be shared between writeback and the real encoder.
> 
> To accommodate such vendor drivers and hardware, allow
> real encoder to be passed for drm_writeback_connector.
> 
> changes in v4:
>   - split the possible_crtcs change and the parts which should
> belong to the addition of new API to the next change
> 
> Co-developed-by: Kandpal Suraj 
> Signed-off-by: Abhinav Kumar 
> ---
>  drivers/gpu/drm/drm_writeback.c | 12 +++-
>  drivers/gpu/drm/vc4/vc4_txp.c   | 14 ++
>  include/drm/drm_writeback.h | 18 --
>  3 files changed, 33 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> index dc2ef12..a4c17d6 100644
> --- a/drivers/gpu/drm/drm_writeback.c
> +++ b/drivers/gpu/drm/drm_writeback.c
> @@ -190,11 +190,13 @@ int drm_writeback_connector_init(struct drm_device *dev,
>   if (IS_ERR(blob))
>   return PTR_ERR(blob);
>  
> - drm_encoder_helper_add(_connector->encoder, enc_helper_funcs);
> + drm_encoder_helper_add(wb_connector->encoder, enc_helper_funcs);
>  
> - wb_connector->encoder.possible_crtcs = possible_crtcs;
> + wb_connector->encoder = _connector->internal_encoder;

You need to check here that the wb_connector doesn't have already an attached 
encoder
before you overwrite the pointer with the internal encoder.

>  
> - ret = drm_encoder_init(dev, _connector->encoder,
> + wb_connector->encoder->possible_crtcs = possible_crtcs;
> +
> + ret = drm_encoder_init(dev, wb_connector->encoder,
>  _writeback_encoder_funcs,
>  DRM_MODE_ENCODER_VIRTUAL, NULL);

Here you have initialised the encoder pointed at by wb_connector->encoder, 
which is
always wb_connector->internal_encoder with your code.

>   if (ret)
> @@ -208,7 +210,7 @@ int drm_writeback_connector_init(struct drm_device *dev,
>   goto connector_fail;
>  
>   ret = drm_connector_attach_encoder(connector,
> - _connector->encoder);
> + wb_connector->encoder);
>   if (ret)
>   goto attach_fail;
>  
> @@ -237,7 +239,7 @@ int drm_writeback_connector_init(struct drm_device *dev,
>  attach_fail:
>   drm_connector_cleanup(connector);
>  connector_fail:
> - drm_encoder_cleanup(_connector->encoder);
> + drm_encoder_cleanup(wb_connector->encoder);
>  fail:
>   drm_property_blob_put(blob);
>   return ret;
> diff --git a/drivers/gpu/drm/vc4/vc4_txp.c b/drivers/gpu/drm/vc4/vc4_txp.c
> index 3447eb6..341a9be5 100644
> --- a/drivers/gpu/drm/vc4/vc4_txp.c
> +++ b/drivers/gpu/drm/vc4/vc4_txp.c
> @@ -151,6 +151,8 @@ struct vc4_txp {
>  
>   struct platform_device *pdev;
>  
> + struct drm_encoder drm_enc;
> +
>   struct drm_writeback_connector connector;
>  
>   void __iomem *regs;
> @@ -159,7 +161,7 @@ struct vc4_txp {
>  
>  static inline struct vc4_txp *encoder_to_vc4_txp(struct drm_encoder *encoder)
>  {
> - return container_of(encoder, struct vc4_txp, connector.encoder);
> + return container_of(encoder, struct vc4_txp, drm_enc);
>  }
>  
>  static inline struct vc4_txp *connector_to_vc4_txp(struct drm_connector 
> *conn)
> @@ -467,6 +469,7 @@ static int vc4_txp_bind(struct device *dev, struct device 
> *master, void *data)
>   struct vc4_txp *txp;
>   struct drm_crtc *crtc;
>   struct drm_encoder *encoder;
> + struct drm_writeback_connector *wb_conn;
>   int ret, irq;
>  
>   irq = platform_get_irq(pdev, 0);
> @@ -492,9 +495,12 @@ static int vc4_txp_bind(struct device *dev, struct 
> device *master, void *data)
>   txp->regset.regs = txp_regs;
>   txp->regset.nregs = ARRAY_SIZE(txp_regs);
>  
> - drm_connector_helper_add(>connector.base,
> + wb_conn = >connector;
> + wb_conn->encoder = >drm_enc;
> +
> + drm_connector_helper_add(_conn->base,
>_txp_connector_helper_funcs);
> - ret = drm_writeback_connector_init(drm, >connector,
> + ret = drm_writeback_connector_init(drm, wb_conn,
>  _txp_connector_funcs,
>  _txp_encoder_helper_funcs,
>  drm_fmts, ARRAY_SIZE(drm_fmts),

This call will never initialise the txp->drm_enc, as per my comments above. 
However
if this was the intent, it's fine, but then you need to add a 
drm_encoder_init() call
here for txp->drm_enc. Otherwise, you need to stop overwriting the pointer in
drm_writeback_connector_init().

> @@ -507,7 +513,7 @@ static int vc4_txp_bind(struct device *dev, struct 

[Bug 205089] amdgpu : drm:amdgpu_cs_ioctl : Failed to initialize parser -125

2022-03-21 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205089

Joris L. (commandl...@protonmail.com) changed:

   What|Removed |Added

 CC||commandl...@protonmail.com

--- Comment #36 from Joris L. (commandl...@protonmail.com) ---
I also see these kind of errors on EL8 with kernel 4.18.0-348.20.1.el8_5.x86_64

I've been tracking a webkit bug for some time with similar impact, this webkit
bug were hard freezes but here the system does not always freeze, it can
recover.

Since the webkit bug was browser originating and specific to some URL only i
considered it highly likely to be specific to Javascript.

Now also the impact is Javascript/NodeJS specific.

The URL which now caused this freeze was while writing content on LinkedIn.com

Before the most recent 'partial freeze' there was a 'full freeze' where the
messages such as '[drm:amdgpu_cs_ioctl [amdgpu]] *ERROR* Failed to initialize
parser -125!' were preceeded by a lengthy evolution of the problem

--

[ma mrt 21 17:06:55 2022] perf: interrupt took too long (2510 > 2500), lowering
kernel.perf_event_max_sample_rate to 79000
[ma mrt 21 17:09:27 2022] [drm:amdgpu_dm_commit_planes [amdgpu]] *ERROR*
Waiting for fences timed out!
[ma mrt 21 17:09:32 2022] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring sdma0
timeout, signaled seq=266035, emitted seq=266036
[ma mrt 21 17:09:32 2022] [drm:amdgpu_dm_commit_planes [amdgpu]] *ERROR*
Waiting for fences timed out!
[ma mrt 21 17:09:32 2022] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process
information: process  pid 0 thread  pid 0
[ma mrt 21 17:09:32 2022] amdgpu :05:00.0: GPU reset begin!
[ma mrt 21 17:09:32 2022] [drm] free PSP TMR buffer
[ma mrt 21 17:09:32 2022] amdgpu :05:00.0: MODE2 reset
[ma mrt 21 17:09:32 2022] amdgpu :05:00.0: GPU reset succeeded, trying to
resume
[ma mrt 21 17:09:32 2022] [drm] PCIE GART of 1024M enabled (table at
0x00F40090).
[ma mrt 21 17:09:32 2022] [drm] PSP is resuming...
[ma mrt 21 17:09:32 2022] [drm] reserve 0x40 from 0xf47fc0 for PSP TMR
[ma mrt 21 17:09:32 2022] amdgpu :05:00.0: RAS: optional ras ta ucode is
not available
[ma mrt 21 17:09:32 2022] amdgpu :05:00.0: RAP: optional rap ta ucode is
not available
[ma mrt 21 17:09:32 2022] [drm] kiq ring mec 2 pipe 1 q 0
[ma mrt 21 17:09:33 2022] WARNING: CPU: 5 PID: 25470 at
drivers/gpu/drm/amd/amdgpu/../display/dc/core/dc.c:942
dc_commit_state_no_check+0x404/0x980 [amdgpu]
[ma mrt 21 17:09:33 2022] Modules linked in: snd_seq_dummy snd_hrtimer uinput
xt_CHECKSUM ipt_MASQUERADE xt_conntrack ipt_REJECT nft_compat nf_nat_tftp
nft_objref nf_conntrack_tftp nft_counter tun bridge stp llc nft_fib_inet
nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6
nft_reject nft_ct nf_tables_set nft_chain_nat nf_nat nf_conntrack
nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables libcrc32c nfnetlink sunrpc vfat
fat intel_rapl_msr wmi_bmof intel_rapl_common edac_mce_amd rtw88_8822be
snd_ctl_led rtw88_8822b snd_hda_codec_conexant kvm_amd rtw88_pci
snd_hda_codec_generic snd_hda_codec_hdmi uvcvideo ccp kvm rtw88_core
videobuf2_vmalloc irqbypass rapl snd_hda_intel joydev mac80211 videobuf2_memops
videobuf2_v4l2 pcspkr videobuf2_common snd_intel_dspcfg videodev
snd_intel_sdw_acpi snd_hda_codec snd_hda_core cfg80211 k10temp snd_hwdep
snd_seq snd_seq_device snd_pcm libarc4 snd_timer rtsx_pci_ms thinkpad_acpi
sp5100_tco ledtrig_audio snd_rn_pci_acp3x memstick snd i2c_piix4
[ma mrt 21 17:09:33 2022]  soundcore rfkill wmi video i2c_scmi acpi_cpufreq
ext4 mbcache jbd2 dm_crypt mmc_block sd_mod sg amdgpu rtsx_pci_sdmmc mmc_core
drm_ttm_helper ttm iommu_v2 gpu_sched i2c_algo_bit drm_kms_helper
crct10dif_pclmul crc32_pclmul syscopyarea sysfillrect crc32c_intel ahci
sysimgblt fb_sys_fops libahci drm ghash_clmulni_intel libata serio_raw nvme
nvme_core r8169 rtsx_pci realtek t10_pi dm_mirror dm_region_hash dm_log dm_mod
fuse
[ma mrt 21 17:09:33 2022] CPU: 5 PID: 25470 Comm: kworker/5:3 Kdump: loaded Not
tainted 4.18.0-348.20.1.el8_5.x86_64 #1
[ma mrt 21 17:09:33 2022] Hardware name: LENOVO 20NFGE/20NFGE, BIOS
R11ET44P (1.24 ) 01/26/2022
[ma mrt 21 17:09:33 2022] Workqueue: events drm_sched_job_timedout [gpu_sched]
[ma mrt 21 17:09:33 2022] RIP: 0010:dc_commit_state_no_check+0x404/0x980
[amdgpu]
[ma mrt 21 17:09:33 2022] Code: 74 e2 49 3b 56 08 75 dc 48 8b 93 f8 e8 00 00 48
85 d2 74 d0 48 89 de 4c 89 f7 e8 d7 58 9c c6 eb c3 80 b8 80 03 00 00 00 74 02
<0f> 0b 48 81 c5 d8 04 00 00 49 39 ed 0f 85 d9 02 00 00 48 8b 93 b8
[ma mrt 21 17:09:33 2022] RSP: 0018:a2e14ae7bc20 EFLAGS: 00010202
[ma mrt 21 17:09:33 2022] RAX: 89a339309400 RBX: 89a1e440 RCX:
0002
[ma mrt 21 17:09:33 2022] RDX: 0e60 RSI: 08f8 RDI:
0baa349077ea
[ma mrt 21 17:09:33 2022] RBP: 89a3441e06c0 R08: a2e14ae7bb74 R09:

[ma mrt 

Re: [PATCH v4 1/4] drm: allow passing possible_crtcs to drm_writeback_connector_init()

2022-03-21 Thread Liviu Dudau
Hi,

On Thu, Mar 17, 2022 at 06:45:33PM -0700, Abhinav Kumar wrote:
> Clients of drm_writeback_connector_init() initialize the
> possible_crtcs and then invoke the call to this API.
> 
> To simplify things, allow passing possible_crtcs as a parameter
> to drm_writeback_connector_init() and make changes to the
> other drm drivers to make them compatible with this change.
> 
> changes in v4:
>  - keep only changes related to possible_crtcs
>- add line breaks after ARRAY_SIZE
>- stop using temporary variables for possible_crtcs
> 
> Signed-off-by: Abhinav Kumar 
> ---
>  drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c | 3 +--
>  drivers/gpu/drm/arm/malidp_mw.c  | 4 ++--
>  drivers/gpu/drm/drm_writeback.c  | 6 +-
>  drivers/gpu/drm/rcar-du/rcar_du_writeback.c  | 4 ++--
>  drivers/gpu/drm/vc4/vc4_txp.c| 3 ++-
>  drivers/gpu/drm/vkms/vkms_writeback.c| 4 ++--
>  include/drm/drm_writeback.h  | 2 +-
>  7 files changed, 15 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c 
> b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
> index e465cc4..40774e6 100644
> --- a/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
> +++ b/drivers/gpu/drm/arm/display/komeda/komeda_wb_connector.c
> @@ -155,7 +155,6 @@ static int komeda_wb_connector_add(struct komeda_kms_dev 
> *kms,
>   kwb_conn->wb_layer = kcrtc->master->wb_layer;
>  
>   wb_conn = _conn->base;
> - wb_conn->encoder.possible_crtcs = BIT(drm_crtc_index(>base));
>  
>   formats = komeda_get_layer_fourcc_list(>fmt_tbl,
>  kwb_conn->wb_layer->layer_type,
> @@ -164,7 +163,7 @@ static int komeda_wb_connector_add(struct komeda_kms_dev 
> *kms,
>   err = drm_writeback_connector_init(>base, wb_conn,
>  _wb_connector_funcs,
>  _wb_encoder_helper_funcs,
> -formats, n_formats);
> +formats, n_formats, 
> BIT(drm_crtc_index(>base)));
>   komeda_put_fourcc_list(formats);
>   if (err) {
>   kfree(kwb_conn);
> diff --git a/drivers/gpu/drm/arm/malidp_mw.c b/drivers/gpu/drm/arm/malidp_mw.c
> index f5847a7..e54921d 100644
> --- a/drivers/gpu/drm/arm/malidp_mw.c
> +++ b/drivers/gpu/drm/arm/malidp_mw.c
> @@ -212,7 +212,6 @@ int malidp_mw_connector_init(struct drm_device *drm)
>   if (!malidp->dev->hw->enable_memwrite)
>   return 0;
>  
> - malidp->mw_connector.encoder.possible_crtcs = 1 << 
> drm_crtc_index(>crtc);
>   drm_connector_helper_add(>mw_connector.base,
>_mw_connector_helper_funcs);
>  
> @@ -223,7 +222,8 @@ int malidp_mw_connector_init(struct drm_device *drm)
>   ret = drm_writeback_connector_init(drm, >mw_connector,
>  _mw_connector_funcs,
>  _mw_encoder_helper_funcs,
> -formats, n_formats);
> +formats, n_formats,
> +   (1 << drm_crtc_index(>crtc)));
>   kfree(formats);
>   if (ret)
>   return ret;
> diff --git a/drivers/gpu/drm/drm_writeback.c b/drivers/gpu/drm/drm_writeback.c
> index dccf4504..dc2ef12 100644
> --- a/drivers/gpu/drm/drm_writeback.c
> +++ b/drivers/gpu/drm/drm_writeback.c
> @@ -157,6 +157,7 @@ static const struct drm_encoder_funcs 
> drm_writeback_encoder_funcs = {
>   * @enc_helper_funcs: Encoder helper funcs vtable to be used by the internal 
> encoder
>   * @formats: Array of supported pixel formats for the writeback engine
>   * @n_formats: Length of the formats array
> + * @possible_crtcs: possible crtcs for the internal writeback encoder
>   *
>   * This function creates the writeback-connector-specific properties if they
>   * have not been already created, initializes the connector as
> @@ -174,7 +175,7 @@ int drm_writeback_connector_init(struct drm_device *dev,
>struct drm_writeback_connector *wb_connector,
>const struct drm_connector_funcs *con_funcs,
>const struct drm_encoder_helper_funcs 
> *enc_helper_funcs,
> -  const u32 *formats, int n_formats)
> +  const u32 *formats, int n_formats, uint32_t 
> possible_crtcs)
>  {
>   struct drm_property_blob *blob;
>   struct drm_connector *connector = _connector->base;
> @@ -190,6 +191,9 @@ int drm_writeback_connector_init(struct drm_device *dev,
>   return PTR_ERR(blob);
>  
>   drm_encoder_helper_add(_connector->encoder, enc_helper_funcs);
> +
> + wb_connector->encoder.possible_crtcs 

[PATCH v13 10/13] drm/i915/guc: Extract GuC error capture lists on G2H notification.

2022-03-21 Thread Alan Previn
- Upon the G2H Notify-Err-Capture event, parse through the
  GuC Log Buffer (error-capture-subregion) and generate one or
  more capture-nodes. A single node represents a single "engine-
  instance-capture-dump" and contains at least 3 register lists:
  global, engine-class and engine-instance. An internal link
  list is maintained to store one or more nodes.
- Because the link-list node generation happen before the call
  to i915_gpu_codedump, duplicate global and engine-class register
  lists for each engine-instance register dump if we find
  dependent-engine resets in a engine-capture-group.
- When i915_gpu_coredump calls into capture_engine, (in a
  subsequent patch) we detach the matching node (guc-id,
  LRCA, etc) from the link list above and attach it to
  i915_gpu_coredump's intel_engine_coredump structure when have
  matching LRCA/guc-id/engine-instance.

Additional notes to be aware of:
- GuC generates the error capture dump into the GuC log buffer but
  this buffer is one big log buffer with 3 independent subregions
  within it. Each subregion is populated with different content
  and used in different ways and timings but all regions operate
  behave as independent ring buffers. Each guc-log subregion
  (general-logs, crash-dump and error- capture) has it's own
  guc_log_buffer_state that contain independent read and write
  pointers.

Signed-off-by: Alan Previn 
Reviewed-by: Umesh Nerlige Ramappa 
---
 .../gpu/drm/i915/gt/uc/abi/guc_actions_abi.h  |   7 +
 drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h |  56 ++
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 561 +-
 .../gpu/drm/i915/gt/uc/intel_guc_capture.h|   1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c|  26 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.h|   4 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  10 +-
 7 files changed, 652 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h 
b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
index f9b3dd146a7e..9ad6df1b6fbc 100644
--- a/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
+++ b/drivers/gpu/drm/i915/gt/uc/abi/guc_actions_abi.h
@@ -172,4 +172,11 @@ enum intel_guc_sleep_state_status {
 #define GUC_LOG_CONTROL_VERBOSITY_MASK (0xF << GUC_LOG_CONTROL_VERBOSITY_SHIFT)
 #define GUC_LOG_CONTROL_DEFAULT_LOGGING(1 << 8)
 
+enum intel_guc_state_capture_event_status {
+   INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_SUCCESS = 0x0,
+   INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE = 0x1,
+};
+
+#define INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_MASK  0x00FF
+
 #endif /* _ABI_GUC_ACTIONS_ABI_H */
diff --git a/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h 
b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
index 8824c5eba355..5d959e62d146 100644
--- a/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
@@ -12,6 +12,52 @@
 struct intel_guc;
 struct file;
 
+/**
+ * struct __guc_capture_bufstate
+ *
+ * Book-keeping structure used to track read and write pointers
+ * as we extract error capture data from the GuC-log-buffer's
+ * error-capture region as a stream of dwords.
+ */
+struct __guc_capture_bufstate {
+   u32 size;
+   void *data;
+   u32 rd;
+   u32 wr;
+};
+
+/**
+ * struct __guc_capture_parsed_output - extracted error capture node
+ *
+ * A single unit of extracted error-capture output data grouped together
+ * at an engine-instance level. We keep these nodes in a linked list.
+ * See outlist below.
+ */
+struct __guc_capture_parsed_output {
+   /*
+* A single set of 3 capture lists: a global-list
+* an engine-class-list and an engine-instance list.
+* outlist in __guc_capture_parsed_output will keep
+* a linked list of these nodes that will eventually
+* be detached from outlist and attached into to
+* i915_gpu_codedump in response to a context reset
+*/
+   struct list_head link;
+   bool is_partial;
+   u32 eng_class;
+   u32 eng_inst;
+   u32 guc_id;
+   u32 lrca;
+   struct gcap_reg_list_info {
+   u32 vfid;
+   u32 num_regs;
+   struct guc_mmio_reg *regs;
+   } reginfo[GUC_CAPTURE_LIST_TYPE_MAX];
+#define GCAP_PARSED_REGLIST_INDEX_GLOBAL   BIT(GUC_CAPTURE_LIST_TYPE_GLOBAL)
+#define GCAP_PARSED_REGLIST_INDEX_ENGCLASS 
BIT(GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS)
+#define GCAP_PARSED_REGLIST_INDEX_ENGINST  
BIT(GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE)
+};
+
 /**
  * struct guc_debug_capture_list_header / struct guc_debug_capture_list
  *
@@ -142,6 +188,16 @@ struct intel_guc_state_capture {
[GUC_CAPTURE_LIST_TYPE_MAX]
[GUC_MAX_ENGINE_CLASSES];
void *ads_null_cache;
+
+   /**
+* @outlist: allocated nodes with parsed engine-instance error capture 
data
+*
+* A linked list of parsed GuC 

[PATCH v13 12/13] drm/i915/guc: Plumb GuC-capture into gpu_coredump

2022-03-21 Thread Alan Previn
Add a flags parameter through all of the coredump creation
functions. Add a bitmask flag to indicate if the top
level gpu_coredump event is triggered in response to
a GuC context reset notification.

Using that flag, ensure all coredump functions that
read or print mmio-register values related to work submission
or command-streamer engines are skipped and replaced with
a calls guc-capture module equivalent functions to retrieve
or print the register dump.

While here, split out display related register reading
and printing into its own function that is called agnostic
to whether GuC had triggered the reset.

For now, introduce an empty printing function that can
filled in on a subsequent patch just to handle formatting.

Signed-off-by: Alan Previn 
Reviewed-by: Umesh Nerlige Ramappa 
---
 .../drm/i915/gt/intel_execlists_submission.c  |   4 +-
 drivers/gpu/drm/i915/gt/intel_reset.c |   2 +-
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c|  70 +
 .../gpu/drm/i915/gt/uc/intel_guc_capture.h|   9 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   2 +-
 drivers/gpu/drm/i915/i915_debugfs.c   |   2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c | 266 --
 drivers/gpu/drm/i915/i915_gpu_error.h |  30 +-
 8 files changed, 288 insertions(+), 97 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c 
b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index e1470bb60f34..738c120490fc 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -2236,11 +2236,11 @@ static struct execlists_capture *capture_regs(struct 
intel_engine_cs *engine)
if (!cap->error)
goto err_cap;
 
-   cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
+   cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp, 
CORE_DUMP_FLAG_NONE);
if (!cap->error->gt)
goto err_gpu;
 
-   cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
+   cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp, 
CORE_DUMP_FLAG_NONE);
if (!cap->error->gt->engine)
goto err_gt;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c 
b/drivers/gpu/drm/i915/gt/intel_reset.c
index a6ae213c7d89..f52015e79fdf 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -1319,7 +1319,7 @@ void intel_gt_handle_error(struct intel_gt *gt,
engine_mask &= gt->info.engine_mask;
 
if (flags & I915_ERROR_CAPTURE) {
-   i915_capture_error_state(gt, engine_mask);
+   i915_capture_error_state(gt, engine_mask, CORE_DUMP_FLAG_NONE);
intel_gt_clear_error_registers(gt, engine_mask);
}
 
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index b279d80e4772..a4f44755d3d3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -10,6 +10,7 @@
 #include "gt/intel_engine_regs.h"
 #include "gt/intel_gt.h"
 #include "gt/intel_gt_regs.h"
+#include "gt/intel_lrc.h"
 #include "guc_capture_fwif.h"
 #include "intel_guc_capture.h"
 #include "intel_guc_fwif.h"
@@ -754,6 +755,18 @@ intel_guc_capture_output_min_size_est(struct intel_guc 
*guc)
  *   data from GuC and then it's added into 
guc->capture->outlist linked
  *   list. This list is used for matchup and printout by 
i915_gpu_coredump
  *   and err_print_gt, (when user invokes the error capture 
sysfs).
+ *
+ * GUC --> notify context reset:
+ * -
+ * --> G2H CONTEXT RESET
+ *   L--> guc_handle_context_reset --> i915_capture_error_state
+ *  L--> i915_gpu_coredump(..IS_GUC_CAPTURE) --> 
gt_record_engines
+ *   --> capture_engine(..IS_GUC_CAPTURE)
+ *   L--> intel_guc_capture_get_matching_node is 
where
+ *detach C from internal linked list and 
add it into
+ *intel_engine_coredump struct (if the 
context and
+ *engine of the event notification matches 
a node
+ *in the link list).
  */
 
 static int guc_capture_buf_cnt(struct __guc_capture_bufstate *buf)
@@ -1369,6 +1382,63 @@ static void __guc_capture_process_output(struct 
intel_guc *guc)
__guc_capture_flushlog_complete(guc);
 }
 
+#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
+
+int intel_guc_capture_print_engine_node(struct drm_i915_error_state_buf *ebuf,
+   const struct intel_engine_coredump *ee)
+{
+   return 0;
+}
+
+#endif //CONFIG_DRM_I915_CAPTURE_ERROR
+
+void intel_guc_capture_free_node(struct intel_engine_coredump *ee)
+{
+   if (!ee || 

[PATCH v13 08/13] drm/i915/guc: Add capture region into intel_guc_log

2022-03-21 Thread Alan Previn
GuC log buffer regions for debug-log-events, crash-dumps and
error-state-capture are all part of a single bo allocation that
also includes the guc_log_buffer_state structures. Now that we
support it, increase the size allocation for error-capture.

Since the error-capture region is accessed at non-deterministic
times (as part of GuC triggered context reset) while debug-log-
events region is accessed as part of relay logging or during
debugfs triggered dumps, move the mapping and unmapping of the
shared buffer into intel_guc_log_create and intel_guc_log_destroy
so that it's always mapped throughout life of GuC operation.

Additionally, while here, update the guc log region layout
diagram to follow the order according to the enum definition
as per the GuC interface.

NOTE: A future effort to visit (part of baseline code) is that
buf_addr should be updated to be a io_sys_map and use the
io_sys_map wrapper functions to access the various GuC log
buffer regions.

Signed-off-by: Alan Previn 
Reviewed-by: Matthew Brost 
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c | 59 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.h |  3 +-
 2 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index 0d63c411080f..fe4b2d3f305d 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -26,7 +26,8 @@ static void guc_log_copy_debuglogs_for_relay(struct 
intel_guc_log *log);
 static int guc_action_flush_log_complete(struct intel_guc *guc)
 {
u32 action[] = {
-   INTEL_GUC_ACTION_LOG_BUFFER_FILE_FLUSH_COMPLETE
+   INTEL_GUC_ACTION_LOG_BUFFER_FILE_FLUSH_COMPLETE,
+   GUC_DEBUG_LOG_BUFFER
};
 
return intel_guc_send(guc, action, ARRAY_SIZE(action));
@@ -137,7 +138,7 @@ static void guc_move_to_next_buf(struct intel_guc_log *log)
smp_wmb();
 
/* All data has been written, so now move the offset of sub buffer. */
-   relay_reserve(log->relay.channel, log->vma->obj->base.size);
+   relay_reserve(log->relay.channel, log->vma->obj->base.size - 
CAPTURE_BUFFER_SIZE);
 
/* Switch to the next sub buffer */
relay_flush(log->relay.channel);
@@ -213,7 +214,8 @@ static void _guc_log_copy_debuglogs_for_relay(struct 
intel_guc_log *log)
goto out_unlock;
 
/* Get the pointer to shared GuC log buffer */
-   log_buf_state = src_data = log->relay.buf_addr;
+   src_data = log->buf_addr;
+   log_buf_state = src_data;
 
/* Get the pointer to local buffer to store the logs */
log_buf_snapshot_state = dst_data = guc_get_write_buffer(log);
@@ -233,7 +235,8 @@ static void _guc_log_copy_debuglogs_for_relay(struct 
intel_guc_log *log)
src_data += PAGE_SIZE;
dst_data += PAGE_SIZE;
 
-   for (type = GUC_DEBUG_LOG_BUFFER; type < GUC_MAX_LOG_BUFFER; type++) {
+   /* For relay logging, we exclude error state capture */
+   for (type = GUC_DEBUG_LOG_BUFFER; type <= GUC_CRASH_DUMP_LOG_BUFFER; 
type++) {
/*
 * Make a copy of the state structure, inside GuC log buffer
 * (which is uncached mapped), on the stack to avoid reading
@@ -311,23 +314,17 @@ static void copy_debug_logs_work(struct work_struct *work)
 
 static int guc_log_relay_map(struct intel_guc_log *log)
 {
-   void *vaddr;
-
lockdep_assert_held(>relay.lock);
 
-   if (!log->vma)
+   if (!log->vma || !log->buf_addr)
return -ENODEV;
 
/*
-* Create a WC (Uncached for read) vmalloc mapping of log
-* buffer pages, so that we can directly get the data
-* (up-to-date) from memory.
+* WC vmalloc mapping of log buffer pages was done at
+* GuC Log Init time, but lets keep a ref for book-keeping
 */
-   vaddr = i915_gem_object_pin_map_unlocked(log->vma->obj, I915_MAP_WC);
-   if (IS_ERR(vaddr))
-   return PTR_ERR(vaddr);
-
-   log->relay.buf_addr = vaddr;
+   i915_gem_object_get(log->vma->obj);
+   log->relay.buf_in_use = true;
 
return 0;
 }
@@ -336,8 +333,8 @@ static void guc_log_relay_unmap(struct intel_guc_log *log)
 {
lockdep_assert_held(>relay.lock);
 
-   i915_gem_object_unpin_map(log->vma->obj);
-   log->relay.buf_addr = NULL;
+   i915_gem_object_put(log->vma->obj);
+   log->relay.buf_in_use = false;
 }
 
 void intel_guc_log_init_early(struct intel_guc_log *log)
@@ -443,6 +440,7 @@ int intel_guc_log_create(struct intel_guc_log *log)
 {
struct intel_guc *guc = log_to_guc(log);
struct i915_vma *vma;
+   void *vaddr;
u32 guc_log_size;
int ret;
 
@@ -450,20 +448,21 @@ int intel_guc_log_create(struct intel_guc_log *log)
 
/*
 *  GuC Log buffer Layout
+* (this ordering must follow "enum guc_log_buffer_type" 

[PATCH v13 13/13] drm/i915/guc: Print the GuC error capture output register list.

2022-03-21 Thread Alan Previn
Print the GuC captured error state register list (string names
and values) when gpu_coredump_state printout is invoked via
the i915 debugfs for flushing the gpu error-state that was
captured prior.

Since GuC could have reported multiple engine register dumps
in a single notification event, parse the captured data
(appearing as a stream of structures) to identify each dump as
a different 'engine-capture-group-output'.

Finally, for each 'engine-capture-group-output' that is found,
verify if the engine register dump corresponds to the
engine_coredump content that was previously populated by the
i915_gpu_coredump function. That function would have copied
the context's vma's including the bacth buffer during the
G2H-context-reset notification that occurred earlier. Perform
this verification check by comparing guc_id, lrca and engine-
instance obtained from the 'engine-capture-group-output' vs a
copy of that same info taken during i915_gpu_coredump. If
they match, then print those vma's as well (such as the batch
buffers).

NOTE: the output format was verified using the gem_exec_capture
IGT test.

Signed-off-by: Alan Previn 
Reviewed-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |   4 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc.h|   3 +
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 161 ++
 .../gpu/drm/i915/gt/uc/intel_guc_capture.h|   2 +-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |   6 +-
 drivers/gpu/drm/i915/i915_debugfs.c   |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.c |  16 +-
 drivers/gpu/drm/i915/i915_gpu_error.h |   5 +
 8 files changed, 183 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index a900fbb21884..98b61ff13c95 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1714,9 +1714,7 @@ static void intel_engine_print_registers(struct 
intel_engine_cs *engine,
drm_printf(m, "\tIPEHR: 0x%08x\n", ENGINE_READ(engine, IPEHR));
}
 
-   if (intel_engine_uses_guc(engine)) {
-   /* nothing to print yet */
-   } else if (HAS_EXECLISTS(dev_priv)) {
+   if (HAS_EXECLISTS(dev_priv) && !intel_engine_uses_guc(engine)) {
struct i915_request * const *port, *rq;
const u32 *hws =
>status_page.addr[I915_HWS_CSB_BUF0_INDEX];
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index de32367831c6..4e431c14b118 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -438,6 +438,9 @@ int intel_guc_engine_failure_process_msg(struct intel_guc 
*guc,
 int intel_guc_error_capture_process_msg(struct intel_guc *guc,
const u32 *msg, u32 len);
 
+struct intel_engine_cs *
+intel_guc_lookup_engine(struct intel_guc *guc, u8 guc_class, u8 instance);
+
 void intel_guc_find_hung_context(struct intel_engine_cs *engine);
 
 int intel_guc_global_policies_update(struct intel_guc *guc);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index a4f44755d3d3..eac2dc4bdfae 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -767,6 +767,21 @@ intel_guc_capture_output_min_size_est(struct intel_guc 
*guc)
  *intel_engine_coredump struct (if the 
context and
  *engine of the event notification matches 
a node
  *in the link list).
+ *
+ * User Sysfs / Debugfs
+ * 
+ *  --> i915_gpu_coredump_copy_to_buffer->
+ *   L--> err_print_to_sgl --> err_print_gt
+ *L--> error_print_guc_captures
+ * L--> intel_guc_capture_print_node prints the
+ *  register lists values of the attached node
+ *  on the error-engine-dump being reported.
+ *   L--> i915_reset_error_state ... 
-->__i915_gpu_coredump_free
+ *L--> ... cleanup_gt -->
+ * L--> intel_guc_capture_free_node returns the
+ *  capture-output-node back to the internal
+ *  cachelist for reuse.
+ *
  */
 
 static int guc_capture_buf_cnt(struct __guc_capture_bufstate *buf)
@@ -1384,9 +1399,155 @@ static void __guc_capture_process_output(struct 
intel_guc *guc)
 
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 
+static const char *
+guc_capture_reg_to_str(const struct intel_guc *guc, u32 owner, u32 type,
+  u32 class, u32 id, u32 offset, u32 *is_ext)
+{
+   const struct __guc_mmio_reg_descr_group *reglists = 

[PATCH v13 01/13] drm/i915/guc: Update GuC ADS size for error capture lists

2022-03-21 Thread Alan Previn
Update GuC ADS size allocation to include space for
the lists of error state capture register descriptors.

Then, populate GuC ADS with the lists of registers we want
GuC to report back to host on engine reset events. This list
should include global, engine-class and engine-instance
registers for every engine-class type on the current hardware.

Ensure we allocate a persistent store for the register lists
that are populated into ADS so that we don't need to allocate
memory during GT resets when GuC is reloaded and ADS population
happens again.

NOTE: Start with a sample static table of register lists to
layout the framework before adding real registers in subsequent
patch. This static register tables are a different format from
the ADS populated list.

Signed-off-by: Alan Previn 
Reviewed-by: Matthew Brost 
---
 drivers/gpu/drm/i915/Makefile |   1 +
 drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h |  91 +
 drivers/gpu/drm/i915/gt/uc/intel_guc.c|  13 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc.h|   9 +-
 drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c| 127 +-
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 374 ++
 .../gpu/drm/i915/gt/uc/intel_guc_capture.h|  22 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   8 +
 8 files changed, 628 insertions(+), 17 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
 create mode 100644 drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
 create mode 100644 drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index a54e84e05466..d34f625221e2 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -184,6 +184,7 @@ i915-y += gt/uc/intel_uc.o \
  gt/uc/intel_uc_fw.o \
  gt/uc/intel_guc.o \
  gt/uc/intel_guc_ads.o \
+ gt/uc/intel_guc_capture.o \
  gt/uc/intel_guc_ct.o \
  gt/uc/intel_guc_debugfs.o \
  gt/uc/intel_guc_fw.o \
diff --git a/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h 
b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
new file mode 100644
index ..919ed985f09a
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2021-2022 Intel Corporation
+ */
+
+#ifndef _INTEL_GUC_CAPTURE_FWIF_H
+#define _INTEL_GUC_CAPTURE_FWIF_H
+
+#include 
+#include "intel_guc_fwif.h"
+
+struct intel_guc;
+struct file;
+
+/**
+ * struct guc_debug_capture_list_header / struct guc_debug_capture_list
+ *
+ * As part of ADS registration, these header structures (followed by
+ * an array of 'struct guc_mmio_reg' entries) are used to register with
+ * GuC microkernel the list of registers we want it to dump out prior
+ * to a engine reset.
+ */
+struct guc_debug_capture_list_header {
+   u32 info;
+#define GUC_CAPTURELISTHDR_NUMDESCR GENMASK(15, 0)
+} __packed;
+
+struct guc_debug_capture_list {
+   struct guc_debug_capture_list_header header;
+   struct guc_mmio_reg regs[0];
+} __packed;
+
+/**
+ * struct __guc_mmio_reg_descr / struct __guc_mmio_reg_descr_group
+ *
+ * intel_guc_capture module uses these structures to maintain static
+ * tables (per unique platform) that consists of lists of registers
+ * (offsets, names, flags,...) that are used at the ADS regisration
+ * time as well as during runtime processing and reporting of error-
+ * capture states generated by GuC just prior to engine reset events.
+ */
+struct __guc_mmio_reg_descr {
+   i915_reg_t reg;
+   u32 flags;
+   u32 mask;
+   const char *regname;
+};
+
+struct __guc_mmio_reg_descr_group {
+   const struct __guc_mmio_reg_descr *list;
+   u32 num_regs;
+   u32 owner; /* see enum guc_capture_owner */
+   u32 type; /* see enum guc_capture_type */
+   u32 engine; /* as per MAX_ENGINE_CLASS */
+};
+
+/**
+ * struct __guc_capture_ads_cache
+ *
+ * A structure to cache register lists that were populated and registered
+ * with GuC at startup during ADS registration. This allows much quicker
+ * GuC resets without re-parsing all the tables for the given gt.
+ */
+struct __guc_capture_ads_cache {
+   bool is_valid;
+   void *ptr;
+   size_t size;
+   int status;
+};
+
+/**
+ * struct intel_guc_state_capture
+ *
+ * Internal context of the intel_guc_capture module.
+ */
+struct intel_guc_state_capture {
+   /**
+* @reglists: static table of register lists used for error-capture 
state.
+*/
+   const struct __guc_mmio_reg_descr_group *reglists;
+
+   /**
+* @ads_cache: cached register lists that is ADS format ready
+*/
+   struct __guc_capture_ads_cache ads_cache[GUC_CAPTURE_LIST_INDEX_MAX]
+   [GUC_CAPTURE_LIST_TYPE_MAX]
+   [GUC_MAX_ENGINE_CLASSES];
+   void *ads_null_cache;
+};
+
+#endif /* 

[PATCH v13 09/13] drm/i915/guc: Check sizing of guc_capture output

2022-03-21 Thread Alan Previn
Add intel_guc_capture_output_min_size_est function to
provide a reasonable minimum size for error-capture
region before allocating the shared buffer.

Signed-off-by: Alan Previn 
Reviewed-by: Matthew Brost 
---
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 48 +++
 .../gpu/drm/i915/gt/uc/intel_guc_capture.h|  1 +
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c|  7 ++-
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 63ef407a2fd0..f87fee216430 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -663,6 +663,54 @@ intel_guc_capture_getnullheader(struct intel_guc *guc,
return 0;
 }
 
+#define GUC_CAPTURE_OVERBUFFER_MULTIPLIER 3
+int
+intel_guc_capture_output_min_size_est(struct intel_guc *guc)
+{
+   struct intel_gt *gt = guc_to_gt(guc);
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   int worst_min_size = 0, num_regs = 0;
+   size_t tmp = 0;
+
+   /*
+* If every single engine-instance suffered a failure in quick 
succession but
+* were all unrelated, then a burst of multiple error-capture events 
would dump
+* registers for every one engine instance, one at a time. In this 
case, GuC
+* would even dump the global-registers repeatedly.
+*
+* For each engine instance, there would be 1 x 
guc_state_capture_group_t output
+* followed by 3 x guc_state_capture_t lists. The latter is how the 
register
+* dumps are split across different register types (where the '3' are 
global vs class
+* vs instance). Finally, let's multiply the whole thing by 3x (just so 
we are
+* not limited to just 1 round of data in a worst case full register 
dump log)
+*
+* NOTE: intel_guc_log that allocates the log buffer would round this 
size up to
+* a power of two.
+*/
+
+   for_each_engine(engine, gt, id) {
+   worst_min_size += sizeof(struct 
guc_state_capture_group_header_t) +
+ (3 * sizeof(struct 
guc_state_capture_header_t));
+
+   if (!intel_guc_capture_getlistsize(guc, 0, 
GUC_CAPTURE_LIST_TYPE_GLOBAL, 0, ))
+   num_regs += tmp;
+
+   if (!intel_guc_capture_getlistsize(guc, 0, 
GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS,
+  engine->class, )) {
+   num_regs += tmp;
+   }
+   if (!intel_guc_capture_getlistsize(guc, 0, 
GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE,
+  engine->class, )) {
+   num_regs += tmp;
+   }
+   }
+
+   worst_min_size += (num_regs * sizeof(struct guc_mmio_reg));
+
+   return (worst_min_size * GUC_CAPTURE_OVERBUFFER_MULTIPLIER);
+}
+
 static void
 guc_capture_free_ads_cache(struct intel_guc_state_capture *gc)
 {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
index 8de7704e12eb..540d72079462 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.h
@@ -11,6 +11,7 @@
 struct guc_gt_system_info;
 struct intel_guc;
 
+int intel_guc_capture_output_min_size_est(struct intel_guc *guc);
 int intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 
classid,
  void **outptr);
 int intel_guc_capture_getlistsize(struct intel_guc *guc, u32 owner, u32 type, 
u32 classid,
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index fe4b2d3f305d..ed05b1a04f9c 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -7,10 +7,11 @@
 #include 
 
 #include "gt/intel_gt.h"
+#include "intel_guc_capture.h"
+#include "intel_guc_log.h"
 #include "i915_drv.h"
 #include "i915_irq.h"
 #include "i915_memcpy.h"
-#include "intel_guc_log.h"
 
 static void guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log);
 
@@ -466,6 +467,10 @@ int intel_guc_log_create(struct intel_guc_log *log)
 *  | Capture logs  |
 *  +===+ + CAPTURE_SIZE
 */
+   if (intel_guc_capture_output_min_size_est(guc) > CAPTURE_BUFFER_SIZE)
+   DRM_WARN("GuC log buffer for state_capture maybe too small. %d 
< %d\n",
+CAPTURE_BUFFER_SIZE, 
intel_guc_capture_output_min_size_est(guc));
+
guc_log_size = PAGE_SIZE + CRASH_BUFFER_SIZE + DEBUG_BUFFER_SIZE +
   CAPTURE_BUFFER_SIZE;
 
-- 
2.25.1



[PATCH v13 06/13] drm/i915/guc: Add GuC's error state capture output structures.

2022-03-21 Thread Alan Previn
Add GuC's error capture output structures and definitions as how
they would appear in GuC log buffer's error capture subregion after
an error state capture G2H event notification.

Signed-off-by: Alan Previn 
Reviewed-by: Matthew Brost 
---
 drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h | 47 +++
 1 file changed, 47 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h 
b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
index 6c199433945d..8824c5eba355 100644
--- a/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
@@ -55,6 +55,53 @@ struct __guc_mmio_reg_descr_group {
struct __guc_mmio_reg_descr *extlist; /* only used for steered 
registers */
 };
 
+/**
+ * struct guc_state_capture_header_t / struct guc_state_capture_t /
+ * guc_state_capture_group_header_t / guc_state_capture_group_t
+ *
+ * Prior to resetting engines that have hung or faulted, GuC microkernel
+ * reports the engine error-state (register values that was read) by
+ * logging them into the shared GuC log buffer using these hierarchy
+ * of structures.
+ */
+struct guc_state_capture_header_t {
+   u32 owner;
+#define CAP_HDR_CAPTURE_VFID GENMASK(7, 0)
+   u32 info;
+#define CAP_HDR_CAPTURE_TYPE GENMASK(3, 0) /* see enum guc_capture_type */
+#define CAP_HDR_ENGINE_CLASS GENMASK(7, 4) /* see GUC_MAX_ENGINE_CLASSES */
+#define CAP_HDR_ENGINE_INSTANCE GENMASK(11, 8)
+   u32 lrca; /* if type-instance, LRCA (address) that hung, else set to ~0 
*/
+   u32 guc_id; /* if type-instance, context index of hung context, else 
set to ~0 */
+   u32 num_mmios;
+#define CAP_HDR_NUM_MMIOS GENMASK(9, 0)
+} __packed;
+
+struct guc_state_capture_t {
+   struct guc_state_capture_header_t header;
+   struct guc_mmio_reg mmio_entries[0];
+} __packed;
+
+enum guc_capture_group_types {
+   GUC_STATE_CAPTURE_GROUP_TYPE_FULL,
+   GUC_STATE_CAPTURE_GROUP_TYPE_PARTIAL,
+   GUC_STATE_CAPTURE_GROUP_TYPE_MAX,
+};
+
+struct guc_state_capture_group_header_t {
+   u32 owner;
+#define CAP_GRP_HDR_CAPTURE_VFID GENMASK(7, 0)
+   u32 info;
+#define CAP_GRP_HDR_NUM_CAPTURES GENMASK(7, 0)
+#define CAP_GRP_HDR_CAPTURE_TYPE GENMASK(15, 8) /* guc_capture_group_types */
+} __packed;
+
+/* this is the top level structure where an error-capture dump starts */
+struct guc_state_capture_group_t {
+   struct guc_state_capture_group_header_t grp_header;
+   struct guc_state_capture_t capture_entries[0];
+} __packed;
+
 /**
  * struct __guc_capture_ads_cache
  *
-- 
2.25.1



[PATCH v13 07/13] drm/i915/guc: Update GuC-log relay function names

2022-03-21 Thread Alan Previn
For the sake of better code readibility, change previous
relay logging function names with "capture_logs" to
"copy_debug_logs" to differentiate from error capture
functions that will use a different region of the same buffer.

Signed-off-by: Alan Previn 
Reviewed-by: Matthew Brost 
---
 drivers/gpu/drm/i915/gt/uc/intel_guc_log.c | 35 --
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
index a24dc6441872..0d63c411080f 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_log.c
@@ -12,7 +12,7 @@
 #include "i915_memcpy.h"
 #include "intel_guc_log.h"
 
-static void guc_log_capture_logs(struct intel_guc_log *log);
+static void guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log);
 
 /**
  * DOC: GuC firmware log
@@ -198,7 +198,7 @@ static unsigned int guc_get_log_buffer_size(enum 
guc_log_buffer_type type)
return 0;
 }
 
-static void guc_read_update_log_buffer(struct intel_guc_log *log)
+static void _guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log)
 {
unsigned int buffer_size, read_offset, write_offset, bytes_to_copy, 
full_cnt;
struct guc_log_buffer_state *log_buf_state, *log_buf_snapshot_state;
@@ -223,7 +223,7 @@ static void guc_read_update_log_buffer(struct intel_guc_log 
*log)
 * Used rate limited to avoid deluge of messages, logs might be
 * getting consumed by User at a slow rate.
 */
-   DRM_ERROR_RATELIMITED("no sub-buffer to capture logs\n");
+   DRM_ERROR_RATELIMITED("no sub-buffer to copy general logs\n");
log->relay.full_count++;
 
goto out_unlock;
@@ -301,15 +301,15 @@ static void guc_read_update_log_buffer(struct 
intel_guc_log *log)
mutex_unlock(>relay.lock);
 }
 
-static void capture_logs_work(struct work_struct *work)
+static void copy_debug_logs_work(struct work_struct *work)
 {
struct intel_guc_log *log =
container_of(work, struct intel_guc_log, relay.flush_work);
 
-   guc_log_capture_logs(log);
+   guc_log_copy_debuglogs_for_relay(log);
 }
 
-static int guc_log_map(struct intel_guc_log *log)
+static int guc_log_relay_map(struct intel_guc_log *log)
 {
void *vaddr;
 
@@ -332,7 +332,7 @@ static int guc_log_map(struct intel_guc_log *log)
return 0;
 }
 
-static void guc_log_unmap(struct intel_guc_log *log)
+static void guc_log_relay_unmap(struct intel_guc_log *log)
 {
lockdep_assert_held(>relay.lock);
 
@@ -343,7 +343,7 @@ static void guc_log_unmap(struct intel_guc_log *log)
 void intel_guc_log_init_early(struct intel_guc_log *log)
 {
mutex_init(>relay.lock);
-   INIT_WORK(>relay.flush_work, capture_logs_work);
+   INIT_WORK(>relay.flush_work, copy_debug_logs_work);
log->relay.started = false;
 }
 
@@ -358,8 +358,11 @@ static int guc_log_relay_create(struct intel_guc_log *log)
lockdep_assert_held(>relay.lock);
GEM_BUG_ON(!log->vma);
 
-/* Keep the size of sub buffers same as shared log buffer */
-   subbuf_size = log->vma->size;
+/*
+ * Keep the size of sub buffers same as shared log buffer
+ * but GuC log-events excludes the error-state-capture logs
+ */
+   subbuf_size = log->vma->size - CAPTURE_BUFFER_SIZE;
 
/*
 * Store up to 8 snapshots, which is large enough to buffer sufficient
@@ -394,13 +397,13 @@ static void guc_log_relay_destroy(struct intel_guc_log 
*log)
log->relay.channel = NULL;
 }
 
-static void guc_log_capture_logs(struct intel_guc_log *log)
+static void guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log)
 {
struct intel_guc *guc = log_to_guc(log);
struct drm_i915_private *dev_priv = guc_to_gt(guc)->i915;
intel_wakeref_t wakeref;
 
-   guc_read_update_log_buffer(log);
+   _guc_log_copy_debuglogs_for_relay(log);
 
/*
 * Generally device is expected to be active only at this
@@ -566,7 +569,7 @@ int intel_guc_log_relay_open(struct intel_guc_log *log)
if (ret)
goto out_unlock;
 
-   ret = guc_log_map(log);
+   ret = guc_log_relay_map(log);
if (ret)
goto out_relay;
 
@@ -616,8 +619,8 @@ void intel_guc_log_relay_flush(struct intel_guc_log *log)
with_intel_runtime_pm(guc_to_gt(guc)->uncore->rpm, wakeref)
guc_action_flush_log(guc);
 
-   /* GuC would have updated log buffer by now, so capture it */
-   guc_log_capture_logs(log);
+   /* GuC would have updated log buffer by now, so copy it */
+   guc_log_copy_debuglogs_for_relay(log);
 }
 
 /*
@@ -646,7 +649,7 @@ void intel_guc_log_relay_close(struct intel_guc_log *log)
 
mutex_lock(>relay.lock);
GEM_BUG_ON(!intel_guc_log_relay_created(log));
-   guc_log_unmap(log);
+   

[PATCH v13 03/13] drm/i915/guc: Add XE_LP steered register lists support

2022-03-21 Thread Alan Previn
Add the ability for runtime allocation and freeing of
steered register list extentions that depend on the
detected HW config fuses.

Signed-off-by: Alan Previn 
Reviewed-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h |   9 +
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 176 --
 2 files changed, 174 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h 
b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
index 919ed985f09a..6c199433945d 100644
--- a/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
@@ -52,6 +52,7 @@ struct __guc_mmio_reg_descr_group {
u32 owner; /* see enum guc_capture_owner */
u32 type; /* see enum guc_capture_type */
u32 engine; /* as per MAX_ENGINE_CLASS */
+   struct __guc_mmio_reg_descr *extlist; /* only used for steered 
registers */
 };
 
 /**
@@ -79,6 +80,14 @@ struct intel_guc_state_capture {
 */
const struct __guc_mmio_reg_descr_group *reglists;
 
+   /**
+* @extlists: allocated table of steered register lists used for 
error-capture state.
+*
+* NOTE: steered registers have multiple instances depending on the HW 
configuration
+* (slices or dual-sub-slices) and thus depends on HW fuses discovered 
at startup
+*/
+   struct __guc_mmio_reg_descr_group *extlists;
+
/**
 * @ads_cache: cached register lists that is ADS format ready
 */
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 478308f33d34..45ed09f3453d 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -133,6 +133,7 @@ static const struct __guc_mmio_reg_descr empty_regs_list[] 
= {
TO_GCAP_DEF_OWNER(regsowner), \
TO_GCAP_DEF_TYPE(regstype), \
class, \
+   NULL, \
}
 
 /* List of lists */
@@ -150,28 +151,33 @@ static const struct __guc_mmio_reg_descr_group 
xe_lpd_lists[] = {
 };
 
 static const struct __guc_mmio_reg_descr_group *
-guc_capture_get_device_reglist(struct intel_guc *guc)
+guc_capture_get_one_list(const struct __guc_mmio_reg_descr_group *reglists,
+u32 owner, u32 type, u32 id)
 {
-   struct drm_i915_private *i915 = guc_to_gt(guc)->i915;
+   int i;
 
-   if (IS_TIGERLAKE(i915) || IS_ROCKETLAKE(i915) ||
-   IS_ALDERLAKE_S(i915) || IS_ALDERLAKE_P(i915)) {
-   return xe_lpd_lists;
+   if (!reglists)
+   return NULL;
+
+   for (i = 0; reglists[i].list; ++i) {
+   if (reglists[i].owner == owner && reglists[i].type == type &&
+   (reglists[i].engine == id || reglists[i].type == 
GUC_CAPTURE_LIST_TYPE_GLOBAL))
+   return [i];
}
 
return NULL;
 }
 
-static const struct __guc_mmio_reg_descr_group *
-guc_capture_get_one_list(const struct __guc_mmio_reg_descr_group *reglists,
-u32 owner, u32 type, u32 id)
+static struct __guc_mmio_reg_descr_group *
+guc_capture_get_one_ext_list(struct __guc_mmio_reg_descr_group *reglists,
+u32 owner, u32 type, u32 id)
 {
int i;
 
if (!reglists)
return NULL;
 
-   for (i = 0; reglists[i].list; ++i) {
+   for (i = 0; reglists[i].extlist; ++i) {
if (reglists[i].owner == owner && reglists[i].type == type &&
(reglists[i].engine == id || reglists[i].type == 
GUC_CAPTURE_LIST_TYPE_GLOBAL))
return [i];
@@ -180,6 +186,127 @@ guc_capture_get_one_list(const struct 
__guc_mmio_reg_descr_group *reglists,
return NULL;
 }
 
+static void guc_capture_free_extlists(struct __guc_mmio_reg_descr_group 
*reglists)
+{
+   int i = 0;
+
+   if (!reglists)
+   return;
+
+   while (reglists[i].extlist)
+   kfree(reglists[i++].extlist);
+}
+
+struct __ext_steer_reg {
+   const char *name;
+   i915_reg_t reg;
+};
+
+static const struct __ext_steer_reg xe_extregs[] = {
+   {"GEN7_SAMPLER_INSTDONE", GEN7_SAMPLER_INSTDONE},
+   {"GEN7_ROW_INSTDONE", GEN7_ROW_INSTDONE}
+};
+
+static void __fill_ext_reg(struct __guc_mmio_reg_descr *ext,
+  const struct __ext_steer_reg *extlist,
+  int slice_id, int subslice_id)
+{
+   ext->reg = extlist->reg;
+   ext->flags = FIELD_PREP(GUC_REGSET_STEERING_GROUP, slice_id);
+   ext->flags |= FIELD_PREP(GUC_REGSET_STEERING_INSTANCE, subslice_id);
+   ext->regname = extlist->name;
+}
+
+static int
+__alloc_ext_regs(struct __guc_mmio_reg_descr_group *newlist,
+const struct __guc_mmio_reg_descr_group *rootlist, int 
num_regs)
+{
+   struct __guc_mmio_reg_descr *list;
+
+   list = kcalloc(num_regs, sizeof(struct 

[PATCH v13 04/13] drm/i915/guc: Add DG2 registers for GuC error state capture.

2022-03-21 Thread Alan Previn
Add additional DG2 registers for GuC error state capture.

Signed-off-by: Alan Previn 
Reviewed-by: Umesh Nerlige Ramappa 
---
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 80 ++-
 1 file changed, 77 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 45ed09f3453d..7c3d9943ecdd 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -285,20 +285,94 @@ guc_capture_alloc_steered_lists_xe_lpd(struct intel_guc 
*guc,
guc->capture->extlists = extlists;
 }
 
+static const struct __ext_steer_reg xehpg_extregs[] = {
+   {"XEHPG_INSTDONE_GEOM_SVG", XEHPG_INSTDONE_GEOM_SVG}
+};
+
+static bool __has_xehpg_extregs(u32 ipver)
+{
+   return (ipver >= IP_VER(12, 55));
+}
+
+static void
+guc_capture_alloc_steered_lists_xe_hpg(struct intel_guc *guc,
+  const struct __guc_mmio_reg_descr_group 
*lists,
+  u32 ipver)
+{
+   struct intel_gt *gt = guc_to_gt(guc);
+   struct drm_i915_private *i915 = guc_to_gt(guc)->i915;
+   struct sseu_dev_info *sseu;
+   int slice, subslice, i, iter, num_steer_regs, num_tot_regs = 0;
+   const struct __guc_mmio_reg_descr_group *list;
+   struct __guc_mmio_reg_descr_group *extlists;
+   struct __guc_mmio_reg_descr *extarray;
+
+   /* In XE_LP / HPG we only have render-class steering registers during 
error-capture */
+   list = guc_capture_get_one_list(lists, GUC_CAPTURE_LIST_INDEX_PF,
+   GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS, 
GUC_RENDER_CLASS);
+   /* skip if extlists was previously allocated */
+   if (!list || guc->capture->extlists)
+   return;
+
+   num_steer_regs = ARRAY_SIZE(xe_extregs);
+   if (__has_xehpg_extregs(ipver))
+   num_steer_regs += ARRAY_SIZE(xehpg_extregs);
+
+   sseu = >info.sseu;
+   for_each_instdone_gslice_dss_xehp(i915, sseu, iter, slice, subslice) {
+   num_tot_regs += num_steer_regs;
+   }
+
+   if (!num_tot_regs)
+   return;
+
+   /* allocate an extra for an end marker */
+   extlists = kcalloc(2, sizeof(struct __guc_mmio_reg_descr_group), 
GFP_KERNEL);
+   if (!extlists)
+   return;
+
+   if (__alloc_ext_regs([0], list, num_tot_regs)) {
+   kfree(extlists);
+   return;
+   }
+
+   extarray = extlists[0].extlist;
+   for_each_instdone_gslice_dss_xehp(i915, sseu, iter, slice, subslice) {
+   for (i = 0; i < ARRAY_SIZE(xe_extregs); ++i) {
+   __fill_ext_reg(extarray, _extregs[i], slice, 
subslice);
+   ++extarray;
+   }
+   if (__has_xehpg_extregs(ipver)) {
+   for (i = 0; i < ARRAY_SIZE(xehpg_extregs); ++i) {
+   __fill_ext_reg(extarray, _extregs[i], 
slice, subslice);
+   ++extarray;
+   }
+   }
+   }
+
+   drm_dbg(>drm, "GuC-capture found %d-ext-regs.\n", num_tot_regs);
+   guc->capture->extlists = extlists;
+}
+
 static const struct __guc_mmio_reg_descr_group *
 guc_capture_get_device_reglist(struct intel_guc *guc)
 {
struct drm_i915_private *i915 = guc_to_gt(guc)->i915;
 
-   if (IS_TIGERLAKE(i915) || IS_ROCKETLAKE(i915) ||
-   IS_ALDERLAKE_S(i915) || IS_ALDERLAKE_P(i915)) {
+   if (GRAPHICS_VER(i915) > 11) {
/*
 * For certain engine classes, there are slice and subslice
 * level registers requiring steering. We allocate and populate
 * these at init time based on hw config add it as an extension
 * list at the end of the pre-populated render list.
 */
-   guc_capture_alloc_steered_lists_xe_lpd(guc, xe_lpd_lists);
+   if (IS_DG2(i915))
+   guc_capture_alloc_steered_lists_xe_hpg(guc, 
xe_lpd_lists, IP_VER(12, 55));
+   else if (IS_XEHPSDV(i915))
+   guc_capture_alloc_steered_lists_xe_hpg(guc, 
xe_lpd_lists, IP_VER(12, 50));
+   else
+   guc_capture_alloc_steered_lists_xe_lpd(guc, 
xe_lpd_lists);
+
return xe_lpd_lists;
}
 
-- 
2.25.1



[PATCH v13 02/13] drm/i915/guc: Add XE_LP static registers for GuC error capture.

2022-03-21 Thread Alan Previn
Add device specific tables and register lists to cover different engines
class types for GuC error state capture for XE_LP products.

Signed-off-by: Alan Previn 
Reviewed-by: Umesh Nerlige Ramappa 
---
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 112 ++
 drivers/gpu/drm/i915/gt/uc/intel_guc_fwif.h   |   6 +-
 2 files changed, 95 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 4adfa14699d4..478308f33d34 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -22,40 +22,106 @@
  * NOTE: For engine-registers, GuC only needs the register offsets
  *   from the engine-mmio-base
  */
+#define COMMON_GEN12BASE_GLOBAL \
+   { GEN12_FAULT_TLB_DATA0,0,  0, "GEN12_FAULT_TLB_DATA0" }, \
+   { GEN12_FAULT_TLB_DATA1,0,  0, "GEN12_FAULT_TLB_DATA1" }, \
+   { FORCEWAKE_MT, 0,  0, "FORCEWAKE" }, \
+   { GEN12_AUX_ERR_DBG,0,  0, "AUX_ERR_DBG" }, \
+   { GEN12_GAM_DONE,   0,  0, "GAM_DONE" }, \
+   { GEN12_RING_FAULT_REG, 0,  0, "FAULT_REG" }
+
+#define COMMON_GEN12BASE_ENGINE_INSTANCE \
+   { RING_PSMI_CTL(0), 0,  0, "RC PSMI" }, \
+   { RING_ESR(0),  0,  0, "ESR" }, \
+   { RING_DMA_FADD(0), 0,  0, "RING_DMA_FADD_LDW" }, \
+   { RING_DMA_FADD_UDW(0), 0,  0, "RING_DMA_FADD_UDW" }, \
+   { RING_IPEIR(0),0,  0, "IPEIR" }, \
+   { RING_IPEHR(0),0,  0, "IPEHR" }, \
+   { RING_INSTPS(0),   0,  0, "INSTPS" }, \
+   { RING_BBADDR(0),   0,  0, "RING_BBADDR_LOW32" }, \
+   { RING_BBADDR_UDW(0),   0,  0, "RING_BBADDR_UP32" }, \
+   { RING_BBSTATE(0),  0,  0, "BB_STATE" }, \
+   { CCID(0),  0,  0, "CCID" }, \
+   { RING_ACTHD(0),0,  0, "ACTHD_LDW" }, \
+   { RING_ACTHD_UDW(0),0,  0, "ACTHD_UDW" }, \
+   { RING_INSTPM(0),   0,  0, "INSTPM" }, \
+   { RING_INSTDONE(0), 0,  0, "INSTDONE" }, \
+   { RING_NOPID(0),0,  0, "RING_NOPID" }, \
+   { RING_START(0),0,  0, "START" }, \
+   { RING_HEAD(0), 0,  0, "HEAD" }, \
+   { RING_TAIL(0), 0,  0, "TAIL" }, \
+   { RING_CTL(0),  0,  0, "CTL" }, \
+   { RING_MI_MODE(0),  0,  0, "MODE" }, \
+   { RING_CONTEXT_CONTROL(0),  0,  0, "RING_CONTEXT_CONTROL" }, \
+   { RING_HWS_PGA(0),  0,  0, "HWS" }, \
+   { RING_MODE_GEN7(0),0,  0, "GFX_MODE" }, \
+   { GEN8_RING_PDP_LDW(0, 0),  0,  0, "PDP0_LDW" }, \
+   { GEN8_RING_PDP_UDW(0, 0),  0,  0, "PDP0_UDW" }, \
+   { GEN8_RING_PDP_LDW(0, 1),  0,  0, "PDP1_LDW" }, \
+   { GEN8_RING_PDP_UDW(0, 1),  0,  0, "PDP1_UDW" }, \
+   { GEN8_RING_PDP_LDW(0, 2),  0,  0, "PDP2_LDW" }, \
+   { GEN8_RING_PDP_UDW(0, 2),  0,  0, "PDP2_UDW" }, \
+   { GEN8_RING_PDP_LDW(0, 3),  0,  0, "PDP3_LDW" }, \
+   { GEN8_RING_PDP_UDW(0, 3),  0,  0, "PDP3_UDW" }
+
+#define COMMON_GEN12BASE_HAS_EU \
+   { EIR,  0,  0, "EIR" }
+
+#define COMMON_GEN12BASE_RENDER \
+   { GEN7_SC_INSTDONE, 0,  0, "GEN7_SC_INSTDONE" }, \
+   { GEN12_SC_INSTDONE_EXTRA,  0,  0, "GEN12_SC_INSTDONE_EXTRA" }, \
+   { GEN12_SC_INSTDONE_EXTRA2, 0,  0, "GEN12_SC_INSTDONE_EXTRA2" }
+
+#define COMMON_GEN12BASE_VEC \
+   { GEN12_SFC_DONE(0),0,  0, "SFC_DONE[0]" }, \
+   { GEN12_SFC_DONE(1),0,  0, "SFC_DONE[1]" }, \
+   { GEN12_SFC_DONE(2),0,  0, "SFC_DONE[2]" }, \
+   { GEN12_SFC_DONE(3),0,  0, "SFC_DONE[3]" }
+
 /* XE_LPD - Global */
 static const struct __guc_mmio_reg_descr xe_lpd_global_regs[] = {
-   { GEN12_RING_FAULT_REG, 0,  0, "GEN12_RING_FAULT_REG" }
+   COMMON_GEN12BASE_GLOBAL,
 };
 
 /* XE_LPD - Render / Compute Per-Class */
 static const struct __guc_mmio_reg_descr xe_lpd_rc_class_regs[] = {
-   { EIR,  0,  0, "EIR" }
+   COMMON_GEN12BASE_HAS_EU,
+   COMMON_GEN12BASE_RENDER,
 };
 
 /* XE_LPD - Render / Compute Per-Engine-Instance */
 static const struct __guc_mmio_reg_descr xe_lpd_rc_inst_regs[] = {
-   { RING_HEAD(0), 0,  0, "RING_HEAD" },
-   { RING_TAIL(0), 0,  0, "RING_TAIL" },
+   COMMON_GEN12BASE_ENGINE_INSTANCE,
 };
 
 /* XE_LPD - Media Decode/Encode Per-Class */
 static const struct __guc_mmio_reg_descr xe_lpd_vd_class_regs[] = {
+   COMMON_GEN12BASE_ENGINE_INSTANCE,
 };
 
 /* XE_LPD - Media Decode/Encode Per-Engine-Instance */
 static const struct __guc_mmio_reg_descr xe_lpd_vd_inst_regs[] = {
-   { RING_HEAD(0), 0,  0, 

[PATCH v13 11/13] drm/i915/guc: Pre-allocate output nodes for extraction

2022-03-21 Thread Alan Previn
In the rare but possible scenario where we are in the midst of
multiple GuC error-capture (and engine reset) events and the
user also triggers a forced full GT reset or the internal watchdog
triggers the same, intel_guc_submission_reset_prepare's call
to flush_work(>ct.requests.worker) can cause the G2H message
handler to trigger intel_guc_capture_store_snapshot upon
receiving new G2H error-capture notifications. This can happen
despite the prior call to disable_submission(guc);. However,
there's no race-free way for intel_guc_capture_store_snapshot to
know that we are in the midst of a reset. That said, we can never
dynamically allocate the output nodes in this handler. Thus, we
shall pre-allocate a fixed number of empty nodes up front (at the
time of ADS registration) that we can consume from or return to
an internal cached list of nodes.

Signed-off-by: Alan Previn 
Reviewed-by: Umesh Nerlige Ramappa 
---
 drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h |  19 +-
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 177 ++
 2 files changed, 161 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h 
b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
index 5d959e62d146..3624abfd22d1 100644
--- a/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
+++ b/drivers/gpu/drm/i915/gt/uc/guc_capture_fwif.h
@@ -31,7 +31,7 @@ struct __guc_capture_bufstate {
  *
  * A single unit of extracted error-capture output data grouped together
  * at an engine-instance level. We keep these nodes in a linked list.
- * See outlist below.
+ * See cachelist and outlist below.
  */
 struct __guc_capture_parsed_output {
/*
@@ -190,7 +190,22 @@ struct intel_guc_state_capture {
void *ads_null_cache;
 
/**
-* @outlist: allocated nodes with parsed engine-instance error capture 
data
+* @cachelist: Pool of pre-allocated nodes for error capture output
+*
+* We need this pool of pre-allocated nodes because we cannot
+* dynamically allocate new nodes when receiving the G2H notification
+* because the event handlers for all G2H event-processing is called
+* by the ct processing worker queue and when that queue is being
+* processed, there is no absoluate guarantee that we are not in the
+* midst of a GT reset operation (which doesn't allow allocations).
+*/
+   struct list_head cachelist;
+#define PREALLOC_NODES_MAX_COUNT (3 * GUC_MAX_ENGINE_CLASSES * 
GUC_MAX_INSTANCES_PER_CLASS)
+#define PREALLOC_NODES_DEFAULT_NUMREGS 64
+   int max_mmio_per_node;
+
+   /**
+* @outlist: Pool of pre-allocated nodes for error capture output
 *
 * A linked list of parsed GuC error-capture output data before
 * reporting with formatting via i915_gpu_coredump. Each node in this 
linked list shall
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index af82f6ff3534..b279d80e4772 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -581,6 +581,8 @@ intel_guc_capture_getlistsize(struct intel_guc *guc, u32 
owner, u32 type, u32 cl
return 0;
 }
 
+static void guc_capture_create_prealloc_nodes(struct intel_guc *guc);
+
 int
 intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 
classid,
  void **outptr)
@@ -601,6 +603,12 @@ intel_guc_capture_getlist(struct intel_guc *guc, u32 
owner, u32 type, u32 classi
return cache->status;
}
 
+   /*
+* ADS population of input registers is a good
+* time to pre-allocate cachelist output nodes
+*/
+   guc_capture_create_prealloc_nodes(guc);
+
ret = intel_guc_capture_getlistsize(guc, owner, type, classid, );
if (ret) {
cache->is_valid = true;
@@ -741,7 +749,8 @@ intel_guc_capture_output_min_size_est(struct intel_guc *guc)
  *err-state-captured register-list we find, we 
alloc 'C':
  *  --> alloc C: A capture-output-node structure that includes misc 
capture info along
  *   with 3 register list dumps (global, engine-class and 
engine-instance)
- *   This node is dynamically allocated and populated with the 
error-capture
+ *   This node is created from a pre-allocated list of blank 
nodes in
+ *   guc->capture->cachelist and populated with the 
error-capture
  *   data from GuC and then it's added into 
guc->capture->outlist linked
  *   list. This list is used for matchup and printout by 
i915_gpu_coredump
  *   and err_print_gt, (when user invokes the error capture 
sysfs).
@@ -901,19 +910,20 @@ guc_capture_delete_one_node(struct intel_guc *guc, struct 
__guc_capture_parsed_o
 }
 
 static void
-guc_capture_delete_nodes(struct intel_guc *guc)

[PATCH v13 05/13] drm/i915/guc: Add Gen9 registers for GuC error state capture.

2022-03-21 Thread Alan Previn
Abstract out a Gen9 register list as the default for all other
platforms we don't yet formally support GuC submission on.

Signed-off-by: Alan Previn 
Reviewed-by: Umesh Nerlige Ramappa 
---
 .../gpu/drm/i915/gt/uc/intel_guc_capture.c| 82 +--
 1 file changed, 59 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c 
b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
index 7c3d9943ecdd..63ef407a2fd0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_capture.c
@@ -22,15 +22,24 @@
  * NOTE: For engine-registers, GuC only needs the register offsets
  *   from the engine-mmio-base
  */
+#define COMMON_BASE_GLOBAL \
+   { FORCEWAKE_MT, 0,  0, "FORCEWAKE" }
+
+#define COMMON_GEN9BASE_GLOBAL \
+   { GEN8_FAULT_TLB_DATA0, 0,  0, "GEN8_FAULT_TLB_DATA0" }, \
+   { GEN8_FAULT_TLB_DATA1, 0,  0, "GEN8_FAULT_TLB_DATA1" }, \
+   { ERROR_GEN6,   0,  0, "ERROR_GEN6" }, \
+   { DONE_REG, 0,  0, "DONE_REG" }, \
+   { HSW_GTT_CACHE_EN, 0,  0, "HSW_GTT_CACHE_EN" }
+
 #define COMMON_GEN12BASE_GLOBAL \
{ GEN12_FAULT_TLB_DATA0,0,  0, "GEN12_FAULT_TLB_DATA0" }, \
{ GEN12_FAULT_TLB_DATA1,0,  0, "GEN12_FAULT_TLB_DATA1" }, \
-   { FORCEWAKE_MT, 0,  0, "FORCEWAKE" }, \
{ GEN12_AUX_ERR_DBG,0,  0, "AUX_ERR_DBG" }, \
{ GEN12_GAM_DONE,   0,  0, "GAM_DONE" }, \
{ GEN12_RING_FAULT_REG, 0,  0, "FAULT_REG" }
 
-#define COMMON_GEN12BASE_ENGINE_INSTANCE \
+#define COMMON_BASE_ENGINE_INSTANCE \
{ RING_PSMI_CTL(0), 0,  0, "RC PSMI" }, \
{ RING_ESR(0),  0,  0, "ESR" }, \
{ RING_DMA_FADD(0), 0,  0, "RING_DMA_FADD_LDW" }, \
@@ -64,11 +73,13 @@
{ GEN8_RING_PDP_LDW(0, 3),  0,  0, "PDP3_LDW" }, \
{ GEN8_RING_PDP_UDW(0, 3),  0,  0, "PDP3_UDW" }
 
-#define COMMON_GEN12BASE_HAS_EU \
+#define COMMON_BASE_HAS_EU \
{ EIR,  0,  0, "EIR" }
 
+#define COMMON_BASE_RENDER \
+   { GEN7_SC_INSTDONE, 0,  0, "GEN7_SC_INSTDONE" }
+
 #define COMMON_GEN12BASE_RENDER \
-   { GEN7_SC_INSTDONE, 0,  0, "GEN7_SC_INSTDONE" }, \
{ GEN12_SC_INSTDONE_EXTRA,  0,  0, "GEN12_SC_INSTDONE_EXTRA" }, \
{ GEN12_SC_INSTDONE_EXTRA2, 0,  0, "GEN12_SC_INSTDONE_EXTRA2" }
 
@@ -80,28 +91,26 @@
 
 /* XE_LPD - Global */
 static const struct __guc_mmio_reg_descr xe_lpd_global_regs[] = {
+   COMMON_BASE_GLOBAL,
+   COMMON_GEN9BASE_GLOBAL,
COMMON_GEN12BASE_GLOBAL,
 };
 
 /* XE_LPD - Render / Compute Per-Class */
 static const struct __guc_mmio_reg_descr xe_lpd_rc_class_regs[] = {
-   COMMON_GEN12BASE_HAS_EU,
+   COMMON_BASE_HAS_EU,
+   COMMON_BASE_RENDER,
COMMON_GEN12BASE_RENDER,
 };
 
-/* XE_LPD - Render / Compute Per-Engine-Instance */
+/* GEN9/XE_LPD - Render / Compute Per-Engine-Instance */
 static const struct __guc_mmio_reg_descr xe_lpd_rc_inst_regs[] = {
-   COMMON_GEN12BASE_ENGINE_INSTANCE,
+   COMMON_BASE_ENGINE_INSTANCE,
 };
 
-/* XE_LPD - Media Decode/Encode Per-Class */
-static const struct __guc_mmio_reg_descr xe_lpd_vd_class_regs[] = {
-   COMMON_GEN12BASE_ENGINE_INSTANCE,
-};
-
-/* XE_LPD - Media Decode/Encode Per-Engine-Instance */
+/* GEN9/XE_LPD - Media Decode/Encode Per-Engine-Instance */
 static const struct __guc_mmio_reg_descr xe_lpd_vd_inst_regs[] = {
-   COMMON_GEN12BASE_ENGINE_INSTANCE,
+   COMMON_BASE_ENGINE_INSTANCE,
 };
 
 /* XE_LPD - Video Enhancement Per-Class */
@@ -109,18 +118,33 @@ static const struct __guc_mmio_reg_descr 
xe_lpd_vec_class_regs[] = {
COMMON_GEN12BASE_VEC,
 };
 
-/* XE_LPD - Video Enhancement Per-Engine-Instance */
+/* GEN9/XE_LPD - Video Enhancement Per-Engine-Instance */
 static const struct __guc_mmio_reg_descr xe_lpd_vec_inst_regs[] = {
-   COMMON_GEN12BASE_ENGINE_INSTANCE,
+   COMMON_BASE_ENGINE_INSTANCE,
 };
 
-/* XE_LPD - Blitter Per-Engine-Instance */
+/* GEN9/XE_LPD - Blitter Per-Engine-Instance */
 static const struct __guc_mmio_reg_descr xe_lpd_blt_inst_regs[] = {
-   COMMON_GEN12BASE_ENGINE_INSTANCE,
+   COMMON_BASE_ENGINE_INSTANCE,
 };
 
-/* XE_LPD - Blitter Per-Class */
-/* XE_LPD - Media Decode/Encode Per-Class */
+/* GEN9 - Global */
+static const struct __guc_mmio_reg_descr default_global_regs[] = {
+   COMMON_BASE_GLOBAL,
+   COMMON_GEN9BASE_GLOBAL,
+};
+
+static const struct __guc_mmio_reg_descr default_rc_class_regs[] = {
+   COMMON_BASE_HAS_EU,
+   COMMON_BASE_RENDER,
+};
+
+/*
+ * Empty lists:
+ * GEN9/XE_LPD - Blitter Per-Class
+ * GEN9/XE_LPD - Media Decode/Encode Per-Class
+ * GEN9 - VEC Class
+ */
 static const struct __guc_mmio_reg_descr empty_regs_list[] = {
 };
 
@@ -137,6 +161,19 @@ static const struct __guc_mmio_reg_descr 

[PATCH v13 00/13] Add GuC Error Capture Support

2022-03-21 Thread Alan Previn
This series:
  1. Enables support of GuC to report error-state-capture
 using a list of MMIO registers the driver registers
 and GuC will dump, log and notify right before a GuC
 triggered engine-reset event.
  2. Updates the ADS blob creation to register said lists
 of global, engine class and engine instance registers
 with GuC.
  3. Defines tables of register lists that are global or
 engine class or engine instance in scope.
  4. Updates usage and buffer-state data for the regions
 of the shared GuC log-buffer to accomdate both
 the existing relay logging of general debug logs
 along with the new error state capture usage.
  5. Using a pool of preallocated memory, provide ability
 to extract and format the GuC reported register-capture
 data into chunks consistent with existing i915 error-
 state collection flows and structures.
  6. Connects the i915_gpu_coredump reporting function
 to the GuC error capture module to print all GuC
 error state capture dumps that is reported.

This is the 13th rev of this series where the first 3 revs
are RFC

Prior receipts of rvb's:
  - Patch #2, #3, #4, #5, #10, #11, #12, #13 have received
R-v-b's from Umesh Nerlige Ramappa 
  - Patch #1, #6, #7, #8, #9 has received an R-v-b from Matthew Brost
. NOTE: some of these came in on the
trybot series. https://patchwork.freedesktop.org/series/100831/

Changes from prior revs:
  v13:- Fixing register list definition styling as per Jani's request.
  v12:- Re-sending it because previous revs only got to intel-gfx,
and only cover letter was in dri-devel. Also rebased again.
  v11:- Rebase again on latest drm-tip to fix merge error.
  v10:- Rebase on latest drm-tip again. Fix a number of checkpatch
warnings and an error Reported-by: kernel test robot .
  v9: - Rebase on latest drm-tip to solve CI merge-build error.
  v8: - Fix a bug found by CI in rev7: Create a cached ADS
capture list for null-header like the other lists.
  - Fixed a bug on the ggtt offset calculation in the
ADS population loop. Thanks to Matt Brost.
  - Change the storage uses for initial allocation and
caching of the ADS register lists so we only store
a regular pointer instead of file handle.
  - Multiple improvements on code styling, variable names,
comments and code reduction from Umesh suggestions
across multiple patches.

  v7: - Rebased on lastest drm_tip that has the ADS now using
shmem based ads_blob_write utilities. Stress test
was performed with this patch included to fix a
legacy bug:
https://patchwork.freedesktop.org/series/100768/

  v6: - In patch #1, ADS reg-list population, we now alloc
regular memory to create the lists and cache them for
simpler and faster use by GuC ADS module at init, 
suspend-resume and reset cycles. This was in response
to review comments from Lucas De Marchi that also
wanted to ensure the GuC ADS module owns the final
copying into the ADS phyical memory.
  - Thanks to Jani Nikula for pointing out that patch #2
and #3 should ensure static tables as constant and
dynamic lists should be allocated and cached but
attached to the GT level for the case of multiple
cards with different fusings for steered registers.
These are addressed now along with multiple code
style fixups (thanks to review comment from Umesh)
and splitting the steered register list generation
as a seperate patch.
  - The extraction functionality, Patch #10 and #11 (was
patch #7), has fixed all of Umesh's review comments
related to the code styling. Additionally, it was
discovered during stress tests that the extraction
function could be called by the ct processing thread
at the same time as the start of a GT reset event.
Thus, a redesign was done whereby the linked list of
processed capture-output-nodes are allocated up
front and reused throughout the driver's life to
ensure no memory locks are taken during extraction.
  - For patch #6 (now 7, 8 and 9), updates to
intel_guc_log was split into smaller chunks and the
log_state structure was returned back to inside of
the intel_guc_log struct as opposed to the
intel_guc struct in prior rev. This is in response
to review comments by Matt Brost.
  - #Patch 13 (previously #10) is mostly identical but
addresses all of the code styling comments reviews
from Umesh.

  v5: - Added Gen9->Gen11 register list for CI coverage that
included Gen9 with GuC submission.
  - Redesigned the extraction of the GuC error-capture
dumps by grouping them into complete per-engine-reset
nodes. Complete here means each node includes the
global, engine-class and 

Re: [PATCH 1/4] i915/gem: drop wbinvd_on_all_cpus usage

2022-03-21 Thread Michael Cheng

On 2022-03-21 3:30 a.m., Tvrtko Ursulin wrote:



On 19/03/2022 19:42, Michael Cheng wrote:
Previous concern with using drm_clflush_sg was that we don't know 
what the

sg_table is pointing to, thus the usage of wbinvd_on_all_cpus to flush
everything at once to avoid paranoia.


And now we know, or we know it is not a concern?

To make i915 more architecture-neutral and be less paranoid, lets 
attempt to


"Lets attempt" as we don't know if this will work and/or what can/will 
break?


Yes, but it seems like there's no regression with IGT .

If there's a big hit in performance, or if this solution gets accepted 
and the bug reports come flying in, we can explore other solutions. But 
speaking to Dan Vetter, ideal solution would be to avoid any calls 
directly to wbinvd, and use drm helpers in place.


+Daniel for any extra input.


use drm_clflush_sg to flush the pages for when the GPU wants to read
from main memory.

Signed-off-by: Michael Cheng 
---
  drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 9 ++---
  1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c 
b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c

index f5062d0c6333..b0a5baaebc43 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c
@@ -8,6 +8,7 @@
  #include 
  #include 
  #include 
+#include 
    #include 
  @@ -250,16 +251,10 @@ static int 
i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj)
   * DG1 is special here since it still snoops transactions even 
with
   * CACHE_NONE. This is not the case with other HAS_SNOOP 
platforms. We

   * might need to revisit this as we add new discrete platforms.
- *
- * XXX: Consider doing a vmap flush or something, where possible.
- * Currently we just do a heavy handed wbinvd_on_all_cpus() here 
since
- * the underlying sg_table might not even point to struct pages, 
so we
- * can't just call drm_clflush_sg or similar, like we do 
elsewhere in

- * the driver.
   */
  if (i915_gem_object_can_bypass_llc(obj) ||
  (!HAS_LLC(i915) && !IS_DG1(i915)))
-    wbinvd_on_all_cpus();
+    drm_clflush_sg(pages);


And as noticed before, drm_clfush_sg still can call wbinvd_on_all_cpus 
so are you just punting the issue somewhere else? How will it be 
solved there?


Instead of calling an x86 asm directly, we are using what's available to 
use to make the driver more architecture neutral. Agreeing with Thomas, 
this solution falls within the "prefer range-aware clflush apis", and 
since some other generation platform doesn't support clflushopt, it will 
fall back to using wbinvd.

Regards,

Tvrtko


    sg_page_sizes = i915_sg_dma_sizes(pages->sgl);
  __i915_gem_object_set_pages(obj, pages, sg_page_sizes);


[PATCH v11 6/7] MIPS: Loongson64: defconfig: enable display bridge drivers on Loongson64

2022-03-21 Thread Sui Jingfeng
From: suijingfeng 

ls3A4000 evb board is shipped with adv7123 and tfp410 while ls2k1000
PI board use a DPI panel from FORLINX and a sii9022 HDMI transmitter.

Signed-off-by: suijingfeng 
Signed-off-by: Sui Jingfeng <15330273...@189.cn>
---
 arch/mips/configs/loongson2k_defconfig | 5 +
 arch/mips/configs/loongson3_defconfig  | 5 +
 2 files changed, 10 insertions(+)

diff --git a/arch/mips/configs/loongson2k_defconfig 
b/arch/mips/configs/loongson2k_defconfig
index e948ca487e2d..0a97c332a5c3 100644
--- a/arch/mips/configs/loongson2k_defconfig
+++ b/arch/mips/configs/loongson2k_defconfig
@@ -243,6 +243,11 @@ CONFIG_MEDIA_USB_SUPPORT=y
 CONFIG_USB_VIDEO_CLASS=m
 CONFIG_DRM=y
 CONFIG_DRM_RADEON=y
+CONFIG_DRM_DISPLAY_CONNECTOR=m
+CONFIG_DRM_PANEL_SIMPLE=m
+CONFIG_DRM_SII902X=m
+CONFIG_DRM_SIMPLE_BRIDGE=m
+CONFIG_DRM_TI_TFP410=m
 CONFIG_FB_RADEON=y
 CONFIG_LCD_CLASS_DEVICE=y
 CONFIG_LCD_PLATFORM=m
diff --git a/arch/mips/configs/loongson3_defconfig 
b/arch/mips/configs/loongson3_defconfig
index 25ecd15bc952..35e2fc998768 100644
--- a/arch/mips/configs/loongson3_defconfig
+++ b/arch/mips/configs/loongson3_defconfig
@@ -280,6 +280,11 @@ CONFIG_MEDIA_USB_SUPPORT=y
 CONFIG_USB_VIDEO_CLASS=m
 CONFIG_DRM=y
 CONFIG_DRM_RADEON=m
+CONFIG_DRM_DISPLAY_CONNECTOR=m
+CONFIG_DRM_PANEL_SIMPLE=m
+CONFIG_DRM_SII902X=m
+CONFIG_DRM_SIMPLE_BRIDGE=m
+CONFIG_DRM_TI_TFP410=m
 CONFIG_DRM_QXL=y
 CONFIG_DRM_VIRTIO_GPU=y
 CONFIG_FB=y
-- 
2.25.1



[PATCH v11 7/7] drm/lsdc: add drm driver for loongson display controller

2022-03-21 Thread Sui Jingfeng
From: suijingfeng 

There is a display controller in loongson's LS2K1000 SoC and LS7A1000
bridge chip, the display controller is a PCI device in those chips. It
has two display pipes but with only one hardware cursor. Each way has
a DVO interface which provide RGB888 signals, vertical & horizontal
synchronisations, data enable and the pixel clock. Each CRTC is able to
scanout from 1920x1080 resolution at 60Hz, the maxmium resolution is
2048x2048 according to the hardware spec. Loongson display controllers
are simple which require scanout buffers to be physically contiguous.

For LS7A1000 bridge chip, the DC is equipped with a dedicated video RAM
which is typically 64MB or more. In this case, VRAM helper based driver
is intend to be used. While LS2K1000 is a SoC, only system memory is
available. Therefore CMA helper based driver is intend to be used. It is
possible to use VRAM helper based solution by carving out part of system
memory as VRAM though.

For LS7A1000, there are 4 dedicated GPIOs whose control register is
located at the DC register space, They are used to emulate two way i2c.
One for DVO0, another for DVO1. LS2K1000 and LS2K0500 SoC don't have such
GPIO hardwared, they grab i2c adapter from other module, either general
purpose GPIO emulated i2c or hardware i2c adapter.

+--++---+
| DDR4 ||  +---+|
+--+|  | PCIe Root complex |   LS7A1000 |
   || MC0   |  +--++-+++|
  +--+  HT 3.0  | || || |
  | LS3A4000 |<>| +---++---+  +--++--++-+   +--+
  |   CPU|<>| | GC1000 |  | LSDC |<-->| DDR3 MC |<->| VRAM |
  +--+  | ++  +-+--+-++-+   +--+
   || MC1   +---|--|+
+--+|  |
| DDR4 |  +---+   DVO0  |  |  DVO1   +--+
+--+   VGA <--|ADV7125|<+  +>|TFP410|--> DVI/HDMI
  +---+  +--+

The above picture give a simple usage of LS7A1000, note that the encoder
is not necessary adv7125 or tfp410, other candicates can be ch7034b,
sil9022, ite66121 and lt8618 etc.

v2: Fixup warnings reported by kernel test robot

v3: Fix more grammar mistakes in Kconfig reported by Randy Dunlap and give
more details about lsdc.

v4:
   1) Add dts required and explain why device tree is required.
   2) Give more description about lsdc and VRAM helper based driver.
   3) Fix warnings reported by kernel test robot.
   4) Introduce stride_alignment member into struct lsdc_chip_desc, the
  stride alignment is 256 bytes for ls7a1000, ls2k1000 and ls2k0500.

v5:
   1) Using writel and readl replace writeq and readq, to fix kernel test
  robot report build error on other archtecture.
   2) Set default fb format to XRGB at crtc reset time.

v6:
   1) Explain why we are not switch to drm dridge subsystem on ls2k1000.
   2) Explain why tiny drm driver is not suitable for us.
   3) Give a short description of the trival dirty uppdate implement based
  on CMA helper.

v7:
   1) Remove select I2C_GPIO and I2C_LS2X in Kconfig, it is not ready now
   2) Licensing issues are fixed suggested by Krzysztof Kozlowski.
   3) Remove lsdc_pixpll_print(), part of it move to debugfs.
   4) Set prefer_shadow to true if vram based driver is in using.
   5) Replace double blank lines with single line in all files.
   6) Verbose cmd line parameter is replaced with drm_dbg()
   7) All warnnings reported by ./scripts/checkpatch.pl --strict are fixed
   8) Get edid from dtb support is removed as suggested by Maxime Ripard
   9) Fix typos and various improvement

v8:
   1) Drop damage update implement and its command line.
   2) Drop DRM_LSDC_VRAM_DRIVER config option as suggested by Maxime.
   3) Deduce DC's identification from its compatible property.
   4) Drop the board specific dts patch.
   5) Add documention about the display controller device node.

v9:
   1) Fix the warnings reported by checkpatch script and fix typos

v10:
   1) Pass `make dt_binding_check` validation
   2) Fix warnings reported by kernel test robot

v11:
   1) Convert the driver to use drm bridge and of graph framework.
   2) Dump register value support through debugfs.

Reported-by: kernel test robot 
Signed-off-by: suijingfeng 
Signed-off-by: Sui Jingfeng <15330273...@189.cn>
Signed-off-by: suijingfeng 
---
 drivers/gpu/drm/Kconfig |   2 +
 drivers/gpu/drm/Makefile|   1 +
 drivers/gpu/drm/lsdc/Kconfig|  23 ++
 drivers/gpu/drm/lsdc/Makefile   |  13 +
 drivers/gpu/drm/lsdc/lsdc_crtc.c| 396 +++
 drivers/gpu/drm/lsdc/lsdc_drv.c | 547 ++
 drivers/gpu/drm/lsdc/lsdc_drv.h | 197 ++
 drivers/gpu/drm/lsdc/lsdc_i2c.c | 235 
 

[PATCH v11 5/7] dt-bindings: display: Add Loongson display controller

2022-03-21 Thread Sui Jingfeng
From: suijingfeng 

Signed-off-by: suijingfeng 
Signed-off-by: Sui Jingfeng <15330273...@189.cn>
---
 .../loongson/loongson,display-controller.yaml | 230 ++
 1 file changed, 230 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml

diff --git 
a/Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
 
b/Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
new file mode 100644
index ..7be63346289e
--- /dev/null
+++ 
b/Documentation/devicetree/bindings/display/loongson/loongson,display-controller.yaml
@@ -0,0 +1,230 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: 
http://devicetree.org/schemas/display/loongson/loongson,display-controller.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Loongson LS7A1000/LS2K1000/LS2K0500 Display Controller Device Tree 
Bindings
+
+maintainers:
+  - Sui Jingfeng 
+
+description: |+
+
+  Loongson display controllers are simple which require scanout buffers
+  to be physically contiguous. LS2K1000/LS2K0500 is a SOC, only system
+  memory is available. LS7A1000/LS7A2000 is bridge chip which is equipped
+  with a dedicated video RAM which is 64MB or more, precise size can be
+  read from the PCI BAR 2 of the GPU device(0x0014:0x7A15) in the bridge
+  chip.
+
+  LSDC has two display pipes, each way has a DVO interface which provide
+  RGB888 signals, vertical & horizontal synchronisations, data enable and
+  the pixel clock. LSDC has two CRTC, each CRTC is able to scanout from
+  1920x1080 resolution at 60Hz. Each CRTC has two FB address registers.
+
+  For LS7A1000, there are 4 dedicated GPIOs whose control register is
+  located at the DC register space. They are used to emulate two way i2c,
+  One for DVO0, another for DVO1.
+
+  LS2K1000 and LS2K0500 SoC grab i2c adapter from other module, either
+  general purpose GPIO emulated i2c or hardware i2c in the SoC.
+
+  LSDC's display pipeline have several components as below description,
+
+  The display controller in LS7A1000:
+ ___ _
+|---|   | |
+|  CRTC0 --> | DVO0 > Encoder0 ---> Connector0 ---> | Monitor |
+|  _   _ ---|^ ^|_|
+| | | | |---|| |
+| |_| |_|| i2c0 <+-+
+|---|
+|   DC IN LS7A1000  |
+|  _   _ ---|
+| | | | || i2c1 <+-+
+| |_| |_|---|| | _
+|---|| || |
+|  CRTC1 --> | DVO1 > Encoder1 ---> Connector1 ---> |  Panel  |
+|---|   |_|
+|___|
+
+  Simple usage of LS7A1000 with LS3A4000 CPU:
+
++--++---+
+| DDR4 ||  +---+|
++--+|  | PCIe Root complex |   LS7A1000 |
+   || MC0   |  +--++-+++|
+  +--+  HT 3.0  | || || |
+  | LS3A4000 |<>| +---++---+  +--++--++-+   +--+
+  |   CPU|<>| | GC1000 |  | LSDC |<-->| DDR3 MC |<->| VRAM |
+  +--+  | ++  +-+--+-++-+   +--+
+   || MC1   +---|--|+
++--+|  |
+| DDR4 |  +---+   DVO0  |  |  DVO1   +--+
++--+   VGA <--|ADV7125|<+  +>|TFP410|--> DVI/HDMI
+  +---+  +--+
+
+  The display controller in LS2K1000/LS2K0500:
+ ___ _
+|---|   | |
+|  CRTC0 --> | DVO0 > Encoder0 ---> Connector0 ---> | Monitor |
+|  _   _ ---|^  ^   |_|
+| | | | |   ||  |
+| |_| |_|   | +--+  |
+|   <>| i2c0 |<-+
+|   DC IN LS2K1000  | +--+
+|  _   _| +--+
+| | | | |   <>| i2c1 |--+
+| |_| |_|   | +--+  |_
+|---||  |   | |
+|  CRTC1 --> | DVO1 > Encoder1 ---> Connector1 ---> |  Panel  |
+|---|   |_|
+|___|
+
+properties:
+  $nodename:
+pattern: "^display-controller@[0-9a-f],[0-9a-f]$"
+
+  compatible:
+oneOf:
+  - items:
+  - enum:
+  - 

[PATCH v11 4/7] MIPS: Loongson64: dts: introduce ls2k1000 pai evaluation board

2022-03-21 Thread Sui Jingfeng
From: suijingfeng 

   ___   
  |---| ||
  |  CRTC0 --> | DVO0 > | 1024x600 DPI Panel |
  |  _   _ ---|  | Which panel to use   ||
  | | | | |   |  | with this board is a  ___
  | |_| |_|   |  | choice of the user   |   |
  |   |  +> | 800x480 DPI Panel |
  |   DC In LS2K1000  | |___|
  |  _   _| +--+
  | | | | |   <>| i2c1 |---+
  | |_| |_|   | +--+   |
  |   ||   |   _
  |---|+-+ |  | |
  |  CRTC1 --> | DVO1 ---> | sii9022 | --> HDMI connector --> | Monitor |
  |---|+-+|_|
  |___|

The sii9022 HDMI transmitter working in transparent mode, in this case
the edid is read from the monitor directly, not through sil9022's ddc
channel. The PMON[2] firmware of this board is responsible for configure
the sii9022 encoder at boot time. Due to i2c driver for lsk2000 SoC is
not upstream yet, we simply replace the sii9022 with a 1024x768 panel.

The i2c0 is not get used by lsdc driver for this board, so there no
need to worry about DVO0.

[1] https://wiki.debian.org/InstallingDebianOn/Lemote/Loongson2K1000
[2] https://github.com/loongson-community/pmon

Signed-off-by: suijingfeng 
Signed-off-by: Sui Jingfeng <15330273...@189.cn>
---
 arch/mips/boot/dts/loongson/ls2k1000_pai.dts | 102 +++
 1 file changed, 102 insertions(+)
 create mode 100644 arch/mips/boot/dts/loongson/ls2k1000_pai.dts

diff --git a/arch/mips/boot/dts/loongson/ls2k1000_pai.dts 
b/arch/mips/boot/dts/loongson/ls2k1000_pai.dts
new file mode 100644
index ..0b0172d90677
--- /dev/null
+++ b/arch/mips/boot/dts/loongson/ls2k1000_pai.dts
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/dts-v1/;
+
+#include "loongson64-2k1000.dtsi"
+
+/ {
+   model = "LS2K1000_PAI_UDB_V1.5";
+
+   panel: display@0 {
+   compatible = "hontron,070JII2135-A2", "panel-dpi";
+   label = "LCD070CG1024600+DC21";
+
+   rotation = <0>;
+   width-mm = <86>;
+   height-mm = <154>;
+
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   port@0 {
+   reg = <0>;
+
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   panel_in: endpoint@0 {
+   reg = <0>;
+   remote-endpoint = <_out_rgb0>;
+   };
+   };
+
+   panel-timing {
+   clock-frequency = <5120>;
+   hactive = <1024>;
+   vactive = <600>;
+   hsync-len = <4>;
+   hfront-porch = <160>;
+   hback-porch = <156>;
+   vfront-porch = <11>;
+   vback-porch = <23>;
+   vsync-len = <1>;
+
+   hsync-active = <0>;
+   vsync-active = <0>;
+   de-active = <1>;
+   pixelclk-active = <1>;
+   };
+   };
+
+   monitor: display@1 {
+   compatible = "panel-dpi";
+
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   port@0 {
+   reg = <0>;
+
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   monitor_in: endpoint@0 {
+   reg = <0>;
+   remote-endpoint = <_out_rgb1>;
+   };
+   };
+
+   panel-timing {
+   clock-frequency = <6500>;
+   hactive = <1024>;
+   vactive = <768>;
+   hfront-porch = <24>;
+   hsync-len = <136>;
+   hback-porch = <160>;
+   vfront-porch = <3>;
+   vback-porch = <6>;
+   vsync-len = <29>;
+
+   hsync-active = <0>;
+   vsync-active = <0>;
+   de-active = <1>;
+   pixelclk-active = <1>;
+   };
+   };
+};
+
+ {
+   ports {
+   port@0 {
+   endpoint {
+   remote-endpoint = <_in>;
+   };
+   };
+
+   port@1 {
+   endpoint {
+   remote-endpoint = 

[PATCH v11 3/7] MIPS: Loongson64: dts: introduce lemote A1901 motherboard

2022-03-21 Thread Sui Jingfeng
From: suijingfeng 

This board is made by LEMOTE corporation, it has two name, one
is LX-6901, another is A1901.

This board has only one VGA output which is connected to the DVO1 of
the display controller.

+--++---+
| DDR4 ||  +---+|
+--+|  | PCIe Root complex |   LS7A1000 |
   || MC0   |  +--++-+++|
  +--+  HT 3.0  | || || |
  | LS3A4000 |<>| +---++---+  +--++--++-+   +--+
  |   CPU|<>| | GC1000 |  | LSDC |<-->| DDR3 MC |<->| VRAM |
  +--+  | ++  +-+--+-++-+   +--+
   || MC1   +---|--|+
+--+|  |
| DDR4 |   DVO0 is not get used |  |  DVO1   +---+
+--+   <+  +>|ADV7125|---> VGA
 +---+
The model property added can provided board specific information,
mips kernel use it as machine name.

$ cat /proc/cpuinfo

system type : Generic Loongson64 System
machine : LX-6901  < notice here
processor   : 0
cpu model   : ICT Loongson-3 V0.1  FPU V0.1
BogoMIPS: 3594.02
tlb_entries : 2112
isa : mips64r2
ASEs implemented: vz msa loongson-ext2
...

Signed-off-by: suijingfeng 
Signed-off-by: Sui Jingfeng <15330273...@189.cn>
---
 arch/mips/boot/dts/loongson/lemote_a1901.dts | 92 
 1 file changed, 92 insertions(+)
 create mode 100644 arch/mips/boot/dts/loongson/lemote_a1901.dts

diff --git a/arch/mips/boot/dts/loongson/lemote_a1901.dts 
b/arch/mips/boot/dts/loongson/lemote_a1901.dts
new file mode 100644
index ..f0443bc43af9
--- /dev/null
+++ b/arch/mips/boot/dts/loongson/lemote_a1901.dts
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/dts-v1/;
+
+#include "loongson64g-package.dtsi"
+#include "ls7a-pch.dtsi"
+
+/ {
+   model = "LX-6901";
+
+   vga-encoder {
+   compatible = "adi,adv7123", "dumb-vga-dac";
+
+   ports {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   port@0 {
+   reg = <0>;
+   adv7123_in: endpoint {
+   remote-endpoint = <_out_rgb1>;
+   };
+   };
+
+   port@1 {
+   reg = <1>;
+   adv7123_out: endpoint {
+   remote-endpoint = <_connector_in>;
+   };
+   };
+   };
+   };
+
+   vga-connector {
+   compatible = "vga-connector";
+   label = "vga";
+
+   ddc-i2c-bus = <>;
+
+   port {
+   vga_connector_in: endpoint {
+   remote-endpoint = <_out>;
+   };
+   };
+   };
+};
+
+ {
+   htvec: interrupt-controller@efdfb80 {
+   compatible = "loongson,htvec-1.0";
+   reg = <0xefd 0xfb80 0x40>;
+   interrupt-controller;
+   #interrupt-cells = <1>;
+
+   interrupt-parent = <>;
+   interrupts = <24 IRQ_TYPE_LEVEL_HIGH>,
+<25 IRQ_TYPE_LEVEL_HIGH>,
+<26 IRQ_TYPE_LEVEL_HIGH>,
+<27 IRQ_TYPE_LEVEL_HIGH>,
+<28 IRQ_TYPE_LEVEL_HIGH>,
+<29 IRQ_TYPE_LEVEL_HIGH>,
+<30 IRQ_TYPE_LEVEL_HIGH>,
+<31 IRQ_TYPE_LEVEL_HIGH>;
+   };
+};
+
+ {
+   msi: msi-controller@2ff0 {
+   compatible = "loongson,pch-msi-1.0";
+   reg = <0 0x2ff0 0 0x8>;
+   interrupt-controller;
+   msi-controller;
+   loongson,msi-base-vec = <64>;
+   loongson,msi-num-vecs = <192>;
+   interrupt-parent = <>;
+   };
+};
+
+ {
+   ports {
+   port@0 {
+   status = "disabled";
+   };
+
+   port@1 {
+   status = "ok";
+   endpoint {
+   remote-endpoint = <_in>;
+   };
+   };
+   };
+};
-- 
2.25.1



[PATCH v11 2/7] MIPS: Loongson64: dts: introduce ls3A4000 evaluation board

2022-03-21 Thread Sui Jingfeng
From: suijingfeng 

The board name is LS3A4000_7A1000_EVB_BOARD_V1.4, it consist of 1.8Ghz
mips64r5 4-core CPU and LS7A1000 bridge chip. It has PCIe GEN2 x8 slot,
therefore can play with discrete graphics card.

While the integrated display copntroller is equipped with a VGA output
and a DVI output, the VGA is connect to the DVO0 output port of the
display controller, the DVI is connected to DVO1 output port of the
display controller.

+--++---+
| DDR4 ||  +---+|
+--+|  | PCIe Root complex |   LS7A1000 |
   || MC0   |  +--++-+++|
  +--+  HT 3.0  | || || |
  | LS3A4000 |<>| +---++---+  +--++--++-+   +--+
  |   CPU|<>| | GC1000 |  | LSDC |<-->| DDR3 MC |<->| VRAM |
  +--+  | ++  +-+--+-++-+   +--+
   || MC1   +---|--|+
+--+|  |
| DDR4 |  +---+   DVO0  |  |  DVO1   +--+
+--+   VGA <--|ADV7125|<+  +>|TFP410|--> DVI/HDMI
  +---+  +--+

Signed-off-by: suijingfeng 
Signed-off-by: Sui Jingfeng <15330273...@189.cn>
---
 .../boot/dts/loongson/ls3a4000_7a1000_evb.dts | 136 ++
 1 file changed, 136 insertions(+)
 create mode 100644 arch/mips/boot/dts/loongson/ls3a4000_7a1000_evb.dts

diff --git a/arch/mips/boot/dts/loongson/ls3a4000_7a1000_evb.dts 
b/arch/mips/boot/dts/loongson/ls3a4000_7a1000_evb.dts
new file mode 100644
index ..f467eddccdac
--- /dev/null
+++ b/arch/mips/boot/dts/loongson/ls3a4000_7a1000_evb.dts
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/dts-v1/;
+
+#include "loongson64g-package.dtsi"
+#include "ls7a-pch.dtsi"
+
+/ {
+   compatible = "loongson,loongson64g-4core-ls7a";
+   model = "LS3A4000_7A1000_EVB_BOARD_V1.4";
+
+   vga-encoder {
+   compatible = "adi,adv7123", "dumb-vga-dac";
+
+   ports {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   port@0 {
+   reg = <0>;
+   adv7123_in: endpoint {
+   remote-endpoint = <_out_rgb0>;
+   };
+   };
+
+   port@1 {
+   reg = <1>;
+   adv7123_out: endpoint {
+   remote-endpoint = <_connector_in>;
+   };
+   };
+   };
+   };
+
+   vga-connector {
+   compatible = "vga-connector";
+   label = "vga";
+
+   ddc-i2c-bus = <>;
+
+   port {
+   vga_connector_in: endpoint {
+   remote-endpoint = <_out>;
+   };
+   };
+   };
+
+   tfp410: dvi-encoder {
+   compatible = "ti,tfp410";
+
+   ports {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   port@0 {
+   reg = <0>;
+   tfp410_in: endpoint {
+   pclk-sample = <1>;
+   bus-width = <24>;
+   remote-endpoint = <_out_rgb1>;
+   };
+   };
+
+   port@1 {
+   reg = <1>;
+   tfp410_out: endpoint {
+   remote-endpoint = <_connector_in>;
+   };
+   };
+   };
+   };
+
+   dvi-connector {
+   compatible = "dvi-connector";
+   label = "dvi";
+   digital;
+
+   ddc-i2c-bus = <>;
+
+   port {
+   dvi_connector_in: endpoint {
+   remote-endpoint = <_out>;
+   };
+   };
+   };
+};
+
+ {
+   htvec: interrupt-controller@efdfb80 {
+   compatible = "loongson,htvec-1.0";
+   reg = <0xefd 0xfb80 0x40>;
+   interrupt-controller;
+   #interrupt-cells = <1>;
+
+   interrupt-parent = <>;
+   interrupts = <24 IRQ_TYPE_LEVEL_HIGH>,
+<25 IRQ_TYPE_LEVEL_HIGH>,
+<26 IRQ_TYPE_LEVEL_HIGH>,
+<27 IRQ_TYPE_LEVEL_HIGH>,
+<28 IRQ_TYPE_LEVEL_HIGH>,
+<29 IRQ_TYPE_LEVEL_HIGH>,
+<30 

[PATCH v11 0/7] drm/lsdc: add drm driver for loongson display controller

2022-03-21 Thread Sui Jingfeng
There is a display controller in loongson's LS2K1000 SoC and LS7A1000
bridge chip, the display controller is a PCI device in those chips. It
has two display pipes but with only one hardware cursor. Each way has
a DVO interface which provide RGB888 signals, vertical & horizontal
synchronisations, data enable and the pixel clock. Each CRTC is able to
scanout from 1920x1080 resolution at 60Hz, the maxmium resolution is
2048x2048 according to the hardware spec. Loongson display controllers
are simple which require scanout buffers to be physically contiguous.

For LS7A1000 bridge chip, the DC is equipped with a dedicated video RAM
which is typically 64MB or more. In this case, VRAM helper based driver
is suppose to be used. While LS2K1000 is a SoC, only system memory is
available, therefore CMA helper based driver is intend to be used. It is
possible to use VRAM helper based solution by carving out part of system
memory as VRAM though.

For LS7A1000, there are 4 dedicated GPIOs whose control register is
located at the DC register space, They are used to emulate two way i2c.
One for DVO0, another for DVO1. LS2K1000 and LS2K0500 SoC don't have such
GPIO hardwared, they grab i2c adapter from other module, either general
purpose GPIO emulated i2c or hardware i2c adapter.

+--++---+
| DDR4 ||  +---+|
+--+|  | PCIe Root complex |   LS7A1000 |
   || MC0   |  +--++-+++|
  +--+  HT 3.0  | || || |
  | LS3A4000 |<>| +---++---+  +--++--++-+   +--+
  |   CPU|<>| | GC1000 |  | LSDC |<-->| DDR3 MC |<->| VRAM |
  +--+  | ++  +-+--+-++-+   +--+
   || MC1   +---|--|+
+--+|  |
| DDR4 |  +---+   DVO0  |  |  DVO1   +--+
+--+   VGA <--|ADV7125|<+  +>|TFP410|--> DVI/HDMI
  +---+  +--+

The above picture give a simple usage of LS7A1000, note that the encoder
is not necessary adv7125 or tfp410, other candicates can be ch7034b,
sil9022, ite66121 and lt8618 etc.

v2: Fixup warnings reported by kernel test robot

v3: Fix more grammar mistakes in Kconfig reported by Randy Dunlap and give
more details about lsdc.

v4:
   1) Add dts required and explain why device tree is required.
   2) Give more description about lsdc and VRAM helper based driver.
   3) Fix warnings reported by kernel test robot.
   4) Introduce stride_alignment member into struct lsdc_chip_desc, the
  stride alignment is 256 bytes for ls7a1000, ls2k1000 and ls2k0500.

v5:
   1) Using writel and readl replace writeq and readq, to fix kernel test
  robot report build error on other archtecture.
   2) Set default fb format to XRGB at crtc reset time.

v6:
   1) Explain why we are not switch to drm dridge subsystem on ls2k1000.
   2) Explain why tiny drm driver is not suitable for us.
   3) Give a short description of the trival dirty uppdate implement based
  on CMA helper.

v7:
   1) Remove select I2C_GPIO and I2C_LS2X in Kconfig, it is not ready now
   2) Licensing issues are fixed suggested by Krzysztof Kozlowski.
   3) Remove lsdc_pixpll_print(), part of it move to debugfs.
   4) Set prefer_shadow to true if vram based driver is in using.
   5) Replace double blank lines with single line in all files.
   6) Verbose cmd line parameter is replaced with drm_dbg()
   7) All warnnings reported by ./scripts/checkpatch.pl --strict are fixed
   8) Get edid from dtb support is removed as suggested by Maxime Ripard
   9) Fix typos and various improvement

v8:
   1) Drop damage update implement and its command line.
   2) Drop DRM_LSDC_VRAM_DRIVER config option as suggested by Maxime.
   3) Deduce DC's identification from its compatible property.
   4) Drop the board specific dts patch.
   5) Add documention about the display controller device node.

v9:
   1) Fix the warnings reported by checkpatch script and fix typos

v10:
   1) Pass `make dt_binding_check` validation
   2) Fix warnings reported by kernel test robot

v11:
   1) Convert the driver to use drm bridge and of graph framework.
   2) Dump register value support through debugfs.

Below is a brief introduction of loongson's CPU, bridge chip and SoC.
LS2K1000 is a double core 1.0Ghz mips64r2 compatible SoC[1]. LS7A1000 is
a bridge chip made by Loongson corporation which act as north and/or south
bridge of loongson's desktop and server level processor. It is equivalent
to AMD RS780E+SB710 or something like that. More details can be read from
its user manual[2].

This bridge chip is typically use with LS3A3000, LS3A4000 and LS3A5000 cpu.
LS3A3000 is 4 core 1.45gHz mips64r2 compatible cpu.
LS3A4000 is 4 core 1.8gHz mips64r5 compatible cpu[3].
LS3A5000 is 4 core 2.5gHz loongarch 

[PATCH v11 1/7] MIPS: Loongson64: dts: update the display controller device node

2022-03-21 Thread Sui Jingfeng
From: suijingfeng 

The display controller is a pci device, it is used in ls2k1000 SoC and
LS7A1000 bridge. Its PCI vendor id is 0x0014, Tts PCI device id is 0x7a06.
In order to let the driver to know which chip the DC is contained in,
the compatible of the display controller is named according to the chip's
name.

For LS7A1000, there are 4 dedicated GPIOs whose control register is
located at the DC register space. They are used to emulate i2c for reading
edid from the monitor. One for DVO0, another for DVO1.

LS2K1000 and LS2K0500 SoC don't have such GPIOs, they grab i2c adapter
from other module, either general purpose GPIO emulated i2c or hardware
i2c adapter.

Signed-off-by: suijingfeng 
Signed-off-by: Sui Jingfeng <15330273...@189.cn>
---
 .../boot/dts/loongson/loongson64-2k1000.dtsi  | 24 +
 arch/mips/boot/dts/loongson/ls7a-pch.dtsi | 36 ---
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/arch/mips/boot/dts/loongson/loongson64-2k1000.dtsi 
b/arch/mips/boot/dts/loongson/loongson64-2k1000.dtsi
index 8143a6e3..b1683399 100644
--- a/arch/mips/boot/dts/loongson/loongson64-2k1000.dtsi
+++ b/arch/mips/boot/dts/loongson/loongson64-2k1000.dtsi
@@ -198,6 +198,30 @@ sata@8,0 {
interrupt-parent = <>;
};
 
+   lsdc: display-controller@6,0 {
+   compatible = "loongson,ls2k1000-dc";
+
+   reg = <0x3000 0x0 0x0 0x0 0x0>;
+   interrupts = <28 IRQ_TYPE_LEVEL_LOW>;
+   interrupt-parent = <>;
+
+   ports {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   port@0 {
+   reg = <0>;
+   dc_out_rgb0: endpoint {
+   };
+   };
+   port@1 {
+   reg = <1>;
+   dc_out_rgb1: endpoint {
+   };
+   };
+   };
+   };
+
pci_bridge@9,0 {
compatible = "pci0014,7a19.0",
   "pci0014,7a19",
diff --git a/arch/mips/boot/dts/loongson/ls7a-pch.dtsi 
b/arch/mips/boot/dts/loongson/ls7a-pch.dtsi
index 2f45fce2cdc4..fcea73006f7a 100644
--- a/arch/mips/boot/dts/loongson/ls7a-pch.dtsi
+++ b/arch/mips/boot/dts/loongson/ls7a-pch.dtsi
@@ -160,15 +160,41 @@ gpu@6,0 {
interrupt-parent = <>;
};
 
-   dc@6,1 {
-   compatible = "pci0014,7a06.0",
-  "pci0014,7a06",
-  "pciclass03",
-  "pciclass0300";
+   lsdc: display-controller@6,1 {
+   compatible = "loongson,ls7a1000-dc";
 
reg = <0x3100 0x0 0x0 0x0 0x0>;
interrupts = <28 IRQ_TYPE_LEVEL_HIGH>;
interrupt-parent = <>;
+
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   i2c6: i2c-gpio@0 {
+   compatible = "lsdc,i2c-gpio-0";
+   reg = <6>;
+   };
+
+   i2c7: i2c-gpio@1 {
+   compatible = "lsdc,i2c-gpio-1";
+   reg = <7>;
+   };
+
+   ports {
+   #address-cells = <1>;
+   #size-cells = <0>;
+
+   port@0 {
+   reg = <0>;
+   dc_out_rgb0: endpoint {
+   };
+   };
+   port@1 {
+   reg = <1>;
+   dc_out_rgb1: endpoint {
+   };
+   };
+   };
};
 
hda@7,0 {
-- 
2.25.1



RE: [PATCH v6 1/5] drm/msm/disp/dpu1: set mdp clk to the maximum frequency in opp table during probe

2022-03-21 Thread Vinod Polimera


> -Original Message-
> From: Stephen Boyd 
> Sent: Friday, March 18, 2022 2:41 AM
> To: quic_vpolimer ;
> devicet...@vger.kernel.org; dri-devel@lists.freedesktop.org;
> freedr...@lists.freedesktop.org; linux-arm-...@vger.kernel.org
> Cc: linux-ker...@vger.kernel.org; robdcl...@gmail.com;
> dmitry.barysh...@linaro.org; diand...@chromium.org; quic_kalyant
> 
> Subject: Re: [PATCH v6 1/5] drm/msm/disp/dpu1: set mdp clk to the
> maximum frequency in opp table during probe
> 
> WARNING: This email originated from outside of Qualcomm. Please be wary
> of any links or attachments, and do not enable macros.
> 
> Quoting Vinod Polimera (2022-03-14 07:46:53)
> > use max clock during probe/bind sequence from the opp table.
> > The clock will be scaled down when framework sends an update.
> 
> Capitalize 'use'.
> 
> Why is it important to use max frequency during probe/bind? Does not
> setting the clk rate during probe mean that we'll never use the max
> rate? Does it speed things up during probe?

We need to vote mdp clock during probe/bind so that rails are not set at 
undetermined state as pointed out by Dmitry.
Since we dont know what will be the rate set in boot loader, it would be ideal 
to vote at max frequency. 
There could be a firmware display programmed in bootloader and we want to 
transition it to kernel without underflowing.

Thanks,
Vinod P.


Re: [PATCH v2 1/2] drm: Add GPU reset sysfs event

2022-03-21 Thread Rob Clark
On Mon, Mar 21, 2022 at 2:30 AM Christian König
 wrote:
>
> Am 18.03.22 um 16:12 schrieb Rob Clark:
> > On Fri, Mar 18, 2022 at 12:42 AM Christian König
> >  wrote:
> >> Am 17.03.22 um 18:31 schrieb Rob Clark:
> >>> On Thu, Mar 17, 2022 at 10:27 AM Daniel Vetter  wrote:
>  [SNIP]
> > (At some point, I'd like to use scheduler for the replay, and actually
> > use drm_sched_stop()/etc.. but last time I looked there were still
> > some sched bugs in that area which prevented me from deleting a bunch
> > of code ;-))
>  Not sure about your hw, but at least on intel replaying tends to just
>  result in follow-on fun. And that holds even more so the more complex a
>  workload is. This is why vk just dies immediately and does not try to
>  replay anything, offloading it to the app. Same with arb robusteness.
>  Afaik it's really only media and classic gl which insist that the driver
>  stack somehow recover.
> >>> At least for us, each submit must be self-contained (ie. not rely on
> >>> previous GPU hw state), so in practice replay works out pretty well.
> >>> The worst case is subsequent submits from same process fail as well
> >>> (if they depended on something that crashing submit failed to write
> >>> back to memory.. but in that case they just crash as well and we move
> >>> on to the next one.. the recent gens (a5xx+ at least) are pretty good
> >>> about quickly detecting problems and giving us an error irq.
> >> Well I absolutely agree with Daniel.
> >>
> >> The whole replay thing AMD did in the scheduler is an absolutely mess
> >> and should probably be killed with fire.
> >>
> >> I strongly recommend not to do the same mistake in other drivers.
> >>
> >> If you want to have some replay feature then please make it driver
> >> specific and don't use anything from the infrastructure in the DRM
> >> scheduler.
> > hmm, perhaps I was not clear, but I'm only talking about re-emitting
> > jobs *following* the faulting one (which could be from other contexts,
> > etc).. not trying to restart the faulting job.
> >
> > You *absolutely* need to replay jobs following the faulting one, they
> > could be from unrelated contexts/processes.  You can't just drop them
> > on the floor.
>
> Well you can, it just means that their contexts are lost as well.

Which is rather inconvenient when deqp-egl reset tests, for example,
take down your compositor ;-)

(Which for even more lolz, in CrOS restarts the android container or
vm.. which makes running android-cts deqp kinda funny)

> If you re-submit jobs which were already pushed to the hardware you
> absolutely need to make a couple of things sure:
>
> 1. Don't race with your hardware. E.g. you need a way to stop processing
> in case of a timeout and then double check once more if things haven't
> finished in the meantime.
>
> 2. Make absolutely sure you never re-submit an operation when it's
> dma-fence is already signaled. Otherwise you run into memory corruption.
>
> 3. When you have multiple engines it becomes really tricky because then
> even innocent jobs might have already been started on different queues
> which now hang.

We force power-off/on the GPU to reset it which is a pretty good way
to make sure we aren't racing with the GPU.

It's worked like this since pretty much the beginning, and in the
early days of bringing up mesa support for a new gen we tend to
exercise the gpu hang/recovery path quite a lot.. so it at least seems
pretty robust ;-)

BR,
-R

>
> > Currently it is all driver specific, but I wanted to delete a lot of
> > code and move to using scheduler to handle faults/timeouts (but
> > blocked on that until [1] is resolved)
>
> Please don't.
>
> Especially don't use the pending_list or any of the scheduler
> infrastructure for GPU reset. We need to get rid of that again sooner or
> later.
>
> This is extremely hardware dependent and pushing the amdgpu specific
> handling into the GPU scheduler was a mistake we shouldn't repeat for
> other drivers.
>
> Regards,
> Christian.
>
> >
> > [1] 
> > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.kernel.org%2Fproject%2Fdri-devel%2Fpatch%2F1630457207-13107-2-git-send-email-Monk.Liu%40amd.com%2Fdata=04%7C01%7Cchristian.koenig%40amd.com%7C1f6ddc253f9341231fa108da08f1afa9%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637832131381866493%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000sdata=e%2F1tOh3nxH3QfzKQKiJKjCU7Z5S6haX07F8rzwZhRVY%3Dreserved=0
> >
> > BR,
> > -R
> >
> >> Thanks,
> >> Christian.
> >>
> >>> BR,
> >>> -R
> >>>
>  And recovering from a mess in userspace is a lot simpler than trying to
>  pull of the same magic in the kernel. Plus it also helps with a few of 
>  the
>  dma_fence rules, which is a nice bonus.
>  -Daniel
> 
>


Re: [PATCH] drm/ttm: fix potential null ptr deref in when mem space alloc fails

2022-03-21 Thread Robert Beckett




On 21/03/2022 09:51, Christian König wrote:

Am 18.03.22 um 20:50 schrieb Robert Beckett:

when allocating a resource in place it is common to free the buffer's
resource, then allocate a new resource in a different placement.

e.g. amdgpu_bo_create_kernel_at calls ttm_resource_free, then calls
ttm_bo_mem_space.


Well yes I'm working the drivers towards this, but NAK at the moment. 
Currently bo->resource is never expected to be NULL.


And yes I'm searching for this bug in amdgpu for quite a while. Where 
exactly does that happen?


in my case, I am writing new code for i915 that does this. I will switch 
it to allocate the new resource first, then free the old one if successful.


For the existing amd case, see 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c?h=v5.17#n384



amdgpu_bo_create_kernel_at calls ttm_resource_free, then calls 
ttm_bo_mem_space. If the ttm_bo_mem_space call fails (e.g. due to memory 
pressure), then the error path will try to deref bo->resource, which 
will be null at that point.



to fix this, I honestly don't see a reason to not also have the safety 
check for null there. It could check early and return an error if it is 
null. I think that defensive programming here makes sense, better than a 
null deref if someone programs it wrong.






Amdgpu is supposed to allocate a new resource first, then do a swap and 
the free the old one.


Thanks,
Christian.



In this situation, bo->resource will be null as it is cleared during
the initial freeing of the previous resource.
This leads to a null deref.

Fixes: d3116756a710 (drm/ttm: rename bo->mem and make it a pointer)

Signed-off-by: Robert Beckett 
---
  drivers/gpu/drm/ttm/ttm_bo.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index db3dc7ef5382..62b29ee7d040 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -875,7 +875,7 @@ int ttm_bo_mem_space(struct ttm_buffer_object *bo,
  }
  error:
-    if (bo->resource->mem_type == TTM_PL_SYSTEM && !bo->pin_count)
+    if (bo->resource && bo->resource->mem_type == TTM_PL_SYSTEM && 
!bo->pin_count)

  ttm_bo_move_to_lru_tail_unlocked(bo);
  return ret;




Re: [PATCH 0/4] Drop wbinvd_on_all_cpus usage

2022-03-21 Thread Thomas Hellström
On Mon, 2022-03-21 at 14:43 +, Tvrtko Ursulin wrote:
> 
> On 21/03/2022 13:40, Thomas Hellström wrote:
> > Hi,
> > 
> > On Mon, 2022-03-21 at 13:12 +, Tvrtko Ursulin wrote:
> > > 
> > > On 21/03/2022 12:33, Thomas Hellström wrote:
> > > > On Mon, 2022-03-21 at 12:22 +, Tvrtko Ursulin wrote:
> > > > > 
> > > > > On 21/03/2022 11:03, Thomas Hellström wrote:
> > > > > > Hi, Tvrtko.
> > > > > > 
> > > > > > On 3/21/22 11:27, Tvrtko Ursulin wrote:
> > > > > > > 
> > > > > > > On 19/03/2022 19:42, Michael Cheng wrote:
> > > > > > > > To align with the discussion in [1][2], this patch
> > > > > > > > series
> > > > > > > > drops
> > > > > > > > all
> > > > > > > > usage of
> > > > > > > > wbvind_on_all_cpus within i915 by either replacing the
> > > > > > > > call
> > > > > > > > with certain
> > > > > > > > drm clflush helpers, or reverting to a previous logic.
> > > > > > > 
> > > > > > > AFAIU, complaint from [1] was that it is wrong to provide
> > > > > > > non
> > > > > > > x86
> > > > > > > implementations under the wbinvd_on_all_cpus name.
> > > > > > > Instead an
> > > > > > > arch
> > > > > > > agnostic helper which achieves the same effect could be
> > > > > > > created.
> > > > > > > Does
> > > > > > > Arm have such concept?
> > > > > > 
> > > > > > I also understand Linus' email like we shouldn't leak
> > > > > > incoherent
> > > > > > IO
> > > > > > to
> > > > > > other architectures, meaning any remaining wbinvd()s should
> > > > > > be
> > > > > > X86
> > > > > > only.
> > > > > 
> > > > > The last part is completely obvious since it is a x86
> > > > > instruction
> > > > > name.
> > > > 
> > > > Yeah, I meant the function implementing wbinvd() semantics.
> > > > 
> > > > > 
> > > > > But I think we can't pick a solution until we know how the
> > > > > concept
> > > > > maps
> > > > > to Arm and that will also include seeing how the
> > > > > drm_clflush_sg for
> > > > > Arm
> > > > > would look. Is there a range based solution, or just a big
> > > > > hammer
> > > > > there.
> > > > > If the latter, then it is no good to churn all these reverts
> > > > > but
> > > > > instead
> > > > > an arch agnostic wrapper, with a generic name, would be the
> > > > > way to
> > > > > go.
> > > > 
> > > > But my impression was that ARM would not need the range-based
> > > > interface
> > > > either, because ARM is only for discrete and with discrete
> > > > we're
> > > > always
> > > > coherent.
> > > 
> > > Not sure what you mean here - what about flushing system memory
> > > objects
> > > on discrete? Those still need flushing on paths like suspend
> > > which this
> > > series touches. Am I missing something?
> > 
> > System bos on discrete should always have
> > 
> > I915_BO_CACHE_COHERENT_FOR_READ | I915_BO_CACHE_COHERENT_FOR_WRITE
> > 
> > either by the gpu being fully cache coherent (or us mapping system
> > write-combined). Hence no need for cache clflushes or wbinvd() for
> > incoherent IO.
> 
> Hmm so you are talking about the shmem ttm backend. It ends up
> depending on the result of i915_ttm_cache_level, yes? It cannot end
> up with I915_CACHE_NONE from that function?

If the object is allocated with allowable placement in either LMEM or
SYSTEM, and it ends in system, it gets allocated with I915_CACHE_NONE,
but then the shmem ttm backend isn't used but TTM's wc pools, and the
object should *always* be mapped wc. Even in system.

> 
> I also found in i915_drm.h:
> 
>  * As caching mode when specifying `I915_MMAP_OFFSET_FIXED`,
> WC or WB will
>  * be used, depending on the object placement on creation. WB
> will be used
>  * when the object can only exist in system memory, WC
> otherwise.
> 
> If what you say is true, that on discrete it is _always_ WC, then
> that needs updating as well.

If an object is allocated as system only, then it is mapped WB, and
we're relying on the gpu being cache coherent to avoid clflushes. Same
is actually currently true if the object happens to be accessed by the
cpu while evicted. Might need an update for that.

> 
> > 
> > That's adhering to Linus'
> > 
> > "And I sincerely hope to the gods that no cache-incoherent i915
> > mess
> > ever makes it out of the x86 world. Incoherent IO was always a
> > historical mistake and should never ever happen again, so we should
> > not spread that horrific pattern around."
> 
> Sure, but I was not talking about IO - just the CPU side access to
> CPU side objects.

OK, I was under the impression that clflushes() and wbinvd()s in i915
was only ever used to make data visible to non-snooping GPUs. 

Do you mean that there are other uses as well? Agreed the wb cache
flush on on suspend only if gpu is !I915_BO_CACHE_COHERENT_FOR_READ?
looks to not fit this pattern completely.

Otherwise, for architectures where memory isn't always fully coherent
with the cpu cache, I'd expect them to use the apis in
asm/cacheflush.h, like flush_cache_range() and similar, which are nops
on x86.

Thanks,
Thomas


> 
> 

Re: [PATCH 07/23] drm/vmwgfx: stop using dma_resv_excl_fence

2022-03-21 Thread Zack Rusin
On Mon, 2022-03-21 at 15:12 +0100, Christian König wrote:
> Am 21.03.22 um 15:02 schrieb Zack Rusin:
> > On Mon, 2022-03-21 at 14:58 +0100, Christian König wrote:
> > > ⚠ External Email: This email originated from outside of the
> > > organization. Do not click links or open attachments unless you
> > > recognize the sender.
> > > 
> > > Instead use the new dma_resv_get_singleton function.
> > > 
> > > Signed-off-by: Christian König 
> > > Reviewed-by: Daniel Vetter 
> > > Cc: VMware Graphics 
> > > Cc: Zack Rusin 
> > > ---
> > >   drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 6 --
> > >   1 file changed, 4 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> > > b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> > > index 708899ba2102..36c3b5db7e69 100644
> > > --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> > > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> > > @@ -1165,8 +1165,10 @@ int vmw_resources_clean(struct
> > > vmw_buffer_object *vbo, pgoff_t start,
> > >  vmw_bo_fence_single(bo, NULL);
> > >  if (bo->moving)
> > >  dma_fence_put(bo->moving);
> > > -   bo->moving = dma_fence_get
> > > -   (dma_resv_excl_fence(bo->base.resv));
> > > +
> > > +   /* TODO: This is actually a memory management
> > > dependency */
> > > +   return dma_resv_get_singleton(bo->base.resv,
> > > false,
> > > + >moving);
> > >  }
> > > 
> > >  return 0;
> > > --
> > > 2.25.1
> > > 
> > Sorry, I haven't had the time to go over the entire series, the
> > patch
> > looks good, but what's the memory management dependency the todo
> > mentions?
> 
> Previously the function installed only the exclusive fence as moving
> fence into the BO.
> Now it grabs all fences and installs them as moving fence into the
> BO.
> 
> But what we really need is tracking if a fence in the reservation
> object
> is a kernel memory management dependency or not.
> 
> Patch #19 adds that and patch #23 then finally gets rid of the whole
> bo->moving handling here because it becomes completely unnecessary.
> 
> I can drop the comment if you want or just note that it is only
> temporary until the follow up patches are merged.

Ah, yes, if you could remove it that'd be great. The patch will never
be backported anywhere without the rest of the series, so it shouldn't
be a problem. 

z


[CI] drm/i915/uapi: Add struct drm_i915_query_hwconfig_blob_item

2022-03-21 Thread Tvrtko Ursulin
From: Jordan Justen 

Also, document DRM_I915_QUERY_HWCONFIG_BLOB with this struct.

v3:
 * Add various changes suggested by Tvrtko

v5:
 * Fix documenation formatting and verified with `make htmldocs` as
   suggested by Daniel

Cc: Daniel Vetter 
Signed-off-by: Jordan Justen 
Acked-by: Jon Bloomfield 
Acked-by: Daniel Vetter 
---
I am resurrecthing this since I think it is the minimum we need for an
open source friendly stack.
---
 include/uapi/drm/i915_drm.h | 43 +
 1 file changed, 43 insertions(+)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 071ffd9d51f1..34ad66ddd320 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -3314,6 +3314,49 @@ struct drm_i915_gem_create_ext_protected_content {
 /* ID of the protected content session managed by i915 when PXP is active */
 #define I915_PROTECTED_CONTENT_DEFAULT_SESSION 0xf
 
+/**
+ * DOC: GuC HWCONFIG blob uAPI
+ *
+ * The GuC produces a blob with information about the current device.
+ * i915 reads this blob from GuC and makes it available via this uAPI.
+ *
+ * The returned blob is a sequence of items of variable length
+ * described by struct drm_i915_query_hwconfig_blob_item.
+ *
+ * The overall blob returned by DRM_I915_QUERY_HWCONFIG_BLOB will end
+ * at the same location as the end of the final struct
+ * drm_i915_query_hwconfig_blob_item. In other words, walking through
+ * the individual items is guaranteed to eventually arrive at the
+ * exact end of the entire blob.
+ */
+
+/**
+ * struct drm_i915_query_hwconfig_blob_item - A single hwconfig item
+ * within the sequence of hwconfig items returned by
+ * DRM_I915_QUERY_HWCONFIG_BLOB.
+ *
+ * The length field gives the length of the data[] array. The length
+ * is the number of u32 items in the data[] array, and *not* the
+ * number of bytes.
+ *
+ * The key and length fields are required, so the minimum item size is
+ * 2 x u32, or 8 bytes, when the length field is 0. If the length
+ * field is 1, then the item's size is 12 bytes.
+ *
+ * The meaning of the key field and the data values are documented in
+ * the Programmer's Reference Manual.
+ */
+struct drm_i915_query_hwconfig_blob_item {
+   /** @key: Enum which defines how to interpret @data values. */
+   __u32 key;
+
+   /** @length: The number of u32 values in the @data array. */
+   __u32 length;
+
+   /** @data: Array of values with meaning defined by @key. */
+   __u32 data[];
+};
+
 #if defined(__cplusplus)
 }
 #endif
-- 
2.32.0



Re: [PATCH 0/4] Drop wbinvd_on_all_cpus usage

2022-03-21 Thread Tvrtko Ursulin



On 21/03/2022 13:40, Thomas Hellström wrote:

Hi,

On Mon, 2022-03-21 at 13:12 +, Tvrtko Ursulin wrote:


On 21/03/2022 12:33, Thomas Hellström wrote:

On Mon, 2022-03-21 at 12:22 +, Tvrtko Ursulin wrote:


On 21/03/2022 11:03, Thomas Hellström wrote:

Hi, Tvrtko.

On 3/21/22 11:27, Tvrtko Ursulin wrote:


On 19/03/2022 19:42, Michael Cheng wrote:

To align with the discussion in [1][2], this patch series
drops
all
usage of
wbvind_on_all_cpus within i915 by either replacing the call
with certain
drm clflush helpers, or reverting to a previous logic.


AFAIU, complaint from [1] was that it is wrong to provide non
x86
implementations under the wbinvd_on_all_cpus name. Instead an
arch
agnostic helper which achieves the same effect could be
created.
Does
Arm have such concept?


I also understand Linus' email like we shouldn't leak incoherent
IO
to
other architectures, meaning any remaining wbinvd()s should be
X86
only.


The last part is completely obvious since it is a x86 instruction
name.


Yeah, I meant the function implementing wbinvd() semantics.



But I think we can't pick a solution until we know how the concept
maps
to Arm and that will also include seeing how the drm_clflush_sg for
Arm
would look. Is there a range based solution, or just a big hammer
there.
If the latter, then it is no good to churn all these reverts but
instead
an arch agnostic wrapper, with a generic name, would be the way to
go.


But my impression was that ARM would not need the range-based
interface
either, because ARM is only for discrete and with discrete we're
always
coherent.


Not sure what you mean here - what about flushing system memory objects
on discrete? Those still need flushing on paths like suspend which this
series touches. Am I missing something?


System bos on discrete should always have

I915_BO_CACHE_COHERENT_FOR_READ | I915_BO_CACHE_COHERENT_FOR_WRITE

either by the gpu being fully cache coherent (or us mapping system
write-combined). Hence no need for cache clflushes or wbinvd() for
incoherent IO.


Hmm so you are talking about the shmem ttm backend. It ends up depending on the 
result of i915_ttm_cache_level, yes? It cannot end up with I915_CACHE_NONE from 
that function?

I also found in i915_drm.h:

 * As caching mode when specifying `I915_MMAP_OFFSET_FIXED`, WC or WB 
will
 * be used, depending on the object placement on creation. WB will be 
used
 * when the object can only exist in system memory, WC otherwise.

If what you say is true, that on discrete it is _always_ WC, then that needs 
updating as well.



That's adhering to Linus'

"And I sincerely hope to the gods that no cache-incoherent i915 mess
ever makes it out of the x86 world. Incoherent IO was always a
historical mistake and should never ever happen again, so we should
not spread that horrific pattern around."


Sure, but I was not talking about IO - just the CPU side access to CPU side 
objects.

Regards,

Tvrtko


Re: [PATCH v5 3/3] drm/panel : innolux-ej030na and abt-y030xx067a : add .enable and .disable

2022-03-21 Thread Christophe Branchereau
Sorry I meant "sleep out" not "sleep in" obviously

On Mon, Mar 21, 2022 at 3:39 PM Christophe Branchereau
 wrote:
>
> Following the introduction of bridge_atomic_enable in the ingenic
> drm driver, the crtc is enabled between .prepare and .enable, if
> it exists. Add it so the backlight is only enabled after the crtc is, to
> avoid graphical issues.
>
> As we're moving the "sleep in" command out of the init sequence
> into .enable for the ABT, we need to switch the regmap cache
> to REGCACHE_FLAT to be able to use regmap_set_bits, given this
> panel registers are write-ony and read as 0.
>
> On Mon, Mar 21, 2022 at 3:21 PM Paul Cercueil  wrote:
> >
> > Hi Christophe,
> >
> > Le lun., mars 21 2022 at 14:36:51 +0100, Christophe Branchereau
> >  a écrit :
> > > Following the introduction of bridge_atomic_enable in the ingenic
> > > drm driver, the crtc is enabled between .prepare and .enable, if
> > > it exists.
> > >
> > > Add it so the backlight is only enabled after the crtc is, to avoid
> > > graphical issues.
> > >
> > > Signed-off-by: Christophe Branchereau 
> >
> > Didn't Sam acked it?
> >
> > > ---
> > >  drivers/gpu/drm/panel/panel-abt-y030xx067a.c  | 31
> > > +--
> > >  drivers/gpu/drm/panel/panel-innolux-ej030na.c | 31
> > > ---
> > >  2 files changed, 55 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
> > > b/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
> > > index f043b484055b..ddfacaeac1d4 100644
> > > --- a/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
> > > +++ b/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
> > > @@ -140,7 +140,7 @@ static const struct reg_sequence
> > > y030xx067a_init_sequence[] = {
> > >   { 0x03, REG03_VPOSITION(0x0a) },
> > >   { 0x04, REG04_HPOSITION1(0xd2) },
> > >   { 0x05, REG05_CLIP | REG05_NVM_VREFRESH | REG05_SLBRCHARGE(0x2) },
> > > - { 0x06, REG06_XPSAVE | REG06_NT },
> > > + { 0x06, REG06_NT },
> > >   { 0x07, 0 },
> > >   { 0x08, REG08_PANEL(0x1) | REG08_CLOCK_DIV(0x2) },
> > >   { 0x09, REG09_SUB_BRIGHT_R(0x20) },
> > > @@ -183,8 +183,6 @@ static int y030xx067a_prepare(struct drm_panel
> > > *panel)
> > >   goto err_disable_regulator;
> > >   }
> > >
> > > - msleep(120);
> > > -
> > >   return 0;
> > >
> > >  err_disable_regulator:
> > > @@ -202,6 +200,30 @@ static int y030xx067a_unprepare(struct drm_panel
> > > *panel)
> > >   return 0;
> > >  }
> > >
> > > +static int y030xx067a_enable(struct drm_panel *panel)
> > > +{
> > > +
> > > + struct y030xx067a *priv = to_y030xx067a(panel);
> > > +
> > > + regmap_set_bits(priv->map, 0x06, REG06_XPSAVE);
> > > +
> > > + if (panel->backlight) {
> > > + /* Wait for the picture to be ready before enabling 
> > > backlight */
> > > + msleep(120);
> > > + }
> > > +
> > > + return 0;
> > > +}
> > > +
> > > +static int y030xx067a_disable(struct drm_panel *panel)
> > > +{
> > > + struct y030xx067a *priv = to_y030xx067a(panel);
> > > +
> > > + regmap_clear_bits(priv->map, 0x06, REG06_XPSAVE);
> > > +
> > > + return 0;
> > > +}
> > > +
> > >  static int y030xx067a_get_modes(struct drm_panel *panel,
> > >   struct drm_connector *connector)
> > >  {
> > > @@ -239,6 +261,8 @@ static int y030xx067a_get_modes(struct drm_panel
> > > *panel,
> > >  static const struct drm_panel_funcs y030xx067a_funcs = {
> > >   .prepare= y030xx067a_prepare,
> > >   .unprepare  = y030xx067a_unprepare,
> > > + .enable = y030xx067a_enable,
> > > + .disable= y030xx067a_disable,
> > >   .get_modes  = y030xx067a_get_modes,
> > >  };
> > >
> > > @@ -246,6 +270,7 @@ static const struct regmap_config
> > > y030xx067a_regmap_config = {
> > >   .reg_bits = 8,
> > >   .val_bits = 8,
> > >   .max_register = 0x15,
> > > + .cache_type = REGCACHE_FLAT,
> >
> > I understand that this is added because the panel registers are
> > write-only and read as zero, and it is needed for
> > regmap_{clear,set}_bits to work.
> >
> > This information should definitely be added to the commit message.
> >
> > If you can add it inline here, and I'll update the commit message when
> > applying the patch.
> >
> > Cheers,
> > -Paul
> >
> > >  };
> > >
> > >  static int y030xx067a_probe(struct spi_device *spi)
> > > diff --git a/drivers/gpu/drm/panel/panel-innolux-ej030na.c
> > > b/drivers/gpu/drm/panel/panel-innolux-ej030na.c
> > > index c558de3f99be..6de7370185cd 100644
> > > --- a/drivers/gpu/drm/panel/panel-innolux-ej030na.c
> > > +++ b/drivers/gpu/drm/panel/panel-innolux-ej030na.c
> > > @@ -80,8 +80,6 @@ static const struct reg_sequence
> > > ej030na_init_sequence[] = {
> > >   { 0x47, 0x08 },
> > >   { 0x48, 0x0f },
> > >   { 0x49, 0x0f },
> > > -
> > > - { 0x2b, 0x01 },
> > >  };
> > >
> > >  static int ej030na_prepare(struct drm_panel *panel)
> > > @@ -109,8 +107,6 @@ 

Re: [PATCH v5 3/3] drm/panel : innolux-ej030na and abt-y030xx067a : add .enable and .disable

2022-03-21 Thread Christophe Branchereau
Following the introduction of bridge_atomic_enable in the ingenic
drm driver, the crtc is enabled between .prepare and .enable, if
it exists. Add it so the backlight is only enabled after the crtc is, to
avoid graphical issues.

As we're moving the "sleep in" command out of the init sequence
into .enable for the ABT, we need to switch the regmap cache
to REGCACHE_FLAT to be able to use regmap_set_bits, given this
panel registers are write-ony and read as 0.

On Mon, Mar 21, 2022 at 3:21 PM Paul Cercueil  wrote:
>
> Hi Christophe,
>
> Le lun., mars 21 2022 at 14:36:51 +0100, Christophe Branchereau
>  a écrit :
> > Following the introduction of bridge_atomic_enable in the ingenic
> > drm driver, the crtc is enabled between .prepare and .enable, if
> > it exists.
> >
> > Add it so the backlight is only enabled after the crtc is, to avoid
> > graphical issues.
> >
> > Signed-off-by: Christophe Branchereau 
>
> Didn't Sam acked it?
>
> > ---
> >  drivers/gpu/drm/panel/panel-abt-y030xx067a.c  | 31
> > +--
> >  drivers/gpu/drm/panel/panel-innolux-ej030na.c | 31
> > ---
> >  2 files changed, 55 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
> > b/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
> > index f043b484055b..ddfacaeac1d4 100644
> > --- a/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
> > +++ b/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
> > @@ -140,7 +140,7 @@ static const struct reg_sequence
> > y030xx067a_init_sequence[] = {
> >   { 0x03, REG03_VPOSITION(0x0a) },
> >   { 0x04, REG04_HPOSITION1(0xd2) },
> >   { 0x05, REG05_CLIP | REG05_NVM_VREFRESH | REG05_SLBRCHARGE(0x2) },
> > - { 0x06, REG06_XPSAVE | REG06_NT },
> > + { 0x06, REG06_NT },
> >   { 0x07, 0 },
> >   { 0x08, REG08_PANEL(0x1) | REG08_CLOCK_DIV(0x2) },
> >   { 0x09, REG09_SUB_BRIGHT_R(0x20) },
> > @@ -183,8 +183,6 @@ static int y030xx067a_prepare(struct drm_panel
> > *panel)
> >   goto err_disable_regulator;
> >   }
> >
> > - msleep(120);
> > -
> >   return 0;
> >
> >  err_disable_regulator:
> > @@ -202,6 +200,30 @@ static int y030xx067a_unprepare(struct drm_panel
> > *panel)
> >   return 0;
> >  }
> >
> > +static int y030xx067a_enable(struct drm_panel *panel)
> > +{
> > +
> > + struct y030xx067a *priv = to_y030xx067a(panel);
> > +
> > + regmap_set_bits(priv->map, 0x06, REG06_XPSAVE);
> > +
> > + if (panel->backlight) {
> > + /* Wait for the picture to be ready before enabling backlight 
> > */
> > + msleep(120);
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +static int y030xx067a_disable(struct drm_panel *panel)
> > +{
> > + struct y030xx067a *priv = to_y030xx067a(panel);
> > +
> > + regmap_clear_bits(priv->map, 0x06, REG06_XPSAVE);
> > +
> > + return 0;
> > +}
> > +
> >  static int y030xx067a_get_modes(struct drm_panel *panel,
> >   struct drm_connector *connector)
> >  {
> > @@ -239,6 +261,8 @@ static int y030xx067a_get_modes(struct drm_panel
> > *panel,
> >  static const struct drm_panel_funcs y030xx067a_funcs = {
> >   .prepare= y030xx067a_prepare,
> >   .unprepare  = y030xx067a_unprepare,
> > + .enable = y030xx067a_enable,
> > + .disable= y030xx067a_disable,
> >   .get_modes  = y030xx067a_get_modes,
> >  };
> >
> > @@ -246,6 +270,7 @@ static const struct regmap_config
> > y030xx067a_regmap_config = {
> >   .reg_bits = 8,
> >   .val_bits = 8,
> >   .max_register = 0x15,
> > + .cache_type = REGCACHE_FLAT,
>
> I understand that this is added because the panel registers are
> write-only and read as zero, and it is needed for
> regmap_{clear,set}_bits to work.
>
> This information should definitely be added to the commit message.
>
> If you can add it inline here, and I'll update the commit message when
> applying the patch.
>
> Cheers,
> -Paul
>
> >  };
> >
> >  static int y030xx067a_probe(struct spi_device *spi)
> > diff --git a/drivers/gpu/drm/panel/panel-innolux-ej030na.c
> > b/drivers/gpu/drm/panel/panel-innolux-ej030na.c
> > index c558de3f99be..6de7370185cd 100644
> > --- a/drivers/gpu/drm/panel/panel-innolux-ej030na.c
> > +++ b/drivers/gpu/drm/panel/panel-innolux-ej030na.c
> > @@ -80,8 +80,6 @@ static const struct reg_sequence
> > ej030na_init_sequence[] = {
> >   { 0x47, 0x08 },
> >   { 0x48, 0x0f },
> >   { 0x49, 0x0f },
> > -
> > - { 0x2b, 0x01 },
> >  };
> >
> >  static int ej030na_prepare(struct drm_panel *panel)
> > @@ -109,8 +107,6 @@ static int ej030na_prepare(struct drm_panel
> > *panel)
> >   goto err_disable_regulator;
> >   }
> >
> > - msleep(120);
> > -
> >   return 0;
> >
> >  err_disable_regulator:
> > @@ -128,6 +124,31 @@ static int ej030na_unprepare(struct drm_panel
> > *panel)
> >   return 0;
> >  }
> >
> > +static int ej030na_enable(struct drm_panel *panel)
> > +{
> > + struct 

Re: Regression from 3c196f056666 ("drm/amdgpu: always reset the asic in suspend (v2)") on suspend?

2022-03-21 Thread Thorsten Leemhuis
On 21.03.22 13:07, Éric Valette wrote:
> My problem has never been fixed.
>
> The proposed patch has been applied to 5.15. I do not remerber which version 
> 28 maybe.
> 
> I still have à RIP in pm_suspend. Did not test the Last two 15 versions.
> 
> I can leave with 5.10 est using own compiled kernels.
> 
> Thanks for asking.

This thread/the debian bug report
(https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1005005 ) is getting
long which makes things hard to grasp. But to me it looks a lot like the
problem you are facing is different from the problem that others ran
into and bisected -- but I might be totally wrong there. Have you ever
tried reverting 3c196f05 to seem if it helps (sorry if that's
mentioned in the bug report somewhere, as I said, it became long)? I
guess a bisection from your side really would help a lot; but before you
go down that route you might want to give 5.17 and the latest 5.15.y
kernel a try.

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)

P.S.: As the Linux kernel's regression tracker I'm getting a lot of
reports on my table. I can only look briefly into most of them and lack
knowledge about most of the areas they concern. I thus unfortunately
will sometimes get things wrong or miss something important. I hope
that's not the case here; if you think it is, don't hesitate to tell me
in a public reply, it's in everyone's interest to set the public record
straight.



Re: [PATCH v5 3/3] drm/panel : innolux-ej030na and abt-y030xx067a : add .enable and .disable

2022-03-21 Thread Paul Cercueil

Hi Christophe,

Le lun., mars 21 2022 at 14:36:51 +0100, Christophe Branchereau 
 a écrit :

Following the introduction of bridge_atomic_enable in the ingenic
drm driver, the crtc is enabled between .prepare and .enable, if
it exists.

Add it so the backlight is only enabled after the crtc is, to avoid
graphical issues.

Signed-off-by: Christophe Branchereau 


Didn't Sam acked it?


---
 drivers/gpu/drm/panel/panel-abt-y030xx067a.c  | 31 
+--
 drivers/gpu/drm/panel/panel-innolux-ej030na.c | 31 
---

 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/panel/panel-abt-y030xx067a.c 
b/drivers/gpu/drm/panel/panel-abt-y030xx067a.c

index f043b484055b..ddfacaeac1d4 100644
--- a/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
+++ b/drivers/gpu/drm/panel/panel-abt-y030xx067a.c
@@ -140,7 +140,7 @@ static const struct reg_sequence 
y030xx067a_init_sequence[] = {

{ 0x03, REG03_VPOSITION(0x0a) },
{ 0x04, REG04_HPOSITION1(0xd2) },
{ 0x05, REG05_CLIP | REG05_NVM_VREFRESH | REG05_SLBRCHARGE(0x2) },
-   { 0x06, REG06_XPSAVE | REG06_NT },
+   { 0x06, REG06_NT },
{ 0x07, 0 },
{ 0x08, REG08_PANEL(0x1) | REG08_CLOCK_DIV(0x2) },
{ 0x09, REG09_SUB_BRIGHT_R(0x20) },
@@ -183,8 +183,6 @@ static int y030xx067a_prepare(struct drm_panel 
*panel)

goto err_disable_regulator;
}

-   msleep(120);
-
return 0;

 err_disable_regulator:
@@ -202,6 +200,30 @@ static int y030xx067a_unprepare(struct drm_panel 
*panel)

return 0;
 }

+static int y030xx067a_enable(struct drm_panel *panel)
+{
+
+   struct y030xx067a *priv = to_y030xx067a(panel);
+
+   regmap_set_bits(priv->map, 0x06, REG06_XPSAVE);
+
+   if (panel->backlight) {
+   /* Wait for the picture to be ready before enabling backlight */
+   msleep(120);
+   }
+
+   return 0;
+}
+
+static int y030xx067a_disable(struct drm_panel *panel)
+{
+   struct y030xx067a *priv = to_y030xx067a(panel);
+
+   regmap_clear_bits(priv->map, 0x06, REG06_XPSAVE);
+
+   return 0;
+}
+
 static int y030xx067a_get_modes(struct drm_panel *panel,
struct drm_connector *connector)
 {
@@ -239,6 +261,8 @@ static int y030xx067a_get_modes(struct drm_panel 
*panel,

 static const struct drm_panel_funcs y030xx067a_funcs = {
.prepare= y030xx067a_prepare,
.unprepare  = y030xx067a_unprepare,
+   .enable = y030xx067a_enable,
+   .disable= y030xx067a_disable,
.get_modes  = y030xx067a_get_modes,
 };

@@ -246,6 +270,7 @@ static const struct regmap_config 
y030xx067a_regmap_config = {

.reg_bits = 8,
.val_bits = 8,
.max_register = 0x15,
+   .cache_type = REGCACHE_FLAT,


I understand that this is added because the panel registers are 
write-only and read as zero, and it is needed for 
regmap_{clear,set}_bits to work.


This information should definitely be added to the commit message.

If you can add it inline here, and I'll update the commit message when 
applying the patch.


Cheers,
-Paul


 };

 static int y030xx067a_probe(struct spi_device *spi)
diff --git a/drivers/gpu/drm/panel/panel-innolux-ej030na.c 
b/drivers/gpu/drm/panel/panel-innolux-ej030na.c

index c558de3f99be..6de7370185cd 100644
--- a/drivers/gpu/drm/panel/panel-innolux-ej030na.c
+++ b/drivers/gpu/drm/panel/panel-innolux-ej030na.c
@@ -80,8 +80,6 @@ static const struct reg_sequence 
ej030na_init_sequence[] = {

{ 0x47, 0x08 },
{ 0x48, 0x0f },
{ 0x49, 0x0f },
-
-   { 0x2b, 0x01 },
 };

 static int ej030na_prepare(struct drm_panel *panel)
@@ -109,8 +107,6 @@ static int ej030na_prepare(struct drm_panel 
*panel)

goto err_disable_regulator;
}

-   msleep(120);
-
return 0;

 err_disable_regulator:
@@ -128,6 +124,31 @@ static int ej030na_unprepare(struct drm_panel 
*panel)

return 0;
 }

+static int ej030na_enable(struct drm_panel *panel)
+{
+   struct ej030na *priv = to_ej030na(panel);
+
+   /* standby off */
+   regmap_write(priv->map, 0x2b, 0x01);
+
+   if (panel->backlight) {
+   /* Wait for the picture to be ready before enabling backlight */
+   msleep(120);
+   }
+
+   return 0;
+}
+
+static int ej030na_disable(struct drm_panel *panel)
+{
+   struct ej030na *priv = to_ej030na(panel);
+
+   /* standby on */
+   regmap_write(priv->map, 0x2b, 0x00);
+
+   return 0;
+}
+
 static int ej030na_get_modes(struct drm_panel *panel,
 struct drm_connector *connector)
 {
@@ -165,6 +186,8 @@ static int ej030na_get_modes(struct drm_panel 
*panel,

 static const struct drm_panel_funcs ej030na_funcs = {
.prepare= ej030na_prepare,
.unprepare  = ej030na_unprepare,
+   .enable = ej030na_enable,
+   .disable= ej030na_disable,

Re: [PATCH v2 1/2] drm: Add GPU reset sysfs event

2022-03-21 Thread Daniel Vetter
On Fri, Mar 18, 2022 at 08:12:54AM -0700, Rob Clark wrote:
> On Fri, Mar 18, 2022 at 12:42 AM Christian König
>  wrote:
> >
> > Am 17.03.22 um 18:31 schrieb Rob Clark:
> > > On Thu, Mar 17, 2022 at 10:27 AM Daniel Vetter  wrote:
> > >> [SNIP]
> > >>> (At some point, I'd like to use scheduler for the replay, and actually
> > >>> use drm_sched_stop()/etc.. but last time I looked there were still
> > >>> some sched bugs in that area which prevented me from deleting a bunch
> > >>> of code ;-))
> > >> Not sure about your hw, but at least on intel replaying tends to just
> > >> result in follow-on fun. And that holds even more so the more complex a
> > >> workload is. This is why vk just dies immediately and does not try to
> > >> replay anything, offloading it to the app. Same with arb robusteness.
> > >> Afaik it's really only media and classic gl which insist that the driver
> > >> stack somehow recover.
> > > At least for us, each submit must be self-contained (ie. not rely on
> > > previous GPU hw state), so in practice replay works out pretty well.
> > > The worst case is subsequent submits from same process fail as well
> > > (if they depended on something that crashing submit failed to write
> > > back to memory.. but in that case they just crash as well and we move
> > > on to the next one.. the recent gens (a5xx+ at least) are pretty good
> > > about quickly detecting problems and giving us an error irq.
> >
> > Well I absolutely agree with Daniel.
> >
> > The whole replay thing AMD did in the scheduler is an absolutely mess
> > and should probably be killed with fire.
> >
> > I strongly recommend not to do the same mistake in other drivers.
> >
> > If you want to have some replay feature then please make it driver
> > specific and don't use anything from the infrastructure in the DRM
> > scheduler.
> 
> hmm, perhaps I was not clear, but I'm only talking about re-emitting
> jobs *following* the faulting one (which could be from other contexts,
> etc).. not trying to restart the faulting job.

You absolutely can drop jobs on the floor, this is what both anv and iris
expect. They use what we call non-recoverable context, meaning when any
gpu hang happens and the context is affect (whether as the guilty on, or
because it was a multi-engine reset and it was victimized) we kill it
entirely. No replaying, and any further execbuf ioctl fails with -EIO.

Userspace then gets to sort out the mess, which for vk is
VK_ERROR_DEVICE_LOST, for robust gl it's the same, and for non-robust gl
iris re-creates a pile of things.

Anything in-between _is_ dropped on the floor completely.

Also note that this is obviously uapi, if you have an userspace which
expect contexts to survive, then replaying makes some sense.

> You *absolutely* need to replay jobs following the faulting one, they
> could be from unrelated contexts/processes.  You can't just drop them
> on the floor.
> 
> Currently it is all driver specific, but I wanted to delete a lot of
> code and move to using scheduler to handle faults/timeouts (but
> blocked on that until [1] is resolved)

Yeah for the drivers where the uapi is "you can safely replay after a
hang, and you're supposed to", then sharing the code is ofc a good idea.

Just wanted to make it clear that this is only one of many uapi flavours
you can pick from, dropping it all on the floor is a perfectly legit
approach :-) And imo it's the more robust one, and also better fits with
latest apis like gl_arb_robustness or vk.

Cheers, Daniel


> 
> [1] 
> https://patchwork.kernel.org/project/dri-devel/patch/1630457207-13107-2-git-send-email-monk@amd.com/
> 
> BR,
> -R
> 
> > Thanks,
> > Christian.
> >
> > >
> > > BR,
> > > -R
> > >
> > >> And recovering from a mess in userspace is a lot simpler than trying to
> > >> pull of the same magic in the kernel. Plus it also helps with a few of 
> > >> the
> > >> dma_fence rules, which is a nice bonus.
> > >> -Daniel
> > >>
> >

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


Re: [PATCH 07/23] drm/vmwgfx: stop using dma_resv_excl_fence

2022-03-21 Thread Christian König

Am 21.03.22 um 15:02 schrieb Zack Rusin:

On Mon, 2022-03-21 at 14:58 +0100, Christian König wrote:

⚠ External Email: This email originated from outside of the
organization. Do not click links or open attachments unless you
recognize the sender.

Instead use the new dma_resv_get_singleton function.

Signed-off-by: Christian König 
Reviewed-by: Daniel Vetter 
Cc: VMware Graphics 
Cc: Zack Rusin 
---
  drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
index 708899ba2102..36c3b5db7e69 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
@@ -1165,8 +1165,10 @@ int vmw_resources_clean(struct
vmw_buffer_object *vbo, pgoff_t start,
     vmw_bo_fence_single(bo, NULL);
     if (bo->moving)
     dma_fence_put(bo->moving);
-   bo->moving = dma_fence_get
-   (dma_resv_excl_fence(bo->base.resv));
+
+   /* TODO: This is actually a memory management
dependency */
+   return dma_resv_get_singleton(bo->base.resv, false,
+ >moving);
     }

     return 0;
--
2.25.1


Sorry, I haven't had the time to go over the entire series, the patch
looks good, but what's the memory management dependency the todo
mentions?


Previously the function installed only the exclusive fence as moving 
fence into the BO.

Now it grabs all fences and installs them as moving fence into the BO.

But what we really need is tracking if a fence in the reservation object 
is a kernel memory management dependency or not.


Patch #19 adds that and patch #23 then finally gets rid of the whole 
bo->moving handling here because it becomes completely unnecessary.


I can drop the comment if you want or just note that it is only 
temporary until the follow up patches are merged.


Regards,
Christian.




z





Re: [PATCH 01/23] dma-buf: add dma_resv_replace_fences v2

2022-03-21 Thread Christian König
Mhm, crap I've forgot the cover letter. Anyway should be a well known 
set by now.


I've polished the documentation a bit more compared to the last version 
and with this finally managed to correctly CC all the driver maintainers 
on the relevant patches.


Please review and comment.

Cheers,
Christian.

Am 21.03.22 um 14:58 schrieb Christian König:

This function allows to replace fences from the shared fence list when
we can gurantee that the operation represented by the original fence has
finished or no accesses to the resources protected by the dma_resv
object any more when the new fence finishes.

Then use this function in the amdkfd code when BOs are unmapped from the
process.

v2: add an example when this is usefull.

Signed-off-by: Christian König 
---
  drivers/dma-buf/dma-resv.c| 45 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 49 +++
  include/linux/dma-resv.h  |  2 +
  3 files changed, 54 insertions(+), 42 deletions(-)

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index b51416405e86..509060861cf3 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -289,6 +289,51 @@ void dma_resv_add_shared_fence(struct dma_resv *obj, 
struct dma_fence *fence)
  }
  EXPORT_SYMBOL(dma_resv_add_shared_fence);
  
+/**

+ * dma_resv_replace_fences - replace fences in the dma_resv obj
+ * @obj: the reservation object
+ * @context: the context of the fences to replace
+ * @replacement: the new fence to use instead
+ *
+ * Replace fences with a specified context with a new fence. Only valid if the
+ * operation represented by the original fence has no longer access to the
+ * resources represented by the dma_resv object when the new fence completes.
+ *
+ * And example for using this is replacing a preemption fence with a page table
+ * update fence which makes the resource inaccessible.
+ */
+void dma_resv_replace_fences(struct dma_resv *obj, uint64_t context,
+struct dma_fence *replacement)
+{
+   struct dma_resv_list *list;
+   struct dma_fence *old;
+   unsigned int i;
+
+   dma_resv_assert_held(obj);
+
+   write_seqcount_begin(>seq);
+
+   old = dma_resv_excl_fence(obj);
+   if (old->context == context) {
+   RCU_INIT_POINTER(obj->fence_excl, dma_fence_get(replacement));
+   dma_fence_put(old);
+   }
+
+   list = dma_resv_shared_list(obj);
+   for (i = 0; list && i < list->shared_count; ++i) {
+   old = rcu_dereference_protected(list->shared[i],
+   dma_resv_held(obj));
+   if (old->context != context)
+   continue;
+
+   rcu_assign_pointer(list->shared[i], dma_fence_get(replacement));
+   dma_fence_put(old);
+   }
+
+   write_seqcount_end(>seq);
+}
+EXPORT_SYMBOL(dma_resv_replace_fences);
+
  /**
   * dma_resv_add_excl_fence - Add an exclusive fence.
   * @obj: the reservation object
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index f9bab963a948..b6f266f612ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -253,53 +253,18 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
  static int amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo,
struct amdgpu_amdkfd_fence *ef)
  {
-   struct dma_resv *resv = bo->tbo.base.resv;
-   struct dma_resv_list *old, *new;
-   unsigned int i, j, k;
+   struct dma_fence *replacement;
  
  	if (!ef)

return -EINVAL;
  
-	old = dma_resv_shared_list(resv);

-   if (!old)
-   return 0;
-
-   new = kmalloc(struct_size(new, shared, old->shared_max), GFP_KERNEL);
-   if (!new)
-   return -ENOMEM;
-
-   /* Go through all the shared fences in the resevation object and sort
-* the interesting ones to the end of the list.
+   /* TODO: Instead of block before we should use the fence of the page
+* table update and TLB flush here directly.
 */
-   for (i = 0, j = old->shared_count, k = 0; i < old->shared_count; ++i) {
-   struct dma_fence *f;
-
-   f = rcu_dereference_protected(old->shared[i],
- dma_resv_held(resv));
-
-   if (f->context == ef->base.context)
-   RCU_INIT_POINTER(new->shared[--j], f);
-   else
-   RCU_INIT_POINTER(new->shared[k++], f);
-   }
-   new->shared_max = old->shared_max;
-   new->shared_count = k;
-
-   /* Install the new fence list, seqcount provides the barriers */
-   write_seqcount_begin(>seq);
-   RCU_INIT_POINTER(resv->fence, new);
-   write_seqcount_end(>seq);
-
-   /* Drop 

Re: [PATCH 07/23] drm/vmwgfx: stop using dma_resv_excl_fence

2022-03-21 Thread Zack Rusin
On Mon, 2022-03-21 at 14:58 +0100, Christian König wrote:
> ⚠ External Email: This email originated from outside of the
> organization. Do not click links or open attachments unless you
> recognize the sender.
> 
> Instead use the new dma_resv_get_singleton function.
> 
> Signed-off-by: Christian König 
> Reviewed-by: Daniel Vetter 
> Cc: VMware Graphics 
> Cc: Zack Rusin 
> ---
>  drivers/gpu/drm/vmwgfx/vmwgfx_resource.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> index 708899ba2102..36c3b5db7e69 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_resource.c
> @@ -1165,8 +1165,10 @@ int vmw_resources_clean(struct
> vmw_buffer_object *vbo, pgoff_t start,
>     vmw_bo_fence_single(bo, NULL);
>     if (bo->moving)
>     dma_fence_put(bo->moving);
> -   bo->moving = dma_fence_get
> -   (dma_resv_excl_fence(bo->base.resv));
> +
> +   /* TODO: This is actually a memory management
> dependency */
> +   return dma_resv_get_singleton(bo->base.resv, false,
> + >moving);
>     }
> 
>     return 0;
> --
> 2.25.1
> 

Sorry, I haven't had the time to go over the entire series, the patch
looks good, but what's the memory management dependency the todo
mentions?

z



[PATCH 5.16 20/37] drm: Dont make DRM_PANEL_BRIDGE dependent on DRM_KMS_HELPERS

2022-03-21 Thread Greg Kroah-Hartman
From: Thomas Zimmermann 

[ Upstream commit 3c3384050d68570f9de0fec9e58824decfefba7a ]

Fix a number of undefined references to drm_kms_helper.ko in
drm_dp_helper.ko:

  arm-suse-linux-gnueabi-ld: drivers/gpu/drm/dp/drm_dp_mst_topology.o: in 
function `drm_dp_mst_duplicate_state':
  drm_dp_mst_topology.c:(.text+0x2df0): undefined reference to 
`__drm_atomic_helper_private_obj_duplicate_state'
  arm-suse-linux-gnueabi-ld: drivers/gpu/drm/dp/drm_dp_mst_topology.o: in 
function `drm_dp_delayed_destroy_work':
  drm_dp_mst_topology.c:(.text+0x370c): undefined reference to 
`drm_kms_helper_hotplug_event'
  arm-suse-linux-gnueabi-ld: drivers/gpu/drm/dp/drm_dp_mst_topology.o: in 
function `drm_dp_mst_up_req_work':
  drm_dp_mst_topology.c:(.text+0x7938): undefined reference to 
`drm_kms_helper_hotplug_event'
  arm-suse-linux-gnueabi-ld: drivers/gpu/drm/dp/drm_dp_mst_topology.o: in 
function `drm_dp_mst_link_probe_work':
  drm_dp_mst_topology.c:(.text+0x82e0): undefined reference to 
`drm_kms_helper_hotplug_event'

This happens if panel-edp.ko has been configured with

  DRM_PANEL_EDP=y
  DRM_DP_HELPER=y
  DRM_KMS_HELPER=m

which builds DP helpers into the kernel and KMS helpers sa a module.
Making DRM_PANEL_EDP select DRM_KMS_HELPER resolves this problem.

To avoid a resulting cyclic dependency with DRM_PANEL_BRIDGE, don't
make the latter depend on DRM_KMS_HELPER and fix the one DRM bridge
drivers that doesn't already select DRM_KMS_HELPER. As KMS helpers
cannot be selected directly by the user, config symbols should avoid
depending on it anyway.

Signed-off-by: Thomas Zimmermann 
Fixes: 3755d35ee1d2 ("drm/panel: Select DRM_DP_HELPER for DRM_PANEL_EDP")
Acked-by: Sam Ravnborg 
Tested-by: Brian Masney 
Reported-by: kernel test robot 
Cc: Thomas Zimmermann 
Cc: Naresh Kamboju 
Cc: Linux Kernel Functional Testing 
Cc: Lyude Paul 
Cc: Sam Ravnborg 
Cc: Daniel Vetter 
Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: dri-devel@lists.freedesktop.org
Cc: Dave Airlie 
Cc: Thierry Reding 
Link: https://patchwork.freedesktop.org/patch/478296/
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/bridge/Kconfig | 2 +-
 drivers/gpu/drm/panel/Kconfig  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/bridge/Kconfig b/drivers/gpu/drm/bridge/Kconfig
index 431b6e12a81f..68ec45abc1fb 100644
--- a/drivers/gpu/drm/bridge/Kconfig
+++ b/drivers/gpu/drm/bridge/Kconfig
@@ -8,7 +8,6 @@ config DRM_BRIDGE
 config DRM_PANEL_BRIDGE
def_bool y
depends on DRM_BRIDGE
-   depends on DRM_KMS_HELPER
select DRM_PANEL
help
  DRM bridge wrapper of DRM panels
@@ -30,6 +29,7 @@ config DRM_CDNS_DSI
 config DRM_CHIPONE_ICN6211
tristate "Chipone ICN6211 MIPI-DSI/RGB Converter bridge"
depends on OF
+   select DRM_KMS_HELPER
select DRM_MIPI_DSI
select DRM_PANEL_BRIDGE
help
diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig
index 0d3798354e6a..42011d884202 100644
--- a/drivers/gpu/drm/panel/Kconfig
+++ b/drivers/gpu/drm/panel/Kconfig
@@ -96,6 +96,7 @@ config DRM_PANEL_EDP
select VIDEOMODE_HELPERS
select DRM_DP_AUX_BUS
select DRM_DP_HELPER
+   select DRM_KMS_HELPER
help
  DRM panel driver for dumb eDP panels that need at most a regulator and
  a GPIO to be powered up. Optionally a backlight can be attached so
-- 
2.34.1





[PATCH 5.16 17/37] drm/imx: parallel-display: Remove bus flags check in imx_pd_bridge_atomic_check()

2022-03-21 Thread Greg Kroah-Hartman
From: Christoph Niedermaier 

[ Upstream commit 6061806a863e8b65b109eb06a280041cc7525442 ]

If display timings were read from the devicetree using
of_get_display_timing() and pixelclk-active is defined
there, the flag DISPLAY_FLAGS_SYNC_POSEDGE/NEGEDGE is
automatically generated. Through the function
drm_bus_flags_from_videomode() e.g. called in the
panel-simple driver this flag got into the bus flags,
but then in imx_pd_bridge_atomic_check() the bus flag
check failed and will not initialize the display. The
original commit fe141cedc433 does not explain why this
check was introduced. So remove the bus flags check,
because it stops the initialization of the display with
valid bus flags.

Fixes: fe141cedc433 ("drm/imx: pd: Use bus format/flags provided by the bridge 
when available")
Signed-off-by: Christoph Niedermaier 
Cc: Marek Vasut 
Cc: Boris Brezillon 
Cc: Philipp Zabel 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Shawn Guo 
Cc: Sascha Hauer 
Cc: Pengutronix Kernel Team 
Cc: Fabio Estevam 
Cc: NXP Linux Team 
Cc: linux-arm-ker...@lists.infradead.org
To: dri-devel@lists.freedesktop.org
Tested-by: Max Krummenacher 
Acked-by: Boris Brezillon 
Signed-off-by: Marek Vasut 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20220201113643.4638-1-cniederma...@dh-electronics.com
Signed-off-by: Maarten Lankhorst 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/imx/parallel-display.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/gpu/drm/imx/parallel-display.c 
b/drivers/gpu/drm/imx/parallel-display.c
index a8aba0141ce7..06cb1a59b9bc 100644
--- a/drivers/gpu/drm/imx/parallel-display.c
+++ b/drivers/gpu/drm/imx/parallel-display.c
@@ -217,14 +217,6 @@ static int imx_pd_bridge_atomic_check(struct drm_bridge 
*bridge,
if (!imx_pd_format_supported(bus_fmt))
return -EINVAL;
 
-   if (bus_flags &
-   ~(DRM_BUS_FLAG_DE_LOW | DRM_BUS_FLAG_DE_HIGH |
- DRM_BUS_FLAG_PIXDATA_DRIVE_POSEDGE |
- DRM_BUS_FLAG_PIXDATA_DRIVE_NEGEDGE)) {
-   dev_warn(imxpd->dev, "invalid bus_flags (%x)\n", bus_flags);
-   return -EINVAL;
-   }
-
bridge_state->output_bus_cfg.flags = bus_flags;
bridge_state->input_bus_cfg.flags = bus_flags;
imx_crtc_state->bus_flags = bus_flags;
-- 
2.34.1





[PATCH 5.15 18/32] drm: Dont make DRM_PANEL_BRIDGE dependent on DRM_KMS_HELPERS

2022-03-21 Thread Greg Kroah-Hartman
From: Thomas Zimmermann 

[ Upstream commit 3c3384050d68570f9de0fec9e58824decfefba7a ]

Fix a number of undefined references to drm_kms_helper.ko in
drm_dp_helper.ko:

  arm-suse-linux-gnueabi-ld: drivers/gpu/drm/dp/drm_dp_mst_topology.o: in 
function `drm_dp_mst_duplicate_state':
  drm_dp_mst_topology.c:(.text+0x2df0): undefined reference to 
`__drm_atomic_helper_private_obj_duplicate_state'
  arm-suse-linux-gnueabi-ld: drivers/gpu/drm/dp/drm_dp_mst_topology.o: in 
function `drm_dp_delayed_destroy_work':
  drm_dp_mst_topology.c:(.text+0x370c): undefined reference to 
`drm_kms_helper_hotplug_event'
  arm-suse-linux-gnueabi-ld: drivers/gpu/drm/dp/drm_dp_mst_topology.o: in 
function `drm_dp_mst_up_req_work':
  drm_dp_mst_topology.c:(.text+0x7938): undefined reference to 
`drm_kms_helper_hotplug_event'
  arm-suse-linux-gnueabi-ld: drivers/gpu/drm/dp/drm_dp_mst_topology.o: in 
function `drm_dp_mst_link_probe_work':
  drm_dp_mst_topology.c:(.text+0x82e0): undefined reference to 
`drm_kms_helper_hotplug_event'

This happens if panel-edp.ko has been configured with

  DRM_PANEL_EDP=y
  DRM_DP_HELPER=y
  DRM_KMS_HELPER=m

which builds DP helpers into the kernel and KMS helpers sa a module.
Making DRM_PANEL_EDP select DRM_KMS_HELPER resolves this problem.

To avoid a resulting cyclic dependency with DRM_PANEL_BRIDGE, don't
make the latter depend on DRM_KMS_HELPER and fix the one DRM bridge
drivers that doesn't already select DRM_KMS_HELPER. As KMS helpers
cannot be selected directly by the user, config symbols should avoid
depending on it anyway.

Signed-off-by: Thomas Zimmermann 
Fixes: 3755d35ee1d2 ("drm/panel: Select DRM_DP_HELPER for DRM_PANEL_EDP")
Acked-by: Sam Ravnborg 
Tested-by: Brian Masney 
Reported-by: kernel test robot 
Cc: Thomas Zimmermann 
Cc: Naresh Kamboju 
Cc: Linux Kernel Functional Testing 
Cc: Lyude Paul 
Cc: Sam Ravnborg 
Cc: Daniel Vetter 
Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: dri-devel@lists.freedesktop.org
Cc: Dave Airlie 
Cc: Thierry Reding 
Link: https://patchwork.freedesktop.org/patch/478296/
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/bridge/Kconfig | 2 +-
 drivers/gpu/drm/panel/Kconfig  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/bridge/Kconfig b/drivers/gpu/drm/bridge/Kconfig
index 431b6e12a81f..68ec45abc1fb 100644
--- a/drivers/gpu/drm/bridge/Kconfig
+++ b/drivers/gpu/drm/bridge/Kconfig
@@ -8,7 +8,6 @@ config DRM_BRIDGE
 config DRM_PANEL_BRIDGE
def_bool y
depends on DRM_BRIDGE
-   depends on DRM_KMS_HELPER
select DRM_PANEL
help
  DRM bridge wrapper of DRM panels
@@ -30,6 +29,7 @@ config DRM_CDNS_DSI
 config DRM_CHIPONE_ICN6211
tristate "Chipone ICN6211 MIPI-DSI/RGB Converter bridge"
depends on OF
+   select DRM_KMS_HELPER
select DRM_MIPI_DSI
select DRM_PANEL_BRIDGE
help
diff --git a/drivers/gpu/drm/panel/Kconfig b/drivers/gpu/drm/panel/Kconfig
index f63fd0f90360..af1402d83d51 100644
--- a/drivers/gpu/drm/panel/Kconfig
+++ b/drivers/gpu/drm/panel/Kconfig
@@ -84,6 +84,7 @@ config DRM_PANEL_SIMPLE
select VIDEOMODE_HELPERS
select DRM_DP_AUX_BUS
select DRM_DP_HELPER
+   select DRM_KMS_HELPER
help
  DRM panel driver for dumb panels that need at most a regulator and
  a GPIO to be powered up. Optionally a backlight can be attached so
-- 
2.34.1





[PATCH 5.15 15/32] drm/imx: parallel-display: Remove bus flags check in imx_pd_bridge_atomic_check()

2022-03-21 Thread Greg Kroah-Hartman
From: Christoph Niedermaier 

[ Upstream commit 6061806a863e8b65b109eb06a280041cc7525442 ]

If display timings were read from the devicetree using
of_get_display_timing() and pixelclk-active is defined
there, the flag DISPLAY_FLAGS_SYNC_POSEDGE/NEGEDGE is
automatically generated. Through the function
drm_bus_flags_from_videomode() e.g. called in the
panel-simple driver this flag got into the bus flags,
but then in imx_pd_bridge_atomic_check() the bus flag
check failed and will not initialize the display. The
original commit fe141cedc433 does not explain why this
check was introduced. So remove the bus flags check,
because it stops the initialization of the display with
valid bus flags.

Fixes: fe141cedc433 ("drm/imx: pd: Use bus format/flags provided by the bridge 
when available")
Signed-off-by: Christoph Niedermaier 
Cc: Marek Vasut 
Cc: Boris Brezillon 
Cc: Philipp Zabel 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Shawn Guo 
Cc: Sascha Hauer 
Cc: Pengutronix Kernel Team 
Cc: Fabio Estevam 
Cc: NXP Linux Team 
Cc: linux-arm-ker...@lists.infradead.org
To: dri-devel@lists.freedesktop.org
Tested-by: Max Krummenacher 
Acked-by: Boris Brezillon 
Signed-off-by: Marek Vasut 
Link: 
https://patchwork.freedesktop.org/patch/msgid/20220201113643.4638-1-cniederma...@dh-electronics.com
Signed-off-by: Maarten Lankhorst 
Signed-off-by: Sasha Levin 
---
 drivers/gpu/drm/imx/parallel-display.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/gpu/drm/imx/parallel-display.c 
b/drivers/gpu/drm/imx/parallel-display.c
index a8aba0141ce7..06cb1a59b9bc 100644
--- a/drivers/gpu/drm/imx/parallel-display.c
+++ b/drivers/gpu/drm/imx/parallel-display.c
@@ -217,14 +217,6 @@ static int imx_pd_bridge_atomic_check(struct drm_bridge 
*bridge,
if (!imx_pd_format_supported(bus_fmt))
return -EINVAL;
 
-   if (bus_flags &
-   ~(DRM_BUS_FLAG_DE_LOW | DRM_BUS_FLAG_DE_HIGH |
- DRM_BUS_FLAG_PIXDATA_DRIVE_POSEDGE |
- DRM_BUS_FLAG_PIXDATA_DRIVE_NEGEDGE)) {
-   dev_warn(imxpd->dev, "invalid bus_flags (%x)\n", bus_flags);
-   return -EINVAL;
-   }
-
bridge_state->output_bus_cfg.flags = bus_flags;
bridge_state->input_bus_cfg.flags = bus_flags;
imx_crtc_state->bus_flags = bus_flags;
-- 
2.34.1





[PATCH 22/23] drm/i915: drop bo->moving dependency

2022-03-21 Thread Christian König
That should now be handled by the common dma_resv framework.

Signed-off-by: Christian König 
Cc: intel-...@lists.freedesktop.org
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c   | 29 ++--
 drivers/gpu/drm/i915/gem/i915_gem_object.h   |  5 ++--
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c | 15 +-
 drivers/gpu/drm/i915/i915_vma.c  |  9 +-
 4 files changed, 19 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index d87b508b59b1..fd240435ffef 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -742,18 +742,19 @@ static const struct drm_gem_object_funcs 
i915_gem_object_funcs = {
 /**
  * i915_gem_object_get_moving_fence - Get the object's moving fence if any
  * @obj: The object whose moving fence to get.
+ * @fence: The resulting fence
  *
  * A non-signaled moving fence means that there is an async operation
  * pending on the object that needs to be waited on before setting up
  * any GPU- or CPU PTEs to the object's pages.
  *
- * Return: A refcounted pointer to the object's moving fence if any,
- * NULL otherwise.
+ * Return: Negative error code or 0 for success.
  */
-struct dma_fence *
-i915_gem_object_get_moving_fence(struct drm_i915_gem_object *obj)
+int i915_gem_object_get_moving_fence(struct drm_i915_gem_object *obj,
+struct dma_fence **fence)
 {
-   return dma_fence_get(i915_gem_to_ttm(obj)->moving);
+   return dma_resv_get_singleton(obj->base.resv, DMA_RESV_USAGE_KERNEL,
+ fence);
 }
 
 /**
@@ -771,23 +772,9 @@ i915_gem_object_get_moving_fence(struct 
drm_i915_gem_object *obj)
 int i915_gem_object_wait_moving_fence(struct drm_i915_gem_object *obj,
  bool intr)
 {
-   struct dma_fence *fence = i915_gem_to_ttm(obj)->moving;
-   int ret;
-
assert_object_held(obj);
-   if (!fence)
-   return 0;
-
-   ret = dma_fence_wait(fence, intr);
-   if (ret)
-   return ret;
-
-   if (fence->error)
-   return fence->error;
-
-   i915_gem_to_ttm(obj)->moving = NULL;
-   dma_fence_put(fence);
-   return 0;
+   return dma_resv_wait_timeout(obj->base. resv, DMA_RESV_USAGE_KERNEL,
+intr, MAX_SCHEDULE_TIMEOUT);
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object.h
index f66d46882ea7..be57af8bfb31 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -521,9 +521,8 @@ i915_gem_object_finish_access(struct drm_i915_gem_object 
*obj)
i915_gem_object_unpin_pages(obj);
 }
 
-struct dma_fence *
-i915_gem_object_get_moving_fence(struct drm_i915_gem_object *obj);
-
+int i915_gem_object_get_moving_fence(struct drm_i915_gem_object *obj,
+struct dma_fence **fence);
 int i915_gem_object_wait_moving_fence(struct drm_i915_gem_object *obj,
  bool intr);
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index e4a232e22f9d..4d5d0cd64f23 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -452,19 +452,6 @@ __i915_ttm_move(struct ttm_buffer_object *bo,
return fence;
 }
 
-static int
-prev_deps(struct ttm_buffer_object *bo, struct ttm_operation_ctx *ctx,
- struct i915_deps *deps)
-{
-   int ret;
-
-   ret = i915_deps_add_dependency(deps, bo->moving, ctx);
-   if (!ret)
-   ret = i915_deps_add_resv(deps, bo->base.resv, ctx);
-
-   return ret;
-}
-
 /**
  * i915_ttm_move - The TTM move callback used by i915.
  * @bo: The buffer object.
@@ -519,7 +506,7 @@ int i915_ttm_move(struct ttm_buffer_object *bo, bool evict,
struct i915_deps deps;
 
i915_deps_init(, GFP_KERNEL | __GFP_NORETRY | 
__GFP_NOWARN);
-   ret = prev_deps(bo, ctx, );
+   ret = i915_deps_add_resv(, bo->base.resv, ctx);
if (ret) {
i915_refct_sgt_put(dst_rsgt);
return ret;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 52fd6705a518..8737159f4706 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -1247,10 +1247,17 @@ int i915_vma_pin_ww(struct i915_vma *vma, struct 
i915_gem_ww_ctx *ww,
if (err)
return err;
 
+   if (vma->obj) {
+   err = i915_gem_object_get_moving_fence(vma->obj, );
+   if (err)
+   return err;
+   } else {
+   moving = NULL;
+   }
+
if (flags & PIN_GLOBAL)
wakeref = 

[PATCH 18/23] drm/amdgpu: remove dma_resv workaround

2022-03-21 Thread Christian König
We can now add multiple writers to the dma_resv object.

Also enable the check for not adding containers in dma_resv.c again.

Signed-off-by: Christian König 
Cc: amd-...@lists.freedesktop.org
---
 drivers/dma-buf/dma-resv.c  |  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 51 ++---
 3 files changed, 8 insertions(+), 50 deletions(-)

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index 26257ba1527e..10d70812373c 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -308,10 +308,10 @@ void dma_resv_add_fence(struct dma_resv *obj, struct 
dma_fence *fence,
 
dma_resv_assert_held(obj);
 
-   /* TODO: Drivers should not add containers here, instead add each fence
-* individually. Disabled for now until we cleaned up amdgpu/ttm.
+   /* Drivers should not add containers here, instead add each fence
+* individually.
 */
-   /* WARN_ON(dma_fence_is_container(fence)); */
+   WARN_ON(dma_fence_is_container(fence));
 
fobj = dma_resv_fences_list(obj);
count = fobj->num_fences;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
index 044b41f0bfd9..529d52a204cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_bo_list.h
@@ -34,7 +34,6 @@ struct amdgpu_fpriv;
 struct amdgpu_bo_list_entry {
struct ttm_validate_buffer  tv;
struct amdgpu_bo_va *bo_va;
-   struct dma_fence_chain  *chain;
uint32_tpriority;
struct page **user_pages;
booluser_invalidated;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 1c039db976a9..88009833f523 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -575,14 +575,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
 
e->bo_va = amdgpu_vm_bo_find(vm, bo);
-
-   if (bo->tbo.base.dma_buf && !amdgpu_bo_explicit_sync(bo)) {
-   e->chain = dma_fence_chain_alloc();
-   if (!e->chain) {
-   r = -ENOMEM;
-   goto error_validate;
-   }
-   }
}
 
amdgpu_cs_get_threshold_for_moves(p->adev, >bytes_moved_threshold,
@@ -633,13 +625,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
}
 
 error_validate:
-   if (r) {
-   amdgpu_bo_list_for_each_entry(e, p->bo_list) {
-   dma_fence_chain_free(e->chain);
-   e->chain = NULL;
-   }
+   if (r)
ttm_eu_backoff_reservation(>ticket, >validated);
-   }
 out:
return r;
 }
@@ -679,17 +666,9 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser 
*parser, int error,
 {
unsigned i;
 
-   if (error && backoff) {
-   struct amdgpu_bo_list_entry *e;
-
-   amdgpu_bo_list_for_each_entry(e, parser->bo_list) {
-   dma_fence_chain_free(e->chain);
-   e->chain = NULL;
-   }
-
+   if (error && backoff)
ttm_eu_backoff_reservation(>ticket,
   >validated);
-   }
 
for (i = 0; i < parser->num_post_deps; i++) {
drm_syncobj_put(parser->post_deps[i].syncobj);
@@ -1264,29 +1243,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 
amdgpu_vm_move_to_lru_tail(p->adev, >vm);
 
-   amdgpu_bo_list_for_each_entry(e, p->bo_list) {
-   struct dma_resv *resv = e->tv.bo->base.resv;
-   struct dma_fence_chain *chain = e->chain;
-   struct dma_resv_iter cursor;
-   struct dma_fence *fence;
-
-   if (!chain)
-   continue;
-
-   /*
-* Work around dma_resv shortcommings by wrapping up the
-* submission in a dma_fence_chain and add it as exclusive
-* fence.
-*/
-   dma_resv_for_each_fence(, resv,
-   DMA_RESV_USAGE_WRITE,
-   fence) {
-   break;
-   }
-   dma_fence_chain_init(chain, fence, dma_fence_get(p->fence), 1);
-   dma_resv_add_fence(resv, >base, DMA_RESV_USAGE_WRITE);
-   e->chain = NULL;
-   }
+   /* Make sure all BOs are remembered as writers */
+   amdgpu_bo_list_for_each_entry(e, p->bo_list)
+   e->tv.num_shared = 0;
 
ttm_eu_fence_buffer_objects(>ticket, >validated, 

[PATCH 17/23] dma-buf: specify usage while adding fences to dma_resv obj v5

2022-03-21 Thread Christian König
Instead of distingting between shared and exclusive fences specify
the fence usage while adding fences.

Rework all drivers to use this interface instead and deprecate the old one.

v2: some kerneldoc comments suggested by Daniel
v3: fix a missing case in radeon
v4: rebase on nouveau changes, fix lockdep and temporary disable warning
v5: more documentation updates

Signed-off-by: Christian König 
---
 drivers/dma-buf/dma-resv.c| 345 --
 drivers/dma-buf/st-dma-resv.c | 101 ++---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c|   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c|   6 +-
 drivers/gpu/drm/etnaviv/etnaviv_gem_submit.c  |  10 +-
 drivers/gpu/drm/i915/gem/i915_gem_busy.c  |  13 +-
 drivers/gpu/drm/i915/gem/i915_gem_clflush.c   |   3 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |   5 +-
 drivers/gpu/drm/i915/i915_vma.c   |   6 +-
 drivers/gpu/drm/lima/lima_gem.c   |   2 +-
 drivers/gpu/drm/msm/msm_gem_submit.c  |   2 +-
 drivers/gpu/drm/nouveau/nouveau_bo.c  |   9 +-
 drivers/gpu/drm/nouveau/nouveau_fence.c   |   4 +-
 drivers/gpu/drm/qxl/qxl_release.c |   3 +-
 drivers/gpu/drm/radeon/radeon_object.c|   6 +-
 drivers/gpu/drm/ttm/ttm_bo.c  |   2 +-
 drivers/gpu/drm/ttm/ttm_bo_util.c |   5 +-
 drivers/gpu/drm/ttm/ttm_execbuf_util.c|   6 +-
 drivers/gpu/drm/v3d/v3d_gem.c |   4 +-
 drivers/gpu/drm/vc4/vc4_gem.c |   2 +-
 drivers/gpu/drm/vgem/vgem_fence.c |   9 +-
 drivers/gpu/drm/virtio/virtgpu_gem.c  |   3 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_bo.c|   3 +-
 include/linux/dma-buf.h   |  17 +-
 include/linux/dma-resv.h  |  72 ++--
 26 files changed, 276 insertions(+), 370 deletions(-)

diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c
index bb7b023c2d33..26257ba1527e 100644
--- a/drivers/dma-buf/dma-resv.c
+++ b/drivers/dma-buf/dma-resv.c
@@ -44,12 +44,12 @@
 /**
  * DOC: Reservation Object Overview
  *
- * The reservation object provides a mechanism to manage shared and
- * exclusive fences associated with a buffer.  A reservation object
- * can have attached one exclusive fence (normally associated with
- * write operations) or N shared fences (read operations).  The RCU
- * mechanism is used to protect read access to fences from locked
- * write-side updates.
+ * The reservation object provides a mechanism to manage a container of
+ * dma_fence object associated with a resource. A reservation object
+ * can have any number of fences attaches to it. Each fence carring an usage
+ * parameter determining how the operation represented by the fence is using 
the
+ * resource. The RCU mechanism is used to protect read access to fences from
+ * locked write-side updates.
  *
  * See struct dma_resv for more details.
  */
@@ -57,29 +57,74 @@
 DEFINE_WD_CLASS(reservation_ww_class);
 EXPORT_SYMBOL(reservation_ww_class);
 
+/* Mask for the lower fence pointer bits */
+#define DMA_RESV_LIST_MASK 0x3
+
 struct dma_resv_list {
struct rcu_head rcu;
-   u32 shared_count, shared_max;
-   struct dma_fence __rcu *shared[];
+   u32 num_fences, max_fences;
+   struct dma_fence __rcu *table[];
 };
 
+/**
+ * dma_resv_list_entry - extract fence and usage from a list entry
+ * @list: the list to extract and entry from
+ * @index: which entry we want
+ * @resv: optional dma_resv obj for lockdep check that the access is allowed
+ * @fence: the resulting fence
+ * @usage: the resulting usage
+ *
+ * Extract the fence and usage flags from an RCU protected entry in the list.
+ */
+static void dma_resv_list_entry(struct dma_resv_list *list, unsigned int index,
+   struct dma_resv *resv, struct dma_fence **fence,
+   enum dma_resv_usage *usage)
+{
+   long tmp;
+
+   tmp = (long)rcu_dereference_check(list->table[index],
+ resv ? dma_resv_held(resv) : true);
+   *fence = (struct dma_fence *)(tmp & ~DMA_RESV_LIST_MASK);
+   if (usage)
+   *usage = tmp & DMA_RESV_LIST_MASK;
+}
+
+/**
+ * dma_resv_list_set - set fence and usage at a specific index
+ * @list: the list to modify
+ * @index: where to make the change
+ * @fence: the fence to set
+ * @usage: the usage to set
+ *
+ * Set the fence and usage flags at the specific index in the list.
+ */
+static void dma_resv_list_set(struct dma_resv_list *list,
+ unsigned int index,
+ struct dma_fence *fence,
+ enum dma_resv_usage usage)
+{
+   long tmp = ((long)fence) | usage;
+
+   RCU_INIT_POINTER(list->table[index], (struct dma_fence *)tmp);
+}
+
 /**
  * dma_resv_list_alloc - allocate fence list
- * @shared_max: 

  1   2   >