[PATCH 2/2] drm/amdgpu: Adjust ras support check condition for special asic

2023-01-12 Thread YiPeng Chai
[Why]:
 Amdgpu ras uses amdgpu_ras_is_supported to check whether
  the ras block supports the ras function. amdgpu_ras_is_supported
  uses .ras_enabled to determine whether the ras function of the
  block is enabled.
 But for special asic with mem ecc enabled but sram ecc not
  enabled, some ras blocks support poison mode but their ras function
  is not enabled on .ras_enabled, these ras blocks will run abnormally.

[How]:
If the ras block is not supported on .ras_enabled but the asic
  supports poison mode and the ras block has ras configuration, it
  can be considered that the ras block supports ras function.

Signed-off-by: YiPeng Chai 
Reviewed-by: Tao Zhou 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3f1e987bdf83..6e543558386d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3022,11 +3022,26 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, 
struct amdgpu_ras *ras_co
 int amdgpu_ras_is_supported(struct amdgpu_device *adev,
unsigned int block)
 {
+   int ret = 0;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
if (block >= AMDGPU_RAS_BLOCK_COUNT)
return 0;
-   return ras && (adev->ras_enabled & (1 << block));
+
+   ret = ras && (adev->ras_enabled & (1 << block));
+
+   /* For the special asic with mem ecc enabled but sram ecc
+* not enabled, even if the ras block is not supported on
+* .ras_enabled, if the asic supports poison mode and the
+* ras block has ras configuration, it can be considered
+* that the ras block supports ras function.
+*/
+   if (!ret &&
+   amdgpu_ras_is_poison_mode_supported(adev) &&
+   amdgpu_ras_get_ras_block(adev, block, 0))
+   ret = 1;
+
+   return ret;
 }
 
 int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
-- 
2.25.1



[PATCH 1/2] drm/amdgpu: Remove unnecessary ras block support check

2023-01-12 Thread YiPeng Chai
[Why]:
   For special asic with mem ecc enabled but sram ecc
not enabled, some ras blocks can register their ras
configuration to ras list, but these ras blocks are not
enabled on .ras_enabled, so it can not get ras block
object using amdgpu_ras_get_ras_block.

[How]:
   Remove ras block support check. Even if the ras block
checked is not in the ras list, it will return a null
pointer and will have no effect.

Signed-off-by: YiPeng Chai 
Reviewed-by: Tao Zhou 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0a95d1c1e7ab..3f1e987bdf83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -920,9 +920,6 @@ static struct amdgpu_ras_block_object 
*amdgpu_ras_get_ras_block(struct amdgpu_de
if (block >= AMDGPU_RAS_BLOCK__LAST)
return NULL;
 
-   if (!amdgpu_ras_is_supported(adev, block))
-   return NULL;
-
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) {
dev_warn(adev->dev, "Warning: abnormal ras list 
node.\n");
-- 
2.25.1



[PATCH 5/5] drm/amdgpu: Perform gpu reset after gfx finishes processing ras poison consumption on gfx_v11_0_3

2023-01-12 Thread YiPeng Chai
Perform gpu reset after gfx finishes processing
ras poison consumption on gfx_v11_0_3.

V2:
 Move gfx poison consumption handler from hw_ops to ip
 function level.

V3:
 Adjust the calling position of amdgpu_gfx_poison_consumation_handler.

V4:
   Since gfx v11_0_3 does not have .hw_ops instance, the .hw_ops null
 pointer check in amdgpu_ras_interrupt_poison_consumption_handler
 needs to be adjusted.

Signed-off-by: YiPeng Chai 
Reviewed-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  |  9 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  8 +---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 13 +
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 09c42c00e43c..caf7fd3adcbd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -731,6 +731,15 @@ int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
return 0;
 }
 
+int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
+   struct amdgpu_iv_entry *entry)
+{
+   if (adev->gfx.ras && adev->gfx.ras->poison_consumption_handler)
+   return adev->gfx.ras->poison_consumption_handler(adev, entry);
+
+   return 0;
+}
+
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 0b39fe3cd624..86ec9d0d12c8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -213,6 +213,8 @@ struct amdgpu_gfx_ras {
int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
+   int (*poison_consumption_handler)(struct amdgpu_device *adev,
+   struct amdgpu_iv_entry *entry);
 };
 
 struct amdgpu_gfx_funcs {
@@ -437,4 +439,6 @@ int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);
 void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t 
ucode_id);
 
 int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev);
+int amdgpu_gfx_poison_consumption_handler(struct amdgpu_device *adev,
+   struct amdgpu_iv_entry *entry);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d06beb884a16..0a95d1c1e7ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1620,14 +1620,14 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
struct amdgpu_ras_block_object *block_obj =
amdgpu_ras_get_ras_block(adev, obj->head.block, 0);
 
-   if (!block_obj || !block_obj->hw_ops)
+   if (!block_obj)
return;
 
/* both query_poison_status and handle_poison_consumption are optional,
 * but at least one of them should be implemented if we need poison
 * consumption handler
 */
-   if (block_obj->hw_ops->query_poison_status) {
+   if (block_obj->hw_ops && block_obj->hw_ops->query_poison_status) {
poison_stat = block_obj->hw_ops->query_poison_status(adev);
if (!poison_stat) {
/* Not poison consumption interrupt, no need to handle 
it */
@@ -1641,7 +1641,7 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
if (!adev->gmc.xgmi.connected_to_cpu)
amdgpu_umc_poison_handler(adev, false);
 
-   if (block_obj->hw_ops->handle_poison_consumption)
+   if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption)
poison_stat = 
block_obj->hw_ops->handle_poison_consumption(adev);
 
/* gpu reset is fallback for failed and default cases */
@@ -1649,6 +1649,8 @@ static void 
amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager *
dev_info(adev->dev, "GPU reset for %s RAS poison consumption is 
issued!\n",
block_obj->ras_comm.name);
amdgpu_ras_reset_gpu(adev);
+   } else {
+   amdgpu_gfx_poison_consumption_handler(adev, entry);
}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index a18e09de31dd..b07a72ca25d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -70,6 +70,19 @@ static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device 
*adev,
return 0;
 }
 
+static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev,
+  

[PATCH 4/5] drm/amdgpu: Add gfx cp ecc error irq handling on gfx v11_0_3

2023-01-12 Thread YiPeng Chai
V2:
  Optimize gfx_v11_0_set_cp_ecc_error_state function.

V3:
  Define macro constant for me pipe instance address interval.

V5:
  Register and handle gfx cp ecc error irq on gfx v11_0_3.

V6:
  Remove invalid intermediate function call.

Signed-off-by: YiPeng Chai 
Reviewed-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 47 ++
 1 file changed, 47 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index cc634cae77d7..39f48227bcf8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1338,6 +1338,13 @@ static int gfx_v11_0_sw_init(void *handle)
if (r)
return r;
 
+   /* ECC error */
+   r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GRBM_CP,
+ GFX_11_0_0__SRCID__CP_ECC_ERROR,
+ &adev->gfx.cp_ecc_error_irq);
+   if (r)
+   return r;
+
/* FED error */
r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GFX,
  GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT,
@@ -4434,6 +4441,7 @@ static int gfx_v11_0_hw_fini(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
int r;
 
+   amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
 
@@ -5865,6 +5873,36 @@ static void 
gfx_v11_0_set_compute_eop_interrupt_state(struct amdgpu_device *adev
}
 }
 
+#define CP_ME1_PIPE_INST_ADDR_INTERVAL  0x1
+#define SET_ECC_ME_PIPE_STATE(reg_addr, state) \
+   do { \
+   uint32_t tmp = RREG32_SOC15_IP(GC, reg_addr); \
+   tmp = REG_SET_FIELD(tmp, CP_ME1_PIPE0_INT_CNTL, 
CP_ECC_ERROR_INT_ENABLE, state); \
+   WREG32_SOC15_IP(GC, reg_addr, tmp); \
+   } while (0)
+
+static int gfx_v11_0_set_cp_ecc_error_state(struct amdgpu_device *adev,
+   struct amdgpu_irq_src 
*source,
+   unsigned type,
+   enum 
amdgpu_interrupt_state state)
+{
+   uint32_t ecc_irq_state = 0;
+   uint32_t pipe0_int_cntl_addr = 0;
+   int i = 0;
+
+   ecc_irq_state = (state == AMDGPU_IRQ_STATE_ENABLE) ? 1 : 0;
+
+   pipe0_int_cntl_addr = SOC15_REG_OFFSET(GC, 0, regCP_ME1_PIPE0_INT_CNTL);
+
+   WREG32_FIELD15_PREREG(GC, 0, CP_INT_CNTL_RING0, 
CP_ECC_ERROR_INT_ENABLE, ecc_irq_state);
+
+   for (i = 0; i < adev->gfx.mec.num_pipe_per_mec; i++)
+   SET_ECC_ME_PIPE_STATE(pipe0_int_cntl_addr + i * 
CP_ME1_PIPE_INST_ADDR_INTERVAL,
+   ecc_irq_state);
+
+   return 0;
+}
+
 static int gfx_v11_0_set_eop_interrupt_state(struct amdgpu_device *adev,
struct amdgpu_irq_src *src,
unsigned type,
@@ -6281,6 +6319,11 @@ static const struct amdgpu_irq_src_funcs 
gfx_v11_0_priv_inst_irq_funcs = {
.process = gfx_v11_0_priv_inst_irq,
 };
 
+static const struct amdgpu_irq_src_funcs gfx_v11_0_cp_ecc_error_irq_funcs = {
+   .set = gfx_v11_0_set_cp_ecc_error_state,
+   .process = amdgpu_gfx_cp_ecc_error_irq,
+};
+
 static const struct amdgpu_irq_src_funcs gfx_v11_0_rlc_gc_fed_irq_funcs = {
.process = gfx_v11_0_rlc_gc_fed_irq,
 };
@@ -6296,8 +6339,12 @@ static void gfx_v11_0_set_irq_funcs(struct amdgpu_device 
*adev)
adev->gfx.priv_inst_irq.num_types = 1;
adev->gfx.priv_inst_irq.funcs = &gfx_v11_0_priv_inst_irq_funcs;
 
+   adev->gfx.cp_ecc_error_irq.num_types = 1; /* CP ECC error */
+   adev->gfx.cp_ecc_error_irq.funcs = &gfx_v11_0_cp_ecc_error_irq_funcs;
+
adev->gfx.rlc_gc_fed_irq.num_types = 1; /* 0x80 FED error */
adev->gfx.rlc_gc_fed_irq.funcs = &gfx_v11_0_rlc_gc_fed_irq_funcs;
+
 }
 
 static void gfx_v11_0_set_imu_funcs(struct amdgpu_device *adev)
-- 
2.25.1



[PATCH 3/5] drm/amdgpu: Add gfx ras poison consumption irq handling on gfx v11_0_3

2023-01-12 Thread YiPeng Chai
Add gfx ras poison consumption irq handling on gfx v11_0_3.

V2:
  Move ras poison consumption irq handling code of gfx
 v11_0_3 to gfx_v11_0_3.c.
V5:
  Create dedicated irq handler for RLC_GC_FED_INTERRUPT.

V6:
  Remove invalid function call.

Signed-off-by: YiPeng Chai 
Reviewed-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h   |  4 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c| 24 +
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c  | 50 ++-
 .../include/ivsrcid/gfx/irqsrcs_gfx_11_0_0.h  |  2 +
 4 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 6b26597217ed..0b39fe3cd624 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -210,6 +210,9 @@ struct amdgpu_gfx_ras {
struct amdgpu_ras_block_object  ras_block;
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
bool (*query_utcl2_poison_status)(struct amdgpu_device *adev);
+   int (*rlc_gc_fed_irq)(struct amdgpu_device *adev,
+   struct amdgpu_irq_src *source,
+   struct amdgpu_iv_entry *entry);
 };
 
 struct amdgpu_gfx_funcs {
@@ -323,6 +326,7 @@ struct amdgpu_gfx {
struct amdgpu_irq_src   priv_inst_irq;
struct amdgpu_irq_src   cp_ecc_error_irq;
struct amdgpu_irq_src   sq_irq;
+   struct amdgpu_irq_src   rlc_gc_fed_irq;
struct sq_work  sq_work;
 
/* gfx status */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 82beb46788cf..cc634cae77d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -1338,6 +1338,13 @@ static int gfx_v11_0_sw_init(void *handle)
if (r)
return r;
 
+   /* FED error */
+   r = amdgpu_irq_add_id(adev, SOC21_IH_CLIENTID_GFX,
+ GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT,
+ &adev->gfx.rlc_gc_fed_irq);
+   if (r)
+   return r;
+
adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE;
 
if (adev->gfx.imu.funcs) {
@@ -6034,6 +6041,16 @@ static int gfx_v11_0_priv_inst_irq(struct amdgpu_device 
*adev,
return 0;
 }
 
+static int gfx_v11_0_rlc_gc_fed_irq(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry)
+{
+   if (adev->gfx.ras && adev->gfx.ras->rlc_gc_fed_irq)
+   return adev->gfx.ras->rlc_gc_fed_irq(adev, source, entry);
+
+   return 0;
+}
+
 #if 0
 static int gfx_v11_0_kiq_set_interrupt_state(struct amdgpu_device *adev,
 struct amdgpu_irq_src *src,
@@ -6264,6 +6281,10 @@ static const struct amdgpu_irq_src_funcs 
gfx_v11_0_priv_inst_irq_funcs = {
.process = gfx_v11_0_priv_inst_irq,
 };
 
+static const struct amdgpu_irq_src_funcs gfx_v11_0_rlc_gc_fed_irq_funcs = {
+   .process = gfx_v11_0_rlc_gc_fed_irq,
+};
+
 static void gfx_v11_0_set_irq_funcs(struct amdgpu_device *adev)
 {
adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
@@ -6274,6 +6295,9 @@ static void gfx_v11_0_set_irq_funcs(struct amdgpu_device 
*adev)
 
adev->gfx.priv_inst_irq.num_types = 1;
adev->gfx.priv_inst_irq.funcs = &gfx_v11_0_priv_inst_irq_funcs;
+
+   adev->gfx.rlc_gc_fed_irq.num_types = 1; /* 0x80 FED error */
+   adev->gfx.rlc_gc_fed_irq.funcs = &gfx_v11_0_rlc_gc_fed_irq_funcs;
 }
 
 static void gfx_v11_0_set_imu_funcs(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 5966d984a30a..a18e09de31dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -22,6 +22,54 @@
  */
 
 #include "amdgpu.h"
+#include "soc21.h"
+#include "gc/gc_11_0_3_offset.h"
+#include "gc/gc_11_0_3_sh_mask.h"
+#include "ivsrcid/gfx/irqsrcs_gfx_11_0_0.h"
+#include "soc15.h"
+#include "soc15d.h"
+#include "gfx_v11_0.h"
 
 
-struct amdgpu_gfx_ras gfx_v11_0_3_ras;
+static int gfx_v11_0_3_rlc_gc_fed_irq(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry)
+{
+   uint32_t rlc_status0 = 0, rlc_status1 = 0;
+   struct ras_common_if *ras_if = NULL;
+   struct ras_dispatch_if ih_data = {
+   .entry = entry,
+   };
+
+   rlc_status0 = RREG32(SOC15_REG_OFFSET(GC, 0, regRLC_RLCS_FED_STATUS_0));
+   rlc_status1 = RREG32(SOC15_REG_OFFSET(GC, 0, regRLC_RLCS_FED_STATUS_1));
+
+   if (!rlc_status0 && !rlc_status1) {
+   dev_warn(adev->dev, "RLC_GC_FED irq is generated, but 
rlc_status0 and rlc_status1 are

[PATCH 2/5] amd/amdgpu: Add RLC_RLCS_FED_STATUS_* to gc v11_0_3 ip headers

2023-01-12 Thread YiPeng Chai
V2:
   Add RLC_RLCS_FED_STATUS_0 and RLC_RLCS_FED_STATUS_1 register
   offset and shift masks.

Signed-off-by: YiPeng Chai 
Reviewed-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
Reviewed-by: Alex Deucher 
---
 .../include/asic_reg/gc/gc_11_0_3_offset.h|  8 +++
 .../include/asic_reg/gc/gc_11_0_3_sh_mask.h   | 50 +++
 2 files changed, 58 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h
index 3b95a59b196c..56e00252bff8 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_offset.h
@@ -3593,6 +3593,14 @@
 #define regGCL2TLB_PERFCOUNTER_RSLT_CNTL_BASE_IDX  
 1
 
 
+// addressBlock: gc_rlcsdec
+// base address: 0x3b980
+#define regRLC_RLCS_FED_STATUS_0   
 0x4eff
+#define regRLC_RLCS_FED_STATUS_0_BASE_IDX  
 1
+#define regRLC_RLCS_FED_STATUS_1   
 0x4f00
+#define regRLC_RLCS_FED_STATUS_1_BASE_IDX  
 1
+
+
 // addressBlock: gc_gcvml2pspdec
 // base address: 0x3f900
 #define regGCUTCL2_TRANSLATION_BYPASS_BY_VMID  
 0x5e41
diff --git a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h
index ae3ef8a9e702..658e88a8e2ac 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/gc/gc_11_0_3_sh_mask.h
@@ -37642,6 +37642,56 @@
 #define RLC_RLCG_DOORBELL_RANGE__LOWER_ADDR_MASK   
   0x0FFCL
 #define RLC_RLCG_DOORBELL_RANGE__UPPER_ADDR_RESERVED_MASK  
   0x0003L
 #define RLC_RLCG_DOORBELL_RANGE__UPPER_ADDR_MASK   
   0x0FFCL
+//RLC_RLCS_FED_STATUS_0
+#define RLC_RLCS_FED_STATUS_0__RLC_FED_ERR__SHIFT  
   0x0
+#define RLC_RLCS_FED_STATUS_0__UTCL2_FED_ERR__SHIFT
   0x1
+#define RLC_RLCS_FED_STATUS_0__GE_FED_ERR__SHIFT   
   0x2
+#define RLC_RLCS_FED_STATUS_0__CPC_FED_ERR__SHIFT  
   0x3
+#define RLC_RLCS_FED_STATUS_0__CPF_FED_ERR__SHIFT  
   0x4
+#define RLC_RLCS_FED_STATUS_0__CPG_FED_ERR__SHIFT  
   0x5
+#define RLC_RLCS_FED_STATUS_0__SDMA0_FED_ERR__SHIFT
   0x6
+#define RLC_RLCS_FED_STATUS_0__SDMA1_FED_ERR__SHIFT
   0x7
+#define RLC_RLCS_FED_STATUS_0__RLC_FED_ERR_MASK
   0x0001L
+#define RLC_RLCS_FED_STATUS_0__UTCL2_FED_ERR_MASK  
   0x0002L
+#define RLC_RLCS_FED_STATUS_0__GE_FED_ERR_MASK 
   0x0004L
+#define RLC_RLCS_FED_STATUS_0__CPC_FED_ERR_MASK
   0x0008L
+#define RLC_RLCS_FED_STATUS_0__CPF_FED_ERR_MASK
   0x0010L
+#define RLC_RLCS_FED_STATUS_0__CPG_FED_ERR_MASK
   0x0020L
+#define RLC_RLCS_FED_STATUS_0__SDMA0_FED_ERR_MASK  
   0x0040L
+#define RLC_RLCS_FED_STATUS_0__SDMA1_FED_ERR_MASK  
   0x0080L
+//RLC_RLCS_FED_STATUS_1
+#define RLC_RLCS_FED_STATUS_1__GL2C0_FED_ERR__SHIFT
   0x0
+#define RLC_RLCS_FED_STATUS_1__GL2C1_FED_ERR__SHIFT
   0x1
+#define RLC_RLCS_FED_STATUS_1__GL2C2_FED_ERR__SHIFT
   0x2
+#define RLC_RLCS_FED_STATUS_1__GL2C3_FED_ERR__SHIFT
   0x3
+#define RLC_RLCS_FED_STATUS_1__GL2C4_FED_ERR__SHIFT
   0x4
+#define RLC_RLCS_FED_STATUS_1__GL2C5_FED_ERR__SHIFT
   0x5
+#define RLC_RLCS_FED_STATUS_1__GL2C6_FED_ERR__SHIFT
   0x6
+#define RLC_RLCS_FED_STATUS_

[PATCH 1/5] drm/amdgpu: Add gfx ras function on gfx v11_0_3

2023-01-12 Thread YiPeng Chai
Add gfx ras function on gfx v11_0_3.

V2:
 1. Add separate source files for gfx v11_0_3.
 2. Create a common function to initialize gfx ras block.

V3:
 1. Rename amdgpu_gfx_ras_block_init to amdgpu_gfx_ras_sw_init.
 2. Adjust the calling position of amdgpu_gfx_ras_sw_init.
 3. Remove gfx_v11_0_3_ras_ops.

V4:
 Revert changes in amdgpu_ras_interrupt_poison_consumption_handler.

V5:
 1. Remove invalid include file in gfx_v11_0_3.c.
 2. Reduce the number of parameters of amdgpu_gfx_ras_sw_init.

Signed-off-by: YiPeng Chai 
Reviewed-by: Hawking Zhang 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/Makefile  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 35 
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c   | 13 +
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 27 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h | 29 
 6 files changed, 106 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 332cf8bda7a2..5df603192cdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -137,6 +137,7 @@ amdgpu-y += \
gfx_v10_0.o \
imu_v11_0.o \
gfx_v11_0.o \
+   gfx_v11_0_3.o \
imu_v11_0_3.o
 
 # add async DMA block
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 42a939cd2eac..09c42c00e43c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -696,6 +696,41 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, 
struct ras_common_if *r
return r;
 }
 
+int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev)
+{
+   int err = 0;
+   struct amdgpu_gfx_ras *ras = NULL;
+
+   /* adev->gfx.ras is NULL, which means gfx does not
+* support ras function, then do nothing here.
+*/
+   if (!adev->gfx.ras)
+   return 0;
+
+   ras = adev->gfx.ras;
+
+   err = amdgpu_ras_register_ras_block(adev, &ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register gfx ras block!\n");
+   return err;
+   }
+
+   strcpy(ras->ras_block.ras_comm.name, "gfx");
+   ras->ras_block.ras_comm.block = AMDGPU_RAS_BLOCK__GFX;
+   ras->ras_block.ras_comm.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
+   adev->gfx.ras_if = &ras->ras_block.ras_comm;
+
+   /* If not define special ras_late_init function, use gfx default 
ras_late_init */
+   if (!ras->ras_block.ras_late_init)
+   ras->ras_block.ras_late_init = amdgpu_ras_block_late_init;
+
+   /* If not defined special ras_cb function, use default ras_cb */
+   if (!ras->ras_block.ras_cb)
+   ras->ras_block.ras_cb = amdgpu_gfx_process_ras_data_cb;
+
+   return 0;
+}
+
 int amdgpu_gfx_process_ras_data_cb(struct amdgpu_device *adev,
void *err_data,
struct amdgpu_iv_entry *entry)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index b3df4787877e..6b26597217ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -432,4 +432,5 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t 
reg, uint32_t v);
 int amdgpu_gfx_get_num_kcq(struct amdgpu_device *adev);
 void amdgpu_gfx_cp_init_microcode(struct amdgpu_device *adev, uint32_t 
ucode_id);
 
+int amdgpu_gfx_ras_sw_init(struct amdgpu_device *adev);
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 259ebf0356db..82beb46788cf 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -46,6 +46,7 @@
 #include "clearstate_gfx11.h"
 #include "v11_structs.h"
 #include "gfx_v11_0.h"
+#include "gfx_v11_0_3.h"
 #include "nbio_v4_3.h"
 #include "mes_v11_0.h"
 
@@ -852,7 +853,14 @@ static int gfx_v11_0_gpu_early_init(struct amdgpu_device 
*adev)
switch (adev->ip_versions[GC_HWIP][0]) {
case IP_VERSION(11, 0, 0):
case IP_VERSION(11, 0, 2):
+   adev->gfx.config.max_hw_contexts = 8;
+   adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
+   adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
+   adev->gfx.config.sc_hiz_tile_fifo_size = 0;
+   adev->gfx.config.sc_earlyz_tile_fifo_size = 0x4C0;
+   break;
case IP_VERSION(11, 0, 3):
+   adev->gfx.ras = &gfx_v11_0_3_ras;
adev->gfx.config.max_hw_contexts = 8;
adev->gfx.config.sc_prim_fifo_size_frontend = 0x20;
adev->gfx.config.sc_prim_fifo_size_backend = 0x100;
@@ -1422,6 +1430,11 @@ static int gfx_v11_0_sw_init(void *h

Re: Wrong revert commit in stable channel

2023-01-12 Thread Yury Zhuravlev
Yes, this is right in 6.2-rc3

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit
/drivers/gpu/drm/amd/pm/powerplay/hwmgr?h=v6.2-rc3&id=f936f535fa70f35ce3369b1418ebae0e657cda6a


But somebody reverted it again for the stable stream:

https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=v6.1.4&id=9ccd11718d76b95c69aa773f2abedef560776037

On Wed, Jan 11, 2023 at 5:35 PM Chen, Guchun  wrote:

> Hi Yury,
>
>
>
> My understanding is though that’s a revert of your original patch, we did
> a revert again on top of the reverted patch later on. Can you please sync
> to below commit to check again? Or do I understand wrong?
>
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/gpu/drm/amd/pm/powerplay/hwmgr?h=v6.2-rc3&id=f936f535fa70f35ce3369b1418ebae0e657cda6a
>
>
>
> Regards,
>
> Guchun
>
>
>
> *From:* amd-gfx  *On Behalf Of *Yury
> Zhuravlev
> *Sent:* Wednesday, January 11, 2023 4:26 PM
> *To:* amd-gfx@lists.freedesktop.org
> *Subject:* Wrong revert commit in stable channel
>
>
>
> Hello,
>
>
>
> Something went wrong, and we commited what we diced not commit.
>
>
>
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.2-rc3&id=e5b781c56d46c44c52caa915f1b65064f2f7c1ba
> 
>
>
>
> and
>
>
>
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v6.2-rc3&id=4545ae2ed3f2f7c3f615a53399c9c8460ee5bca7
> 
>
>
>
> It's wrong reverts because, initially was an issue with a test case, not a
> patch itself.
>
> My GPU is not working correctly again after such "stable" patch.
>


[PATCH 2/2] drm/ttm: Use debugfs_remove_recursive to remove ttm directory

2023-01-12 Thread Ma Jun
Use debugfs_remove_recursive to remove the /sys/kernel/debug/ttm
directory for better compatibility. Becuase debugfs_remove fails
on older kernel.

Signed-off-by: Ma Jun 
---
 drivers/gpu/drm/ttm/ttm_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index 967bc2244df3..590297123bb2 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -55,7 +55,7 @@ static void ttm_global_release(void)
goto out;
 
ttm_pool_mgr_fini();
-   debugfs_remove(ttm_debugfs_root);
+   debugfs_remove_recursive(ttm_debugfs_root);
 
__free_page(glob->dummy_read_page);
memset(glob, 0, sizeof(*glob));
-- 
2.25.1



[PATCH 1/2] drm/ttm: Check ttm_debugfs_root before creating files under it

2023-01-12 Thread Ma Jun
Check the ttm_debugfs_root before creating files under it.
If the ttm_debugfs_root is NULL, all the files created for
ttm/ will be placed in the /sys/kerne/debug/ but not
/sys/kernel/debug/ttm/

Signed-off-by: Ma Jun 
---
 drivers/gpu/drm/ttm/ttm_device.c |  3 ++-
 drivers/gpu/drm/ttm/ttm_pool.c   | 10 ++
 drivers/gpu/drm/ttm/ttm_tt.c |  5 +++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index e7147e304637..967bc2244df3 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -105,7 +105,8 @@ static int ttm_global_init(void)
INIT_LIST_HEAD(&glob->device_list);
atomic_set(&glob->bo_count, 0);
 
-   debugfs_create_atomic_t("buffer_objects", 0444, ttm_debugfs_root,
+   if(ttm_debugfs_root)
+   debugfs_create_atomic_t("buffer_objects", 0444, 
ttm_debugfs_root,
&glob->bo_count);
 out:
if (ret && ttm_debugfs_root)
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 21b61631f73a..d95a65f759df 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -713,10 +713,12 @@ int ttm_pool_mgr_init(unsigned long num_pages)
}
 
 #ifdef CONFIG_DEBUG_FS
-   debugfs_create_file("page_pool", 0444, ttm_debugfs_root, NULL,
-   &ttm_pool_debugfs_globals_fops);
-   debugfs_create_file("page_pool_shrink", 0400, ttm_debugfs_root, NULL,
-   &ttm_pool_debugfs_shrink_fops);
+   if(ttm_debugfs_root) {
+   debugfs_create_file("page_pool", 0444, ttm_debugfs_root, NULL,
+   &ttm_pool_debugfs_globals_fops);
+   debugfs_create_file("page_pool_shrink", 0400, ttm_debugfs_root, 
NULL,
+   &ttm_pool_debugfs_shrink_fops);
+   }
 #endif
 
mm_shrinker.count_objects = ttm_pool_shrinker_count;
diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
index d505603930a7..fec443494ef0 100644
--- a/drivers/gpu/drm/ttm/ttm_tt.c
+++ b/drivers/gpu/drm/ttm/ttm_tt.c
@@ -394,8 +394,9 @@ DEFINE_SHOW_ATTRIBUTE(ttm_tt_debugfs_shrink);
 void ttm_tt_mgr_init(unsigned long num_pages, unsigned long num_dma32_pages)
 {
 #ifdef CONFIG_DEBUG_FS
-   debugfs_create_file("tt_shrink", 0400, ttm_debugfs_root, NULL,
-   &ttm_tt_debugfs_shrink_fops);
+   if(ttm_debugfs_root)
+   debugfs_create_file("tt_shrink", 0400, ttm_debugfs_root, NULL,
+   &ttm_tt_debugfs_shrink_fops);
 #endif
 
if (!ttm_pages_limit)
-- 
2.25.1



RE: [PATCH] drm/amd/pm: Support RAS fatal error mode1 reset on smu v13_0_0 and v13_0_10

2023-01-12 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: amd-gfx  On Behalf Of Candice Li
Sent: Friday, January 13, 2023 10:40
To: amd-gfx@lists.freedesktop.org
Cc: Li, Candice 
Subject: [PATCH] drm/amd/pm: Support RAS fatal error mode1 reset on smu v13_0_0 
and v13_0_10

Support RAS fatal error mode1 reset on smu v13_0_0 and v13_0_10.

Signed-off-by: Candice Li 
Reviewed-by: Evan Quan 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  | 42 +--
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c|  6 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h|  3 ++
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 969e5f96554015..d0cdc578344d8d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -1904,15 +1904,51 @@ static int smu_v13_0_0_set_df_cstate(struct smu_context 
*smu,
   NULL);
 }

+static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu,
+   uint32_t supported_version,
+   uint32_t *param)
+{
+   uint32_t smu_version;
+   struct amdgpu_device *adev = smu->adev;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   smu_cmn_get_smc_version(smu, NULL, &smu_version);
+
+   if ((smu_version >= supported_version) &&
+   ras && atomic_read(&ras->in_recovery))
+   /* Set RAS fatal error reset flag */
+   *param = 1 << 16;
+   else
+   *param = 0;
+}
+
 static int smu_v13_0_0_mode1_reset(struct smu_context *smu)  {
int ret;
+   uint32_t param;
struct amdgpu_device *adev = smu->adev;

-   if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 10))
-   ret = smu_cmn_send_debug_smc_msg(smu, DEBUGSMC_MSG_Mode1Reset);
-   else
+   switch (adev->ip_versions[MP1_HWIP][0]) {
+   case IP_VERSION(13, 0, 0):
+   /* SMU 13_0_0 PMFW supports RAS fatal error reset from 78.77 */
+   smu_v13_0_0_set_mode1_reset_param(smu, 0x004e4d00, ¶m);
+
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_Mode1Reset, param, 
NULL);
+   break;
+
+   case IP_VERSION(13, 0, 10):
+   /* SMU 13_0_10 PMFW supports RAS fatal error reset from 80.28 */
+   smu_v13_0_0_set_mode1_reset_param(smu, 0x00501c00, ¶m);
+
+   ret = smu_cmn_send_debug_smc_msg_with_param(smu,
+   DEBUGSMC_MSG_Mode1Reset, param);
+   break;
+
+   default:
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
+   break;
+   }

if (!ret)
msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 768b6e7dbd7719..d5abafc5a68201 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -404,6 +404,12 @@ int smu_cmn_send_debug_smc_msg(struct smu_context *smu,
return __smu_cmn_send_debug_msg(smu, msg, 0);  }

+int smu_cmn_send_debug_smc_msg_with_param(struct smu_context *smu,
+uint32_t msg, uint32_t param)
+{
+   return __smu_cmn_send_debug_msg(smu, msg, param); }
+
 int smu_cmn_to_asic_specific_index(struct smu_context *smu,
   enum smu_cmn2asic_mapping_type type,
   uint32_t index)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index f82cf76dd3a474..d7cd358a53bdcd 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -45,6 +45,9 @@ int smu_cmn_send_smc_msg(struct smu_context *smu,  int 
smu_cmn_send_debug_smc_msg(struct smu_context *smu,
 uint32_t msg);

+int smu_cmn_send_debug_smc_msg_with_param(struct smu_context *smu,
+uint32_t msg, uint32_t param);
+
 int smu_cmn_wait_for_response(struct smu_context *smu);

 int smu_cmn_to_asic_specific_index(struct smu_context *smu,
--
2.17.1



[PATCH] drm/amd/pm: Support RAS fatal error mode1 reset on smu v13_0_0 and v13_0_10

2023-01-12 Thread Candice Li
Support RAS fatal error mode1 reset on smu v13_0_0 and v13_0_10.

Signed-off-by: Candice Li 
Reviewed-by: Evan Quan 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  | 42 +--
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c|  6 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h|  3 ++
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 969e5f96554015..d0cdc578344d8d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -1904,15 +1904,51 @@ static int smu_v13_0_0_set_df_cstate(struct smu_context 
*smu,
   NULL);
 }
 
+static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu,
+   uint32_t supported_version,
+   uint32_t *param)
+{
+   uint32_t smu_version;
+   struct amdgpu_device *adev = smu->adev;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   smu_cmn_get_smc_version(smu, NULL, &smu_version);
+
+   if ((smu_version >= supported_version) &&
+   ras && atomic_read(&ras->in_recovery))
+   /* Set RAS fatal error reset flag */
+   *param = 1 << 16;
+   else
+   *param = 0;
+}
+
 static int smu_v13_0_0_mode1_reset(struct smu_context *smu)
 {
int ret;
+   uint32_t param;
struct amdgpu_device *adev = smu->adev;
 
-   if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 10))
-   ret = smu_cmn_send_debug_smc_msg(smu, DEBUGSMC_MSG_Mode1Reset);
-   else
+   switch (adev->ip_versions[MP1_HWIP][0]) {
+   case IP_VERSION(13, 0, 0):
+   /* SMU 13_0_0 PMFW supports RAS fatal error reset from 78.77 */
+   smu_v13_0_0_set_mode1_reset_param(smu, 0x004e4d00, ¶m);
+
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_Mode1Reset, param, 
NULL);
+   break;
+
+   case IP_VERSION(13, 0, 10):
+   /* SMU 13_0_10 PMFW supports RAS fatal error reset from 80.28 */
+   smu_v13_0_0_set_mode1_reset_param(smu, 0x00501c00, ¶m);
+
+   ret = smu_cmn_send_debug_smc_msg_with_param(smu,
+   DEBUGSMC_MSG_Mode1Reset, param);
+   break;
+
+   default:
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
+   break;
+   }
 
if (!ret)
msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 768b6e7dbd7719..d5abafc5a68201 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -404,6 +404,12 @@ int smu_cmn_send_debug_smc_msg(struct smu_context *smu,
return __smu_cmn_send_debug_msg(smu, msg, 0);
 }
 
+int smu_cmn_send_debug_smc_msg_with_param(struct smu_context *smu,
+uint32_t msg, uint32_t param)
+{
+   return __smu_cmn_send_debug_msg(smu, msg, param);
+}
+
 int smu_cmn_to_asic_specific_index(struct smu_context *smu,
   enum smu_cmn2asic_mapping_type type,
   uint32_t index)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index f82cf76dd3a474..d7cd358a53bdcd 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -45,6 +45,9 @@ int smu_cmn_send_smc_msg(struct smu_context *smu,
 int smu_cmn_send_debug_smc_msg(struct smu_context *smu,
 uint32_t msg);
 
+int smu_cmn_send_debug_smc_msg_with_param(struct smu_context *smu,
+uint32_t msg, uint32_t param);
+
 int smu_cmn_wait_for_response(struct smu_context *smu);
 
 int smu_cmn_to_asic_specific_index(struct smu_context *smu,
-- 
2.17.1



RE: [PATCH] drm/amdgpu: Renoir/Cezanne GPU power reporting issue

2023-01-12 Thread Liu, Aaron
Reviewed-by: Aaron Liu aaron@amd.com

From: amd-gfx  On Behalf Of Zhang, 
Jesse(Jie)
Sent: Friday, January 13, 2023 10:07 AM
To: Deucher, Alexander 
Cc: amd-gfx@lists.freedesktop.org
Subject: [PATCH] drm/amdgpu: Renoir/Cezanne GPU power reporting issue


[AMD Official Use Only - General]


drm/amdgpu: Correct the power calcultion for Renior/Cezanne.
From smu firmware,the value of power is transferred  in units of watts.
Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2321
Fixes: 137aac26a2ed ("drm/amdgpu/smu12: fix power reporting on renoir")

Acked-by: Alex Deucher 
alexander.deuc...@amd.com
Signed-off-by: Jesse Zhang jesse.zh...@amd.com

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
index 85e22210963f..96a49a3b3ad9 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
@@ -1171,6 +1171,7 @@ static int renoir_get_smu_metrics_data(struct smu_context 
*smu,
int ret = 0;
uint32_t apu_percent = 0;
uint32_t dgpu_percent = 0;
+   struct amdgpu_device *adev = smu->adev;


ret = smu_cmn_get_metrics_table(smu,
@@ -1196,7 +1197,11 @@ static int renoir_get_smu_metrics_data(struct 
smu_context *smu,
*value = metrics->AverageUvdActivity / 100;
break;
case METRICS_AVERAGE_SOCKETPOWER:
-   *value = (metrics->CurrentSocketPower << 8) / 1000;
+   if (((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(12, 0, 1)) 
&& (adev->pm.fw_version >= 0x4f))
+   || ((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(12, 
0, 0)) && (adev->pm.fw_version >= 0x373200)))
+   *value = metrics->CurrentSocketPower << 8;
+   else
+   *value = (metrics->CurrentSocketPower << 8) / 1000;
break;
case METRICS_TEMPERATURE_EDGE:
*value = (metrics->GfxTemperature / 100) *


[PATCH] drm/amdgpu: Renoir/Cezanne GPU power reporting issue

2023-01-12 Thread Zhang, Jesse(Jie)
[AMD Official Use Only - General]


drm/amdgpu: Correct the power calcultion for Renior/Cezanne.
From smu firmware,the value of power is transferred  in units of watts.
Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2321
Fixes: 137aac26a2ed ("drm/amdgpu/smu12: fix power reporting on renoir")

Acked-by: Alex Deucher 
alexander.deuc...@amd.com
Signed-off-by: Jesse Zhang jesse.zh...@amd.com

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
index 85e22210963f..96a49a3b3ad9 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
@@ -1171,6 +1171,7 @@ static int renoir_get_smu_metrics_data(struct smu_context 
*smu,
int ret = 0;
uint32_t apu_percent = 0;
uint32_t dgpu_percent = 0;
+   struct amdgpu_device *adev = smu->adev;


ret = smu_cmn_get_metrics_table(smu,
@@ -1196,7 +1197,11 @@ static int renoir_get_smu_metrics_data(struct 
smu_context *smu,
*value = metrics->AverageUvdActivity / 100;
break;
case METRICS_AVERAGE_SOCKETPOWER:
-   *value = (metrics->CurrentSocketPower << 8) / 1000;
+   if (((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(12, 0, 1)) 
&& (adev->pm.fw_version >= 0x4f))
+   || ((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(12, 
0, 0)) && (adev->pm.fw_version >= 0x373200)))
+   *value = metrics->CurrentSocketPower << 8;
+   else
+   *value = (metrics->CurrentSocketPower << 8) / 1000;
break;
case METRICS_TEMPERATURE_EDGE:
*value = (metrics->GfxTemperature / 100) *


Re: [PATCH] drm/amd: fix some dead code in `gfx_v9_0_init_cp_compute_microcode`

2023-01-12 Thread Alex Deucher
On Thu, Jan 12, 2023 at 5:37 PM Mario Limonciello
 wrote:
>
> Some dead code was introdcued as part of utilizing the `amdgpu_ucode_*`
> helpers. Adjust the control flow to make sure that firmware is released
> in the appropriate error flows.
>
> Reported-by: coverity-bot 
> Addresses-Coverity-ID: 1530548 ("Control flow issues")
> Fixes: ec787deb2ddf ("drm/amd: Use `amdgpu_ucode_*` helpers for GFX9")
> Signed-off-by: Mario Limonciello 

Reviewed-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 11 ++-
>  1 file changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index e80685d1e6c6c..d4b0fa4b62a44 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -1345,7 +1345,7 @@ static int gfx_v9_0_init_cp_compute_microcode(struct 
> amdgpu_device *adev,
>
> err = amdgpu_ucode_request(adev, &adev->gfx.mec_fw, fw_name);
> if (err)
> -   return err;
> +   goto out;
> amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_MEC1);
> amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_MEC1_JT);
>
> @@ -1355,13 +1355,14 @@ static int gfx_v9_0_init_cp_compute_microcode(struct 
> amdgpu_device *adev,
> else
> snprintf(fw_name, sizeof(fw_name), 
> "amdgpu/%s_mec2.bin", chip_name);
>
> +   /* ignore failures to load */
> err = amdgpu_ucode_request(adev, &adev->gfx.mec2_fw, fw_name);
> if (!err) {
> amdgpu_gfx_cp_init_microcode(adev, 
> AMDGPU_UCODE_ID_CP_MEC2);
> amdgpu_gfx_cp_init_microcode(adev, 
> AMDGPU_UCODE_ID_CP_MEC2_JT);
> } else {
> err = 0;
> -   adev->gfx.mec2_fw = NULL;
> +   amdgpu_ucode_release(&adev->gfx.mec2_fw);
> }
> } else {
> adev->gfx.mec2_fw_version = adev->gfx.mec_fw_version;
> @@ -1370,10 +1371,10 @@ static int gfx_v9_0_init_cp_compute_microcode(struct 
> amdgpu_device *adev,
>
> gfx_v9_0_check_if_need_gfxoff(adev);
> gfx_v9_0_check_fw_write_wait(adev);
> -   if (err) {
> +
> +out:
> +   if (err)
> amdgpu_ucode_release(&adev->gfx.mec_fw);
> -   amdgpu_ucode_release(&adev->gfx.mec2_fw);
> -   }
> return err;
>  }
>
> --
> 2.34.1
>


Re: [PATCH] drm/amd: fix some dead code in `gfx_v9_0_init_cp_compute_microcode`

2023-01-12 Thread Kees Cook
On Thu, Jan 12, 2023 at 04:37:01PM -0600, Mario Limonciello wrote:
> Some dead code was introdcued as part of utilizing the `amdgpu_ucode_*`
> helpers. Adjust the control flow to make sure that firmware is released
> in the appropriate error flows.
> 
> Reported-by: coverity-bot 
> Addresses-Coverity-ID: 1530548 ("Control flow issues")
> Fixes: ec787deb2ddf ("drm/amd: Use `amdgpu_ucode_*` helpers for GFX9")
> Signed-off-by: Mario Limonciello 

Reviewed-by: Kees Cook 

-- 
Kees Cook


Re: Coverity: dm_dmub_sw_init(): Incorrect expression

2023-01-12 Thread Kees Cook
On Thu, Jan 12, 2023 at 10:39:20PM +, Limonciello, Mario wrote:
> This particular one was fixed already in 
> https://patchwork.freedesktop.org/patch/518050/ which got applied today.

Ah-ha; thanks!

-- 
Kees Cook


Re: [PATCH 1/6] drm/amdgpu: Generalize KFD dmabuf import

2023-01-12 Thread Chen, Xiaogang



On 1/11/2023 7:31 PM, Felix Kuehling wrote:

Use proper amdgpu_gem_prime_import function to handle all kinds of
imports. Remember the dmabuf reference to enable proper multi-GPU
attachment to multiple VMs without erroneously re-exporting the
underlying BO multiple times.

Signed-off-by: Felix Kuehling 
---
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 38 ++-
  1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 6f236ded5f12..e13c3493b786 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2209,30 +2209,27 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct 
amdgpu_device *adev,
struct amdgpu_bo *bo;
int ret;
  
-	if (dma_buf->ops != &amdgpu_dmabuf_ops)

-   /* Can't handle non-graphics buffers */
-   return -EINVAL;
-
-   obj = dma_buf->priv;
-   if (drm_to_adev(obj->dev) != adev)
-   /* Can't handle buffers from other devices */
-   return -EINVAL;
+   obj = amdgpu_gem_prime_import(adev_to_drm(adev), dma_buf);
+   if (IS_ERR(obj))
+   return PTR_ERR(obj);
  
  	bo = gem_to_amdgpu_bo(obj);

if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
-   AMDGPU_GEM_DOMAIN_GTT)))
+   AMDGPU_GEM_DOMAIN_GTT))) {
/* Only VRAM and GTT BOs are supported */
-   return -EINVAL;
+   ret = -EINVAL;
+   goto err_put_obj;
+   }
  
  	*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL);

-   if (!*mem)
-   return -ENOMEM;
+   if (!*mem) {
+   ret = -ENOMEM;
+   goto err_put_obj;
+   }
  
  	ret = drm_vma_node_allow(&obj->vma_node, drm_priv);

-   if (ret) {
-   kfree(*mem);
-   return ret;
-   }
+   if (ret)
+   goto err_free_mem;
  
  	if (size)

*size = amdgpu_bo_size(bo);


Hi Felix:

I have a question when using amdgpu_gem_prime_import. It will allow 
importing a dmabuf to different gpus, then can we still call 
amdgpu_bo_mmap_offset on the generated bo if 
amdgpu_amdkfd_gpuvm_import_dmabuf also ask mmap_offset?



@@ -2249,7 +2246,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct 
amdgpu_device *adev,
| KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE
| KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE;
  
-	drm_gem_object_get(&bo->tbo.base);

+   get_dma_buf(dma_buf);
+   (*mem)->dmabuf = dma_buf;
(*mem)->bo = bo;
(*mem)->va = va;
(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) ?
@@ -2261,6 +2259,12 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct 
amdgpu_device *adev,
(*mem)->is_imported = true;
  
  	return 0;

+
+err_free_mem:
+   kfree(*mem);
+err_put_obj:
+   drm_gem_object_put(obj);
+   return ret;
  }
  
  /* Evict a userptr BO by stopping the queues if necessary


RE: Coverity: dm_dmub_sw_init(): Incorrect expression

2023-01-12 Thread Limonciello, Mario
[AMD Official Use Only - General]

This particular one was fixed already in 
https://patchwork.freedesktop.org/patch/518050/ which got applied today.

> -Original Message-
> From: coverity-bot 
> Sent: Thursday, January 12, 2023 16:25
> To: Limonciello, Mario 
> Cc: linux-ker...@vger.kernel.org; amd-gfx@lists.freedesktop.org; Siqueira,
> Rodrigo ; Li, Sun peng (Leo)
> ; Li, Roman ; Zuo, Jerry
> ; Wu, Hersen ; dri-
> de...@lists.freedesktop.org; Koenig, Christian ;
> Lazar, Lijo ; Pillai, Aurabindo
> ; Wentland, Harry ;
> Deucher, Alexander ; Daniel Vetter
> ; David Airlie ; Pan, Xinhui
> ; Wheeler, Daniel ;
> Gustavo A. R. Silva ; linux-n...@vger.kernel.org;
> linux-harden...@vger.kernel.org
> Subject: Coverity: dm_dmub_sw_init(): Incorrect expression
> 
> Hello!
> 
> This is an experimental semi-automated report about issues detected by
> Coverity from a scan of next-20230111 as part of the linux-next scan project:
> https://scan.coverity.com/projects/linux-next-weekly-scan
> 
> You're getting this email because you were associated with the identified
> lines of code (noted below) that were touched by commits:
> 
>   Tue Jan 10 14:32:57 2023 -0500
> a7ab345149b8 ("drm/amd/display: Load DMUB microcode during early_init")
> 
> Coverity reported the following:
> 
> *** CID 1530544:  Incorrect expression  (IDENTICAL_BRANCHES)
> drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c:1951 in
> dm_dmub_sw_init()
> 1945
> 1946  switch (adev->ip_versions[DCE_HWIP][0]) {
> 1947  case IP_VERSION(2, 1, 0):
> 1948  dmub_asic = DMUB_ASIC_DCN21;
> 1949  break;
> 1950  case IP_VERSION(3, 0, 0):
> vvv CID 1530544:  Incorrect expression  (IDENTICAL_BRANCHES)
> vvv The same code is executed regardless of whether "adev-
> >ip_versions[GC_HWIP][0] == 656128U" is true, because the 'then' and 'else'
> branches are identical. Should one of the branches be modified, or the entire 
> 'if'
> statement replaced?
> 1951  if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 3, 
> 0))
> 1952  dmub_asic = DMUB_ASIC_DCN30;
> 1953  else
> 1954  dmub_asic = DMUB_ASIC_DCN30;
> 1955  break;
> 1956  case IP_VERSION(3, 0, 1):
> 
> If this is a false positive, please let us know so we can mark it as
> such, or teach the Coverity rules to be smarter. If not, please make
> sure fixes get into linux-next. :) For patches fixing this, please
> include these lines (but double-check the "Fixes" first):
> 
> Reported-by: coverity-bot 
> Addresses-Coverity-ID: 1530544 ("Incorrect expression")
> Fixes: a7ab345149b8 ("drm/amd/display: Load DMUB microcode during
> early_init")
> 
> Thanks for your attention!
> 
> --
> Coverity-bot


[PATCH] drm/amd: fix some dead code in `gfx_v9_0_init_cp_compute_microcode`

2023-01-12 Thread Mario Limonciello
Some dead code was introdcued as part of utilizing the `amdgpu_ucode_*`
helpers. Adjust the control flow to make sure that firmware is released
in the appropriate error flows.

Reported-by: coverity-bot 
Addresses-Coverity-ID: 1530548 ("Control flow issues")
Fixes: ec787deb2ddf ("drm/amd: Use `amdgpu_ucode_*` helpers for GFX9")
Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index e80685d1e6c6c..d4b0fa4b62a44 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1345,7 +1345,7 @@ static int gfx_v9_0_init_cp_compute_microcode(struct 
amdgpu_device *adev,
 
err = amdgpu_ucode_request(adev, &adev->gfx.mec_fw, fw_name);
if (err)
-   return err;
+   goto out;
amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_MEC1);
amdgpu_gfx_cp_init_microcode(adev, AMDGPU_UCODE_ID_CP_MEC1_JT);
 
@@ -1355,13 +1355,14 @@ static int gfx_v9_0_init_cp_compute_microcode(struct 
amdgpu_device *adev,
else
snprintf(fw_name, sizeof(fw_name), 
"amdgpu/%s_mec2.bin", chip_name);
 
+   /* ignore failures to load */
err = amdgpu_ucode_request(adev, &adev->gfx.mec2_fw, fw_name);
if (!err) {
amdgpu_gfx_cp_init_microcode(adev, 
AMDGPU_UCODE_ID_CP_MEC2);
amdgpu_gfx_cp_init_microcode(adev, 
AMDGPU_UCODE_ID_CP_MEC2_JT);
} else {
err = 0;
-   adev->gfx.mec2_fw = NULL;
+   amdgpu_ucode_release(&adev->gfx.mec2_fw);
}
} else {
adev->gfx.mec2_fw_version = adev->gfx.mec_fw_version;
@@ -1370,10 +1371,10 @@ static int gfx_v9_0_init_cp_compute_microcode(struct 
amdgpu_device *adev,
 
gfx_v9_0_check_if_need_gfxoff(adev);
gfx_v9_0_check_fw_write_wait(adev);
-   if (err) {
+
+out:
+   if (err)
amdgpu_ucode_release(&adev->gfx.mec_fw);
-   amdgpu_ucode_release(&adev->gfx.mec2_fw);
-   }
return err;
 }
 
-- 
2.34.1



Coverity: dm_dmub_sw_init(): Incorrect expression

2023-01-12 Thread coverity-bot
Hello!

This is an experimental semi-automated report about issues detected by
Coverity from a scan of next-20230111 as part of the linux-next scan project:
https://scan.coverity.com/projects/linux-next-weekly-scan

You're getting this email because you were associated with the identified
lines of code (noted below) that were touched by commits:

  Tue Jan 10 14:32:57 2023 -0500
a7ab345149b8 ("drm/amd/display: Load DMUB microcode during early_init")

Coverity reported the following:

*** CID 1530544:  Incorrect expression  (IDENTICAL_BRANCHES)
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c:1951 in dm_dmub_sw_init()
1945
1946switch (adev->ip_versions[DCE_HWIP][0]) {
1947case IP_VERSION(2, 1, 0):
1948dmub_asic = DMUB_ASIC_DCN21;
1949break;
1950case IP_VERSION(3, 0, 0):
vvv CID 1530544:  Incorrect expression  (IDENTICAL_BRANCHES)
vvv The same code is executed regardless of whether 
"adev->ip_versions[GC_HWIP][0] == 656128U" is true, because the 'then' and 
'else' branches are identical. Should one of the branches be modified, or the 
entire 'if' statement replaced?
1951if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 3, 
0))
1952dmub_asic = DMUB_ASIC_DCN30;
1953else
1954dmub_asic = DMUB_ASIC_DCN30;
1955break;
1956case IP_VERSION(3, 0, 1):

If this is a false positive, please let us know so we can mark it as
such, or teach the Coverity rules to be smarter. If not, please make
sure fixes get into linux-next. :) For patches fixing this, please
include these lines (but double-check the "Fixes" first):

Reported-by: coverity-bot 
Addresses-Coverity-ID: 1530544 ("Incorrect expression")
Fixes: a7ab345149b8 ("drm/amd/display: Load DMUB microcode during early_init")

Thanks for your attention!

-- 
Coverity-bot


Coverity: gfx_v9_0_init_cp_compute_microcode(): Control flow issues

2023-01-12 Thread coverity-bot
Hello!

This is an experimental semi-automated report about issues detected by
Coverity from a scan of next-20230111 as part of the linux-next scan project:
https://scan.coverity.com/projects/linux-next-weekly-scan

You're getting this email because you were associated with the identified
lines of code (noted below) that were touched by commits:

  Mon Jan 9 17:02:18 2023 -0500
ec787deb2ddf ("drm/amd: Use `amdgpu_ucode_*` helpers for GFX9")

Coverity reported the following:

*** CID 1530548:  Control flow issues  (DEADCODE)
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c:1374 in 
gfx_v9_0_init_cp_compute_microcode()
1368adev->gfx.mec2_feature_version = 
adev->gfx.mec_feature_version;
1369}
1370
1371gfx_v9_0_check_if_need_gfxoff(adev);
1372gfx_v9_0_check_fw_write_wait(adev);
1373if (err) {
vvv CID 1530548:  Control flow issues  (DEADCODE)
vvv Execution cannot reach this statement: "amdgpu_ucode_release(&adev-...".
1374amdgpu_ucode_release(&adev->gfx.mec_fw);
1375amdgpu_ucode_release(&adev->gfx.mec2_fw);
1376}
1377return err;
1378 }
1379

If this is a false positive, please let us know so we can mark it as
such, or teach the Coverity rules to be smarter. If not, please make
sure fixes get into linux-next. :) For patches fixing this, please
include these lines (but double-check the "Fixes" first):

Reported-by: coverity-bot 
Addresses-Coverity-ID: 1530548 ("Control flow issues")
Fixes: ec787deb2ddf ("drm/amd: Use `amdgpu_ucode_*` helpers for GFX9")

Thanks for your attention!

-- 
Coverity-bot


[PATCH] drm/amdkfd: Support process XNACK mode dynamic change

2023-01-12 Thread Philip Yang
Update queue qpd is done for the first queue creation of the process,
if the device support XNACK mode per process, update qpd setup
sh_mem_config based on the process XNACK mode, to support the process
destory all queues, change XNACK mode, and then create queues.

Add helper macro KFD_SUPPORT_XNACK_PER_PROCESS to remove duplicate code
and add new ASICs support in future.

Signed-off-by: Philip Yang 
---
 .../amd/amdkfd/kfd_device_queue_manager_v9.c  | 27 +--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  2 +-
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
index d119070956fb..8b2dd2670ab7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
@@ -59,30 +59,27 @@ static int update_qpd_v9(struct device_queue_manager *dqm,
 
/* check if sh_mem_config register already configured */
if (qpd->sh_mem_config == 0) {
-   qpd->sh_mem_config =
-   SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
+   qpd->sh_mem_config = SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
 
-   if (KFD_GC_VERSION(dqm->dev) == IP_VERSION(9, 4, 2)) {
-   /* Aldebaran can safely support different XNACK modes
-* per process
-*/
-   if (!pdd->process->xnack_enabled)
-   qpd->sh_mem_config |=
-   1 << 
SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
-   } else if (dqm->dev->noretry &&
-  !dqm->dev->use_iommu_v2) {
-   qpd->sh_mem_config |=
-   1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
-   }
+   if (dqm->dev->noretry && !dqm->dev->use_iommu_v2)
+   qpd->sh_mem_config |= 1 << 
SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
 
qpd->sh_mem_ape1_limit = 0;
qpd->sh_mem_ape1_base = 0;
}
 
+   if (KFD_SUPPORT_XNACK_PER_PROCESS(dqm->dev)) {
+   if (!pdd->process->xnack_enabled)
+   qpd->sh_mem_config |= 1 << 
SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
+   else
+   qpd->sh_mem_config &= ~(1 << 
SH_MEM_CONFIG__RETRY_DISABLE__SHIFT);
+   }
+
qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
 
-   pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
+   pr_debug("sh_mem_bases 0x%X sh_mem_config 0x%X\n", qpd->sh_mem_bases,
+qpd->sh_mem_config);
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 552c3ac85a13..bfa30d12406b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -206,6 +206,8 @@ enum cache_policy {
 
 #define KFD_GC_VERSION(dev) ((dev)->adev->ip_versions[GC_HWIP][0])
 #define KFD_IS_SOC15(dev)   ((KFD_GC_VERSION(dev)) >= (IP_VERSION(9, 0, 1)))
+#define KFD_SUPPORT_XNACK_PER_PROCESS(dev)\
+   (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2))
 
 struct kfd_event_interrupt_class {
bool (*interrupt_isr)(struct kfd_dev *dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 71db24fefe05..72df6286e240 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1330,7 +1330,7 @@ bool kfd_process_xnack_mode(struct kfd_process *p, bool 
supported)
 * per-process XNACK mode selection. But let the dev->noretry
 * setting still influence the default XNACK mode.
 */
-   if (supported && KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2))
+   if (supported && KFD_SUPPORT_XNACK_PER_PROCESS(dev))
continue;
 
/* GFXv10 and later GPUs do not support shader preemption
-- 
2.35.1



Re: [PATCH] Revert "drm/display/dp_mst: Move all payload info into the atomic state"

2023-01-12 Thread Lyude Paul
Acked-by: Lyude Paul 

On Thu, 2023-01-12 at 16:50 +0800, Wayne Lin wrote:
> This reverts commit 4d07b0bc403403438d9cf88450506240c5faf92f.
> 
> [Why]
> Changes cause regression on amdgpu mst.
> E.g.
> In fill_dc_mst_payload_table_from_drm(), amdgpu expects to add/remove payload
> one by one and call fill_dc_mst_payload_table_from_drm() to update the HW
> maintained payload table. But previous change tries to go through all the
> payloads in mst_state and update amdpug hw maintained table in once everytime
> driver only tries to add/remove a specific payload stream only. The newly
> design idea conflicts with the implementation in amdgpu nowadays.
> 
> [How]
> Revert this patch first. After addressing all regression problems caused by
> this previous patch, will add it back and adjust it.
> 
> Signed-off-by: Wayne Lin 
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2171
> Cc: sta...@vger.kernel.org # 6.1
> Cc: Lyude Paul 
> Cc: Harry Wentland 
> Cc: Mario Limonciello 
> Cc: Ville Syrjälä 
> Cc: Ben Skeggs 
> Cc: Stanislav Lisovskiy 
> Cc: Fangzhi Zuo 
> ---
>  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  53 +-
>  .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 106 ++-
>  .../display/amdgpu_dm/amdgpu_dm_mst_types.c   |  87 ++-
>  .../amd/display/include/link_service_types.h  |   3 -
>  drivers/gpu/drm/display/drm_dp_mst_topology.c | 724 --
>  drivers/gpu/drm/i915/display/intel_dp_mst.c   |  67 +-
>  drivers/gpu/drm/i915/display/intel_hdcp.c |  24 +-
>  drivers/gpu/drm/nouveau/dispnv50/disp.c   | 167 ++--
>  include/drm/display/drm_dp_mst_helper.h   | 177 +++--
>  9 files changed, 878 insertions(+), 530 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 77277d90b6e2..674f5dc1102b 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -6548,7 +6548,6 @@ static int dm_encoder_helper_atomic_check(struct 
> drm_encoder *encoder,
>   const struct drm_display_mode *adjusted_mode = 
> &crtc_state->adjusted_mode;
>   struct drm_dp_mst_topology_mgr *mst_mgr;
>   struct drm_dp_mst_port *mst_port;
> - struct drm_dp_mst_topology_state *mst_state;
>   enum dc_color_depth color_depth;
>   int clock, bpp = 0;
>   bool is_y420 = false;
> @@ -6562,13 +6561,6 @@ static int dm_encoder_helper_atomic_check(struct 
> drm_encoder *encoder,
>   if (!crtc_state->connectors_changed && !crtc_state->mode_changed)
>   return 0;
>  
> - mst_state = drm_atomic_get_mst_topology_state(state, mst_mgr);
> - if (IS_ERR(mst_state))
> - return PTR_ERR(mst_state);
> -
> - if (!mst_state->pbn_div)
> - mst_state->pbn_div = 
> dm_mst_get_pbn_divider(aconnector->mst_port->dc_link);
> -
>   if (!state->duplicated) {
>   int max_bpc = conn_state->max_requested_bpc;
>   is_y420 = drm_mode_is_420_also(&connector->display_info, 
> adjusted_mode) &&
> @@ -6580,10 +6572,11 @@ static int dm_encoder_helper_atomic_check(struct 
> drm_encoder *encoder,
>   clock = adjusted_mode->clock;
>   dm_new_connector_state->pbn = drm_dp_calc_pbn_mode(clock, bpp, 
> false);
>   }
> -
> - dm_new_connector_state->vcpi_slots =
> - drm_dp_atomic_find_time_slots(state, mst_mgr, mst_port,
> -   dm_new_connector_state->pbn);
> + dm_new_connector_state->vcpi_slots = 
> drm_dp_atomic_find_time_slots(state,
> +
> mst_mgr,
> +
> mst_port,
> +
> dm_new_connector_state->pbn,
> +
> dm_mst_get_pbn_divider(aconnector->dc_link));
>   if (dm_new_connector_state->vcpi_slots < 0) {
>   DRM_DEBUG_ATOMIC("failed finding vcpi slots: %d\n", 
> (int)dm_new_connector_state->vcpi_slots);
>   return dm_new_connector_state->vcpi_slots;
> @@ -6654,14 +6647,17 @@ static int dm_update_mst_vcpi_slots_for_dsc(struct 
> drm_atomic_state *state,
>   dm_conn_state->vcpi_slots = slot_num;
>  
>   ret = drm_dp_mst_atomic_enable_dsc(state, 
> aconnector->port,
> -dm_conn_state->pbn, 
> false);
> +dm_conn_state->pbn, 
> 0, false);
>   if (ret < 0)
>   return ret;
>  
>   continue;
>   }
>  
> - vcpi = drm_dp_mst_atomic_enable_dsc(state, aconnector->port, 
> pbn, true);
> + vcpi = drm_dp_mst_atomic_enable_dsc(state,
> +

[PATCH v2 2/3] drm/fb-helper: Set framebuffer for vga-switcheroo clients

2023-01-12 Thread Thomas Zimmermann
Set the framebuffer info for drivers that support VGA switcheroo. Only
affects the amdgpu and nouveau drivers, which use VGA switcheroo and
generic fbdev emulation. For other drivers, this does nothing.

This fixes a potential regression in the console code. Both, amdgpu and
nouveau, invoked vga_switcheroo_client_fb_set() from their internal fbdev
code. But the call got lost when the drivers switched to the generic
emulation.

Fixes: 087451f372bf ("drm/amdgpu: use generic fb helpers instead of setting up 
AMD own's.")
Fixes: 4a16dd9d18a0 ("drm/nouveau/kms: switch to drm fbdev helpers")
Signed-off-by: Thomas Zimmermann 
Reviewed-by: Daniel Vetter 
Reviewed-by: Alex Deucher 
Cc: Ben Skeggs 
Cc: Karol Herbst 
Cc: Lyude Paul 
Cc: Thomas Zimmermann 
Cc: Javier Martinez Canillas 
Cc: Laurent Pinchart 
Cc: Jani Nikula 
Cc: Dave Airlie 
Cc: Evan Quan 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Hawking Zhang 
Cc: Likun Gao 
Cc: "Christian König" 
Cc: Stanley Yang 
Cc: "Tianci.Yin" 
Cc: Xiaojian Du 
Cc: Andrey Grodzovsky 
Cc: YiPeng Chai 
Cc: Somalapuram Amaranath 
Cc: Bokun Zhang 
Cc: Guchun Chen 
Cc: Hamza Mahfooz 
Cc: Aurabindo Pillai 
Cc: Mario Limonciello 
Cc: Solomon Chiu 
Cc: Kai-Heng Feng 
Cc: Felix Kuehling 
Cc: Daniel Vetter 
Cc: "Marek Olšák" 
Cc: Sam Ravnborg 
Cc: Hans de Goede 
Cc: "Ville Syrjälä" 
Cc: dri-de...@lists.freedesktop.org
Cc: nouv...@lists.freedesktop.org
Cc:  # v5.17+
---
 drivers/gpu/drm/drm_fb_helper.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c
index 427631706128..5e445c61252d 100644
--- a/drivers/gpu/drm/drm_fb_helper.c
+++ b/drivers/gpu/drm/drm_fb_helper.c
@@ -30,7 +30,9 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include 
+#include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1940,6 +1942,7 @@ static int drm_fb_helper_single_fb_probe(struct 
drm_fb_helper *fb_helper,
 int preferred_bpp)
 {
struct drm_client_dev *client = &fb_helper->client;
+   struct drm_device *dev = fb_helper->dev;
struct drm_fb_helper_surface_size sizes;
int ret;
 
@@ -1961,6 +1964,11 @@ static int drm_fb_helper_single_fb_probe(struct 
drm_fb_helper *fb_helper,
return ret;
 
strcpy(fb_helper->fb->comm, "[fbcon]");
+
+   /* Set the fb info for vgaswitcheroo clients. Does nothing otherwise. */
+   if (dev_is_pci(dev->dev))
+   vga_switcheroo_client_fb_set(to_pci_dev(dev->dev), 
fb_helper->info);
+
return 0;
 }
 
-- 
2.39.0



[PATCH v2 1/3] drm/i915: Allow switching away via vga-switcheroo if uninitialized

2023-01-12 Thread Thomas Zimmermann
Always allow switching away via vga-switcheroo if the display is
uninitalized. Instead prevent switching to i915 if the device has
not been initialized.

This issue was introduced by commit 5df7bd130818 ("drm/i915: skip
display initialization when there is no display") protected, which
protects code paths from being executed on uninitialized devices.
In the case of vga-switcheroo, we want to allow a switch away from
i915's device. So run vga_switcheroo_process_delayed_switch() and
test in the switcheroo callbacks if the i915 device is available.

Fixes: 5df7bd130818 ("drm/i915: skip display initialization when there is no 
display")
Signed-off-by: Thomas Zimmermann 
Cc: Radhakrishna Sripada 
Cc: Lucas De Marchi 
Cc: José Roberto de Souza 
Cc: Jani Nikula 
Cc: Ville Syrjälä 
Cc: Jani Nikula 
Cc: Joonas Lahtinen 
Cc: Rodrigo Vivi 
Cc: Tvrtko Ursulin 
Cc: "Ville Syrjälä" 
Cc: Manasi Navare 
Cc: Stanislav Lisovskiy 
Cc: Imre Deak 
Cc: "Jouni Högander" 
Cc: Uma Shankar 
Cc: Ankit Nautiyal 
Cc: "Jason A. Donenfeld" 
Cc: Matt Roper 
Cc: Ramalingam C 
Cc: Thomas Zimmermann 
Cc: Andi Shyti 
Cc: Andrzej Hajda 
Cc: "José Roberto de Souza" 
Cc: Julia Lawall 
Cc: intel-...@lists.freedesktop.org
Cc:  # v5.14+
---
 drivers/gpu/drm/i915/i915_driver.c | 3 +--
 drivers/gpu/drm/i915/i915_switcheroo.c | 6 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_driver.c 
b/drivers/gpu/drm/i915/i915_driver.c
index c1e427ba57ae..33e231b120c1 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -1075,8 +1075,7 @@ static void i915_driver_lastclose(struct drm_device *dev)
 
intel_fbdev_restore_mode(dev);
 
-   if (HAS_DISPLAY(i915))
-   vga_switcheroo_process_delayed_switch();
+   vga_switcheroo_process_delayed_switch();
 }
 
 static void i915_driver_postclose(struct drm_device *dev, struct drm_file 
*file)
diff --git a/drivers/gpu/drm/i915/i915_switcheroo.c 
b/drivers/gpu/drm/i915/i915_switcheroo.c
index 23777d500cdf..f45bd6b6cede 100644
--- a/drivers/gpu/drm/i915/i915_switcheroo.c
+++ b/drivers/gpu/drm/i915/i915_switcheroo.c
@@ -19,6 +19,10 @@ static void i915_switcheroo_set_state(struct pci_dev *pdev,
dev_err(&pdev->dev, "DRM not initialized, aborting switch.\n");
return;
}
+   if (!HAS_DISPLAY(i915)) {
+   dev_err(&pdev->dev, "Device state not initialized, aborting 
switch.\n");
+   return;
+   }
 
if (state == VGA_SWITCHEROO_ON) {
drm_info(&i915->drm, "switched on\n");
@@ -44,7 +48,7 @@ static bool i915_switcheroo_can_switch(struct pci_dev *pdev)
 * locking inversion with the driver load path. And the access here is
 * completely racy anyway. So don't bother with locking for now.
 */
-   return i915 && atomic_read(&i915->drm.open_count) == 0;
+   return i915 && HAS_DISPLAY(i915) && atomic_read(&i915->drm.open_count) 
== 0;
 }
 
 static const struct vga_switcheroo_client_ops i915_switcheroo_ops = {
-- 
2.39.0



[PATCH v2 3/3] drm: Call vga_switcheroo_process_delayed_switch() in drm_lastclose

2023-01-12 Thread Thomas Zimmermann
Several lastclose helpers call vga_switcheroo_process_delayed_switch().
It's better to call the helper from drm_lastclose() after the kernel
client's screen has been restored. This way, all drivers can benefit
without having to implement their own lastclose helper. For drivers
without vga-switcheroo, vga_switcheroo_process_delayed_switch() does
nothing.

There was an earlier patchset to do something similar. [1]

v2:
* handle vga_switcheroo_client_fb_set() in a separate patch
* also update i915, nouveau and radeon
* remove unnecessary include statements
* update vga-switcheroo docs

Suggested-by: Alexander Deucher 
Signed-off-by: Thomas Zimmermann 
Reviewed-by: Daniel Vetter 
Reviewed-by: Alex Deucher 
Link: 
https://lore.kernel.org/amd-gfx/20221020143603.563929-1-alexander.deuc...@amd.com/
 # 1
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 13 -
 drivers/gpu/drm/drm_file.c  |  3 +++
 drivers/gpu/drm/i915/i915_driver.c  | 25 ++---
 drivers/gpu/drm/nouveau/nouveau_drm.c   |  1 -
 drivers/gpu/drm/nouveau/nouveau_vga.c   |  7 ---
 drivers/gpu/drm/nouveau/nouveau_vga.h   |  1 -
 drivers/gpu/drm/radeon/radeon_drv.c |  2 +-
 drivers/gpu/drm/radeon/radeon_drv.h |  1 -
 drivers/gpu/drm/radeon/radeon_kms.c | 18 --
 drivers/gpu/vga/vga_switcheroo.c|  4 ++--
 12 files changed, 8 insertions(+), 70 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 63c921c55fb9..7120b9b6e580 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1330,7 +1330,6 @@ extern const int amdgpu_max_kms_ioctl;
 
 int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags);
 void amdgpu_driver_unload_kms(struct drm_device *dev);
-void amdgpu_driver_lastclose_kms(struct drm_device *dev);
 int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv);
 void amdgpu_driver_postclose_kms(struct drm_device *dev,
 struct drm_file *file_priv);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1353ffd08988..783c1e284a22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -34,7 +34,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -2785,7 +2784,6 @@ static const struct drm_driver amdgpu_kms_driver = {
DRIVER_SYNCOBJ_TIMELINE,
.open = amdgpu_driver_open_kms,
.postclose = amdgpu_driver_postclose_kms,
-   .lastclose = amdgpu_driver_lastclose_kms,
.ioctls = amdgpu_ioctls_kms,
.num_ioctls = ARRAY_SIZE(amdgpu_ioctls_kms),
.dumb_create = amdgpu_mode_dumb_create,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 7aa7e52ca784..a37be02fb2fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -34,7 +34,6 @@
 #include "amdgpu_vce.h"
 #include "atom.h"
 
-#include 
 #include 
 #include 
 #include 
@@ -1104,18 +1103,6 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
 /*
  * Outdated mess for old drm with Xorg being in charge (void function now).
  */
-/**
- * amdgpu_driver_lastclose_kms - drm callback for last close
- *
- * @dev: drm dev pointer
- *
- * Switch vga_switcheroo state after last close (all asics).
- */
-void amdgpu_driver_lastclose_kms(struct drm_device *dev)
-{
-   drm_fb_helper_lastclose(dev);
-   vga_switcheroo_process_delayed_switch();
-}
 
 /**
  * amdgpu_driver_open_kms - drm callback for open
diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index a51ff8cee049..314c309db9a3 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -38,6 +38,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -460,6 +461,8 @@ void drm_lastclose(struct drm_device * dev)
drm_legacy_dev_reinit(dev);
 
drm_client_dev_restore(dev);
+
+   vga_switcheroo_process_delayed_switch();
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_driver.c 
b/drivers/gpu/drm/i915/i915_driver.c
index 33e231b120c1..bf6ad8620970 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -29,6 +29,7 @@
 
 #include 
 #include 
+#include  /* for FBINFO_STATE_ */
 #include 
 #include 
 #include 
@@ -37,7 +38,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #include 
@@ -1057,27 +1057,6 @@ static int i915_driver_open(struct drm_device *dev, 
struct drm_file *file)
return 0;
 }
 
-/**
- * i915_driver_lastclose - clean up after all DRM clients have exited
- * @dev: DRM device
- *
- * Take care of cleaning up after all DRM clients have exited.  In the
- * mod

[PATCH v2 0/3] drm: Generic fbdev and vga-switcheroo

2023-01-12 Thread Thomas Zimmermann
(was: drm/fb-helper: Set framebuffer for vga-switcheroo clients)

This patch has now turned into a little series. The first two patches
are bug fixes for the existing code. The third patch cleans up the
drivers.

Patch 1 fixes i915 to do the correct thing if the device has not been
initialized yet. Switching to the device is only possible after the
initialization, but switching away is always possible.

Patch 2 is the original patch without the amdgpu changes. Installs
the fbdev framebuffer in vga-switcheroo for the PCI device. Does
nothing for drivers without vga-switcheroo.

Patch 3 cleans up vga_switcheroo_process_delayed_switch() in amdgpu
and the other related drivers (i.e., i915, nouveau and radeon). The
call is now located at the end of drm_lastclose() and drivers do not
need their own lastclose helpers any longer.

I kept the r-bs from v1, but patch 1 is entirely new and patch 3 has
significantly grown in size.

Thomas Zimmermann (3):
  drm/i915: Allow switching away via vga-switcheroo if uninitialized
  drm/fb-helper: Set framebuffer for vga-switcheroo clients
  drm: Call vga_switcheroo_process_delayed_switch() in drm_lastclose

 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 13 -
 drivers/gpu/drm/drm_fb_helper.c |  8 
 drivers/gpu/drm/drm_file.c  |  3 +++
 drivers/gpu/drm/i915/i915_driver.c  | 26 ++---
 drivers/gpu/drm/i915/i915_switcheroo.c  |  6 +-
 drivers/gpu/drm/nouveau/nouveau_drm.c   |  1 -
 drivers/gpu/drm/nouveau/nouveau_vga.c   |  7 ---
 drivers/gpu/drm/nouveau/nouveau_vga.h   |  1 -
 drivers/gpu/drm/radeon/radeon_drv.c |  2 +-
 drivers/gpu/drm/radeon/radeon_drv.h |  1 -
 drivers/gpu/drm/radeon/radeon_kms.c | 18 -
 drivers/gpu/vga/vga_switcheroo.c|  4 ++--
 14 files changed, 21 insertions(+), 72 deletions(-)


base-commit: ac04152253dccfb02dcedfa0c57443122cf79314
-- 
2.39.0



Re: [PATCH] drm/amdgpu: Skip specific mmhub and sdma registers accessing under sriov

2023-01-12 Thread Alex Deucher
On Wed, Jan 11, 2023 at 2:53 AM Yifan Zha  wrote:
>
> [Why]
> SDMA0_CNTL and MMHUB system aperture related registers are blocked by L1 
> Policy.
> Therefore, they cannot be accessed by VF and loged in violation.
>
> [How]
> For MMHUB registers, they will be programmed by PF. So VF will skip to 
> program them in mmhubv3_0.
> For SDMA0_CNTL which is a PF_only register, VF don't need to program it in 
> sdma_v6_0.
>
> Signed-off-by: Yifan Zha 

Acked-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c | 34 -
>  drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c  | 10 +---
>  2 files changed, 23 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c 
> b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c
> index e9dcd6fcde7f..ae9cd1a4cfee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v3_0.c
> @@ -169,23 +169,23 @@ static void mmhub_v3_0_init_system_aperture_regs(struct 
> amdgpu_device *adev)
> uint64_t value;
> uint32_t tmp;
>
> -   if (!amdgpu_sriov_vf(adev)) {
> -   /*
> -* the new L1 policy will block SRIOV guest from writing
> -* these regs, and they will be programed at host.
> -* so skip programing these regs.
> -*/
> -   /* Disable AGP. */
> -   WREG32_SOC15(MMHUB, 0, regMMMC_VM_AGP_BASE, 0);
> -   WREG32_SOC15(MMHUB, 0, regMMMC_VM_AGP_TOP, 0);
> -   WREG32_SOC15(MMHUB, 0, regMMMC_VM_AGP_BOT, 0x00FF);
> -
> -   /* Program the system aperture low logical page number. */
> -   WREG32_SOC15(MMHUB, 0, regMMMC_VM_SYSTEM_APERTURE_LOW_ADDR,
> -adev->gmc.vram_start >> 18);
> -   WREG32_SOC15(MMHUB, 0, regMMMC_VM_SYSTEM_APERTURE_HIGH_ADDR,
> -adev->gmc.vram_end >> 18);
> -   }
> +   if (amdgpu_sriov_vf(adev))
> +   return;
> +
> +   /*
> +* the new L1 policy will block SRIOV guest from writing
> +* these regs, and they will be programed at host.
> +* so skip programing these regs.
> +*/
> +   /* Disable AGP. */
> +   WREG32_SOC15(MMHUB, 0, regMMMC_VM_AGP_BASE, 0);
> +   WREG32_SOC15(MMHUB, 0, regMMMC_VM_AGP_TOP, 0);
> +   WREG32_SOC15(MMHUB, 0, regMMMC_VM_AGP_BOT, 0x00FF);
> +   /* Program the system aperture low logical page number. */
> +   WREG32_SOC15(MMHUB, 0, regMMMC_VM_SYSTEM_APERTURE_LOW_ADDR,
> +adev->gmc.vram_start >> 18);
> +   WREG32_SOC15(MMHUB, 0, regMMMC_VM_SYSTEM_APERTURE_HIGH_ADDR,
> +adev->gmc.vram_end >> 18);
>
> /* Set default page address. */
> value = adev->mem_scratch.gpu_addr - adev->gmc.vram_start +
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> index bf1fa5e8d2f9..6fe292a2486b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> @@ -1403,10 +1403,12 @@ static int sdma_v6_0_set_trap_irq_state(struct 
> amdgpu_device *adev,
>
> u32 reg_offset = sdma_v6_0_get_reg_offset(adev, type, regSDMA0_CNTL);
>
> -   sdma_cntl = RREG32(reg_offset);
> -   sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA0_CNTL, TRAP_ENABLE,
> -  state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
> -   WREG32(reg_offset, sdma_cntl);
> +   if (!amdgpu_sriov_vf(adev)) {
> +   sdma_cntl = RREG32(reg_offset);
> +   sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA0_CNTL, TRAP_ENABLE,
> +   state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0);
> +   WREG32(reg_offset, sdma_cntl);
> +   }
>
> return 0;
>  }
> --
> 2.25.1
>


Re: [PATCH] drm/amd/display: Conversion to bool not necessary

2023-01-12 Thread Alex Deucher
Applied.  Thanks!

Alex

On Thu, Jan 12, 2023 at 8:51 AM Deepak R Varma  wrote:
>
> A logical evaluation already results in bool. There is no need for using
> a ternary operator based evaluation and bool conversion of the outcome.
> Issue identified using boolconv.cocci Coccinelle semantic patch.
> This was also reported by the Kernel Test Robot. Hence
>
> Fixes: 473683a03495 ("drm/amd/display: Create a file dedicated for CRTC")
>
> Reported-by: kernel test robot 
> Signed-off-by: Deepak R Varma 
> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
> index 22125daf9dcf..1e39d0939700 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
> @@ -105,8 +105,7 @@ static void vblank_control_worker(struct work_struct 
> *work)
> else if (dm->active_vblank_irq_count)
> dm->active_vblank_irq_count--;
>
> -   dc_allow_idle_optimizations(
> -   dm->dc, dm->active_vblank_irq_count == 0 ? true : false);
> +   dc_allow_idle_optimizations(dm->dc, dm->active_vblank_irq_count == 0);
>
> DRM_DEBUG_KMS("Allow idle optimizations (MALL): %d\n", 
> dm->active_vblank_irq_count == 0);
>
> --
> 2.34.1
>
>
>


Re: [PATCH] drm/amd/display: Remove useless else if

2023-01-12 Thread Alex Deucher
Applied.  Thanks!

On Wed, Jan 11, 2023 at 10:21 PM Jiapeng Chong
 wrote:
>
> The assignment of the else and if branches is the same, so the if else
> here is redundant, so we remove it.
>
> ./drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c:1951:2-4: WARNING: 
> possible condition with no effect (if == else).
>
> Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3719
> Reported-by: Abaci Robot 
> Signed-off-by: Jiapeng Chong 
> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 4300ce98ce8d..aa3024e58d12 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -1948,10 +1948,7 @@ static int dm_dmub_sw_init(struct amdgpu_device *adev)
> dmub_asic = DMUB_ASIC_DCN21;
> break;
> case IP_VERSION(3, 0, 0):
> -   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 3, 0))
> -   dmub_asic = DMUB_ASIC_DCN30;
> -   else
> -   dmub_asic = DMUB_ASIC_DCN30;
> +   dmub_asic = DMUB_ASIC_DCN30;
> break;
> case IP_VERSION(3, 0, 1):
> dmub_asic = DMUB_ASIC_DCN301;
> --
> 2.20.1.7.g153144c
>


Re: [PATCH] Revert "drm/display/dp_mst: Move all payload info into the atomic state"

2023-01-12 Thread Limonciello, Mario

On 1/12/2023 02:50, Wayne Lin wrote:

This reverts commit 4d07b0bc403403438d9cf88450506240c5faf92f.

[Why]
Changes cause regression on amdgpu mst.
E.g.
In fill_dc_mst_payload_table_from_drm(), amdgpu expects to add/remove payload
one by one and call fill_dc_mst_payload_table_from_drm() to update the HW
maintained payload table. But previous change tries to go through all the
payloads in mst_state and update amdpug hw maintained table in once everytime
driver only tries to add/remove a specific payload stream only. The newly
design idea conflicts with the implementation in amdgpu nowadays.

[How]
Revert this patch first. After addressing all regression problems caused by
this previous patch, will add it back and adjust it.

Signed-off-by: Wayne Lin 
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2171
Cc: sta...@vger.kernel.org # 6.1
Cc: Lyude Paul 
Cc: Harry Wentland 
Cc: Mario Limonciello 
Cc: Ville Syrjälä 
Cc: Ben Skeggs 
Cc: Stanislav Lisovskiy 
Cc: Fangzhi Zuo 
---
  .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  53 +-
  .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 106 ++-
  .../display/amdgpu_dm/amdgpu_dm_mst_types.c   |  87 ++-
  .../amd/display/include/link_service_types.h  |   3 -
  drivers/gpu/drm/display/drm_dp_mst_topology.c | 724 --
  drivers/gpu/drm/i915/display/intel_dp_mst.c   |  67 +-
  drivers/gpu/drm/i915/display/intel_hdcp.c |  24 +-
  drivers/gpu/drm/nouveau/dispnv50/disp.c   | 167 ++--
  include/drm/display/drm_dp_mst_helper.h   | 177 +++--
  9 files changed, 878 insertions(+), 530 deletions(-)


Hi Wayne,

What branch is this intended to apply against?  I shared that it existed 
to reporters in #2171 and they said they couldn't apply it against 
drm-next (03a0a1040), v6.2-rc3 or v6.1.5.


I guess it's unclear to me the correct path this is supposed to start.
Should we be reverting in drm-fixes, drm-next, or directly to 6.2-rc?

Thanks,



diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 77277d90b6e2..674f5dc1102b 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6548,7 +6548,6 @@ static int dm_encoder_helper_atomic_check(struct 
drm_encoder *encoder,
const struct drm_display_mode *adjusted_mode = 
&crtc_state->adjusted_mode;
struct drm_dp_mst_topology_mgr *mst_mgr;
struct drm_dp_mst_port *mst_port;
-   struct drm_dp_mst_topology_state *mst_state;
enum dc_color_depth color_depth;
int clock, bpp = 0;
bool is_y420 = false;
@@ -6562,13 +6561,6 @@ static int dm_encoder_helper_atomic_check(struct 
drm_encoder *encoder,
if (!crtc_state->connectors_changed && !crtc_state->mode_changed)
return 0;
  
-	mst_state = drm_atomic_get_mst_topology_state(state, mst_mgr);

-   if (IS_ERR(mst_state))
-   return PTR_ERR(mst_state);
-
-   if (!mst_state->pbn_div)
-   mst_state->pbn_div = 
dm_mst_get_pbn_divider(aconnector->mst_port->dc_link);
-
if (!state->duplicated) {
int max_bpc = conn_state->max_requested_bpc;
is_y420 = drm_mode_is_420_also(&connector->display_info, adjusted_mode) 
&&
@@ -6580,10 +6572,11 @@ static int dm_encoder_helper_atomic_check(struct 
drm_encoder *encoder,
clock = adjusted_mode->clock;
dm_new_connector_state->pbn = drm_dp_calc_pbn_mode(clock, bpp, 
false);
}
-
-   dm_new_connector_state->vcpi_slots =
-   drm_dp_atomic_find_time_slots(state, mst_mgr, mst_port,
- dm_new_connector_state->pbn);
+   dm_new_connector_state->vcpi_slots = 
drm_dp_atomic_find_time_slots(state,
+  
mst_mgr,
+  
mst_port,
+  
dm_new_connector_state->pbn,
+  
dm_mst_get_pbn_divider(aconnector->dc_link));
if (dm_new_connector_state->vcpi_slots < 0) {
DRM_DEBUG_ATOMIC("failed finding vcpi slots: %d\n", 
(int)dm_new_connector_state->vcpi_slots);
return dm_new_connector_state->vcpi_slots;
@@ -6654,14 +6647,17 @@ static int dm_update_mst_vcpi_slots_for_dsc(struct 
drm_atomic_state *state,
dm_conn_state->vcpi_slots = slot_num;
  
  			ret = drm_dp_mst_atomic_enable_dsc(state, aconnector->port,

-  dm_conn_state->pbn, 
false);
+  dm_conn_state->pbn, 
0, false);
if (ret < 0)
return ret;
  
  			continue;

}
  
-		vcpi = drm_dp_mst_atomic_enable_dsc(state, aco

Re: [PATCH 6.0 108/148] drm/amdgpu: Fix size validation for non-exclusive domains (v4)

2023-01-12 Thread Greg Kroah-Hartman
On Thu, Jan 12, 2023 at 11:59:06AM -0500, Luben Tuikov wrote:
> On 2023-01-12 11:49, Greg Kroah-Hartman wrote:
> > On Thu, Jan 12, 2023 at 11:25:08AM -0500, Luben Tuikov wrote:
> >> Hi Greg,
> >>
> >> The patch in the link is a Fixes patch of the quoted patch, and should 
> >> also go in:
> >>
> >> https://lore.kernel.org/all/20230104221935.113400-1-luben.tui...@amd.com/
> > 
> > Is that in Linus's tree already?  if so, what is the git commit id?
> 
> I just checked, and not yet. Just wanted to give a heads up.
> 
> It does have a Fixes tag, and I hope it would be picked up automatically,
> when it lands in Linus' tree.

That does not always happen if it does not have a "cc: stable@..." tag.
So when it does land in Linus's tree, please let us know the id so we
are sure to pick it up.

thanks,

greg k-h


[PATCH] drm/amdgpu: print bo inode number instead of ptr

2023-01-12 Thread Pierre-Eric Pelloux-Prayer
This allows to correlate the infos printed by
/sys/kernel/debug/dri/n/amdgpu_gem_info to the ones found
in /proc/.../fdinfo and /sys/kernel/debug/dma_buf/bufinfo.

Signed-off-by: Pierre-Eric Pelloux-Prayer 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 90eb07106609..2b076ed46e78 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1572,9 +1572,9 @@ u64 amdgpu_bo_print_info(int id, struct amdgpu_bo *bo, 
struct seq_file *m)
attachment = READ_ONCE(bo->tbo.base.import_attach);
 
if (attachment)
-   seq_printf(m, " imported from %p", dma_buf);
+   seq_printf(m, " imported from ino:%lu", 
file_inode(dma_buf->file)->i_ino);
else if (dma_buf)
-   seq_printf(m, " exported as %p", dma_buf);
+   seq_printf(m, " exported as ino:%lu", 
file_inode(dma_buf->file)->i_ino);
 
amdgpu_bo_print_flag(m, bo, CPU_ACCESS_REQUIRED);
amdgpu_bo_print_flag(m, bo, NO_CPU_ACCESS);
-- 
2.39.0



Re: [PATCH 6.0 108/148] drm/amdgpu: Fix size validation for non-exclusive domains (v4)

2023-01-12 Thread Luben Tuikov
On 2023-01-12 11:49, Greg Kroah-Hartman wrote:
> On Thu, Jan 12, 2023 at 11:25:08AM -0500, Luben Tuikov wrote:
>> Hi Greg,
>>
>> The patch in the link is a Fixes patch of the quoted patch, and should also 
>> go in:
>>
>> https://lore.kernel.org/all/20230104221935.113400-1-luben.tui...@amd.com/
> 
> Is that in Linus's tree already?  if so, what is the git commit id?

I just checked, and not yet. Just wanted to give a heads up.

It does have a Fixes tag, and I hope it would be picked up automatically,
when it lands in Linus' tree.
-- 
Regards,
Luben


Re: [PATCH 1/2] drm/amdgpu/smu12: fix power reporting on CZN/BCL

2023-01-12 Thread Alex Deucher
On Thu, Jan 12, 2023 at 11:25 AM Alex Deucher  wrote:
>
> The metrics interface exposes the socket power in W, but
> apparently RN systems exposed the power as mW.  See
> commit 137aac26a2ed ("drm/amdgpu/smu12: fix power reporting on renoir").
> So leave RN as mW and use W for CZN/BCL.

Just saw that Jesse is working on a proper patch that fixes this for
real based on the firmware versions.  So let's take that instead.

Alex

>
> Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2321
> Fixes: 137aac26a2ed ("drm/amdgpu/smu12: fix power reporting on renoir")
> Cc: jesse.zh...@amd.com
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
> index 85e22210963f..77308d481c54 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
> @@ -1196,7 +1196,10 @@ static int renoir_get_smu_metrics_data(struct 
> smu_context *smu,
> *value = metrics->AverageUvdActivity / 100;
> break;
> case METRICS_AVERAGE_SOCKETPOWER:
> -   *value = (metrics->CurrentSocketPower << 8) / 1000;
> +   if (smu->adev->ip_versions[MP1_HWIP][0] == IP_VERSION(12, 0, 
> 0))
> +   *value = (metrics->CurrentSocketPower << 8) / 1000; 
> /* mW */
> +   else
> +   *value = metrics->CurrentSocketPower << 8; /* W */
> break;
> case METRICS_TEMPERATURE_EDGE:
> *value = (metrics->GfxTemperature / 100) *
> --
> 2.39.0
>


Re: [PATCH 6.0 108/148] drm/amdgpu: Fix size validation for non-exclusive domains (v4)

2023-01-12 Thread Greg Kroah-Hartman
On Thu, Jan 12, 2023 at 11:25:08AM -0500, Luben Tuikov wrote:
> Hi Greg,
> 
> The patch in the link is a Fixes patch of the quoted patch, and should also 
> go in:
> 
> https://lore.kernel.org/all/20230104221935.113400-1-luben.tui...@amd.com/

Is that in Linus's tree already?  if so, what is the git commit id?

thanks,

greg k-h


Re: [PATCH 5.10 1/1] drm/amdkfd: Check for null pointer after calling kmemdup

2023-01-12 Thread Greg KH
On Thu, Jan 12, 2023 at 04:26:45PM +0100, Daniel Vetter wrote:
> On Thu, 12 Jan 2023 at 13:47, Greg KH  wrote:
> > On Wed, Jan 04, 2023 at 07:56:33PM +0200, Dragos-Marian Panait wrote:
> > > From: Jiasheng Jiang 
> > >
> > > [ Upstream commit abfaf0eee97925905e742aa3b0b72e04a918fa9e ]
> > >
> > > As the possible failure of the allocation, kmemdup() may return NULL
> > > pointer.
> > > Therefore, it should be better to check the 'props2' in order to prevent
> > > the dereference of NULL pointer.
> > >
> > > Fixes: 3a87177eb141 ("drm/amdkfd: Add topology support for dGPUs")
> > > Signed-off-by: Jiasheng Jiang 
> > > Reviewed-by: Felix Kuehling 
> > > Signed-off-by: Felix Kuehling 
> > > Signed-off-by: Alex Deucher 
> > > Signed-off-by: Dragos-Marian Panait 
> > > ---
> > >  drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 3 +++
> > >  1 file changed, 3 insertions(+)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
> > > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > > index 86b4dadf772e..02e3c650ed1c 100644
> > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > > @@ -408,6 +408,9 @@ static int kfd_parse_subtype_iolink(struct 
> > > crat_subtype_iolink *iolink,
> > >   return -ENODEV;
> > >   /* same everything but the other direction */
> > >   props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
> > > + if (!props2)
> > > + return -ENOMEM;
> >
> > Not going to queue this up as this is a bogus CVE.
> 
> Are we at the point where CVE presence actually contraindicates
> backporting?

Some would say that that point passed a long time ago :)

> At least I'm getting a bit the feeling there's a surge of
> automated (security) fixes that just don't hold up to any scrutiny.

That has been happening a lot more in the past 6-8 months than in years
past with the introduction of more automated tools being present.

> Last week I had to toss out an fbdev locking patch due to static
> checker that has no clue at all how refcounting works, and so
> complained that things need more locking ... (that was -fixes, but
> would probably have gone to stable too if I didn't catch it).
> 
> Simple bugfixes from random people was nice when it was checkpatch
> stuff and I was fairly happy to take these aggressively in drm. But my
> gut feeling says things seem to be shifting towards more advanced
> tooling, but without more advanced understanding by submitters. Does
> that holder in other areas too?

Again, yes, I have seen that a lot recently, especially with regards to
patches that purport to fix bugs yet obviously were never tested.

That being said, there are a few developers who are doing great things
with fault-injection testing and providing good patches for that.  So we
can't just say that everyone using these tools has no clue.

thanks,

greg k-h


Re: [PATCH 1/2] drm/amdgpu: return the PCIe gen and lanes from the INFO

2023-01-12 Thread Alex Deucher
On Thu, Jan 12, 2023 at 6:50 AM Christian König
 wrote:
>
> Am 11.01.23 um 21:48 schrieb Alex Deucher:
> > On Wed, Jan 4, 2023 at 3:17 PM Marek Olšák  wrote:
> >> Yes, it's meant to be like a spec sheet. We are not interested in the 
> >> current bandwidth utilization.
> > After chatting with Marek on IRC and thinking about this more, I think
> > this patch is fine.  It's not really meant for bandwidth per se, but
> > rather as a limit to determine what the driver should do in certain
> > cases (i.e., when does it make sense to copy to vram vs not).  It's
> > not straightforward for userspace to parse the full topology to
> > determine what links may be slow.  I guess one potential pitfall would
> > be that if you pass the device into a VM, the driver may report the
> > wrong values.  Generally in a VM the VM doesn't get the full view up
> > to the root port.  I don't know if the hypervisors report properly for
> > pcie_bandwidth_available() in a VM or if it just shows the info about
> > the endpoint in the VM.
>
> So this basically doesn't return the gen and lanes of the device, but
> rather what was negotiated between the device and the upstream root port?

Correct. It exposes the max gen and lanes of the slowest link between
the device and the root port.

>
> If I got that correctly then we should probably document that cause
> otherwise somebody will try to "fix" it at some time.

Good point.

Alex

>
> Christian.
>
> >
> > Reviewed-by: Alex Deucher 
> >
> > Alex
> >
> >> Marek
> >>
> >> On Wed, Jan 4, 2023 at 10:33 AM Lazar, Lijo  wrote:
> >>> [AMD Official Use Only - General]
> >>>
> >>>
> >>> To clarify, with DPM in place, the current bandwidth will be changing 
> >>> based on the load.
> >>>
> >>> If apps/umd already has a way to know the current bandwidth utilisation, 
> >>> then possible maximum also could be part of the same API. Otherwise, this 
> >>> only looks like duplicate information. We have the same information in 
> >>> sysfs DPM nodes.
> >>>
> >>> BTW, I don't know to what extent app/umd really makes use of this. Take 
> >>> that memory frequency as an example (I'm reading it as 16GHz). It only 
> >>> looks like a spec sheet.
> >>>
> >>> Thanks,
> >>> Lijo
> >>> 
> >>> From: Marek Olšák 
> >>> Sent: Wednesday, January 4, 2023 8:40:00 PM
> >>> To: Lazar, Lijo 
> >>> Cc: amd-gfx@lists.freedesktop.org 
> >>> Subject: Re: [PATCH 1/2] drm/amdgpu: return the PCIe gen and lanes from 
> >>> the INFO
> >>>
> >>> On Wed, Jan 4, 2023 at 9:19 AM Lazar, Lijo  wrote:
> >>>
> >>>
> >>>
> >>> On 1/4/2023 7:43 PM, Marek Olšák wrote:
>  On Wed, Jan 4, 2023 at 6:50 AM Lazar, Lijo   > wrote:
> 
> 
> 
>   On 1/4/2023 4:11 AM, Marek Olšák wrote:
>    > I see. Well, those sysfs files are not usable, and I don't think 
>  it
>    > would be important even if they were usable, but for 
>  completeness:
>    >
>    > The ioctl returns:
>    >  pcie_gen = 1
>    >  pcie_num_lanes = 16
>    >
>    > Theoretical bandwidth from those values: 4.0 GB/s
>    > My DMA test shows this write bandwidth: 3.5 GB/s
>    > It matches the expectation.
>    >
>    > Let's see the devices (there is only 1 GPU Navi21 in the system):
>    > $ lspci |egrep '(PCI|VGA).*Navi'
>    > 0a:00.0 PCI bridge: Advanced Micro Devices, Inc. [AMD/ATI] Navi
>   10 XL
>    > Upstream Port of PCI Express Switch (rev c3)
>    > 0b:00.0 PCI bridge: Advanced Micro Devices, Inc. [AMD/ATI] Navi
>   10 XL
>    > Downstream Port of PCI Express Switch
>    > 0c:00.0 VGA compatible controller: Advanced Micro Devices, Inc.
>    > [AMD/ATI] Navi 21 [Radeon RX 6800/6800 XT / 6900 XT] (rev c3)
>    >
>    > Let's read sysfs:
>    >
>    > $ cat /sys/bus/pci/devices/:0a:00.0/current_link_width
>    > 16
>    > $ cat /sys/bus/pci/devices/:0b:00.0/current_link_width
>    > 16
>    > $ cat /sys/bus/pci/devices/:0c:00.0/current_link_width
>    > 16
>    > $ cat /sys/bus/pci/devices/:0a:00.0/current_link_speed
>    > 2.5 GT/s PCIe
>    > $ cat /sys/bus/pci/devices/:0b:00.0/current_link_speed
>    > 16.0 GT/s PCIe
>    > $ cat /sys/bus/pci/devices/:0c:00.0/current_link_speed
>    > 16.0 GT/s PCIe
>    >
>    > Problem 1: None of the speed numbers match 4 GB/s.
> 
>   US bridge = 2.5GT/s means operating at PCIe Gen 1 speed. Total
>   theoretical bandwidth is then derived based on encoding and total
>   number
>   of lanes.
> 
>    > Problem 2: Userspace doesn't know the bus index of the bridges,
>   and it's
>    > not clear which bridge should be used.
> 
>   In general

Re: [PATCH 6.0 108/148] drm/amdgpu: Fix size validation for non-exclusive domains (v4)

2023-01-12 Thread Luben Tuikov
Hi Greg,

The patch in the link is a Fixes patch of the quoted patch, and should also go 
in:

https://lore.kernel.org/all/20230104221935.113400-1-luben.tui...@amd.com/

Regards,
Luben

On 2023-01-10 13:03, Greg Kroah-Hartman wrote:
> From: Luben Tuikov 
> 
> [ Upstream commit 7554886daa31eacc8e7fac9e15bbce67d10b8f1f ]
> 
> Fix amdgpu_bo_validate_size() to check whether the TTM domain manager for the
> requested memory exists, else we get a kernel oops when dereferencing "man".
> 
> v2: Make the patch standalone, i.e. not dependent on local patches.
> v3: Preserve old behaviour and just check that the manager pointer is not
> NULL.
> v4: Complain if GTT domain requested and it is uninitialized--most likely a
> bug.
> 
> Cc: Alex Deucher 
> Cc: Christian König 
> Cc: AMD Graphics 
> Signed-off-by: Luben Tuikov 
> Reviewed-by: Christian König 
> Signed-off-by: Alex Deucher 
> Signed-off-by: Sasha Levin 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 19 ---
>  1 file changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index bfe0fc258fc1..60ab2d952d5c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -446,27 +446,24 @@ static bool amdgpu_bo_validate_size(struct 
> amdgpu_device *adev,
>  
>   /*
>* If GTT is part of requested domains the check must succeed to
> -  * allow fall back to GTT
> +  * allow fall back to GTT.
>*/
>   if (domain & AMDGPU_GEM_DOMAIN_GTT) {
>   man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>  
> - if (size < man->size)
> + if (man && size < man->size)
>   return true;
> - else
> - goto fail;
> - }
> -
> - if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
> + else if (!man)
> + WARN_ON_ONCE("GTT domain requested but GTT mem manager 
> uninitialized");
> + goto fail;
> + } else if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
>   man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
>  
> - if (size < man->size)
> + if (man && size < man->size)
>   return true;
> - else
> - goto fail;
> + goto fail;
>   }
>  
> -
>   /* TODO add more domains checks, such as AMDGPU_GEM_DOMAIN_CPU */
>   return true;
>  



[PATCH 1/2] drm/amdgpu/smu12: fix power reporting on CZN/BCL

2023-01-12 Thread Alex Deucher
The metrics interface exposes the socket power in W, but
apparently RN systems exposed the power as mW.  See
commit 137aac26a2ed ("drm/amdgpu/smu12: fix power reporting on renoir").
So leave RN as mW and use W for CZN/BCL.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2321
Fixes: 137aac26a2ed ("drm/amdgpu/smu12: fix power reporting on renoir")
Cc: jesse.zh...@amd.com
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
index 85e22210963f..77308d481c54 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c
@@ -1196,7 +1196,10 @@ static int renoir_get_smu_metrics_data(struct 
smu_context *smu,
*value = metrics->AverageUvdActivity / 100;
break;
case METRICS_AVERAGE_SOCKETPOWER:
-   *value = (metrics->CurrentSocketPower << 8) / 1000;
+   if (smu->adev->ip_versions[MP1_HWIP][0] == IP_VERSION(12, 0, 0))
+   *value = (metrics->CurrentSocketPower << 8) / 1000; /* 
mW */
+   else
+   *value = metrics->CurrentSocketPower << 8; /* W */
break;
case METRICS_TEMPERATURE_EDGE:
*value = (metrics->GfxTemperature / 100) *
-- 
2.39.0



[PATCH 2/2] drm/amdgpu/pm: update hwmon power documentation

2023-01-12 Thread Alex Deucher
Power reporting is socket power.  On APUs this includes
the CPU.  Update the documentation to clarify this.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 236657eece47..76b9ec64ca50 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -3059,7 +3059,7 @@ static ssize_t amdgpu_hwmon_show_mclk_label(struct device 
*dev,
  *
  * hwmon interfaces for GPU power:
  *
- * - power1_average: average power used by the GPU in microWatts
+ * - power1_average: average power used by the SoC in microWatts.  On APUs 
this includes the CPU.
  *
  * - power1_cap_min: minimum cap supported in microWatts
  *
-- 
2.39.0



Re: [PATCH AUTOSEL 6.1 5/7] drm/amdgpu: Fix size validation for non-exclusive domains (v4)

2023-01-12 Thread Luben Tuikov
Hi Sasha,

The patch in the link is a Fixes patch of the quoted patch, and should also go 
in:

https://lore.kernel.org/all/20230104221935.113400-1-luben.tui...@amd.com/

Regards,
Luben

On 2022-12-31 15:04, Sasha Levin wrote:
> From: Luben Tuikov 
> 
> [ Upstream commit 7554886daa31eacc8e7fac9e15bbce67d10b8f1f ]
> 
> Fix amdgpu_bo_validate_size() to check whether the TTM domain manager for the
> requested memory exists, else we get a kernel oops when dereferencing "man".
> 
> v2: Make the patch standalone, i.e. not dependent on local patches.
> v3: Preserve old behaviour and just check that the manager pointer is not
> NULL.
> v4: Complain if GTT domain requested and it is uninitialized--most likely a
> bug.
> 
> Cc: Alex Deucher 
> Cc: Christian König 
> Cc: AMD Graphics 
> Signed-off-by: Luben Tuikov 
> Reviewed-by: Christian König 
> Signed-off-by: Alex Deucher 
> Signed-off-by: Sasha Levin 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 19 ---
>  1 file changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> index 2e8f6cd7a729..33e266433e9b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> @@ -446,27 +446,24 @@ static bool amdgpu_bo_validate_size(struct 
> amdgpu_device *adev,
>  
>   /*
>* If GTT is part of requested domains the check must succeed to
> -  * allow fall back to GTT
> +  * allow fall back to GTT.
>*/
>   if (domain & AMDGPU_GEM_DOMAIN_GTT) {
>   man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>  
> - if (size < man->size)
> + if (man && size < man->size)
>   return true;
> - else
> - goto fail;
> - }
> -
> - if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
> + else if (!man)
> + WARN_ON_ONCE("GTT domain requested but GTT mem manager 
> uninitialized");
> + goto fail;
> + } else if (domain & AMDGPU_GEM_DOMAIN_VRAM) {
>   man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
>  
> - if (size < man->size)
> + if (man && size < man->size)
>   return true;
> - else
> - goto fail;
> + goto fail;
>   }
>  
> -
>   /* TODO add more domains checks, such as AMDGPU_GEM_DOMAIN_CPU */
>   return true;
>  



Re: [PATCH] drm/amd: Avoid ASSERT for some message failures

2023-01-12 Thread Harry Wentland
On 1/11/23 16:52, Mario Limonciello wrote:
> On DCN314 when resuming from s0i3 an ASSERT is shown indicating that
> `VBIOSSMC_MSG_SetHardMinDcfclkByFreq` returned `VBIOSSMC_Result_Failed`.
> 
> This isn't a driver bug; it's a BIOS/configuration bug. To make this
> easier to triage, add an explicit warning when this issue happens.
> 
> This matches the behavior utilized for failures with
> `VBIOSSMC_MSG_TransferTableDram2Smu` configuration.
> 
> Signed-off-by: Mario Limonciello 

Reviewed-by: Harry Wentland 

Harry

> ---
>  drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c 
> b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c
> index f47cfe6b42bd2..0765334f08259 100644
> --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c
> +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn314/dcn314_smu.c
> @@ -146,6 +146,9 @@ static int dcn314_smu_send_msg_with_param(struct 
> clk_mgr_internal *clk_mgr,
>   if (msg_id == VBIOSSMC_MSG_TransferTableDram2Smu &&
>   param == TABLE_WATERMARKS)
>   DC_LOG_WARNING("Watermarks table not configured 
> properly by SMU");
> + else if (msg_id == VBIOSSMC_MSG_SetHardMinDcfclkByFreq ||
> +  msg_id == VBIOSSMC_MSG_SetMinDeepSleepDcfclk)
> + DC_LOG_WARNING("DCFCLK_DPM is not enabled by BIOS");
>   else
>   ASSERT(0);
>   REG_WRITE(MP1_SMN_C2PMSG_91, VBIOSSMC_Result_OK);



Re: [PATCH] drm/amd/display: Fix set scaling doesn's work

2023-01-12 Thread Rodrigo Siqueira Jordao




On 1/11/23 10:19, Harry Wentland wrote:

On 1/10/23 10:58, Rodrigo Siqueira Jordao wrote:



On 11/22/22 06:20, hongao wrote:

[Why]
Setting scaling does not correctly update CRTC state. As a result
dc stream state's src (composition area) && dest (addressable area)
was not calculated as expected. This causes set scaling doesn's work.

[How]
Correctly update CRTC state when setting scaling property.

Signed-off-by: hongao 

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 3e1ecca72430..a88a6f758748 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -9386,8 +9386,8 @@ static int amdgpu_dm_atomic_check(struct drm_device *dev,
   goto fail;
   }
   -    if (dm_old_con_state->abm_level !=
-    dm_new_con_state->abm_level)
+    if (dm_old_con_state->abm_level != dm_new_con_state->abm_level ||
+    dm_old_con_state->scaling != dm_new_con_state->scaling)
   new_crtc_state->connectors_changed = true;
   }
   


Hi,

This change lgtm, and I also run it in our CI, and from IGT perspective, we are 
good.

Harry, do you have any comment about this change?



LGTM

Reviewed-by: Harry Wentland 

Harry


Thanks
Siqueira




Thanks, patch applied to amd-staging-drm-next.




Re: [PATCH] drm: amd: display: Fix memory leakage

2023-01-12 Thread Rodrigo Siqueira Jordao




On 11/29/22 21:50, Konstantin Meskhidze wrote:

This commit fixes memory leakage in dc_construct_ctx() function.

Signed-off-by: Konstantin Meskhidze 
---
  drivers/gpu/drm/amd/display/dc/core/dc.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c 
b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 997ab031f816..359e28d3567e 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -878,6 +878,7 @@ static bool dc_construct_ctx(struct dc *dc,
  
  	dc_ctx->perf_trace = dc_perf_trace_create();

if (!dc_ctx->perf_trace) {
+   kfree(dc_ctx);
ASSERT_CRITICAL(false);
return false;
}


Reviewed-by: Rodrigo Siqueira 

And applied to amd-staging-drm-next.

Thanks
Siqueira


Re: [PATCH] drm/amdgpu: fix amdgpu_job_free_resources

2023-01-12 Thread Christian König

Sorry for that I've only quickly hacked that together without testing.

Good to know that it solves the issue.

Thanks,
Christian.

Am 12.01.23 um 15:37 schrieb Thong Thai:


The patch solves the UVD issue.

By the way, I had to change one of the "->"'s to a "." to compile:

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c: In function 
‘amdgpu_job_free_resources’:
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c:159:61: error: invalid type argument of 
‘->’ (have ‘struct dma_fence’)
   159 | if (job->base.s_fence && job->base.s_fence->finished->ops)
   | ^~

Thanks,

Thong Thai

On 2023-01-12 09:05, Alex Deucher wrote:

On Thu, Jan 12, 2023 at 8:48 AM Christian König
  wrote:

It can be that neither fence were initialized when we run out of UVD
streams for example.

Signed-off-by: Christian König

Bug:https://gitlab.freedesktop.org/drm/amd/-/issues/2324

Reviewed-by: Alex Deucher



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 10 --
  1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 9e549923622b..28929c6219a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -161,8 +161,14 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
 struct dma_fence *f;
 unsigned i;

-   /* use sched fence if available */
-   f = job->base.s_fence ? &job->base.s_fence->finished :  &job->hw_fence;
+   /* Check if any fences where initialized */
+   if (job->base.s_fence && job->base.s_fence->finished->ops)
+   f = &job->base.s_fence->finished;
+   else if (job->hw_fence.ops)
+   f = &job->hw_fence;
+   else
+   f = NULL;
+
 for (i = 0; i < job->num_ibs; ++i)
 amdgpu_ib_free(ring->adev, &job->ibs[i], f);
  }
--
2.25.1



Re: [PATCH 5.10 1/1] drm/amdkfd: Check for null pointer after calling kmemdup

2023-01-12 Thread Daniel Vetter
On Thu, 12 Jan 2023 at 13:47, Greg KH  wrote:
> On Wed, Jan 04, 2023 at 07:56:33PM +0200, Dragos-Marian Panait wrote:
> > From: Jiasheng Jiang 
> >
> > [ Upstream commit abfaf0eee97925905e742aa3b0b72e04a918fa9e ]
> >
> > As the possible failure of the allocation, kmemdup() may return NULL
> > pointer.
> > Therefore, it should be better to check the 'props2' in order to prevent
> > the dereference of NULL pointer.
> >
> > Fixes: 3a87177eb141 ("drm/amdkfd: Add topology support for dGPUs")
> > Signed-off-by: Jiasheng Jiang 
> > Reviewed-by: Felix Kuehling 
> > Signed-off-by: Felix Kuehling 
> > Signed-off-by: Alex Deucher 
> > Signed-off-by: Dragos-Marian Panait 
> > ---
> >  drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 3 +++
> >  1 file changed, 3 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
> > b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > index 86b4dadf772e..02e3c650ed1c 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> > @@ -408,6 +408,9 @@ static int kfd_parse_subtype_iolink(struct 
> > crat_subtype_iolink *iolink,
> >   return -ENODEV;
> >   /* same everything but the other direction */
> >   props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
> > + if (!props2)
> > + return -ENOMEM;
>
> Not going to queue this up as this is a bogus CVE.

Are we at the point where CVE presence actually contraindicates
backporting? At least I'm getting a bit the feeling there's a surge of
automated (security) fixes that just don't hold up to any scrutiny.
Last week I had to toss out an fbdev locking patch due to static
checker that has no clue at all how refcounting works, and so
complained that things need more locking ... (that was -fixes, but
would probably have gone to stable too if I didn't catch it).

Simple bugfixes from random people was nice when it was checkpatch
stuff and I was fairly happy to take these aggressively in drm. But my
gut feeling says things seem to be shifting towards more advanced
tooling, but without more advanced understanding by submitters. Does
that holder in other areas too?
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


Re: [PATCH] drm/amdgpu: fix amdgpu_job_free_resources

2023-01-12 Thread Thong Thai

The patch solves the UVD issue.

By the way, I had to change one of the "->"'s to a "." to compile:

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c: In function 
‘amdgpu_job_free_resources’:

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c:159:61: error: invalid type argument of 
‘->’ (have ‘struct dma_fence’)

  159 | if (job->base.s_fence && job->base.s_fence->finished->ops)

  | ^~

Thanks,

Thong Thai

On 2023-01-12 09:05, Alex Deucher wrote:

On Thu, Jan 12, 2023 at 8:48 AM Christian König
  wrote:

It can be that neither fence were initialized when we run out of UVD
streams for example.

Signed-off-by: Christian König

Bug:https://gitlab.freedesktop.org/drm/amd/-/issues/2324

Reviewed-by: Alex Deucher



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 10 --
  1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 9e549923622b..28929c6219a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -161,8 +161,14 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
 struct dma_fence *f;
 unsigned i;

-   /* use sched fence if available */
-   f = job->base.s_fence ? &job->base.s_fence->finished :  &job->hw_fence;
+   /* Check if any fences where initialized */
+   if (job->base.s_fence && job->base.s_fence->finished->ops)
+   f = &job->base.s_fence->finished;
+   else if (job->hw_fence.ops)
+   f = &job->hw_fence;
+   else
+   f = NULL;
+
 for (i = 0; i < job->num_ibs; ++i)
 amdgpu_ib_free(ring->adev, &job->ibs[i], f);
  }
--
2.25.1


[PATCH] drm/amd/display: Conversion to bool not necessary

2023-01-12 Thread Deepak R Varma
A logical evaluation already results in bool. There is no need for using
a ternary operator based evaluation and bool conversion of the outcome.
Issue identified using boolconv.cocci Coccinelle semantic patch.
This was also reported by the Kernel Test Robot. Hence

Fixes: 473683a03495 ("drm/amd/display: Create a file dedicated for CRTC")

Reported-by: kernel test robot 
Signed-off-by: Deepak R Varma 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
index 22125daf9dcf..1e39d0939700 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c
@@ -105,8 +105,7 @@ static void vblank_control_worker(struct work_struct *work)
else if (dm->active_vblank_irq_count)
dm->active_vblank_irq_count--;
 
-   dc_allow_idle_optimizations(
-   dm->dc, dm->active_vblank_irq_count == 0 ? true : false);
+   dc_allow_idle_optimizations(dm->dc, dm->active_vblank_irq_count == 0);
 
DRM_DEBUG_KMS("Allow idle optimizations (MALL): %d\n", 
dm->active_vblank_irq_count == 0);
 
-- 
2.34.1





Re: [PATCH] drm/amdgpu: fix amdgpu_job_free_resources

2023-01-12 Thread Alex Deucher
On Thu, Jan 12, 2023 at 8:48 AM Christian König
 wrote:
>
> It can be that neither fence were initialized when we run out of UVD
> streams for example.
>
> Signed-off-by: Christian König 

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2324

Reviewed-by: Alex Deucher 


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 9e549923622b..28929c6219a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -161,8 +161,14 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
> struct dma_fence *f;
> unsigned i;
>
> -   /* use sched fence if available */
> -   f = job->base.s_fence ? &job->base.s_fence->finished :  
> &job->hw_fence;
> +   /* Check if any fences where initialized */
> +   if (job->base.s_fence && job->base.s_fence->finished->ops)
> +   f = &job->base.s_fence->finished;
> +   else if (job->hw_fence.ops)
> +   f = &job->hw_fence;
> +   else
> +   f = NULL;
> +
> for (i = 0; i < job->num_ibs; ++i)
> amdgpu_ib_free(ring->adev, &job->ibs[i], f);
>  }
> --
> 2.25.1
>


[PATCH] drm/amdgpu: fix amdgpu_job_free_resources

2023-01-12 Thread Christian König
It can be that neither fence were initialized when we run out of UVD
streams for example.

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 9e549923622b..28929c6219a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -161,8 +161,14 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
struct dma_fence *f;
unsigned i;
 
-   /* use sched fence if available */
-   f = job->base.s_fence ? &job->base.s_fence->finished :  &job->hw_fence;
+   /* Check if any fences where initialized */
+   if (job->base.s_fence && job->base.s_fence->finished->ops)
+   f = &job->base.s_fence->finished;
+   else if (job->hw_fence.ops)
+   f = &job->hw_fence;
+   else
+   f = NULL;
+
for (i = 0; i < job->num_ibs; ++i)
amdgpu_ib_free(ring->adev, &job->ibs[i], f);
 }
-- 
2.25.1



[PATCH] drm/amd/display: drop unnecessary NULL check in dce60_should_enable_fbc()

2023-01-12 Thread Alexey Kodanev
pipe_ctx pointer cannot be NULL when getting the address of
an element of the pipe_ctx array.

Detected using the static analysis tool - Svace.
Signed-off-by: Alexey Kodanev 
---
 drivers/gpu/drm/amd/display/dc/dce60/dce60_hw_sequencer.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dce60/dce60_hw_sequencer.c 
b/drivers/gpu/drm/amd/display/dc/dce60/dce60_hw_sequencer.c
index 920c7ae29d53..f0c002a6a538 100644
--- a/drivers/gpu/drm/amd/display/dc/dce60/dce60_hw_sequencer.c
+++ b/drivers/gpu/drm/amd/display/dc/dce60/dce60_hw_sequencer.c
@@ -72,9 +72,6 @@ static bool dce60_should_enable_fbc(struct dc *dc,
 
pipe_ctx = &res_ctx->pipe_ctx[i];
 
-   if (!pipe_ctx)
-   continue;
-
/* fbc not applicable on underlay pipe */
if (pipe_ctx->pipe_idx != underlay_idx) {
*pipe_idx = i;
-- 
2.25.1



Re: [PATCH 5.10 1/1] drm/amdkfd: Check for null pointer after calling kmemdup

2023-01-12 Thread Greg KH
On Wed, Jan 04, 2023 at 07:56:33PM +0200, Dragos-Marian Panait wrote:
> From: Jiasheng Jiang 
> 
> [ Upstream commit abfaf0eee97925905e742aa3b0b72e04a918fa9e ]
> 
> As the possible failure of the allocation, kmemdup() may return NULL
> pointer.
> Therefore, it should be better to check the 'props2' in order to prevent
> the dereference of NULL pointer.
> 
> Fixes: 3a87177eb141 ("drm/amdkfd: Add topology support for dGPUs")
> Signed-off-by: Jiasheng Jiang 
> Reviewed-by: Felix Kuehling 
> Signed-off-by: Felix Kuehling 
> Signed-off-by: Alex Deucher 
> Signed-off-by: Dragos-Marian Panait 
> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> index 86b4dadf772e..02e3c650ed1c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> @@ -408,6 +408,9 @@ static int kfd_parse_subtype_iolink(struct 
> crat_subtype_iolink *iolink,
>   return -ENODEV;
>   /* same everything but the other direction */
>   props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
> + if (!props2)
> + return -ENOMEM;

Not going to queue this up as this is a bogus CVE.


Re: [PATCH] drm/amdgpu: grab extra fence reference for drm_sched_job_add_dependency

2023-01-12 Thread Christian König




Am 10.01.23 um 19:21 schrieb Mikhail Gavrilov:

On Mon, Jan 9, 2023 at 6:40 PM Christian König
 wrote:

That looks like an out of memory situation is not gracefully handled.

In other words we have a missing NULL check in drm_sched_job_cleanup().

Going to take a look.

Very strange because it just reproduced again. Almost no memory leaked.

❯ free
totalusedfree  shared  buff/cache   
available
Mem:6558960034060388 1520668 30332843000854427767260
Swap:   75497464  99456074502904


Mhm, our UVD guys reported similar problems when they open up to many 
concurrent streams.


Most likely some random issue cause by one of the gang submit patches.

Could you try to better reproduce this? If we can reproduce this 
reliable compiling the kernel with KASAN might help figuring out where 
exactly something goes wrong.


Christian.







--
Best Regards,
Mike Gavrilov.




[PATCH v2] drm: Optimize drm buddy top-down allocation method

2023-01-12 Thread Arunpravin Paneer Selvam
We are observing performance drop in many usecases which include
games, 3D benchmark applications,etc.. To solve this problem, We
are strictly not allowing top down flag enabled allocations to
steal the memory space from cpu visible region.

The idea is, we are sorting each order list entries in
ascending order and compare the last entry of each order
list in the freelist and return the max block.

This patch improves the 3D benchmark scores and solves
fragmentation issues.

All drm buddy selftests are verfied.
drm_buddy: pass:6 fail:0 skip:0 total:6

Signed-off-by: Arunpravin Paneer Selvam 
Acked-by: Christian König 
Acked-by: Alex Deucher 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/drm_buddy.c | 81 -
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/drm_buddy.c b/drivers/gpu/drm/drm_buddy.c
index 11bb59399471..3d1f50f481cf 100644
--- a/drivers/gpu/drm/drm_buddy.c
+++ b/drivers/gpu/drm/drm_buddy.c
@@ -38,6 +38,25 @@ static void drm_block_free(struct drm_buddy *mm,
kmem_cache_free(slab_blocks, block);
 }
 
+static void list_insert_sorted(struct drm_buddy *mm,
+  struct drm_buddy_block *block)
+{
+   struct drm_buddy_block *node;
+   struct list_head *head;
+
+   head = &mm->free_list[drm_buddy_block_order(block)];
+   if (list_empty(head)) {
+   list_add(&block->link, head);
+   return;
+   }
+
+   list_for_each_entry(node, head, link)
+   if (drm_buddy_block_offset(block) < 
drm_buddy_block_offset(node))
+   break;
+
+   __list_add(&block->link, node->link.prev, &node->link);
+}
+
 static void mark_allocated(struct drm_buddy_block *block)
 {
block->header &= ~DRM_BUDDY_HEADER_STATE;
@@ -52,8 +71,7 @@ static void mark_free(struct drm_buddy *mm,
block->header &= ~DRM_BUDDY_HEADER_STATE;
block->header |= DRM_BUDDY_FREE;
 
-   list_add(&block->link,
-&mm->free_list[drm_buddy_block_order(block)]);
+   list_insert_sorted(mm, block);
 }
 
 static void mark_split(struct drm_buddy_block *block)
@@ -387,20 +405,26 @@ alloc_range_bias(struct drm_buddy *mm,
 }
 
 static struct drm_buddy_block *
-get_maxblock(struct list_head *head)
+get_maxblock(struct drm_buddy *mm, unsigned int order)
 {
struct drm_buddy_block *max_block = NULL, *node;
+   unsigned int i;
 
-   max_block = list_first_entry_or_null(head,
-struct drm_buddy_block,
-link);
-   if (!max_block)
-   return NULL;
+   for (i = order; i <= mm->max_order; ++i) {
+   if (!list_empty(&mm->free_list[i])) {
+   node = list_last_entry(&mm->free_list[i],
+  struct drm_buddy_block,
+  link);
+   if (!max_block) {
+   max_block = node;
+   continue;
+   }
 
-   list_for_each_entry(node, head, link) {
-   if (drm_buddy_block_offset(node) >
-   drm_buddy_block_offset(max_block))
-   max_block = node;
+   if (drm_buddy_block_offset(node) >
+   drm_buddy_block_offset(max_block)) {
+   max_block = node;
+   }
+   }
}
 
return max_block;
@@ -412,20 +436,23 @@ alloc_from_freelist(struct drm_buddy *mm,
unsigned long flags)
 {
struct drm_buddy_block *block = NULL;
-   unsigned int i;
+   unsigned int tmp;
int err;
 
-   for (i = order; i <= mm->max_order; ++i) {
-   if (flags & DRM_BUDDY_TOPDOWN_ALLOCATION) {
-   block = get_maxblock(&mm->free_list[i]);
-   if (block)
-   break;
-   } else {
-   block = list_first_entry_or_null(&mm->free_list[i],
-struct drm_buddy_block,
-link);
-   if (block)
-   break;
+   if (flags & DRM_BUDDY_TOPDOWN_ALLOCATION) {
+   block = get_maxblock(mm, order);
+   if (block)
+   /* Store the obtained block order */
+   tmp = drm_buddy_block_order(block);
+   } else {
+   for (tmp = order; tmp <= mm->max_order; ++tmp) {
+   if (!list_empty(&mm->free_list[tmp])) {
+   block = list_last_entry(&mm->free_list[tmp],
+   struct drm_buddy_block,
+   link);
+

Re: [PATCH 1/2] drm/amdgpu: return the PCIe gen and lanes from the INFO

2023-01-12 Thread Christian König

Am 11.01.23 um 21:48 schrieb Alex Deucher:

On Wed, Jan 4, 2023 at 3:17 PM Marek Olšák  wrote:

Yes, it's meant to be like a spec sheet. We are not interested in the current 
bandwidth utilization.

After chatting with Marek on IRC and thinking about this more, I think
this patch is fine.  It's not really meant for bandwidth per se, but
rather as a limit to determine what the driver should do in certain
cases (i.e., when does it make sense to copy to vram vs not).  It's
not straightforward for userspace to parse the full topology to
determine what links may be slow.  I guess one potential pitfall would
be that if you pass the device into a VM, the driver may report the
wrong values.  Generally in a VM the VM doesn't get the full view up
to the root port.  I don't know if the hypervisors report properly for
pcie_bandwidth_available() in a VM or if it just shows the info about
the endpoint in the VM.


So this basically doesn't return the gen and lanes of the device, but 
rather what was negotiated between the device and the upstream root port?


If I got that correctly then we should probably document that cause 
otherwise somebody will try to "fix" it at some time.


Christian.



Reviewed-by: Alex Deucher 

Alex


Marek

On Wed, Jan 4, 2023 at 10:33 AM Lazar, Lijo  wrote:

[AMD Official Use Only - General]


To clarify, with DPM in place, the current bandwidth will be changing based on 
the load.

If apps/umd already has a way to know the current bandwidth utilisation, then 
possible maximum also could be part of the same API. Otherwise, this only looks 
like duplicate information. We have the same information in sysfs DPM nodes.

BTW, I don't know to what extent app/umd really makes use of this. Take that 
memory frequency as an example (I'm reading it as 16GHz). It only looks like a 
spec sheet.

Thanks,
Lijo

From: Marek Olšák 
Sent: Wednesday, January 4, 2023 8:40:00 PM
To: Lazar, Lijo 
Cc: amd-gfx@lists.freedesktop.org 
Subject: Re: [PATCH 1/2] drm/amdgpu: return the PCIe gen and lanes from the INFO

On Wed, Jan 4, 2023 at 9:19 AM Lazar, Lijo  wrote:



On 1/4/2023 7:43 PM, Marek Olšák wrote:

On Wed, Jan 4, 2023 at 6:50 AM Lazar, Lijo mailto:lijo.la...@amd.com>> wrote:



 On 1/4/2023 4:11 AM, Marek Olšák wrote:
  > I see. Well, those sysfs files are not usable, and I don't think it
  > would be important even if they were usable, but for completeness:
  >
  > The ioctl returns:
  >  pcie_gen = 1
  >  pcie_num_lanes = 16
  >
  > Theoretical bandwidth from those values: 4.0 GB/s
  > My DMA test shows this write bandwidth: 3.5 GB/s
  > It matches the expectation.
  >
  > Let's see the devices (there is only 1 GPU Navi21 in the system):
  > $ lspci |egrep '(PCI|VGA).*Navi'
  > 0a:00.0 PCI bridge: Advanced Micro Devices, Inc. [AMD/ATI] Navi
 10 XL
  > Upstream Port of PCI Express Switch (rev c3)
  > 0b:00.0 PCI bridge: Advanced Micro Devices, Inc. [AMD/ATI] Navi
 10 XL
  > Downstream Port of PCI Express Switch
  > 0c:00.0 VGA compatible controller: Advanced Micro Devices, Inc.
  > [AMD/ATI] Navi 21 [Radeon RX 6800/6800 XT / 6900 XT] (rev c3)
  >
  > Let's read sysfs:
  >
  > $ cat /sys/bus/pci/devices/:0a:00.0/current_link_width
  > 16
  > $ cat /sys/bus/pci/devices/:0b:00.0/current_link_width
  > 16
  > $ cat /sys/bus/pci/devices/:0c:00.0/current_link_width
  > 16
  > $ cat /sys/bus/pci/devices/:0a:00.0/current_link_speed
  > 2.5 GT/s PCIe
  > $ cat /sys/bus/pci/devices/:0b:00.0/current_link_speed
  > 16.0 GT/s PCIe
  > $ cat /sys/bus/pci/devices/:0c:00.0/current_link_speed
  > 16.0 GT/s PCIe
  >
  > Problem 1: None of the speed numbers match 4 GB/s.

 US bridge = 2.5GT/s means operating at PCIe Gen 1 speed. Total
 theoretical bandwidth is then derived based on encoding and total
 number
 of lanes.

  > Problem 2: Userspace doesn't know the bus index of the bridges,
 and it's
  > not clear which bridge should be used.

 In general, modern ones have this arch= US->DS->EP. US is the one
 connected to physical link.

  > Problem 3: The PCIe gen number is missing.

 Current link speed is based on whether it's Gen1/2/3/4/5.

 BTW, your patch makes use of capabilities flags which gives the maximum
 supported speed/width by the device. It may not necessarily reflect the
 current speed/width negotiated. I guess in NV, this info is already
 obtained from PMFW and made available through metrics table.


It computes the minimum of the device PCIe gen and the motherboard/slot
PCIe gen to get the final value. These 2 lines do that. The low 16 bits
of the mask contain the device PCIe gen mask. The high 16 bits of the
mask contain the slot PCIe gen mask.
+ pcie_gen_mask = adev->pm.pcie_gen_mask & (adev->pm.pcie_gen_mask >> 16);
+ dev_info->p

Re: [PATCH 1/2] drm/amdgpu: return the PCIe gen and lanes from the INFO

2023-01-12 Thread Christian König

Am 11.01.23 um 21:50 schrieb Alex Deucher:

On Wed, Jan 11, 2023 at 3:48 PM Alex Deucher  wrote:

On Wed, Jan 4, 2023 at 3:17 PM Marek Olšák  wrote:

Yes, it's meant to be like a spec sheet. We are not interested in the current 
bandwidth utilization.

After chatting with Marek on IRC and thinking about this more, I think
this patch is fine.  It's not really meant for bandwidth per se, but
rather as a limit to determine what the driver should do in certain
cases (i.e., when does it make sense to copy to vram vs not).  It's
not straightforward for userspace to parse the full topology to
determine what links may be slow.  I guess one potential pitfall would
be that if you pass the device into a VM, the driver may report the
wrong values.  Generally in a VM the VM doesn't get the full view up
to the root port.  I don't know if the hypervisors report properly for
pcie_bandwidth_available() in a VM or if it just shows the info about
the endpoint in the VM.

Reviewed-by: Alex Deucher 

Actually:

diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index fe7f871e3080..f7fc7325f17f 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -1053,7 +1053,7 @@ struct drm_amdgpu_info_device {
  __u32 enabled_rb_pipes_mask;
  __u32 num_rb_pipes;
  __u32 num_hw_gfx_contexts;
-__u32 _pad;
+__u32 pcie_gen;
  __u64 ids_flags;
  /** Starting virtual address for UMDs. */
  __u64 virtual_address_offset;
@@ -1109,6 +1109,7 @@ struct drm_amdgpu_info_device {
  __u64 high_va_max;
  /* gfx10 pa_sc_tile_steering_override */
  __u32 pa_sc_tile_steering_override;
+__u32 pcie_num_lanes;
  /* disabled TCCs */
  __u64 tcc_disabled_mask;
  __u64 min_engine_clock;

Doesn't that last one need to be added to the end of the structure?


I think the the structure has a padding here because the next member is 
u64 again.


But probably better to double check using pahole.

Christian.



Alex


Alex


Marek

On Wed, Jan 4, 2023 at 10:33 AM Lazar, Lijo  wrote:

[AMD Official Use Only - General]


To clarify, with DPM in place, the current bandwidth will be changing based on 
the load.

If apps/umd already has a way to know the current bandwidth utilisation, then 
possible maximum also could be part of the same API. Otherwise, this only looks 
like duplicate information. We have the same information in sysfs DPM nodes.

BTW, I don't know to what extent app/umd really makes use of this. Take that 
memory frequency as an example (I'm reading it as 16GHz). It only looks like a 
spec sheet.

Thanks,
Lijo

From: Marek Olšák 
Sent: Wednesday, January 4, 2023 8:40:00 PM
To: Lazar, Lijo 
Cc: amd-gfx@lists.freedesktop.org 
Subject: Re: [PATCH 1/2] drm/amdgpu: return the PCIe gen and lanes from the INFO

On Wed, Jan 4, 2023 at 9:19 AM Lazar, Lijo  wrote:



On 1/4/2023 7:43 PM, Marek Olšák wrote:

On Wed, Jan 4, 2023 at 6:50 AM Lazar, Lijo mailto:lijo.la...@amd.com>> wrote:



 On 1/4/2023 4:11 AM, Marek Olšák wrote:
  > I see. Well, those sysfs files are not usable, and I don't think it
  > would be important even if they were usable, but for completeness:
  >
  > The ioctl returns:
  >  pcie_gen = 1
  >  pcie_num_lanes = 16
  >
  > Theoretical bandwidth from those values: 4.0 GB/s
  > My DMA test shows this write bandwidth: 3.5 GB/s
  > It matches the expectation.
  >
  > Let's see the devices (there is only 1 GPU Navi21 in the system):
  > $ lspci |egrep '(PCI|VGA).*Navi'
  > 0a:00.0 PCI bridge: Advanced Micro Devices, Inc. [AMD/ATI] Navi
 10 XL
  > Upstream Port of PCI Express Switch (rev c3)
  > 0b:00.0 PCI bridge: Advanced Micro Devices, Inc. [AMD/ATI] Navi
 10 XL
  > Downstream Port of PCI Express Switch
  > 0c:00.0 VGA compatible controller: Advanced Micro Devices, Inc.
  > [AMD/ATI] Navi 21 [Radeon RX 6800/6800 XT / 6900 XT] (rev c3)
  >
  > Let's read sysfs:
  >
  > $ cat /sys/bus/pci/devices/:0a:00.0/current_link_width
  > 16
  > $ cat /sys/bus/pci/devices/:0b:00.0/current_link_width
  > 16
  > $ cat /sys/bus/pci/devices/:0c:00.0/current_link_width
  > 16
  > $ cat /sys/bus/pci/devices/:0a:00.0/current_link_speed
  > 2.5 GT/s PCIe
  > $ cat /sys/bus/pci/devices/:0b:00.0/current_link_speed
  > 16.0 GT/s PCIe
  > $ cat /sys/bus/pci/devices/:0c:00.0/current_link_speed
  > 16.0 GT/s PCIe
  >
  > Problem 1: None of the speed numbers match 4 GB/s.

 US bridge = 2.5GT/s means operating at PCIe Gen 1 speed. Total
 theoretical bandwidth is then derived based on encoding and total
 number
 of lanes.

  > Problem 2: Userspace doesn't know the bus index of the bridges,
 and it's
  > not clear which bridge should be used.

 In general, modern ones have this arch= US->DS->EP. US is the one
 connected 

Re: [PATCH] drm/amdgpu: fix pipeline sync v2

2023-01-12 Thread Luben Tuikov
Acked-by: Luben Tuikov 

Regards,
Luben

On 2023-01-09 08:01, Christian König wrote:
> This fixes a potential memory leak of dma_fence objects in the CS code
> as well as glitches in firefox because of missing pipeline sync.
> 
> v2: use the scheduler instead of the fence context for the test
> 
> Signed-off-by: Christian König 
> Bug: 
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues%2F2323&data=05%7C01%7Cluben.tuikov%40amd.com%7C78ed156f30284f9b6dfd08daf2419c17%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638088660850593361%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=R3ZsrQUOHoz4pYWRSFYQTRjsWT0wok4otsi7I%2FxJ9AA%3D&reserved=0
> Fixes: 1b2d5eda5ad7 ("drm/amdgpu: move explicit sync check into the CS")
> Tested-by: Michal Kubecek 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 46 +-
>  1 file changed, 30 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 47763ac0d14a..7b5ce00f0602 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -61,6 +61,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
>   amdgpu_ctx_put(p->ctx);
>   return -ECANCELED;
>   }
> +
> + amdgpu_sync_create(&p->sync);
>   return 0;
>  }
>  
> @@ -452,18 +454,6 @@ static int amdgpu_syncobj_lookup_and_add(struct 
> amdgpu_cs_parser *p,
>   }
>  
>   r = amdgpu_sync_fence(&p->sync, fence);
> - if (r)
> - goto error;
> -
> - /*
> -  * When we have an explicit dependency it might be necessary to insert a
> -  * pipeline sync to make sure that all caches etc are flushed and the
> -  * next job actually sees the results from the previous one.
> -  */
> - if (fence->context == p->gang_leader->base.entity->fence_context)
> - r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence);
> -
> -error:
>   dma_fence_put(fence);
>   return r;
>  }
> @@ -1188,10 +1178,19 @@ static int amdgpu_cs_vm_handling(struct 
> amdgpu_cs_parser *p)
>  static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>  {
>   struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> + struct drm_gpu_scheduler *sched;
>   struct amdgpu_bo_list_entry *e;
> + struct dma_fence *fence;
>   unsigned int i;
>   int r;
>  
> + r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_leader_idx]);
> + if (r) {
> + if (r != -ERESTARTSYS)
> + DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
> + return r;
> + }
> +
>   list_for_each_entry(e, &p->validated, tv.head) {
>   struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo);
>   struct dma_resv *resv = bo->tbo.base.resv;
> @@ -1211,10 +1210,24 @@ static int amdgpu_cs_sync_rings(struct 
> amdgpu_cs_parser *p)
>   return r;
>   }
>  
> - r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_leader_idx]);
> - if (r && r != -ERESTARTSYS)
> - DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
> - return r;
> + sched = p->gang_leader->base.entity->rq->sched;
> + while ((fence = amdgpu_sync_get_fence(&p->sync))) {
> + struct drm_sched_fence *s_fence = to_drm_sched_fence(fence);
> +
> + /*
> +  * When we have an dependency it might be necessary to insert a
> +  * pipeline sync to make sure that all caches etc are flushed 
> and the
> +  * next job actually sees the results from the previous one
> +  * before we start executing on the same scheduler ring.
> +  */
> + if (!s_fence || s_fence->sched != sched)
> + continue;
> +
> + r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence);
> + if (r)
> + return r;
> + }
> + return 0;
>  }
>  
>  static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
> @@ -1347,6 +1360,7 @@ static void amdgpu_cs_parser_fini(struct 
> amdgpu_cs_parser *parser)
>  {
>   unsigned i;
>  
> + amdgpu_sync_free(&parser->sync);
>   for (i = 0; i < parser->num_post_deps; i++) {
>   drm_syncobj_put(parser->post_deps[i].syncobj);
>   kfree(parser->post_deps[i].chain);



[PATCH] Revert "drm/display/dp_mst: Move all payload info into the atomic state"

2023-01-12 Thread Wayne Lin
This reverts commit 4d07b0bc403403438d9cf88450506240c5faf92f.

[Why]
Changes cause regression on amdgpu mst.
E.g.
In fill_dc_mst_payload_table_from_drm(), amdgpu expects to add/remove payload
one by one and call fill_dc_mst_payload_table_from_drm() to update the HW
maintained payload table. But previous change tries to go through all the
payloads in mst_state and update amdpug hw maintained table in once everytime
driver only tries to add/remove a specific payload stream only. The newly
design idea conflicts with the implementation in amdgpu nowadays.

[How]
Revert this patch first. After addressing all regression problems caused by
this previous patch, will add it back and adjust it.

Signed-off-by: Wayne Lin 
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2171
Cc: sta...@vger.kernel.org # 6.1
Cc: Lyude Paul 
Cc: Harry Wentland 
Cc: Mario Limonciello 
Cc: Ville Syrjälä 
Cc: Ben Skeggs 
Cc: Stanislav Lisovskiy 
Cc: Fangzhi Zuo 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c |  53 +-
 .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 106 ++-
 .../display/amdgpu_dm/amdgpu_dm_mst_types.c   |  87 ++-
 .../amd/display/include/link_service_types.h  |   3 -
 drivers/gpu/drm/display/drm_dp_mst_topology.c | 724 --
 drivers/gpu/drm/i915/display/intel_dp_mst.c   |  67 +-
 drivers/gpu/drm/i915/display/intel_hdcp.c |  24 +-
 drivers/gpu/drm/nouveau/dispnv50/disp.c   | 167 ++--
 include/drm/display/drm_dp_mst_helper.h   | 177 +++--
 9 files changed, 878 insertions(+), 530 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index 77277d90b6e2..674f5dc1102b 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -6548,7 +6548,6 @@ static int dm_encoder_helper_atomic_check(struct 
drm_encoder *encoder,
const struct drm_display_mode *adjusted_mode = 
&crtc_state->adjusted_mode;
struct drm_dp_mst_topology_mgr *mst_mgr;
struct drm_dp_mst_port *mst_port;
-   struct drm_dp_mst_topology_state *mst_state;
enum dc_color_depth color_depth;
int clock, bpp = 0;
bool is_y420 = false;
@@ -6562,13 +6561,6 @@ static int dm_encoder_helper_atomic_check(struct 
drm_encoder *encoder,
if (!crtc_state->connectors_changed && !crtc_state->mode_changed)
return 0;
 
-   mst_state = drm_atomic_get_mst_topology_state(state, mst_mgr);
-   if (IS_ERR(mst_state))
-   return PTR_ERR(mst_state);
-
-   if (!mst_state->pbn_div)
-   mst_state->pbn_div = 
dm_mst_get_pbn_divider(aconnector->mst_port->dc_link);
-
if (!state->duplicated) {
int max_bpc = conn_state->max_requested_bpc;
is_y420 = drm_mode_is_420_also(&connector->display_info, 
adjusted_mode) &&
@@ -6580,10 +6572,11 @@ static int dm_encoder_helper_atomic_check(struct 
drm_encoder *encoder,
clock = adjusted_mode->clock;
dm_new_connector_state->pbn = drm_dp_calc_pbn_mode(clock, bpp, 
false);
}
-
-   dm_new_connector_state->vcpi_slots =
-   drm_dp_atomic_find_time_slots(state, mst_mgr, mst_port,
- dm_new_connector_state->pbn);
+   dm_new_connector_state->vcpi_slots = 
drm_dp_atomic_find_time_slots(state,
+  
mst_mgr,
+  
mst_port,
+  
dm_new_connector_state->pbn,
+  
dm_mst_get_pbn_divider(aconnector->dc_link));
if (dm_new_connector_state->vcpi_slots < 0) {
DRM_DEBUG_ATOMIC("failed finding vcpi slots: %d\n", 
(int)dm_new_connector_state->vcpi_slots);
return dm_new_connector_state->vcpi_slots;
@@ -6654,14 +6647,17 @@ static int dm_update_mst_vcpi_slots_for_dsc(struct 
drm_atomic_state *state,
dm_conn_state->vcpi_slots = slot_num;
 
ret = drm_dp_mst_atomic_enable_dsc(state, 
aconnector->port,
-  dm_conn_state->pbn, 
false);
+  dm_conn_state->pbn, 
0, false);
if (ret < 0)
return ret;
 
continue;
}
 
-   vcpi = drm_dp_mst_atomic_enable_dsc(state, aconnector->port, 
pbn, true);
+   vcpi = drm_dp_mst_atomic_enable_dsc(state,
+   aconnector->port,
+   pbn, pbn_div,
+   true);
if (vcpi < 0)
return vcpi;