[PATCH] drm/amd/display: Prevent potential buffer overflow in map_hw_resources

2024-02-20 Thread Srinivasan Shanmugam
Adds a check in the map_hw_resources function to prevent a
potential buffer overflow. The function was accessing arrays using an
index that could potentially be greater than the size of the arrays,
leading to a buffer overflow.

Adds a check to ensure that the index is within the bounds of
the arrays. If the index is out of bounds, an error message is printed
and the function returns early to prevent the buffer overflow.

Reported by smatch:
drivers/gpu/drm/amd/amdgpu/../display/dc/dml2/dml2_wrapper.c:79 
map_hw_resources() error: buffer overflow 
'dml2->v20.scratch.dml_to_dc_pipe_mapping.disp_cfg_to_stream_id' 6 <= 7
drivers/gpu/drm/amd/amdgpu/../display/dc/dml2/dml2_wrapper.c:81 
map_hw_resources() error: buffer overflow 
'dml2->v20.scratch.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id' 6 <= 7

Fixes: 482ce89eec1b ("drm/amd/display: Introduce DML2")
Cc: Rodrigo Siqueira 
Cc: Roman Li 
Cc: Qingqing Zhuo 
Cc: Aurabindo Pillai 
Cc: Tom Chung 
Signed-off-by: Srinivasan Shanmugam 
---
 drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c 
b/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c
index 26307e599614..0531d54b3d68 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c
+++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c
@@ -76,6 +76,12 @@ static void map_hw_resources(struct dml2_context *dml2,
in_out_display_cfg->hw.DLGRefClkFreqMHz = 50;
}
for (j = 0; j < mode_support_info->DPPPerSurface[i]; j++) {
+   if (i >= __DML2_WRAPPER_MAX_STREAMS_PLANES__) {
+   dml_print("DML::%s: Index out of bounds: i=%d,
+ 
__DML2_WRAPPER_MAX_STREAMS_PLANES__=%d\n",
+ __func__, i, 
__DML2_WRAPPER_MAX_STREAMS_PLANES__);
+   return;
+   }

dml2->v20.scratch.dml_to_dc_pipe_mapping.dml_pipe_idx_to_stream_id[num_pipes] = 
dml2->v20.scratch.dml_to_dc_pipe_mapping.disp_cfg_to_stream_id[i];

dml2->v20.scratch.dml_to_dc_pipe_mapping.dml_pipe_idx_to_stream_id_valid[num_pipes]
 = true;

dml2->v20.scratch.dml_to_dc_pipe_mapping.dml_pipe_idx_to_plane_id[num_pipes] = 
dml2->v20.scratch.dml_to_dc_pipe_mapping.disp_cfg_to_plane_id[i];
-- 
2.34.1



[PATCH] drm/amdgpu/jpeg: add support for jpeg multi instance

2024-02-20 Thread Alex Deucher
From: Saleemkhan Jamadar 

Enable support for multi instance on JPEG 4.0.6.

v2: squash in fixes (Alex)

Signed-off-by: Saleemkhan Jamadar 
Reviewed-by: Veerabadhran Gopalakrishnan 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c | 177 ---
 1 file changed, 123 insertions(+), 54 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c 
b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c
index 3602738874ee..521af589ffcd 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c
@@ -53,6 +53,11 @@ static int jpeg_v4_0_5_set_powergating_state(void *handle,
 
 static void jpeg_v4_0_5_dec_ring_set_wptr(struct amdgpu_ring *ring);
 
+static int amdgpu_ih_clientid_jpeg[] = {
+   SOC15_IH_CLIENTID_VCN,
+   SOC15_IH_CLIENTID_VCN1
+};
+
 /**
  * jpeg_v4_0_5_early_init - set function pointers
  *
@@ -64,8 +69,20 @@ static int jpeg_v4_0_5_early_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+   switch (amdgpu_ip_version(adev, UVD_HWIP, 0)) {
+   case IP_VERSION(4, 0, 5):
+   adev->jpeg.num_jpeg_inst = 1;
+   break;
+   case IP_VERSION(4, 0, 6):
+   adev->jpeg.num_jpeg_inst = 2;
+   break;
+   default:
+   DRM_DEV_ERROR(adev->dev,
+   "Failed to init vcn ip block(UVD_HWIP:0x%x)\n",
+   amdgpu_ip_version(adev, UVD_HWIP, 0));
+   return -EINVAL;
+   }
 
-   adev->jpeg.num_jpeg_inst = 1;
adev->jpeg.num_jpeg_rings = 1;
 
jpeg_v4_0_5_set_dec_ring_funcs(adev);
@@ -85,25 +102,30 @@ static int jpeg_v4_0_5_sw_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
struct amdgpu_ring *ring;
-   int r;
+   int r, i;
 
-   /* JPEG TRAP */
-   r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
-   VCN_4_0__SRCID__JPEG_DECODE, >jpeg.inst->irq);
-   if (r)
-   return r;
+   for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) {
+   if (adev->jpeg.harvest_config & (1 << i))
+   continue;
 
-   /* JPEG DJPEG POISON EVENT */
-   r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
-   VCN_4_0__SRCID_DJPEG0_POISON, >jpeg.inst->irq);
-   if (r)
-   return r;
+   /* JPEG TRAP */
+   r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_jpeg[i],
+   VCN_4_0__SRCID__JPEG_DECODE, 
>jpeg.inst[i].irq);
+   if (r)
+   return r;
 
-   /* JPEG EJPEG POISON EVENT */
-   r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_VCN,
-   VCN_4_0__SRCID_EJPEG0_POISON, >jpeg.inst->irq);
-   if (r)
-   return r;
+   /* JPEG DJPEG POISON EVENT */
+   r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_jpeg[i],
+   VCN_4_0__SRCID_DJPEG0_POISON, >jpeg.inst[i].irq);
+   if (r)
+   return r;
+
+   /* JPEG EJPEG POISON EVENT */
+   r = amdgpu_irq_add_id(adev, amdgpu_ih_clientid_jpeg[i],
+   VCN_4_0__SRCID_EJPEG0_POISON, >jpeg.inst[i].irq);
+   if (r)
+   return r;
+   }
 
r = amdgpu_jpeg_sw_init(adev);
if (r)
@@ -113,21 +135,23 @@ static int jpeg_v4_0_5_sw_init(void *handle)
if (r)
return r;
 
-   ring = adev->jpeg.inst->ring_dec;
-   ring->use_doorbell = true;
-   ring->doorbell_index = amdgpu_sriov_vf(adev) ?
-   (((adev->doorbell_index.vcn.vcn_ring0_1) << 1) 
+ 4) :
-   ((adev->doorbell_index.vcn.vcn_ring0_1 << 1) + 
1);
-   ring->vm_hub = AMDGPU_MMHUB0(0);
+   for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) {
+   if (adev->jpeg.harvest_config & (1 << i))
+   continue;
 
-   sprintf(ring->name, "jpeg_dec");
-   r = amdgpu_ring_init(adev, ring, 512, >jpeg.inst->irq, 0,
-AMDGPU_RING_PRIO_DEFAULT, NULL);
-   if (r)
-   return r;
+   ring = adev->jpeg.inst[i].ring_dec;
+   ring->use_doorbell = true;
+   ring->vm_hub = AMDGPU_MMHUB0(0);
+   ring->doorbell_index = (adev->doorbell_index.vcn.vcn_ring0_1 << 
1) + 1 + 8 * i;
+   sprintf(ring->name, "jpeg_dec_%d", i);
+   r = amdgpu_ring_init(adev, ring, 512, >jpeg.inst[i].irq,
+0, AMDGPU_RING_PRIO_DEFAULT, NULL);
+   if (r)
+   return r;
 
-   adev->jpeg.internal.jpeg_pitch[0] = regUVD_JPEG_PITCH_INTERNAL_OFFSET;
-   adev->jpeg.inst->external.jpeg_pitch[0] = SOC15_REG_OFFSET(JPEG, 0, 
regUVD_JPEG_PITCH);
+   adev->jpeg.internal.jpeg_pitch[0] = 

Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-20 Thread Alex Deucher
On Tue, Feb 20, 2024 at 2:41 PM Romano  wrote:
>
> If the increased low range is allowed via boot option, like in proposed
> patch, user clearly made an intentional decision. Undefined, but won't
> fry his hardware for sure. Undefined is also overclocking in that
> matter. You can go out of range with ratio of voltage vs frequency(still
> within vendor's limits) for example and crash the system.

This whole thing reminds me of this:
https://xkcd.com/1172/
The problem is another module parameter is another interface to
maintain and validate.  Moreover, we've had a number of cases in the
past where users have under or overclocked and reported bugs or
stability issues and it did not come to light that they were doing
that until we'd already spent a good deal of time trying to debug the
issue.  This obviously can still happen if you allow any sort of over
or underclocking, but at least if you stick to the limits you are
staying within the bounding box of the design.

Alex

>
>
>
> On 2/20/24 19:09, Alex Deucher wrote:
> > On Tue, Feb 20, 2024 at 11:46 AM Romano  wrote:
> >> For Windows, apps like MSI Afterburner is the one to try and what most
> >> people go for. Using it in the past myself, I would be surprised if it
> >> adhered to such a high min power cap. But even if it did, why would we
> >> have to.
> >>
> >> Relying on vendors cap in this case has already proven wrong because
> >> things worked for quite some time already and people reported saving
> >> significant amount of watts, in my case 90W(!) for <10% perf.
> >>
> >> Therefore this talk about safety seems rather strange to me and
> >> especially so when we are talking about min_cap. Or name me a single
> >> case where someone fried his card due to "too low power" set in said
> >> variable. Now there was a report, where by going way too low, driver
> >> goes opposite into max power. That's it. That can be easily
> >> detected(vents going crazy etc.) and reverted. It is a max_cap that
> >> protect HW(also above scenario), not a min_cap. Feel free to adhere to
> >> safety standards with that one.
> > Because operation outside of the design bounding box is undefined.  It
> > might work for some boards but not others.  It's possible some of the
> > logic in the firmware or some of the components used on the board may
> > not work correctly below a certain limit, or the voltage regulators
> > used on a specific board have a minimum requirement that would not be
> > an issue if you stick the bounding box.
> >
> > Alex
> >
> >> As for solution, what some suggested already exist - a patch posted by
> >> fililip on gitlab is probably the way most of you would agree. It
> >> introduce a variable that can be set during boot to override min_cap.
> >> But he did not pull requested it, so please, if any one of you who have
> >> access to code and merge kernel would be kind enough to implement it.
> >>
> >>
> >>
> >> On 2/20/24 16:46, Alex Deucher wrote:
> >>> On Tue, Feb 20, 2024 at 10:42 AM Linux regression tracking (Thorsten
> >>> Leemhuis)  wrote:
> 
>  On 20.02.24 16:27, Hans de Goede wrote:
> > Hi,
> >
> > On 2/20/24 16:15, Alex Deucher wrote:
> >> On Tue, Feb 20, 2024 at 10:03 AM Linux regression tracking (Thorsten
> >> Leemhuis)  wrote:
> >>> On 20.02.24 15:45, Alex Deucher wrote:
>  On Mon, Feb 19, 2024 at 9:47 AM Linux regression tracking (Thorsten
>  Leemhuis)  wrote:
> > On 17.02.24 14:30, Greg KH wrote:
> >> On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:
> >>> Minimum power limit on latest(6.7+) kernels is 190W for my GPU 
> >>> (RX 6700XT,
> >>> mesa, archlinux) and I cannot get power cap as low as before(to 
> >>> 115W),
> >>> neither with Corectrl, LACT or TuxClocker and /sys have a 
> >>> variable read-only
> >>> even for root. This is not of above apps issue but of the kernel, 
> >>> I read
> >>> similar issues from other bug reports of above apps. I downgraded 
> >>> to v6.6.10
> >>> kernel and my 115W(under power)cap work again as before.
> > For the record and everyone that lands here: the cause is known now
> > (it's 1958946858a62b ("drm/amd/pm: Support for getting 
> > power1_cap_min
> > value") [v6.7-rc1]) and the issue afaics tracked here:
> >
> > https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> >
> > Other mentions:
> > https://gitlab.freedesktop.org/drm/amd/-/issues/3137
> > https://gitlab.freedesktop.org/drm/amd/-/issues/2992
> >
> > Haven't seen any statement from the amdgpu developers (now CCed) 
> > yet on
> > this there (but might have missed something!). From what I can see I
> > assume this will likely be somewhat tricky to handle, as a revert
> > overall might be a bad idea here. We'll see I guess.
>  The change aligns the 

Re: [PATCH 1/2] drm/amdkfd: Document and define SVM event tracing macro

2024-02-20 Thread Philip Yang

  


On 2024-02-16 15:16, Felix Kuehling
  wrote:


  
  On 2024-02-15 10:18, Philip Yang wrote:
  
  Document how to use SMI system management
interface to receive SVM

events.


Define SVM events message string format macro that could use by
user

mode for sscanf to parse the event. Add it to uAPI header file
to make

it obvious that is changing uAPI in future.


No functional changes.


Signed-off-by: Philip Yang 

---

  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 51
+++---

  include/uapi/linux/kfd_ioctl.h  | 77
-

  2 files changed, 102 insertions(+), 26 deletions(-)


diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

index d9953c2b2661..85465eb303a9 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c

@@ -225,15 +225,16 @@ void kfd_smi_event_update_gpu_reset(struct
kfd_node *dev, bool post_reset)

  event = KFD_SMI_EVENT_GPU_PRE_RESET;

  ++(dev->reset_seq_num);

  }

-    kfd_smi_event_add(0, dev, event, "%x\n",
dev->reset_seq_num);

+    kfd_smi_event_add(0, dev, event,

+ 
KFD_EVENT_FMT_UPDATE_GPU_RESET(dev->reset_seq_num));

  }

    void kfd_smi_event_update_thermal_throttling(struct kfd_node
*dev,

   uint64_t throttle_bitmask)

  {

-    kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE,
"%llx:%llx\n",

-  throttle_bitmask,

- 
amdgpu_dpm_get_thermal_throttling_counter(dev->adev));

+    kfd_smi_event_add(0, dev, KFD_SMI_EVENT_THERMAL_THROTTLE,

+ 
KFD_EVENT_FMT_UPDATE_THERMAL_THROTTLING(throttle_bitmask,

+ 
amdgpu_dpm_get_thermal_throttling_counter(dev->adev)));

  }

    void kfd_smi_event_update_vmfault(struct kfd_node *dev,
uint16_t pasid)

@@ -246,8 +247,8 @@ void kfd_smi_event_update_vmfault(struct
kfd_node *dev, uint16_t pasid)

  if (!task_info.pid)

  return;

  -    kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,
"%x:%s\n",

-  task_info.pid, task_info.task_name);

+    kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT,

+  KFD_EVENT_FMT_VMFAULT(task_info.pid,
task_info.task_name));

  }

    void kfd_smi_event_page_fault_start(struct kfd_node *node,
pid_t pid,

@@ -255,16 +256,16 @@ void kfd_smi_event_page_fault_start(struct
kfd_node *node, pid_t pid,

  ktime_t ts)

  {

  kfd_smi_event_add(pid, node,
KFD_SMI_EVENT_PAGE_FAULT_START,

-  "%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,

-  address, node->id, write_fault ? 'W' : 'R');

+  KFD_EVENT_FMT_PAGEFAULT_START(ktime_to_ns(ts),
pid,

+  address, node->id, write_fault ? 'W' : 'R'));

  }

    void kfd_smi_event_page_fault_end(struct kfd_node *node,
pid_t pid,

    unsigned long address, bool migration)

  {

  kfd_smi_event_add(pid, node, KFD_SMI_EVENT_PAGE_FAULT_END,

-  "%lld -%d @%lx(%x) %c\n",
ktime_get_boottime_ns(),

-  pid, address, node->id, migration ? 'M' :
'U');

+ 
KFD_EVENT_FMT_PAGEFAULT_END(ktime_get_boottime_ns(),

+  pid, address, node->id, migration ? 'M' :
'U'));

  }

    void kfd_smi_event_migration_start(struct kfd_node *node,
pid_t pid,

@@ -274,9 +275,9 @@ void kfd_smi_event_migration_start(struct
kfd_node *node, pid_t pid,

 uint32_t trigger)

  {

  kfd_smi_event_add(pid, node, KFD_SMI_EVENT_MIGRATE_START,

-  "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n",

-  

[PATCH] drm/amd: Update atomfirmware.h for DCN401

2024-02-20 Thread Aurabindo Pillai
Add new firmware header definitions reqiured for DCN401

Signed-off-by: Aurabindo Pillai 
---
 drivers/gpu/drm/amd/include/atomfirmware.h | 33 ++
 1 file changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/atomfirmware.h 
b/drivers/gpu/drm/amd/include/atomfirmware.h
index fa7d6ced786f..206c8a025f9e 100644
--- a/drivers/gpu/drm/amd/include/atomfirmware.h
+++ b/drivers/gpu/drm/amd/include/atomfirmware.h
@@ -610,6 +610,39 @@ struct atom_firmware_info_v3_4 {
 uint32_t reserved[2];
 };
 
+struct atom_firmware_info_v3_5
+{
+  struct atom_common_table_header table_header;
+  uint32_t firmware_revision;
+  uint32_t bootup_clk_reserved[2];
+  uint32_t firmware_capability; // enum 
atombios_firmware_capability
+  uint32_t fw_protect_region_size_in_kb;/* FW allocate a write protect 
region at top of FB. */
+  uint32_t bios_scratch_reg_startaddr;  // 1st bios scratch register dword 
address
+  uint32_t bootup_voltage_reserved[2];
+  uint8_t  mem_module_id;
+  uint8_t  coolingsolution_id;  /*0: Air cooling; 1: Liquid 
cooling ... */
+  uint8_t  hw_blt_mode; //0:HW_BLT_DMA_PIO_MODE; 
1:HW_BLT_LITE_SDMA_MODE; 2:HW_BLT_PCI_IO_MODE
+  uint8_t  reserved1;
+  uint32_t mc_baseaddr_high;
+  uint32_t mc_baseaddr_low;
+  uint8_t  board_i2c_feature_id;// enum of 
atom_board_i2c_feature_id_def
+  uint8_t  board_i2c_feature_gpio_id;   // i2c id find in gpio_lut data 
table gpio_id
+  uint8_t  board_i2c_feature_slave_addr;
+  uint8_t  ras_rom_i2c_slave_addr;
+  uint32_t bootup_voltage_reserved1;
+  uint32_t zfb_reserved;
+  // if pplib_pptable_id!=0, pplib get powerplay table inside driver instead 
of from VBIOS
+  uint32_t pplib_pptable_id;
+  uint32_t hw_voltage_reserved[3];
+  uint32_t maco_pwrlimit_mw;// bomaco mode power limit in unit 
of m-watt
+  uint32_t usb_pwrlimit_mw; // power limit when USB is enable 
in unit of m-watt
+  uint32_t fw_reserved_size_in_kb;  // VBIOS reserved extra fw size in 
unit of kb.
+  uint32_t pspbl_init_reserved[3];
+  uint32_t spi_rom_size;// GPU spi rom size
+  uint16_t support_dev_in_objinfo;
+  uint16_t disp_phy_tunning_size;
+  uint32_t reserved[16];
+};
 /* 
   ***
 Data Table lcd_info  structure
-- 
2.43.0



Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-20 Thread Alex Deucher
On Tue, Feb 20, 2024 at 10:27 AM Hans de Goede  wrote:
>
> Hi,
>
> On 2/20/24 16:15, Alex Deucher wrote:
> > On Tue, Feb 20, 2024 at 10:03 AM Linux regression tracking (Thorsten
> > Leemhuis)  wrote:
> >>
> >> On 20.02.24 15:45, Alex Deucher wrote:
> >>> On Mon, Feb 19, 2024 at 9:47 AM Linux regression tracking (Thorsten
> >>> Leemhuis)  wrote:
> 
>  On 17.02.24 14:30, Greg KH wrote:
> > On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:
> >> Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 
> >> 6700XT,
> >> mesa, archlinux) and I cannot get power cap as low as before(to 115W),
> >> neither with Corectrl, LACT or TuxClocker and /sys have a variable 
> >> read-only
> >> even for root. This is not of above apps issue but of the kernel, I 
> >> read
> >> similar issues from other bug reports of above apps. I downgraded to 
> >> v6.6.10
> >> kernel and my 115W(under power)cap work again as before.
> >
>  For the record and everyone that lands here: the cause is known now
>  (it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min
>  value") [v6.7-rc1]) and the issue afaics tracked here:
> 
>  https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> 
>  Other mentions:
>  https://gitlab.freedesktop.org/drm/amd/-/issues/3137
>  https://gitlab.freedesktop.org/drm/amd/-/issues/2992
> 
>  Haven't seen any statement from the amdgpu developers (now CCed) yet on
>  this there (but might have missed something!). From what I can see I
>  assume this will likely be somewhat tricky to handle, as a revert
>  overall might be a bad idea here. We'll see I guess.
> >>>
> >>> The change aligns the driver what has been validated on each board
> >>> design.  Windows uses the same limits.  Using values lower than the
> >>> validated range can lead to undefined behavior and could potentially
> >>> damage your hardware.
> >>
> >> Thx for the reply! Yeah, I was expecting something along those lines.
> >>
> >> Nevertheless it afaics still is a regression in the eyes of many users.
> >> I'm not sure how Linus feels about this, but I wonder if we can find
> >> some solution here so that users that really want to, can continue to do
> >> what was possible out-of-the box before. Is that possible to realize or
> >> even supported already?
> >>
> >> And sure, those users would be running their hardware outside of its
> >> specifications. But is that different from overclocking (which the
> >> driver allows, doesn't it? If not by all means please correct me!)?
> >
> > Sure.  The driver has always had upper bound limits for overclocking,
> > this change adds lower bounds checking for underclocking as well.
> > When the silicon validation teams set the bounding box for a device,
> > they set a range of values where it's reasonable to operate based on
> > the characteristics of the design.
> >
> > If we did want to allow extended underclocking, we need a big warning
> > in the logs at the very least.
>
> Requiring a module-option to be set to allow this, as well as a big
> warning in the logs sounds like a good solution to me.

I dunno.  I kind of go back and forth with it.  It's yet another knob
to maintain and when we've done things like this in the past, we get
lots of bug reports or angry users because the kernel is sending
warnings when they set it.

Alex

>
> Regards,
>
> Hans
>
>
>
>
>
>  Roman posted something that apparently was meant to go to the list, so
>  let me put it here:
> 
>  """
>  UPDATE: User fililip already posted patch, but it need to be merged,
>  discussion is on gitlab link below.
> 
>  (PS: I hope I am replying correctly to "all" now? - using original addr.)
> 
> 
> > it seems that commit was already found(see user's 'fililip' comment):
> >
> > https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> > commit 1958946858a62b6b5392ed075aa219d199bcae39
> > Author: Ma Jun 
> > Date:   Thu Oct 12 09:33:45 2023 +0800
> >
> > drm/amd/pm: Support for getting power1_cap_min value
> >
> > Support for getting power1_cap_min value on smu13 and smu11.
> > For other Asics, we still use 0 as the default value.
> >
> > Signed-off-by: Ma Jun 
> > Reviewed-by: Kenneth Feng 
> > Signed-off-by: Alex Deucher 
> >
> > However, this is not good as it remove under-powering range too far. I
>  was getting only about 7% less performance but 90W(!) less consumption
>  when set to my 115W before. Also I wonder if we as a OS of options and
>  freedom have to stick to such very high reference for min values without
>  ability to override them through some sys ctrls. Commit was done by amd
>  guy and I wonder if because of maybe this post that I made few months
>  ago(business strategy?):
> >
> >
>  

Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-20 Thread Alex Deucher
On Tue, Feb 20, 2024 at 11:46 AM Romano  wrote:
>
> For Windows, apps like MSI Afterburner is the one to try and what most
> people go for. Using it in the past myself, I would be surprised if it
> adhered to such a high min power cap. But even if it did, why would we
> have to.
>
> Relying on vendors cap in this case has already proven wrong because
> things worked for quite some time already and people reported saving
> significant amount of watts, in my case 90W(!) for <10% perf.
>
> Therefore this talk about safety seems rather strange to me and
> especially so when we are talking about min_cap. Or name me a single
> case where someone fried his card due to "too low power" set in said
> variable. Now there was a report, where by going way too low, driver
> goes opposite into max power. That's it. That can be easily
> detected(vents going crazy etc.) and reverted. It is a max_cap that
> protect HW(also above scenario), not a min_cap. Feel free to adhere to
> safety standards with that one.

Because operation outside of the design bounding box is undefined.  It
might work for some boards but not others.  It's possible some of the
logic in the firmware or some of the components used on the board may
not work correctly below a certain limit, or the voltage regulators
used on a specific board have a minimum requirement that would not be
an issue if you stick the bounding box.

Alex

>
> As for solution, what some suggested already exist - a patch posted by
> fililip on gitlab is probably the way most of you would agree. It
> introduce a variable that can be set during boot to override min_cap.
> But he did not pull requested it, so please, if any one of you who have
> access to code and merge kernel would be kind enough to implement it.
>
>
>
> On 2/20/24 16:46, Alex Deucher wrote:
> > On Tue, Feb 20, 2024 at 10:42 AM Linux regression tracking (Thorsten
> > Leemhuis)  wrote:
> >>
> >>
> >> On 20.02.24 16:27, Hans de Goede wrote:
> >>> Hi,
> >>>
> >>> On 2/20/24 16:15, Alex Deucher wrote:
>  On Tue, Feb 20, 2024 at 10:03 AM Linux regression tracking (Thorsten
>  Leemhuis)  wrote:
> > On 20.02.24 15:45, Alex Deucher wrote:
> >> On Mon, Feb 19, 2024 at 9:47 AM Linux regression tracking (Thorsten
> >> Leemhuis)  wrote:
> >>> On 17.02.24 14:30, Greg KH wrote:
>  On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:
> > Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 
> > 6700XT,
> > mesa, archlinux) and I cannot get power cap as low as before(to 
> > 115W),
> > neither with Corectrl, LACT or TuxClocker and /sys have a variable 
> > read-only
> > even for root. This is not of above apps issue but of the kernel, I 
> > read
> > similar issues from other bug reports of above apps. I downgraded 
> > to v6.6.10
> > kernel and my 115W(under power)cap work again as before.
> >>> For the record and everyone that lands here: the cause is known now
> >>> (it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min
> >>> value") [v6.7-rc1]) and the issue afaics tracked here:
> >>>
> >>> https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> >>>
> >>> Other mentions:
> >>> https://gitlab.freedesktop.org/drm/amd/-/issues/3137
> >>> https://gitlab.freedesktop.org/drm/amd/-/issues/2992
> >>>
> >>> Haven't seen any statement from the amdgpu developers (now CCed) yet 
> >>> on
> >>> this there (but might have missed something!). From what I can see I
> >>> assume this will likely be somewhat tricky to handle, as a revert
> >>> overall might be a bad idea here. We'll see I guess.
> >> The change aligns the driver what has been validated on each board
> >> design.  Windows uses the same limits.  Using values lower than the
> >> validated range can lead to undefined behavior and could potentially
> >> damage your hardware.
> > Thx for the reply! Yeah, I was expecting something along those lines.
> >
> > Nevertheless it afaics still is a regression in the eyes of many users.
> > I'm not sure how Linus feels about this, but I wonder if we can find
> > some solution here so that users that really want to, can continue to do
> > what was possible out-of-the box before. Is that possible to realize or
> > even supported already?
> >
> > And sure, those users would be running their hardware outside of its
> > specifications. But is that different from overclocking (which the
> > driver allows, doesn't it? If not by all means please correct me!)?
>  Sure.  The driver has always had upper bound limits for overclocking,
>  this change adds lower bounds checking for underclocking as well.
>  When the silicon validation teams set the bounding box for a device,
>  they set a range of values where it's reasonable to operate based on
>  the characteristics 

RE: [PATCH 4/4] drm/amdgpu: Do not program VM_L2_CNTL under SRIOV

2024-02-20 Thread Dhume, Samir
[AMD Official Use Only - General]

Reviewed-by: Samir Dhume 

-Original Message-
From: amd-gfx  On Behalf Of Victor Lu
Sent: Tuesday, January 2, 2024 12:30 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chander, Vignesh ; Lu, Victor Cheng Chi (Victor) 

Subject: [PATCH 4/4] drm/amdgpu: Do not program VM_L2_CNTL under SRIOV

VM_L2_CNTL* should not be programmed on driver unload under SRIOV.
These regs are skipped during SRIOV driver init.

Signed-off-by: Victor Lu 
---
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c 
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
index 55423ff1bb49..20e800bc0b68 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_2.c
@@ -454,10 +454,12 @@ static void gfxhub_v1_2_xcc_gart_disable(struct 
amdgpu_device *adev,
WREG32_SOC15_RLC(GC, GET_INST(GC, j), regMC_VM_MX_L1_TLB_CNTL, 
tmp);

/* Setup L2 cache */
-   tmp = RREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL);
-   tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
-   WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL, tmp);
-   WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL3, 0);
+   if (!amdgpu_sriov_vf(adev)) {
+   tmp = RREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL);
+   tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 
0);
+   WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL, tmp);
+   WREG32_SOC15(GC, GET_INST(GC, j), regVM_L2_CNTL3, 0);
+   }
}
 }

--
2.34.1



RE: [PATCH 2/4] drm/amdgpu: Do not program SQ_TIMEOUT_CONFIG in SRIOV

2024-02-20 Thread Dhume, Samir
[AMD Official Use Only - General]

Reviewed-by: Samir Dhume 

-Original Message-
From: amd-gfx  On Behalf Of Victor Lu
Sent: Tuesday, January 2, 2024 12:30 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chander, Vignesh ; Lu, Victor Cheng Chi (Victor) 

Subject: [PATCH 2/4] drm/amdgpu: Do not program SQ_TIMEOUT_CONFIG in SRIOV

VF should not program this register.

Signed-off-by: Victor Lu 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 00b21ece081f..30cc155f20d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -3888,6 +3888,9 @@ static void gfx_v9_4_3_inst_enable_watchdog_timer(struct 
amdgpu_device *adev,
uint32_t i;
uint32_t data;

+   if (amdgpu_sriov_vf(adev))
+   return;
+
data = RREG32_SOC15(GC, GET_INST(GC, 0), regSQ_TIMEOUT_CONFIG);
data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
 amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : 
0);
--
2.34.1



Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-20 Thread Alex Deucher
On Tue, Feb 20, 2024 at 10:42 AM Linux regression tracking (Thorsten
Leemhuis)  wrote:
>
>
>
> On 20.02.24 16:27, Hans de Goede wrote:
> > Hi,
> >
> > On 2/20/24 16:15, Alex Deucher wrote:
> >> On Tue, Feb 20, 2024 at 10:03 AM Linux regression tracking (Thorsten
> >> Leemhuis)  wrote:
> >>>
> >>> On 20.02.24 15:45, Alex Deucher wrote:
>  On Mon, Feb 19, 2024 at 9:47 AM Linux regression tracking (Thorsten
>  Leemhuis)  wrote:
> >
> > On 17.02.24 14:30, Greg KH wrote:
> >> On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:
> >>> Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 
> >>> 6700XT,
> >>> mesa, archlinux) and I cannot get power cap as low as before(to 115W),
> >>> neither with Corectrl, LACT or TuxClocker and /sys have a variable 
> >>> read-only
> >>> even for root. This is not of above apps issue but of the kernel, I 
> >>> read
> >>> similar issues from other bug reports of above apps. I downgraded to 
> >>> v6.6.10
> >>> kernel and my 115W(under power)cap work again as before.
> >>
> > For the record and everyone that lands here: the cause is known now
> > (it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min
> > value") [v6.7-rc1]) and the issue afaics tracked here:
> >
> > https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> >
> > Other mentions:
> > https://gitlab.freedesktop.org/drm/amd/-/issues/3137
> > https://gitlab.freedesktop.org/drm/amd/-/issues/2992
> >
> > Haven't seen any statement from the amdgpu developers (now CCed) yet on
> > this there (but might have missed something!). From what I can see I
> > assume this will likely be somewhat tricky to handle, as a revert
> > overall might be a bad idea here. We'll see I guess.
> 
>  The change aligns the driver what has been validated on each board
>  design.  Windows uses the same limits.  Using values lower than the
>  validated range can lead to undefined behavior and could potentially
>  damage your hardware.
> >>>
> >>> Thx for the reply! Yeah, I was expecting something along those lines.
> >>>
> >>> Nevertheless it afaics still is a regression in the eyes of many users.
> >>> I'm not sure how Linus feels about this, but I wonder if we can find
> >>> some solution here so that users that really want to, can continue to do
> >>> what was possible out-of-the box before. Is that possible to realize or
> >>> even supported already?
> >>>
> >>> And sure, those users would be running their hardware outside of its
> >>> specifications. But is that different from overclocking (which the
> >>> driver allows, doesn't it? If not by all means please correct me!)?
> >>
> >> Sure.  The driver has always had upper bound limits for overclocking,
> >> this change adds lower bounds checking for underclocking as well.
> >> When the silicon validation teams set the bounding box for a device,
> >> they set a range of values where it's reasonable to operate based on
> >> the characteristics of the design.
> >>
> >> If we did want to allow extended underclocking, we need a big warning
> >> in the logs at the very least.
> >
> > Requiring a module-option to be set to allow this, as well as a big
> > warning in the logs sounds like a good solution to me.
>
> Yeah, especially as it sounds from some of the reports as if some
> vendors did a really bad job when it came to setting the proper
> lower-bound limits are now adhered -- and thus higher then what we used
> out-of-the box before 1958946858a62b was applied.
>
> Side note: I assume those "lower bounds checking" is done round about
> the same way by the Windows driver? Does that one allow users to go
> lower somehow? Say after modifying the registry or something like that?
> Or through external tools?

Windows uses the same limit.  I'm not aware of any way to override the
limit on windows off hand.

Alex


>
> Ciao, Thorsten
>
> > Roman posted something that apparently was meant to go to the list, so
> > let me put it here:
> >
> > """
> > UPDATE: User fililip already posted patch, but it need to be merged,
> > discussion is on gitlab link below.
> >
> > (PS: I hope I am replying correctly to "all" now? - using original 
> > addr.)
> >
> >
> >> it seems that commit was already found(see user's 'fililip' comment):
> >>
> >> https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> >> commit 1958946858a62b6b5392ed075aa219d199bcae39
> >> Author: Ma Jun 
> >> Date:   Thu Oct 12 09:33:45 2023 +0800
> >>
> >> drm/amd/pm: Support for getting power1_cap_min value
> >>
> >> Support for getting power1_cap_min value on smu13 and smu11.
> >> For other Asics, we still use 0 as the default value.
> >>
> >> Signed-off-by: Ma Jun 
> >> Reviewed-by: Kenneth Feng 
> >> Signed-off-by: Alex Deucher 
> >>
> >> 

Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-20 Thread Christian König

Am 20.02.24 um 16:15 schrieb Alex Deucher:

On Tue, Feb 20, 2024 at 10:03 AM Linux regression tracking (Thorsten
Leemhuis)  wrote:

On 20.02.24 15:45, Alex Deucher wrote:

On Mon, Feb 19, 2024 at 9:47 AM Linux regression tracking (Thorsten
Leemhuis)  wrote:

On 17.02.24 14:30, Greg KH wrote:

On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:

Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 6700XT,
mesa, archlinux) and I cannot get power cap as low as before(to 115W),
neither with Corectrl, LACT or TuxClocker and /sys have a variable read-only
even for root. This is not of above apps issue but of the kernel, I read
similar issues from other bug reports of above apps. I downgraded to v6.6.10
kernel and my 115W(under power)cap work again as before.

For the record and everyone that lands here: the cause is known now
(it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min
value") [v6.7-rc1]) and the issue afaics tracked here:

https://gitlab.freedesktop.org/drm/amd/-/issues/3183

Other mentions:
https://gitlab.freedesktop.org/drm/amd/-/issues/3137
https://gitlab.freedesktop.org/drm/amd/-/issues/2992

Haven't seen any statement from the amdgpu developers (now CCed) yet on
this there (but might have missed something!). From what I can see I
assume this will likely be somewhat tricky to handle, as a revert
overall might be a bad idea here. We'll see I guess.

The change aligns the driver what has been validated on each board
design.  Windows uses the same limits.  Using values lower than the
validated range can lead to undefined behavior and could potentially
damage your hardware.

Thx for the reply! Yeah, I was expecting something along those lines.

Nevertheless it afaics still is a regression in the eyes of many users.
I'm not sure how Linus feels about this, but I wonder if we can find
some solution here so that users that really want to, can continue to do
what was possible out-of-the box before. Is that possible to realize or
even supported already?

And sure, those users would be running their hardware outside of its
specifications. But is that different from overclocking (which the
driver allows, doesn't it? If not by all means please correct me!)?

Sure.  The driver has always had upper bound limits for overclocking,
this change adds lower bounds checking for underclocking as well.
When the silicon validation teams set the bounding box for a device,
they set a range of values where it's reasonable to operate based on
the characteristics of the design.

If we did want to allow extended underclocking, we need a big warning
in the logs at the very least.


Yeah, I mean we had a similar outcry when we started to apply the limits 
for the display PLLs as well.


It's just that we have to stay inside certain parameters to be allowed 
as hardware vendor to sell the stuff in most countries because of public 
regulations.


I mean you can in theory program the ASIC so that it starts sucking more 
power than allowed through the PCIe lanes which could start a fire. 
Because of that certain settings are protected by signed firmware images.


Undervolting is not that problematic than overclocking or overvolting, 
but you can still do stuff which is outside the hardware specification 
with that.


Regards,
Christian.



Alex


Ciao, Thorsten


Roman posted something that apparently was meant to go to the list, so
let me put it here:

"""
UPDATE: User fililip already posted patch, but it need to be merged,
discussion is on gitlab link below.

(PS: I hope I am replying correctly to "all" now? - using original addr.)



it seems that commit was already found(see user's 'fililip' comment):

https://gitlab.freedesktop.org/drm/amd/-/issues/3183
commit 1958946858a62b6b5392ed075aa219d199bcae39
Author: Ma Jun 
Date:   Thu Oct 12 09:33:45 2023 +0800

 drm/amd/pm: Support for getting power1_cap_min value

 Support for getting power1_cap_min value on smu13 and smu11.
 For other Asics, we still use 0 as the default value.

 Signed-off-by: Ma Jun 
 Reviewed-by: Kenneth Feng 
 Signed-off-by: Alex Deucher 

However, this is not good as it remove under-powering range too far. I

was getting only about 7% less performance but 90W(!) less consumption
when set to my 115W before. Also I wonder if we as a OS of options and
freedom have to stick to such very high reference for min values without
ability to override them through some sys ctrls. Commit was done by amd
guy and I wonder if because of maybe this post that I made few months
ago(business strategy?):



https://www.reddit.com/r/Amd/comments/183gye7/rx_6700xt_from_230w_to_capped_115w_at_only_10/

This is not a dangerous OC upwards where I can understand desire to

protect HW, it is downward, having min cap at 190W when card pull on
115W almost same speed is IMO crazy to deny. We don't talk about default
or reference values here either, just a move to lower the range of
options for whatever reason.

I don't know 

Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-20 Thread Alex Deucher
On Tue, Feb 20, 2024 at 10:03 AM Linux regression tracking (Thorsten
Leemhuis)  wrote:
>
> On 20.02.24 15:45, Alex Deucher wrote:
> > On Mon, Feb 19, 2024 at 9:47 AM Linux regression tracking (Thorsten
> > Leemhuis)  wrote:
> >>
> >> On 17.02.24 14:30, Greg KH wrote:
> >>> On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:
>  Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 
>  6700XT,
>  mesa, archlinux) and I cannot get power cap as low as before(to 115W),
>  neither with Corectrl, LACT or TuxClocker and /sys have a variable 
>  read-only
>  even for root. This is not of above apps issue but of the kernel, I read
>  similar issues from other bug reports of above apps. I downgraded to 
>  v6.6.10
>  kernel and my 115W(under power)cap work again as before.
> >>>
> >> For the record and everyone that lands here: the cause is known now
> >> (it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min
> >> value") [v6.7-rc1]) and the issue afaics tracked here:
> >>
> >> https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> >>
> >> Other mentions:
> >> https://gitlab.freedesktop.org/drm/amd/-/issues/3137
> >> https://gitlab.freedesktop.org/drm/amd/-/issues/2992
> >>
> >> Haven't seen any statement from the amdgpu developers (now CCed) yet on
> >> this there (but might have missed something!). From what I can see I
> >> assume this will likely be somewhat tricky to handle, as a revert
> >> overall might be a bad idea here. We'll see I guess.
> >
> > The change aligns the driver what has been validated on each board
> > design.  Windows uses the same limits.  Using values lower than the
> > validated range can lead to undefined behavior and could potentially
> > damage your hardware.
>
> Thx for the reply! Yeah, I was expecting something along those lines.
>
> Nevertheless it afaics still is a regression in the eyes of many users.
> I'm not sure how Linus feels about this, but I wonder if we can find
> some solution here so that users that really want to, can continue to do
> what was possible out-of-the box before. Is that possible to realize or
> even supported already?
>
> And sure, those users would be running their hardware outside of its
> specifications. But is that different from overclocking (which the
> driver allows, doesn't it? If not by all means please correct me!)?

Sure.  The driver has always had upper bound limits for overclocking,
this change adds lower bounds checking for underclocking as well.
When the silicon validation teams set the bounding box for a device,
they set a range of values where it's reasonable to operate based on
the characteristics of the design.

If we did want to allow extended underclocking, we need a big warning
in the logs at the very least.

Alex

>
> Ciao, Thorsten
>
> >> Roman posted something that apparently was meant to go to the list, so
> >> let me put it here:
> >>
> >> """
> >> UPDATE: User fililip already posted patch, but it need to be merged,
> >> discussion is on gitlab link below.
> >>
> >> (PS: I hope I am replying correctly to "all" now? - using original addr.)
> >>
> >>
> >>> it seems that commit was already found(see user's 'fililip' comment):
> >>>
> >>> https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> >>> commit 1958946858a62b6b5392ed075aa219d199bcae39
> >>> Author: Ma Jun 
> >>> Date:   Thu Oct 12 09:33:45 2023 +0800
> >>>
> >>> drm/amd/pm: Support for getting power1_cap_min value
> >>>
> >>> Support for getting power1_cap_min value on smu13 and smu11.
> >>> For other Asics, we still use 0 as the default value.
> >>>
> >>> Signed-off-by: Ma Jun 
> >>> Reviewed-by: Kenneth Feng 
> >>> Signed-off-by: Alex Deucher 
> >>>
> >>> However, this is not good as it remove under-powering range too far. I
> >> was getting only about 7% less performance but 90W(!) less consumption
> >> when set to my 115W before. Also I wonder if we as a OS of options and
> >> freedom have to stick to such very high reference for min values without
> >> ability to override them through some sys ctrls. Commit was done by amd
> >> guy and I wonder if because of maybe this post that I made few months
> >> ago(business strategy?):
> >>>
> >>>
> >> https://www.reddit.com/r/Amd/comments/183gye7/rx_6700xt_from_230w_to_capped_115w_at_only_10/
> >>>
> >>> This is not a dangerous OC upwards where I can understand desire to
> >> protect HW, it is downward, having min cap at 190W when card pull on
> >> 115W almost same speed is IMO crazy to deny. We don't talk about default
> >> or reference values here either, just a move to lower the range of
> >> options for whatever reason.
> >>>
> >>> I don't know how much power you guys have over them, but please
> >> consider either reverting this change, or give us an option to set
> >> min_cap through say /sys (right now param is readonly, even for root).
> >>>
> >>>
> >>> Thank you in advance for looking into this, with regards:  Romano
> >> 

Re: Kernel 6.7+ broke under-powering of my RX 6700XT. (Archlinux, mesa/amdgpu)

2024-02-20 Thread Alex Deucher
On Mon, Feb 19, 2024 at 9:47 AM Linux regression tracking (Thorsten
Leemhuis)  wrote:
>
> On 17.02.24 14:30, Greg KH wrote:
> > On Sat, Feb 17, 2024 at 02:01:54PM +0100, Roman Benes wrote:
> >> Minimum power limit on latest(6.7+) kernels is 190W for my GPU (RX 6700XT,
> >> mesa, archlinux) and I cannot get power cap as low as before(to 115W),
> >> neither with Corectrl, LACT or TuxClocker and /sys have a variable 
> >> read-only
> >> even for root. This is not of above apps issue but of the kernel, I read
> >> similar issues from other bug reports of above apps. I downgraded to 
> >> v6.6.10
> >> kernel and my 115W(under power)cap work again as before.
> >
> > Any chance you can use 'git bisect' to figure out the offending change?
>
> For the record and everyone that lands here: the cause is known now
> (it's 1958946858a62b ("drm/amd/pm: Support for getting power1_cap_min
> value") [v6.7-rc1]) and the issue afaics tracked here:
>
> https://gitlab.freedesktop.org/drm/amd/-/issues/3183
>
> Other mentions:
> https://gitlab.freedesktop.org/drm/amd/-/issues/3137
> https://gitlab.freedesktop.org/drm/amd/-/issues/2992
>
> Haven't seen any statement from the amdgpu developers (now CCed) yet on
> this there (but might have missed something!). From what I can see I
> assume this will likely be somewhat tricky to handle, as a revert
> overall might be a bad idea here. We'll see I guess.

The change aligns the driver what has been validated on each board
design.  Windows uses the same limits.  Using values lower than the
validated range can lead to undefined behavior and could potentially
damage your hardware.

Alex

>
> Roman posted something that apparently was meant to go to the list, so
> let me put it here:
>
> """
> UPDATE: User fililip already posted patch, but it need to be merged,
> discussion is on gitlab link below.
>
> (PS: I hope I am replying correctly to "all" now? - using original addr.)
>
>
> > it seems that commit was already found(see user's 'fililip' comment):
> >
> > https://gitlab.freedesktop.org/drm/amd/-/issues/3183
> > commit 1958946858a62b6b5392ed075aa219d199bcae39
> > Author: Ma Jun 
> > Date:   Thu Oct 12 09:33:45 2023 +0800
> >
> > drm/amd/pm: Support for getting power1_cap_min value
> >
> > Support for getting power1_cap_min value on smu13 and smu11.
> > For other Asics, we still use 0 as the default value.
> >
> > Signed-off-by: Ma Jun 
> > Reviewed-by: Kenneth Feng 
> > Signed-off-by: Alex Deucher 
> >
> > However, this is not good as it remove under-powering range too far. I
> was getting only about 7% less performance but 90W(!) less consumption
> when set to my 115W before. Also I wonder if we as a OS of options and
> freedom have to stick to such very high reference for min values without
> ability to override them through some sys ctrls. Commit was done by amd
> guy and I wonder if because of maybe this post that I made few months
> ago(business strategy?):
> >
> >
> https://www.reddit.com/r/Amd/comments/183gye7/rx_6700xt_from_230w_to_capped_115w_at_only_10/
> >
> > This is not a dangerous OC upwards where I can understand desire to
> protect HW, it is downward, having min cap at 190W when card pull on
> 115W almost same speed is IMO crazy to deny. We don't talk about default
> or reference values here either, just a move to lower the range of
> options for whatever reason.
> >
> > I don't know how much power you guys have over them, but please
> consider either reverting this change, or give us an option to set
> min_cap through say /sys (right now param is readonly, even for root).
> >
> >
> > Thank you in advance for looking into this, with regards:  Romano
> """
>
> And while at it, let me add this issue to the tracking as well
>
> [TLDR: I'm adding this report to the list of tracked Linux kernel
> regressions; the text you find below is based on a few templates
> paragraphs you might have encountered already in similar form.
> See link in footer if these mails annoy you.]
>
> Thanks for the report. To be sure the issue doesn't fall through the
> cracks unnoticed, I'm adding it to regzbot, the Linux kernel regression
> tracking bot:
>
> #regzbot introduced 1958946858a62b /
> #regzbot title drm: amdgpu: under-powering broke
>
> Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
> --
> Everything you wanna know about Linux kernel regression tracking:
> https://linux-regtracking.leemhuis.info/about/#tldr
> That page also explains what to do if mails like this annoy you.


Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-20 Thread Lazar, Lijo



On 2/20/2024 7:52 PM, Christian König wrote:
> Am 20.02.24 um 07:32 schrieb Lazar, Lijo:
>> On 2/16/2024 8:43 PM, Alex Deucher wrote:
>>> Use the new reset critical section accessors for debugfs, sysfs,
>>> and the INFO IOCTL to provide proper mutual exclusivity
>>> to hardware with respect the GPU resets.
>>>
>> This looks more like a priority inversion. When the device needs reset,
>> it doesn't make sense for reset handling to wait on these. Instead,
>> reset should be done at the earliest.
> 
> Nope that doesn't make any sense.
> 
> We block out operations which are not allowed to run concurrently with
> resets here.
> 
> So the reset absolutely has to wait for those operations to finish.
> 

Not at all. The first thing is to make sure that in_reset is set at the
earliest so that anything like these don't create further outstanding
transactions on the device. This patch prevents that.

The operations done/queried on the device on a hung state doesn't make
sense. The more the opportunities for such outstanding transactions, the
further it creates issues during the actual reset process.

Thanks,
Lijo

> Regards,
> Christian.
> 
>>
>> Thanks,
>> Lijo
>>
>>> Signed-off-by: Alex Deucher 
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
>>>   drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
>>>   drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
>>>   4 files changed, 235 insertions(+), 101 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 72eceb7d6667..d0e4a8729703 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct
>>> seq_file *m, void *unused)
>>>   }
>>>     /* Avoid accidently unparking the sched thread during GPU
>>> reset */
>>> -    r = down_write_killable(>reset_domain->sem);
>>> +    r = amdgpu_reset_domain_access_write_start(adev);
>>>   if (r)
>>>   return r;
>>>   @@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct
>>> seq_file *m, void *unused)
>>>   kthread_unpark(ring->sched.thread);
>>>   }
>>>   -    up_write(>reset_domain->sem);
>>> +    amdgpu_reset_domain_access_write_end(adev);
>>>     pm_runtime_mark_last_busy(dev->dev);
>>>   pm_runtime_put_autosuspend(dev->dev);
>>> @@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void
>>> *data, u64 val)
>>>   return -ENOMEM;
>>>     /* Avoid accidently unparking the sched thread during GPU
>>> reset */
>>> -    r = down_read_killable(>reset_domain->sem);
>>> +    r = amdgpu_reset_domain_access_read_start(adev);
>>>   if (r)
>>>   goto pro_end;
>>>   @@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void
>>> *data, u64 val)
>>>   /* restart the scheduler */
>>>   kthread_unpark(ring->sched.thread);
>>>   -    up_read(>reset_domain->sem);
>>> +    amdgpu_reset_domain_access_read_end(adev);
>>>     pro_end:
>>>   kfree(fences);
>>> @@ -2031,23 +2031,23 @@ static ssize_t
>>> amdgpu_reset_dump_register_list_read(struct file *f,
>>>   return 0;
>>>     memset(reg_offset, 0, 12);
>>> -    ret = down_read_killable(>reset_domain->sem);
>>> +    ret = amdgpu_reset_domain_access_read_start(adev);
>>>   if (ret)
>>>   return ret;
>>>     for (i = 0; i < adev->reset_info.num_regs; i++) {
>>>   sprintf(reg_offset, "0x%x\n",
>>> adev->reset_info.reset_dump_reg_list[i]);
>>> -    up_read(>reset_domain->sem);
>>> +    amdgpu_reset_domain_access_read_end(adev);
>>>   if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
>>>   return -EFAULT;
>>>     len += strlen(reg_offset);
>>> -    ret = down_read_killable(>reset_domain->sem);
>>> +    ret = amdgpu_reset_domain_access_read_start(adev);
>>>   if (ret)
>>>   return ret;
>>>   }
>>>   -    up_read(>reset_domain->sem);
>>> +    amdgpu_reset_domain_access_read_end(adev);
>>>   *pos += len;
>>>     return len;
>>> @@ -2089,14 +2089,14 @@ static ssize_t
>>> amdgpu_reset_dump_register_list_write(struct file *f,
>>>   ret = -ENOMEM;
>>>   goto error_free;
>>>   }
>>> -    ret = down_write_killable(>reset_domain->sem);
>>> +    ret = amdgpu_reset_domain_access_write_start(adev);
>>>   if (ret)
>>>   goto error_free;
>>>     swap(adev->reset_info.reset_dump_reg_list, tmp);
>>>   swap(adev->reset_info.reset_dump_reg_value, new);
>>>   adev->reset_info.num_regs = i;
>>> -    up_write(>reset_domain->sem);
>>> +    amdgpu_reset_domain_access_write_end(adev);
>>>   ret = size;
>>>     error_free:
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> index 

Re: [PATCH 2/2] drm/amdgpu: use new reset-affected accessors for userspace interfaces

2024-02-20 Thread Christian König

Am 20.02.24 um 07:32 schrieb Lazar, Lijo:

On 2/16/2024 8:43 PM, Alex Deucher wrote:

Use the new reset critical section accessors for debugfs, sysfs,
and the INFO IOCTL to provide proper mutual exclusivity
to hardware with respect the GPU resets.


This looks more like a priority inversion. When the device needs reset,
it doesn't make sense for reset handling to wait on these. Instead,
reset should be done at the earliest.


Nope that doesn't make any sense.

We block out operations which are not allowed to run concurrently with 
resets here.


So the reset absolutely has to wait for those operations to finish.

Regards,
Christian.



Thanks,
Lijo


Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  20 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  10 +
  drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 215 
  drivers/gpu/drm/amd/pm/amdgpu_pm.c  |  91 -
  4 files changed, 235 insertions(+), 101 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 72eceb7d6667..d0e4a8729703 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1670,7 +1670,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file 
*m, void *unused)
}
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_write_killable(>reset_domain->sem);
+   r = amdgpu_reset_domain_access_write_start(adev);
if (r)
return r;
  
@@ -1699,7 +1699,7 @@ static int amdgpu_debugfs_test_ib_show(struct seq_file *m, void *unused)

kthread_unpark(ring->sched.thread);
}
  
-	up_write(>reset_domain->sem);

+   amdgpu_reset_domain_access_write_end(adev);
  
  	pm_runtime_mark_last_busy(dev->dev);

pm_runtime_put_autosuspend(dev->dev);
@@ -1929,7 +1929,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
  
  	/* Avoid accidently unparking the sched thread during GPU reset */

-   r = down_read_killable(>reset_domain->sem);
+   r = amdgpu_reset_domain_access_read_start(adev);
if (r)
goto pro_end;
  
@@ -1970,7 +1970,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)

/* restart the scheduler */
kthread_unpark(ring->sched.thread);
  
-	up_read(>reset_domain->sem);

+   amdgpu_reset_domain_access_read_end(adev);
  
  pro_end:

kfree(fences);
@@ -2031,23 +2031,23 @@ static ssize_t 
amdgpu_reset_dump_register_list_read(struct file *f,
return 0;
  
  	memset(reg_offset, 0, 12);

-   ret = down_read_killable(>reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
  
  	for (i = 0; i < adev->reset_info.num_regs; i++) {

sprintf(reg_offset, "0x%x\n", 
adev->reset_info.reset_dump_reg_list[i]);
-   up_read(>reset_domain->sem);
+   amdgpu_reset_domain_access_read_end(adev);
if (copy_to_user(buf + len, reg_offset, strlen(reg_offset)))
return -EFAULT;
  
  		len += strlen(reg_offset);

-   ret = down_read_killable(>reset_domain->sem);
+   ret = amdgpu_reset_domain_access_read_start(adev);
if (ret)
return ret;
}
  
-	up_read(>reset_domain->sem);

+   amdgpu_reset_domain_access_read_end(adev);
*pos += len;
  
  	return len;

@@ -2089,14 +2089,14 @@ static ssize_t 
amdgpu_reset_dump_register_list_write(struct file *f,
ret = -ENOMEM;
goto error_free;
}
-   ret = down_write_killable(>reset_domain->sem);
+   ret = amdgpu_reset_domain_access_write_start(adev);
if (ret)
goto error_free;
  
  	swap(adev->reset_info.reset_dump_reg_list, tmp);

swap(adev->reset_info.reset_dump_reg_value, new);
adev->reset_info.num_regs = i;
-   up_write(>reset_domain->sem);
+   amdgpu_reset_domain_access_write_end(adev);
ret = size;
  
  error_free:

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index a2df3025a754..4efb44a964ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -43,6 +43,7 @@
  #include "amdgpu_gem.h"
  #include "amdgpu_display.h"
  #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
  #include "amd_pcie.h"
  
  void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)

@@ -704,7 +705,11 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
return copy_to_user(out, , min(size, 4u)) ? -EFAULT : 0;
}
case AMDGPU_INFO_TIMESTAMP:
+   ret = amdgpu_reset_domain_access_read_start(adev);
+   if (ret)
+   return -EFAULT;
   

Re: [PATCH 5/5] drm/amdgpu: skip GFX FED error in page fault handling

2024-02-20 Thread Christian König

Am 19.02.24 um 09:15 schrieb Tao Zhou:

Let kfd interrupt handler process it.

Signed-off-by: Tao Zhou 
---
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 10 +-
  1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 773725a92cf1..70defc394b7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -552,7 +552,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
  {
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
-   uint32_t status = 0, cid = 0, rw = 0;
+   uint32_t status = 0, cid = 0, rw = 0, fed = 0;
struct amdgpu_task_info task_info;
struct amdgpu_vmhub *hub;
const char *mmhub_cid;
@@ -663,6 +663,14 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device 
*adev,
status = RREG32(hub->vm_l2_pro_fault_status);
cid = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, CID);
rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
+   fed = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, FED);
+
+   /* for gfx fed error, kfd will handle it, return directly */
+   if (fed && amdgpu_ras_is_poison_mode_supported(adev) &&
+   amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(9, 4, 2) &&
+   !strcmp(hub_name, "gfxhub0"))


Please never ever use strcmp() to make a decision like that, 
*especially* not in an interrupt handler.


Regards,
Christian.


+   return 1;
+
WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
  #ifdef HAVE_STRUCT_XARRAY
amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);




RE: [PATCH] drm/amd: Only allow one entity to control ABM

2024-02-20 Thread Deucher, Alexander
[Public]

> -Original Message-
> From: amd-gfx  On Behalf Of
> Christian König
> Sent: Tuesday, February 20, 2024 9:10 AM
> To: Alex Deucher 
> Cc: Limonciello, Mario ; Wentland, Harry
> ; amd-gfx@lists.freedesktop.org; dri-
> de...@lists.freedesktop.org; Mahfooz, Hamza ;
> Li, Sun peng (Leo) 
> Subject: Re: [PATCH] drm/amd: Only allow one entity to control ABM
>
> Am 19.02.24 um 16:28 schrieb Alex Deucher:
> > On Mon, Feb 19, 2024 at 10:19 AM Christian König
> >  wrote:
> >> Am 16.02.24 um 19:37 schrieb Alex Deucher:
> >>> On Fri, Feb 16, 2024 at 10:42 AM Christian König
> >>>  wrote:
>  Am 16.02.24 um 16:12 schrieb Mario Limonciello:
> > On 2/16/2024 09:05, Harry Wentland wrote:
> >> On 2024-02-16 09:47, Christian König wrote:
> >>> Am 16.02.24 um 15:42 schrieb Mario Limonciello:
>  On 2/16/2024 08:38, Christian König wrote:
> > Am 16.02.24 um 15:07 schrieb Mario Limonciello:
> >> By exporting ABM to sysfs it's possible that DRM master and
> >> software controlling the sysfs file fight over the value programmed
> for ABM.
> >>
> >> Adjust the module parameter behavior to control who control
> ABM:
> >> -2: DRM
> >> -1: sysfs (IE via software like power-profiles-daemon)
> > Well that sounds extremely awkward. Why should a
> > power-profiles-deamon has control over the panel power saving
> > features?
> >
> > I mean we are talking about things like reducing backlight
> > level when the is inactivity, don't we?
>  We're talking about activating the ABM algorithm when the
>  system is in power saving mode; not from inactivity.  This
>  allows the user to squeeze out some extra power "just" in that
> situation.
> 
>  But given the comments on the other patch, I tend to agree with
>  Harry's proposal instead that we just drop the DRM property
>  entirely as there are no consumers of it.
> >>> Yeah, but even then the design to let this be controlled by an
> >>> userspace deamon is questionable. Stuff like that is handled
> >>> inside the kernel and not exposed to userspace usually.
> >>>
> > Regarding the "how" and "why" of PPD; besides this panel power
> > savings sysfs file there are two other things that are nominally 
> > changed.
> >
> > ACPI platform profile:
> > https://www.kernel.org/doc/html/latest/userspace-api/sysfs-platfor
> > m_profile.html
> >
> > AMD-Pstate EPP value:
> > https://www.kernel.org/doc/html//latest/admin-guide/pm/amd-
> pstate.
> > html
> >
> > When a user goes into "power saving" mode both of those are tweaked.
> > Before we introduced the EPP tweaking in PPD we did discuss a
> > callback within the kernel so that userspace could change "just"
> > the ACPI platform profile and everything else would react.  There
> > was pushback on this, and so instead knobs are offered for things
> > that should be tweaked and the userspace daemon can set up policy
> > for what to do when a a user uses a userspace client (such as
> > GNOME or KDE) to change the desired system profile.
>  Ok, well who came up with the idea of the userspace deamon? Cause I
>  think there will be even more push back on this approach.
> 
>  Basically when we go from AC to battery (or whatever) the drivers
>  usually handle that all inside the kernel today. Involving
>  userspace is only done when there is a need for that, e.g.
>  inactivity detection or similar.
> >>> Well, we don't want policy in the kernel unless it's a platform or
> >>> hardware requirement.  Kernel should provide the knobs and then
> >>> userspace can set them however they want depending on user preference.
> >> Well, you not have the policy itself but usually the handling inside
> >> the kernel.
> >>
> >> In other words when I connect/disconnect AC from my laptop I can hear
> >> the fan changing, which is a switch in power state. Only the beep
> >> which comes out of the speakers as conformation is handled in userspace I
> think.
> >>
> >> And IIRC changing background light is also handled completely inside
> >> the kernel and when I close the lid the display turns off on its own
> >> and not because of some userspace deamon.
> >>
> >> So why is for this suddenly a userspace deamon involved?
> > It's a user preference.  Some people won't like ABM, some will.  They
> > set the policy from user space.  It's similar to the backlight level.
> > Some users always prefer a bright backlight regardless of AC/DC state,
> > others want the backlight to get brighter when on AC power.  The
> > kernel provides the knobs to set the ABM level and then user space can
> > specify the level and also device when they want it enabled (never,
> > only on DC, etc.).  The kernel driver for the backlight doesn't change
> > the backlight at AC/DC switch, 

Re: [PATCH] drm/amd: Only allow one entity to control ABM

2024-02-20 Thread Christian König

Am 19.02.24 um 16:28 schrieb Alex Deucher:

On Mon, Feb 19, 2024 at 10:19 AM Christian König
 wrote:

Am 16.02.24 um 19:37 schrieb Alex Deucher:

On Fri, Feb 16, 2024 at 10:42 AM Christian König
 wrote:

Am 16.02.24 um 16:12 schrieb Mario Limonciello:

On 2/16/2024 09:05, Harry Wentland wrote:

On 2024-02-16 09:47, Christian König wrote:

Am 16.02.24 um 15:42 schrieb Mario Limonciello:

On 2/16/2024 08:38, Christian König wrote:

Am 16.02.24 um 15:07 schrieb Mario Limonciello:

By exporting ABM to sysfs it's possible that DRM master and software
controlling the sysfs file fight over the value programmed for ABM.

Adjust the module parameter behavior to control who control ABM:
-2: DRM
-1: sysfs (IE via software like power-profiles-daemon)

Well that sounds extremely awkward. Why should a
power-profiles-deamon has control over the panel power saving
features?

I mean we are talking about things like reducing backlight level
when the is inactivity, don't we?

We're talking about activating the ABM algorithm when the system is
in power saving mode; not from inactivity.  This allows the user to
squeeze out some extra power "just" in that situation.

But given the comments on the other patch, I tend to agree with
Harry's proposal instead that we just drop the DRM property
entirely as there are no consumers of it.

Yeah, but even then the design to let this be controlled by an
userspace deamon is questionable. Stuff like that is handled inside
the kernel and not exposed to userspace usually.


Regarding the "how" and "why" of PPD; besides this panel power savings
sysfs file there are two other things that are nominally changed.

ACPI platform profile:
https://www.kernel.org/doc/html/latest/userspace-api/sysfs-platform_profile.html

AMD-Pstate EPP value:
https://www.kernel.org/doc/html//latest/admin-guide/pm/amd-pstate.html

When a user goes into "power saving" mode both of those are tweaked.
Before we introduced the EPP tweaking in PPD we did discuss a callback
within the kernel so that userspace could change "just" the ACPI
platform profile and everything else would react.  There was pushback
on this, and so instead knobs are offered for things that should be
tweaked and the userspace daemon can set up policy for what to do when
a a user uses a userspace client (such as GNOME or KDE) to change the
desired system profile.

Ok, well who came up with the idea of the userspace deamon? Cause I
think there will be even more push back on this approach.

Basically when we go from AC to battery (or whatever) the drivers
usually handle that all inside the kernel today. Involving userspace is
only done when there is a need for that, e.g. inactivity detection or
similar.

Well, we don't want policy in the kernel unless it's a platform or
hardware requirement.  Kernel should provide the knobs and then
userspace can set them however they want depending on user preference.

Well, you not have the policy itself but usually the handling inside the
kernel.

In other words when I connect/disconnect AC from my laptop I can hear
the fan changing, which is a switch in power state. Only the beep which
comes out of the speakers as conformation is handled in userspace I think.

And IIRC changing background light is also handled completely inside the
kernel and when I close the lid the display turns off on its own and not
because of some userspace deamon.

So why is for this suddenly a userspace deamon involved?

It's a user preference.  Some people won't like ABM, some will.  They
set the policy from user space.  It's similar to the backlight level.
Some users always prefer a bright backlight regardless of AC/DC state,
others want the backlight to get brighter when on AC power.  The
kernel provides the knobs to set the ABM level and then user space can
specify the level and also device when they want it enabled (never,
only on DC, etc.).  The kernel driver for the backlight doesn't change
the backlight at AC/DC switch, userspace gets an event when the power
source changes and it then talks to the kernel to change the backlight
level.  I think lid is handled the same way.  Userspace gets a lid
event and it turns off the displays and maybe enters suspend or maybe
not depending on what the user wants.


Ok, well that comes as a surprise. Which userspace component is 
responsible for this?


Christian.



Alex


Christian.


Alex



I think we'll need a bit in our kernel docs describing ABM.
Assumptions around what it is and does seem to lead to confusion.

The thing is that ABM has a visual impact that not all users like. It
would make sense for users to be able to change the ABM level as
desired, and/or configure their power profiles to select a certain
ABM level.

ABM will reduce the backlight and compensate by adjusting brightness
and contrast of the image. It has 5 levels: 0, 1, 2, 3, 4. 0 means
off. 4 means maximum backlight reduction. IMO, 1 and 2 look okay. 3
and 4 can be quite impactful, both to power and visual fidelity.



Re: [PATCH v2] drm/amd/display: Fix potential null pointer dereference in dc_dmub_srv

2024-02-20 Thread Chung, ChiaHsuan (Tom)

Reviewed-by: Tom Chung 

On 2/20/2024 5:06 PM, Srinivasan Shanmugam wrote:

Fixes potential null pointer dereference warnings in the
dc_dmub_srv_cmd_list_queue_execute() and dc_dmub_srv_is_hw_pwr_up()
functions.

In both functions, the 'dc_dmub_srv' variable was being dereferenced
before it was checked for null. This could lead to a null pointer
dereference if 'dc_dmub_srv' is null. The fix is to check if
'dc_dmub_srv' is null before dereferencing it.

Thus moving the null checks for 'dc_dmub_srv' to the beginning of the
functions to ensure that 'dc_dmub_srv' is not null when it is
dereferenced.

Found by smatch & thus fixing the below:
drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:133 
dc_dmub_srv_cmd_list_queue_execute() warn: variable dereferenced before check 
'dc_dmub_srv' (see line 128)
drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:1167 
dc_dmub_srv_is_hw_pwr_up() warn: variable dereferenced before check 
'dc_dmub_srv' (see line 1164)

Fixes: 01fbdc34c687 ("drm/amd/display: decouple dmcub execution to reduce lock 
granularity")
Fixes: 65138eb72e1f ("drm/amd/display: Add DCN35 DMUB")
Cc: JinZe.Xu 
Cc: Hersen Wu 
Cc: Josip Pavic 
Cc: Roman Li 
Cc: Qingqing Zhuo 
Cc: Harry Wentland 
Cc: Rodrigo Siqueira 
Cc: Aurabindo Pillai 
Cc: Tom Chung 
Signed-off-by: Srinivasan Shanmugam 
---
v2:
  - For dc_dmub_srv_is_hw_pwr_up() move 'dc_ctx = dc_dmub_srv->ctx;'
below 'if (dc_dmub_srv->ctx->dc->debug.dmcub_emulation)' (Tom)

  drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c 
b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
index 0bc32537e2eb..a115e1170ef5 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
@@ -128,7 +128,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv 
*dc_dmub_srv,
unsigned int count,
union dmub_rb_cmd *cmd_list)
  {
-   struct dc_context *dc_ctx = dc_dmub_srv->ctx;
+   struct dc_context *dc_ctx;
struct dmub_srv *dmub;
enum dmub_status status;
int i;
@@ -136,6 +136,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv 
*dc_dmub_srv,
if (!dc_dmub_srv || !dc_dmub_srv->dmub)
return false;
  
+	dc_ctx = dc_dmub_srv->ctx;

dmub = dc_dmub_srv->dmub;
  
  	for (i = 0 ; i < count; i++) {

@@ -1169,7 +1170,7 @@ void dc_dmub_srv_subvp_save_surf_addr(const struct 
dc_dmub_srv *dc_dmub_srv, con
  
  bool dc_dmub_srv_is_hw_pwr_up(struct dc_dmub_srv *dc_dmub_srv, bool wait)

  {
-   struct dc_context *dc_ctx = dc_dmub_srv->ctx;
+   struct dc_context *dc_ctx;
enum dmub_status status;
  
  	if (!dc_dmub_srv || !dc_dmub_srv->dmub)

@@ -1177,6 +1178,8 @@ bool dc_dmub_srv_is_hw_pwr_up(struct dc_dmub_srv 
*dc_dmub_srv, bool wait)
  
  	if (dc_dmub_srv->ctx->dc->debug.dmcub_emulation)

return true;
+
+   dc_ctx = dc_dmub_srv->ctx;
  
  	if (wait) {

if (dc_dmub_srv->ctx->dc->debug.disable_timeout) {


[PATCH v2] drm/amd/display: Fix potential null pointer dereference in dc_dmub_srv

2024-02-20 Thread Srinivasan Shanmugam
Fixes potential null pointer dereference warnings in the
dc_dmub_srv_cmd_list_queue_execute() and dc_dmub_srv_is_hw_pwr_up()
functions.

In both functions, the 'dc_dmub_srv' variable was being dereferenced
before it was checked for null. This could lead to a null pointer
dereference if 'dc_dmub_srv' is null. The fix is to check if
'dc_dmub_srv' is null before dereferencing it.

Thus moving the null checks for 'dc_dmub_srv' to the beginning of the
functions to ensure that 'dc_dmub_srv' is not null when it is
dereferenced.

Found by smatch & thus fixing the below:
drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:133 
dc_dmub_srv_cmd_list_queue_execute() warn: variable dereferenced before check 
'dc_dmub_srv' (see line 128)
drivers/gpu/drm/amd/amdgpu/../display/dc/dc_dmub_srv.c:1167 
dc_dmub_srv_is_hw_pwr_up() warn: variable dereferenced before check 
'dc_dmub_srv' (see line 1164)

Fixes: 01fbdc34c687 ("drm/amd/display: decouple dmcub execution to reduce lock 
granularity")
Fixes: 65138eb72e1f ("drm/amd/display: Add DCN35 DMUB")
Cc: JinZe.Xu 
Cc: Hersen Wu 
Cc: Josip Pavic 
Cc: Roman Li 
Cc: Qingqing Zhuo 
Cc: Harry Wentland 
Cc: Rodrigo Siqueira 
Cc: Aurabindo Pillai 
Cc: Tom Chung 
Signed-off-by: Srinivasan Shanmugam 
---
v2:
 - For dc_dmub_srv_is_hw_pwr_up() move 'dc_ctx = dc_dmub_srv->ctx;'
   below 'if (dc_dmub_srv->ctx->dc->debug.dmcub_emulation)' (Tom) 

 drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c 
b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
index 0bc32537e2eb..a115e1170ef5 100644
--- a/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
+++ b/drivers/gpu/drm/amd/display/dc/dc_dmub_srv.c
@@ -128,7 +128,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv 
*dc_dmub_srv,
unsigned int count,
union dmub_rb_cmd *cmd_list)
 {
-   struct dc_context *dc_ctx = dc_dmub_srv->ctx;
+   struct dc_context *dc_ctx;
struct dmub_srv *dmub;
enum dmub_status status;
int i;
@@ -136,6 +136,7 @@ bool dc_dmub_srv_cmd_list_queue_execute(struct dc_dmub_srv 
*dc_dmub_srv,
if (!dc_dmub_srv || !dc_dmub_srv->dmub)
return false;
 
+   dc_ctx = dc_dmub_srv->ctx;
dmub = dc_dmub_srv->dmub;
 
for (i = 0 ; i < count; i++) {
@@ -1169,7 +1170,7 @@ void dc_dmub_srv_subvp_save_surf_addr(const struct 
dc_dmub_srv *dc_dmub_srv, con
 
 bool dc_dmub_srv_is_hw_pwr_up(struct dc_dmub_srv *dc_dmub_srv, bool wait)
 {
-   struct dc_context *dc_ctx = dc_dmub_srv->ctx;
+   struct dc_context *dc_ctx;
enum dmub_status status;
 
if (!dc_dmub_srv || !dc_dmub_srv->dmub)
@@ -1177,6 +1178,8 @@ bool dc_dmub_srv_is_hw_pwr_up(struct dc_dmub_srv 
*dc_dmub_srv, bool wait)
 
if (dc_dmub_srv->ctx->dc->debug.dmcub_emulation)
return true;
+
+   dc_ctx = dc_dmub_srv->ctx;
 
if (wait) {
if (dc_dmub_srv->ctx->dc->debug.disable_timeout) {
-- 
2.34.1



Re: [PATCH] drm/radeon: Call mmiowb() at the end of radeon_ring_commit()

2024-02-20 Thread Christian König

Am 20.02.24 um 08:49 schrieb Huacai Chen:

Commit fb24ea52f78e0d595852e ("drivers: Remove explicit invocations of
mmiowb()") remove all mmiowb() in drivers, but it says:

"NOTE: mmiowb() has only ever guaranteed ordering in conjunction with
spin_unlock(). However, pairing each mmiowb() removal in this patch with
the corresponding call to spin_unlock() is not at all trivial, so there
is a small chance that this change may regress any drivers incorrectly
relying on mmiowb() to order MMIO writes between CPUs using lock-free
synchronisation."

The mmio in radeon_ring_commit() is protected by a mutex rather than a
spinlock, but in the mutex fastpath it behaves similar to spinlock and
need a mmiowb() to make sure the wptr is up-to-date for hardware.


Well, if your hw platform can't guarantee that MMIO writes are ordered 
then I would say you can't use radeon in the first place since this is a 
mandatory prerequisite for correct hw behavior.


Doing this here as a workaround is just the tip of the iceberg and 
doesn't really fix the underlying problem.


I strongly suggest to change your writel() implementation to include an 
mmiowb() instead. If that is to heavy weight than at least the mutex 
handling should be changed instead of adding platform specific 
workarounds to a platform independent driver.


Regards,
Christian.



Without this, we get such an error when run 'glxgears' on weak ordering
architectures such as LoongArch:

radeon :04:00.0: ring 0 stalled for more than 10324msec
radeon :04:00.0: ring 3 stalled for more than 10240msec
radeon :04:00.0: GPU lockup (current fence id 0x0001f412 last fence 
id 0x0001f414 on ring 3)
radeon :04:00.0: GPU lockup (current fence id 0xf940 last fence 
id 0xf941 on ring 0)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)

Cc: sta...@vger.kernel.org
Signed-off-by: Tianyang Zhang 
Signed-off-by: Huacai Chen 
---
  drivers/gpu/drm/radeon/radeon_ring.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/radeon/radeon_ring.c 
b/drivers/gpu/drm/radeon/radeon_ring.c
index 38048593bb4a..d461dc85d820 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -183,6 +183,7 @@ void radeon_ring_commit(struct radeon_device *rdev, struct 
radeon_ring *ring,
if (hdp_flush && rdev->asic->mmio_hdp_flush)
rdev->asic->mmio_hdp_flush(rdev);
radeon_ring_set_wptr(rdev, ring);
+   mmiowb(); /* Make sure wptr is up-to-date for hw */
  }
  
  /**




[PATCH] drm/radeon: Call mmiowb() at the end of radeon_ring_commit()

2024-02-20 Thread Huacai Chen
Commit fb24ea52f78e0d595852e ("drivers: Remove explicit invocations of
mmiowb()") remove all mmiowb() in drivers, but it says:

"NOTE: mmiowb() has only ever guaranteed ordering in conjunction with
spin_unlock(). However, pairing each mmiowb() removal in this patch with
the corresponding call to spin_unlock() is not at all trivial, so there
is a small chance that this change may regress any drivers incorrectly
relying on mmiowb() to order MMIO writes between CPUs using lock-free
synchronisation."

The mmio in radeon_ring_commit() is protected by a mutex rather than a
spinlock, but in the mutex fastpath it behaves similar to spinlock and
need a mmiowb() to make sure the wptr is up-to-date for hardware.

Without this, we get such an error when run 'glxgears' on weak ordering
architectures such as LoongArch:

radeon :04:00.0: ring 0 stalled for more than 10324msec
radeon :04:00.0: ring 3 stalled for more than 10240msec
radeon :04:00.0: GPU lockup (current fence id 0x0001f412 last fence 
id 0x0001f414 on ring 3)
radeon :04:00.0: GPU lockup (current fence id 0xf940 last fence 
id 0xf941 on ring 0)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)
radeon :04:00.0: scheduling IB failed (-35).
[drm:radeon_gem_va_ioctl [radeon]] *ERROR* Couldn't update BO_VA (-35)

Cc: sta...@vger.kernel.org
Signed-off-by: Tianyang Zhang 
Signed-off-by: Huacai Chen 
---
 drivers/gpu/drm/radeon/radeon_ring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/radeon/radeon_ring.c 
b/drivers/gpu/drm/radeon/radeon_ring.c
index 38048593bb4a..d461dc85d820 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -183,6 +183,7 @@ void radeon_ring_commit(struct radeon_device *rdev, struct 
radeon_ring *ring,
if (hdp_flush && rdev->asic->mmio_hdp_flush)
rdev->asic->mmio_hdp_flush(rdev);
radeon_ring_set_wptr(rdev, ring);
+   mmiowb(); /* Make sure wptr is up-to-date for hw */
 }
 
 /**
-- 
2.43.0