On Fri, Aug 23, 2024 at 10:13 AM Mario Limonciello
<[email protected]> wrote:
>
> On 8/23/2024 09:09, Alex Deucher wrote:
> > On Mon, Aug 19, 2024 at 10:30 PM Mario Limonciello <[email protected]> 
> > wrote:
> >>
> >> From: Mario Limonciello <[email protected]>
> >>
> >> If the dGPU is off, then reading the sysfs files with a sensor monitoring
> >> application will wake it. Change the behavior to return an error when the
> >> dGPU is in D3cold.
> >
> > I'm a little concerned that this will generate a flurry of bug reports
> > if this now reports an error.  One more comment below.
> >
>
> Do you have a particular app you're worried about, or just a general
> worry?  I've had a lot of people reach out to me complaining about
> battery life on A+A systems, and it comes down to the use of sensor
> monitoring software waking the dGPU which people don't seem to expect.

Nothing in particular.  Just expecting reports of "I checked my GPU
temperature and it returned an error.  It was working with the last
kernel."

>
> I did double check that software like 'sensors', 'mission center' and
> 'nvtop' don't freak out from this change.

Do we know what other devices do?  I wonder if it would make more
sense to have the userspace tools check the runpm state before trying
to access the device.  Of course there are a lot of them and fixing
them all is non-trivial...

Alex

>
> Here is what 'sensors' shows on my local workstation with this change.
>
> amdgpu-pci-6100
> Adapter: PCI adapter
> vddgfx:           N/A
> ERROR: Can't get value of subfeature fan1_min: Can't read
> ERROR: Can't get value of subfeature fan1_max: Can't read
> fan1:             N/A  (min =    0 RPM, max =    0 RPM)
> edge:             N/A  (crit = +97.0°C, hyst = -273.1°C)
> ERROR: Can't get value of subfeature power1_cap: Can't read
> PPT:              N/A  (cap =   0.00 W)
>
> >>
> >> Signed-off-by: Mario Limonciello <[email protected]>
> >> ---
> >>   drivers/gpu/drm/amd/pm/amdgpu_pm.c | 90 +++++++++++++++---------------
> >>   1 file changed, 45 insertions(+), 45 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
> >> b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> >> index c11952a4389bc..d6e38466fbb82 100644
> >> --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> >> +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
> >> @@ -142,7 +142,7 @@ static ssize_t amdgpu_get_power_dpm_state(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -173,7 +173,7 @@ static ssize_t amdgpu_set_power_dpm_state(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (strncmp("battery", buf, strlen("battery")) == 0)
> >> @@ -270,7 +270,7 @@ static ssize_t 
> >> amdgpu_get_power_dpm_force_performance_level(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -309,7 +309,7 @@ static ssize_t 
> >> amdgpu_set_power_dpm_force_performance_level(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (strncmp("low", buf, strlen("low")) == 0) {
> >> @@ -371,7 +371,7 @@ static ssize_t amdgpu_get_pp_num_states(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -409,7 +409,7 @@ static ssize_t amdgpu_get_pp_cur_state(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -448,7 +448,7 @@ static ssize_t amdgpu_get_pp_force_state(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (adev->pm.pp_force_state_enabled)
> >> @@ -471,7 +471,7 @@ static ssize_t amdgpu_set_pp_force_state(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          adev->pm.pp_force_state_enabled = false;
> >> @@ -541,7 +541,7 @@ static ssize_t amdgpu_get_pp_table(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -577,7 +577,7 @@ static ssize_t amdgpu_set_pp_table(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -760,7 +760,7 @@ static ssize_t amdgpu_set_pp_od_clk_voltage(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (count > 127 || count == 0)
> >> @@ -862,7 +862,7 @@ static ssize_t amdgpu_get_pp_od_clk_voltage(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -922,7 +922,7 @@ static ssize_t amdgpu_set_pp_features(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = kstrtou64(buf, 0, &featuremask);
> >> @@ -957,7 +957,7 @@ static ssize_t amdgpu_get_pp_features(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1026,7 +1026,7 @@ static ssize_t amdgpu_get_pp_dpm_clock(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1095,7 +1095,7 @@ static ssize_t amdgpu_set_pp_dpm_clock(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = amdgpu_read_mask(buf, count, &mask);
> >> @@ -1280,7 +1280,7 @@ static ssize_t amdgpu_get_pp_sclk_od(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1309,7 +1309,7 @@ static ssize_t amdgpu_set_pp_sclk_od(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = kstrtol(buf, 0, &value);
> >> @@ -1342,7 +1342,7 @@ static ssize_t amdgpu_get_pp_mclk_od(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1371,7 +1371,7 @@ static ssize_t amdgpu_set_pp_mclk_od(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = kstrtol(buf, 0, &value);
> >> @@ -1424,7 +1424,7 @@ static ssize_t 
> >> amdgpu_get_pp_power_profile_mode(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1463,7 +1463,7 @@ static ssize_t 
> >> amdgpu_set_pp_power_profile_mode(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          tmp[0] = *(buf);
> >> @@ -1517,7 +1517,7 @@ static int amdgpu_hwmon_get_sensor_generic(struct 
> >> amdgpu_device *adev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -1630,7 +1630,7 @@ static ssize_t amdgpu_get_pcie_bw(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (adev->flags & AMD_IS_APU)
> >> @@ -1673,7 +1673,7 @@ static ssize_t amdgpu_get_unique_id(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (adev->unique_id)
> >> @@ -1846,7 +1846,7 @@ static ssize_t amdgpu_get_pm_metrics(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -1887,7 +1887,7 @@ static ssize_t amdgpu_get_gpu_metrics(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(ddev->dev);
> >> @@ -2005,7 +2005,7 @@ static ssize_t amdgpu_set_smartshift_bias(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          r = pm_runtime_get_sync(ddev->dev);
> >> @@ -2227,7 +2227,7 @@ static ssize_t amdgpu_get_xgmi_plpd_policy(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          mode = amdgpu_dpm_get_xgmi_plpd_mode(adev, &mode_desc);
> >> @@ -2250,7 +2250,7 @@ static ssize_t amdgpu_set_xgmi_plpd_policy(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = kstrtos32(buf, 0, &mode);
> >> @@ -2652,7 +2652,7 @@ static ssize_t amdgpu_hwmon_get_pwm1_enable(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2684,7 +2684,7 @@ static ssize_t amdgpu_hwmon_set_pwm1_enable(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = kstrtoint(buf, 10, &value);
> >> @@ -2742,7 +2742,7 @@ static ssize_t amdgpu_hwmon_set_pwm1(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = kstrtou32(buf, 10, &value);
> >> @@ -2787,7 +2787,7 @@ static ssize_t amdgpu_hwmon_get_pwm1(struct device 
> >> *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2817,7 +2817,7 @@ static ssize_t amdgpu_hwmon_get_fan1_input(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2881,7 +2881,7 @@ static ssize_t amdgpu_hwmon_get_fan1_target(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2912,7 +2912,7 @@ static ssize_t amdgpu_hwmon_set_fan1_target(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = kstrtou32(buf, 10, &value);
> >> @@ -2956,7 +2956,7 @@ static ssize_t amdgpu_hwmon_get_fan1_enable(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -2988,7 +2988,7 @@ static ssize_t amdgpu_hwmon_set_fan1_enable(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          err = kstrtoint(buf, 10, &value);
> >> @@ -3128,7 +3128,7 @@ static ssize_t 
> >> amdgpu_hwmon_show_power_cap_generic(struct device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
> >> @@ -3209,7 +3209,7 @@ static ssize_t amdgpu_hwmon_set_power_cap(struct 
> >> device *dev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          if (amdgpu_sriov_vf(adev))
> >> @@ -3663,7 +3663,7 @@ static int amdgpu_retrieve_od_settings(struct 
> >> amdgpu_device *adev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = pm_runtime_get_sync(adev->dev);
> >> @@ -3747,7 +3747,7 @@ amdgpu_distribute_custom_od_settings(struct 
> >> amdgpu_device *adev,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = parse_input_od_command_lines(in_buf,
> >> @@ -4626,7 +4626,7 @@ static int amdgpu_debugfs_pm_info_show(struct 
> >> seq_file *m, void *unused)
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >
> > I'd prefer to keep the current behavior for debugfs.
>
> OK.  I'll exclude it for debugfs in the next spin.
>
> >
> > Alex
> >
> >>
> >>          r = pm_runtime_get_sync(dev->dev);
> >> @@ -4671,7 +4671,7 @@ static ssize_t amdgpu_pm_prv_buffer_read(struct file 
> >> *f, char __user *buf,
> >>
> >>          if (amdgpu_in_reset(adev))
> >>                  return -EPERM;
> >> -       if (adev->in_suspend && !adev->in_runpm)
> >> +       if (adev->in_suspend || adev->in_runpm)
> >>                  return -EPERM;
> >>
> >>          ret = amdgpu_dpm_get_smu_prv_buf_details(adev, &smu_prv_buf, 
> >> &smu_prv_buf_size);
> >> --
> >> 2.43.0
> >>
>

Reply via email to