Re: [Nouveau] [PATCH 03/32] therm: Split return code and value in nvkm_get_temp

2017-11-22 Thread Karol Herbst
On Wed, Nov 22, 2017 at 9:29 AM, Martin Peres  wrote:
> On 22/11/17 03:42, Karol Herbst wrote:
>> On Wed, Nov 22, 2017 at 1:32 AM, Martin Peres  wrote:
>>> On 17/11/17 02:04, Karol Herbst wrote:
 The current hwmon code doesn't check if the returned value was actually an
 error.

 Since Kepler temperature sensors are able to report negative values. Those
 negative values are not for error reporting, but rather when you buried
 your GPU in snow somewhere in Antarctica and still want a valid
 temperature to be reported (unverified).

 Since Pascal (and maybe earlier) we have sensors with improved precision.

 Adjust the nvkm_get_temp method to be able to deal with those changes
 and let hwmon return an error properly.
>>>
>>> Where did you get this information? And if it is the case, then we will
>>> royally screw up the value because we don't even know on how many bits
>>> 0x20400 uses...
>>>
>>> If you are indeed right, I would like to see g84.c's temp_get().
>>>
>>
>> Nvidia uses 0x020460 on Pascal and I thought we had that in envytools 
>> already?
>
> Nothing in nvbios.
>
>>
>> [0] 228.412618 MMIO32 R 0x020460 0x20002f20 PTHERM.I2C_SLAVE+0x60 => 
>> 0x20002f20
>> [0] 229.413039 MMIO32 R 0x020460 0x20002f10 PTHERM.I2C_SLAVE+0x60 => 
>> 0x20002f10
>> [0] 230.416976 MMIO32 R 0x020460 0x20002f68 PTHERM.I2C_SLAVE+0x60 => 
>> 0x20002f68
>> [0] 231.417407 MMIO32 R 0x020460 0x20002fe8 PTHERM.I2C_SLAVE+0x60 => 
>> 0x20002fe8
>> [0] 232.417742 MMIO32 R 0x020460 0x20002ff0 PTHERM.I2C_SLAVE+0x60 => 
>> 0x20002ff0
>> [0] 233.417742 MMIO32 R 0x020460 0x20003020 PTHERM.I2C_SLAVE+0x60 => 
>> 0x20003020
>>
>> or nvapeek 0x20400 && nvapeek 0x020460;
>> 00020400: 002c
>> 00020460: 20002c60
>
> Cool that they increased the accuracy, but where do you get that the
> value can be negative? And if it is, how many bits are used? 16 or more?
> What are those high bits?
>

I was asking rhyskidd to add stuff to rnndb, because he wrote the
nouveau patch for those sensors, check out for more details:
https://github.com/skeggsb/nouveau/blob/master/drm/nouveau/nvkm/subdev/therm/gp100.c#L27

regarding negative values, we have 0x02044c + 0x020450 for that and I
could imagine that 0x020460 can display negative values as well...
nobody tried for sure, but it seems plausible.

Anyway, moving from u8 to int should make life much easier for us if
we decide to really care about the higher precision. Currently our
code can't handle that for multiple reasons, but this solves one of
those at least.

>>

 Signed-off-by: Karol Herbst 
 ---
  drm/nouveau/include/nvkm/subdev/clk.h   |  4 ++--
  drm/nouveau/include/nvkm/subdev/therm.h |  2 +-
  drm/nouveau/nouveau_hwmon.c | 15 +--
  drm/nouveau/nvkm/subdev/clk/base.c  |  2 +-
  drm/nouveau/nvkm/subdev/therm/base.c| 19 ++-
  drm/nouveau/nvkm/subdev/therm/g84.c | 13 -
  drm/nouveau/nvkm/subdev/therm/gp100.c   |  9 +
  drm/nouveau/nvkm/subdev/therm/nv40.c|  9 +++--
  drm/nouveau/nvkm/subdev/therm/nv50.c|  9 +++--
  drm/nouveau/nvkm/subdev/therm/priv.h|  4 ++--
  drm/nouveau/nvkm/subdev/therm/temp.c| 16 
  11 files changed, 60 insertions(+), 42 deletions(-)

 diff --git a/drm/nouveau/include/nvkm/subdev/clk.h 
 b/drm/nouveau/include/nvkm/subdev/clk.h
 index e5275f74..506f8cc6 100644
 --- a/drm/nouveau/include/nvkm/subdev/clk.h
 +++ b/drm/nouveau/include/nvkm/subdev/clk.h
 @@ -100,7 +100,7 @@ struct nvkm_clk {
   int ustate_dc; /* user-requested (-1 disabled, -2 perfmon) */
   int astate; /* perfmon adjustment (base) */
   int dstate; /* display adjustment (min+) */
 - u8  temp;
 + int temp;

   bool allow_reclock;
  #define NVKM_CLK_BOOST_NONE 0x0
 @@ -122,7 +122,7 @@ int nvkm_clk_read(struct nvkm_clk *, enum nv_clk_src);
  int nvkm_clk_ustate(struct nvkm_clk *, int req, int pwr);
  int nvkm_clk_astate(struct nvkm_clk *, int req, int rel, bool wait);
  int nvkm_clk_dstate(struct nvkm_clk *, int req, int rel);
 -int nvkm_clk_tstate(struct nvkm_clk *, u8 temperature);
 +int nvkm_clk_tstate(struct nvkm_clk *, int temperature);

  int nv04_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
  int nv40_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
 diff --git a/drm/nouveau/include/nvkm/subdev/therm.h 
 b/drm/nouveau/include/nvkm/subdev/therm.h
 index 9841f076..8c84017f 100644
 --- a/drm/nouveau/include/nvkm/subdev/therm.h
 +++ b/drm/nouveau/include/nvkm/subdev/therm.h
 @@ -86,7 +86,7 @@ struct nvkm_therm {
   int (*attr_set)(struct nvkm_therm *, enum nvkm_therm_attr_type, int);
  };

 -int nvkm_therm_temp_get(struct nvkm_therm *);
 +int 

Re: [Nouveau] [PATCH 03/32] therm: Split return code and value in nvkm_get_temp

2017-11-22 Thread Martin Peres
On 22/11/17 03:42, Karol Herbst wrote:
> On Wed, Nov 22, 2017 at 1:32 AM, Martin Peres  wrote:
>> On 17/11/17 02:04, Karol Herbst wrote:
>>> The current hwmon code doesn't check if the returned value was actually an
>>> error.
>>>
>>> Since Kepler temperature sensors are able to report negative values. Those
>>> negative values are not for error reporting, but rather when you buried
>>> your GPU in snow somewhere in Antarctica and still want a valid
>>> temperature to be reported (unverified).
>>>
>>> Since Pascal (and maybe earlier) we have sensors with improved precision.
>>>
>>> Adjust the nvkm_get_temp method to be able to deal with those changes
>>> and let hwmon return an error properly.
>>
>> Where did you get this information? And if it is the case, then we will
>> royally screw up the value because we don't even know on how many bits
>> 0x20400 uses...
>>
>> If you are indeed right, I would like to see g84.c's temp_get().
>>
> 
> Nvidia uses 0x020460 on Pascal and I thought we had that in envytools already?

Nothing in nvbios.

> 
> [0] 228.412618 MMIO32 R 0x020460 0x20002f20 PTHERM.I2C_SLAVE+0x60 => 
> 0x20002f20
> [0] 229.413039 MMIO32 R 0x020460 0x20002f10 PTHERM.I2C_SLAVE+0x60 => 
> 0x20002f10
> [0] 230.416976 MMIO32 R 0x020460 0x20002f68 PTHERM.I2C_SLAVE+0x60 => 
> 0x20002f68
> [0] 231.417407 MMIO32 R 0x020460 0x20002fe8 PTHERM.I2C_SLAVE+0x60 => 
> 0x20002fe8
> [0] 232.417742 MMIO32 R 0x020460 0x20002ff0 PTHERM.I2C_SLAVE+0x60 => 
> 0x20002ff0
> [0] 233.417742 MMIO32 R 0x020460 0x20003020 PTHERM.I2C_SLAVE+0x60 => 
> 0x20003020
> 
> or nvapeek 0x20400 && nvapeek 0x020460;
> 00020400: 002c
> 00020460: 20002c60

Cool that they increased the accuracy, but where do you get that the
value can be negative? And if it is, how many bits are used? 16 or more?
What are those high bits?

> 
>>>
>>> Signed-off-by: Karol Herbst 
>>> ---
>>>  drm/nouveau/include/nvkm/subdev/clk.h   |  4 ++--
>>>  drm/nouveau/include/nvkm/subdev/therm.h |  2 +-
>>>  drm/nouveau/nouveau_hwmon.c | 15 +--
>>>  drm/nouveau/nvkm/subdev/clk/base.c  |  2 +-
>>>  drm/nouveau/nvkm/subdev/therm/base.c| 19 ++-
>>>  drm/nouveau/nvkm/subdev/therm/g84.c | 13 -
>>>  drm/nouveau/nvkm/subdev/therm/gp100.c   |  9 +
>>>  drm/nouveau/nvkm/subdev/therm/nv40.c|  9 +++--
>>>  drm/nouveau/nvkm/subdev/therm/nv50.c|  9 +++--
>>>  drm/nouveau/nvkm/subdev/therm/priv.h|  4 ++--
>>>  drm/nouveau/nvkm/subdev/therm/temp.c| 16 
>>>  11 files changed, 60 insertions(+), 42 deletions(-)
>>>
>>> diff --git a/drm/nouveau/include/nvkm/subdev/clk.h 
>>> b/drm/nouveau/include/nvkm/subdev/clk.h
>>> index e5275f74..506f8cc6 100644
>>> --- a/drm/nouveau/include/nvkm/subdev/clk.h
>>> +++ b/drm/nouveau/include/nvkm/subdev/clk.h
>>> @@ -100,7 +100,7 @@ struct nvkm_clk {
>>>   int ustate_dc; /* user-requested (-1 disabled, -2 perfmon) */
>>>   int astate; /* perfmon adjustment (base) */
>>>   int dstate; /* display adjustment (min+) */
>>> - u8  temp;
>>> + int temp;
>>>
>>>   bool allow_reclock;
>>>  #define NVKM_CLK_BOOST_NONE 0x0
>>> @@ -122,7 +122,7 @@ int nvkm_clk_read(struct nvkm_clk *, enum nv_clk_src);
>>>  int nvkm_clk_ustate(struct nvkm_clk *, int req, int pwr);
>>>  int nvkm_clk_astate(struct nvkm_clk *, int req, int rel, bool wait);
>>>  int nvkm_clk_dstate(struct nvkm_clk *, int req, int rel);
>>> -int nvkm_clk_tstate(struct nvkm_clk *, u8 temperature);
>>> +int nvkm_clk_tstate(struct nvkm_clk *, int temperature);
>>>
>>>  int nv04_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
>>>  int nv40_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
>>> diff --git a/drm/nouveau/include/nvkm/subdev/therm.h 
>>> b/drm/nouveau/include/nvkm/subdev/therm.h
>>> index 9841f076..8c84017f 100644
>>> --- a/drm/nouveau/include/nvkm/subdev/therm.h
>>> +++ b/drm/nouveau/include/nvkm/subdev/therm.h
>>> @@ -86,7 +86,7 @@ struct nvkm_therm {
>>>   int (*attr_set)(struct nvkm_therm *, enum nvkm_therm_attr_type, int);
>>>  };
>>>
>>> -int nvkm_therm_temp_get(struct nvkm_therm *);
>>> +int nvkm_therm_temp_get(struct nvkm_therm *, int *);
>>>  int nvkm_therm_fan_sense(struct nvkm_therm *);
>>>  int nvkm_therm_cstate(struct nvkm_therm *, int, int);
>>>
>>> diff --git a/drm/nouveau/nouveau_hwmon.c b/drm/nouveau/nouveau_hwmon.c
>>> index 7c965648..7486e4af 100644
>>> --- a/drm/nouveau/nouveau_hwmon.c
>>> +++ b/drm/nouveau/nouveau_hwmon.c
>>> @@ -326,8 +326,9 @@ nouveau_temp_is_visible(const void *data, u32 attr, int 
>>> channel)
>>>  {
>>>   struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
>>>   struct nvkm_therm *therm = nvxx_therm(>client.device);
>>> + int val;
>>>
>>> - if (therm && therm->attr_get && nvkm_therm_temp_get(therm) < 0)
>>> + if (therm && therm->attr_get && nvkm_therm_temp_get(therm, ))
>>>   return 0;
>>>
>>>   switch 

Re: [Nouveau] [PATCH 03/32] therm: Split return code and value in nvkm_get_temp

2017-11-21 Thread Karol Herbst
On Wed, Nov 22, 2017 at 1:32 AM, Martin Peres  wrote:
> On 17/11/17 02:04, Karol Herbst wrote:
>> The current hwmon code doesn't check if the returned value was actually an
>> error.
>>
>> Since Kepler temperature sensors are able to report negative values. Those
>> negative values are not for error reporting, but rather when you buried
>> your GPU in snow somewhere in Antarctica and still want a valid
>> temperature to be reported (unverified).
>>
>> Since Pascal (and maybe earlier) we have sensors with improved precision.
>>
>> Adjust the nvkm_get_temp method to be able to deal with those changes
>> and let hwmon return an error properly.
>
> Where did you get this information? And if it is the case, then we will
> royally screw up the value because we don't even know on how many bits
> 0x20400 uses...
>
> If you are indeed right, I would like to see g84.c's temp_get().
>

Nvidia uses 0x020460 on Pascal and I thought we had that in envytools already?

[0] 228.412618 MMIO32 R 0x020460 0x20002f20 PTHERM.I2C_SLAVE+0x60 => 0x20002f20
[0] 229.413039 MMIO32 R 0x020460 0x20002f10 PTHERM.I2C_SLAVE+0x60 => 0x20002f10
[0] 230.416976 MMIO32 R 0x020460 0x20002f68 PTHERM.I2C_SLAVE+0x60 => 0x20002f68
[0] 231.417407 MMIO32 R 0x020460 0x20002fe8 PTHERM.I2C_SLAVE+0x60 => 0x20002fe8
[0] 232.417742 MMIO32 R 0x020460 0x20002ff0 PTHERM.I2C_SLAVE+0x60 => 0x20002ff0
[0] 233.417742 MMIO32 R 0x020460 0x20003020 PTHERM.I2C_SLAVE+0x60 => 0x20003020

or nvapeek 0x20400 && nvapeek 0x020460;
00020400: 002c
00020460: 20002c60

>>
>> Signed-off-by: Karol Herbst 
>> ---
>>  drm/nouveau/include/nvkm/subdev/clk.h   |  4 ++--
>>  drm/nouveau/include/nvkm/subdev/therm.h |  2 +-
>>  drm/nouveau/nouveau_hwmon.c | 15 +--
>>  drm/nouveau/nvkm/subdev/clk/base.c  |  2 +-
>>  drm/nouveau/nvkm/subdev/therm/base.c| 19 ++-
>>  drm/nouveau/nvkm/subdev/therm/g84.c | 13 -
>>  drm/nouveau/nvkm/subdev/therm/gp100.c   |  9 +
>>  drm/nouveau/nvkm/subdev/therm/nv40.c|  9 +++--
>>  drm/nouveau/nvkm/subdev/therm/nv50.c|  9 +++--
>>  drm/nouveau/nvkm/subdev/therm/priv.h|  4 ++--
>>  drm/nouveau/nvkm/subdev/therm/temp.c| 16 
>>  11 files changed, 60 insertions(+), 42 deletions(-)
>>
>> diff --git a/drm/nouveau/include/nvkm/subdev/clk.h 
>> b/drm/nouveau/include/nvkm/subdev/clk.h
>> index e5275f74..506f8cc6 100644
>> --- a/drm/nouveau/include/nvkm/subdev/clk.h
>> +++ b/drm/nouveau/include/nvkm/subdev/clk.h
>> @@ -100,7 +100,7 @@ struct nvkm_clk {
>>   int ustate_dc; /* user-requested (-1 disabled, -2 perfmon) */
>>   int astate; /* perfmon adjustment (base) */
>>   int dstate; /* display adjustment (min+) */
>> - u8  temp;
>> + int temp;
>>
>>   bool allow_reclock;
>>  #define NVKM_CLK_BOOST_NONE 0x0
>> @@ -122,7 +122,7 @@ int nvkm_clk_read(struct nvkm_clk *, enum nv_clk_src);
>>  int nvkm_clk_ustate(struct nvkm_clk *, int req, int pwr);
>>  int nvkm_clk_astate(struct nvkm_clk *, int req, int rel, bool wait);
>>  int nvkm_clk_dstate(struct nvkm_clk *, int req, int rel);
>> -int nvkm_clk_tstate(struct nvkm_clk *, u8 temperature);
>> +int nvkm_clk_tstate(struct nvkm_clk *, int temperature);
>>
>>  int nv04_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
>>  int nv40_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
>> diff --git a/drm/nouveau/include/nvkm/subdev/therm.h 
>> b/drm/nouveau/include/nvkm/subdev/therm.h
>> index 9841f076..8c84017f 100644
>> --- a/drm/nouveau/include/nvkm/subdev/therm.h
>> +++ b/drm/nouveau/include/nvkm/subdev/therm.h
>> @@ -86,7 +86,7 @@ struct nvkm_therm {
>>   int (*attr_set)(struct nvkm_therm *, enum nvkm_therm_attr_type, int);
>>  };
>>
>> -int nvkm_therm_temp_get(struct nvkm_therm *);
>> +int nvkm_therm_temp_get(struct nvkm_therm *, int *);
>>  int nvkm_therm_fan_sense(struct nvkm_therm *);
>>  int nvkm_therm_cstate(struct nvkm_therm *, int, int);
>>
>> diff --git a/drm/nouveau/nouveau_hwmon.c b/drm/nouveau/nouveau_hwmon.c
>> index 7c965648..7486e4af 100644
>> --- a/drm/nouveau/nouveau_hwmon.c
>> +++ b/drm/nouveau/nouveau_hwmon.c
>> @@ -326,8 +326,9 @@ nouveau_temp_is_visible(const void *data, u32 attr, int 
>> channel)
>>  {
>>   struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
>>   struct nvkm_therm *therm = nvxx_therm(>client.device);
>> + int val;
>>
>> - if (therm && therm->attr_get && nvkm_therm_temp_get(therm) < 0)
>> + if (therm && therm->attr_get && nvkm_therm_temp_get(therm, ))
>>   return 0;
>>
>>   switch (attr) {
>> @@ -421,15 +422,16 @@ nouveau_temp_read(struct device *dev, u32 attr, int 
>> channel, long *val)
>>   struct drm_device *drm_dev = dev_get_drvdata(dev);
>>   struct nouveau_drm *drm = nouveau_drm(drm_dev);
>>   struct nvkm_therm *therm = nvxx_therm(>client.device);
>> - int ret;
>> + int ret = 0;
>> + int temp;
>>
>>   if 

Re: [Nouveau] [PATCH 03/32] therm: Split return code and value in nvkm_get_temp

2017-11-21 Thread Martin Peres
On 17/11/17 02:04, Karol Herbst wrote:
> The current hwmon code doesn't check if the returned value was actually an
> error.
> 
> Since Kepler temperature sensors are able to report negative values. Those
> negative values are not for error reporting, but rather when you buried
> your GPU in snow somewhere in Antarctica and still want a valid
> temperature to be reported (unverified).
> 
> Since Pascal (and maybe earlier) we have sensors with improved precision.
> 
> Adjust the nvkm_get_temp method to be able to deal with those changes
> and let hwmon return an error properly.

Where did you get this information? And if it is the case, then we will
royally screw up the value because we don't even know on how many bits
0x20400 uses...

If you are indeed right, I would like to see g84.c's temp_get().

> 
> Signed-off-by: Karol Herbst 
> ---
>  drm/nouveau/include/nvkm/subdev/clk.h   |  4 ++--
>  drm/nouveau/include/nvkm/subdev/therm.h |  2 +-
>  drm/nouveau/nouveau_hwmon.c | 15 +--
>  drm/nouveau/nvkm/subdev/clk/base.c  |  2 +-
>  drm/nouveau/nvkm/subdev/therm/base.c| 19 ++-
>  drm/nouveau/nvkm/subdev/therm/g84.c | 13 -
>  drm/nouveau/nvkm/subdev/therm/gp100.c   |  9 +
>  drm/nouveau/nvkm/subdev/therm/nv40.c|  9 +++--
>  drm/nouveau/nvkm/subdev/therm/nv50.c|  9 +++--
>  drm/nouveau/nvkm/subdev/therm/priv.h|  4 ++--
>  drm/nouveau/nvkm/subdev/therm/temp.c| 16 
>  11 files changed, 60 insertions(+), 42 deletions(-)
> 
> diff --git a/drm/nouveau/include/nvkm/subdev/clk.h 
> b/drm/nouveau/include/nvkm/subdev/clk.h
> index e5275f74..506f8cc6 100644
> --- a/drm/nouveau/include/nvkm/subdev/clk.h
> +++ b/drm/nouveau/include/nvkm/subdev/clk.h
> @@ -100,7 +100,7 @@ struct nvkm_clk {
>   int ustate_dc; /* user-requested (-1 disabled, -2 perfmon) */
>   int astate; /* perfmon adjustment (base) */
>   int dstate; /* display adjustment (min+) */
> - u8  temp;
> + int temp;
>  
>   bool allow_reclock;
>  #define NVKM_CLK_BOOST_NONE 0x0
> @@ -122,7 +122,7 @@ int nvkm_clk_read(struct nvkm_clk *, enum nv_clk_src);
>  int nvkm_clk_ustate(struct nvkm_clk *, int req, int pwr);
>  int nvkm_clk_astate(struct nvkm_clk *, int req, int rel, bool wait);
>  int nvkm_clk_dstate(struct nvkm_clk *, int req, int rel);
> -int nvkm_clk_tstate(struct nvkm_clk *, u8 temperature);
> +int nvkm_clk_tstate(struct nvkm_clk *, int temperature);
>  
>  int nv04_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
>  int nv40_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
> diff --git a/drm/nouveau/include/nvkm/subdev/therm.h 
> b/drm/nouveau/include/nvkm/subdev/therm.h
> index 9841f076..8c84017f 100644
> --- a/drm/nouveau/include/nvkm/subdev/therm.h
> +++ b/drm/nouveau/include/nvkm/subdev/therm.h
> @@ -86,7 +86,7 @@ struct nvkm_therm {
>   int (*attr_set)(struct nvkm_therm *, enum nvkm_therm_attr_type, int);
>  };
>  
> -int nvkm_therm_temp_get(struct nvkm_therm *);
> +int nvkm_therm_temp_get(struct nvkm_therm *, int *);
>  int nvkm_therm_fan_sense(struct nvkm_therm *);
>  int nvkm_therm_cstate(struct nvkm_therm *, int, int);
>  
> diff --git a/drm/nouveau/nouveau_hwmon.c b/drm/nouveau/nouveau_hwmon.c
> index 7c965648..7486e4af 100644
> --- a/drm/nouveau/nouveau_hwmon.c
> +++ b/drm/nouveau/nouveau_hwmon.c
> @@ -326,8 +326,9 @@ nouveau_temp_is_visible(const void *data, u32 attr, int 
> channel)
>  {
>   struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
>   struct nvkm_therm *therm = nvxx_therm(>client.device);
> + int val;
>  
> - if (therm && therm->attr_get && nvkm_therm_temp_get(therm) < 0)
> + if (therm && therm->attr_get && nvkm_therm_temp_get(therm, ))
>   return 0;
>  
>   switch (attr) {
> @@ -421,15 +422,16 @@ nouveau_temp_read(struct device *dev, u32 attr, int 
> channel, long *val)
>   struct drm_device *drm_dev = dev_get_drvdata(dev);
>   struct nouveau_drm *drm = nouveau_drm(drm_dev);
>   struct nvkm_therm *therm = nvxx_therm(>client.device);
> - int ret;
> + int ret = 0;
> + int temp;
>  
>   if (!therm || !therm->attr_get)
>   return -EOPNOTSUPP;
>  
>   switch (attr) {
>   case hwmon_temp_input:
> - ret = nvkm_therm_temp_get(therm);
> - *val = ret < 0 ? ret : (ret * 1000);
> + ret = nvkm_therm_temp_get(therm, );
> + *val = temp * 1000;
>   break;
>   case hwmon_temp_max:
>   *val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
> @@ -459,7 +461,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int 
> channel, long *val)
>   return -EOPNOTSUPP;
>   }
>  
> - return 0;
> + return ret;
>  }
>  
>  static int
> @@ -713,6 +715,7 @@ nouveau_hwmon_init(struct drm_device *dev)
>   struct device *hwmon_dev;
>   int ret = 0;
>   int i = 0;
> + int 

[Nouveau] [PATCH 03/32] therm: Split return code and value in nvkm_get_temp

2017-11-16 Thread Karol Herbst
The current hwmon code doesn't check if the returned value was actually an
error.

Since Kepler temperature sensors are able to report negative values. Those
negative values are not for error reporting, but rather when you buried
your GPU in snow somewhere in Antarctica and still want a valid
temperature to be reported (unverified).

Since Pascal (and maybe earlier) we have sensors with improved precision.

Adjust the nvkm_get_temp method to be able to deal with those changes
and let hwmon return an error properly.

Signed-off-by: Karol Herbst 
---
 drm/nouveau/include/nvkm/subdev/clk.h   |  4 ++--
 drm/nouveau/include/nvkm/subdev/therm.h |  2 +-
 drm/nouveau/nouveau_hwmon.c | 15 +--
 drm/nouveau/nvkm/subdev/clk/base.c  |  2 +-
 drm/nouveau/nvkm/subdev/therm/base.c| 19 ++-
 drm/nouveau/nvkm/subdev/therm/g84.c | 13 -
 drm/nouveau/nvkm/subdev/therm/gp100.c   |  9 +
 drm/nouveau/nvkm/subdev/therm/nv40.c|  9 +++--
 drm/nouveau/nvkm/subdev/therm/nv50.c|  9 +++--
 drm/nouveau/nvkm/subdev/therm/priv.h|  4 ++--
 drm/nouveau/nvkm/subdev/therm/temp.c| 16 
 11 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/drm/nouveau/include/nvkm/subdev/clk.h 
b/drm/nouveau/include/nvkm/subdev/clk.h
index e5275f74..506f8cc6 100644
--- a/drm/nouveau/include/nvkm/subdev/clk.h
+++ b/drm/nouveau/include/nvkm/subdev/clk.h
@@ -100,7 +100,7 @@ struct nvkm_clk {
int ustate_dc; /* user-requested (-1 disabled, -2 perfmon) */
int astate; /* perfmon adjustment (base) */
int dstate; /* display adjustment (min+) */
-   u8  temp;
+   int temp;
 
bool allow_reclock;
 #define NVKM_CLK_BOOST_NONE 0x0
@@ -122,7 +122,7 @@ int nvkm_clk_read(struct nvkm_clk *, enum nv_clk_src);
 int nvkm_clk_ustate(struct nvkm_clk *, int req, int pwr);
 int nvkm_clk_astate(struct nvkm_clk *, int req, int rel, bool wait);
 int nvkm_clk_dstate(struct nvkm_clk *, int req, int rel);
-int nvkm_clk_tstate(struct nvkm_clk *, u8 temperature);
+int nvkm_clk_tstate(struct nvkm_clk *, int temperature);
 
 int nv04_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
 int nv40_clk_new(struct nvkm_device *, int, struct nvkm_clk **);
diff --git a/drm/nouveau/include/nvkm/subdev/therm.h 
b/drm/nouveau/include/nvkm/subdev/therm.h
index 9841f076..8c84017f 100644
--- a/drm/nouveau/include/nvkm/subdev/therm.h
+++ b/drm/nouveau/include/nvkm/subdev/therm.h
@@ -86,7 +86,7 @@ struct nvkm_therm {
int (*attr_set)(struct nvkm_therm *, enum nvkm_therm_attr_type, int);
 };
 
-int nvkm_therm_temp_get(struct nvkm_therm *);
+int nvkm_therm_temp_get(struct nvkm_therm *, int *);
 int nvkm_therm_fan_sense(struct nvkm_therm *);
 int nvkm_therm_cstate(struct nvkm_therm *, int, int);
 
diff --git a/drm/nouveau/nouveau_hwmon.c b/drm/nouveau/nouveau_hwmon.c
index 7c965648..7486e4af 100644
--- a/drm/nouveau/nouveau_hwmon.c
+++ b/drm/nouveau/nouveau_hwmon.c
@@ -326,8 +326,9 @@ nouveau_temp_is_visible(const void *data, u32 attr, int 
channel)
 {
struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
struct nvkm_therm *therm = nvxx_therm(>client.device);
+   int val;
 
-   if (therm && therm->attr_get && nvkm_therm_temp_get(therm) < 0)
+   if (therm && therm->attr_get && nvkm_therm_temp_get(therm, ))
return 0;
 
switch (attr) {
@@ -421,15 +422,16 @@ nouveau_temp_read(struct device *dev, u32 attr, int 
channel, long *val)
struct drm_device *drm_dev = dev_get_drvdata(dev);
struct nouveau_drm *drm = nouveau_drm(drm_dev);
struct nvkm_therm *therm = nvxx_therm(>client.device);
-   int ret;
+   int ret = 0;
+   int temp;
 
if (!therm || !therm->attr_get)
return -EOPNOTSUPP;
 
switch (attr) {
case hwmon_temp_input:
-   ret = nvkm_therm_temp_get(therm);
-   *val = ret < 0 ? ret : (ret * 1000);
+   ret = nvkm_therm_temp_get(therm, );
+   *val = temp * 1000;
break;
case hwmon_temp_max:
*val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
@@ -459,7 +461,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int 
channel, long *val)
return -EOPNOTSUPP;
}
 
-   return 0;
+   return ret;
 }
 
 static int
@@ -713,6 +715,7 @@ nouveau_hwmon_init(struct drm_device *dev)
struct device *hwmon_dev;
int ret = 0;
int i = 0;
+   int val;
 
hwmon = drm->hwmon = kzalloc(sizeof(*hwmon), GFP_KERNEL);
if (!hwmon)
@@ -720,7 +723,7 @@ nouveau_hwmon_init(struct drm_device *dev)
hwmon->dev = dev;
 
if (therm && therm->attr_get && therm->attr_set) {
-   if (nvkm_therm_temp_get(therm) >= 0)
+   if (!nvkm_therm_temp_get(therm, ))
special_groups[i++] = _auto_point_sensor_group;