[Nouveau] [PATCH 0/3] Several hwmon fixes

2017-09-02 Thread Karol Herbst
We should simply return errors while the GPU is turned off, because the sensors
aren't accessable and setting any kind of value doesn't make any sense. Fixes
sensors values reported by "sensors"

Before:
nouveau-pci-0100
Adapter: PCI adapter
GPU core: +0.60 V  (min =  +0.60 V, max =  +1.20 V)
temp1: -0.0°C  (high = +95.0°C, hyst =  +3.0°C)
   (crit = +105.0°C, hyst =  +5.0°C)
   (emerg = +135.0°C, hyst =  +5.0°C)
power1:  -22.00 uW

After:
nouveau-pci-0100
Adapter: PCI adapter
GPU core: N/A  (min =  +0.60 V, max =  +1.20 V)
temp1:N/A  (high = +95.0°C, hyst =  +3.0°C)
   (crit = +105.0°C, hyst =  +5.0°C)
   (emerg = +135.0°C, hyst =  +5.0°C)
power1:   N/A

Karol Herbst (3):
  therm: split return code and value in nvkm_get_temp
  hwmon: properly check for errors
  subdev/volt/gk104: return error when read fails

 drm/nouveau/include/nvkm/subdev/therm.h |  2 +-
 drm/nouveau/nouveau_hwmon.c | 48 -
 drm/nouveau/nvkm/subdev/therm/base.c| 19 +
 drm/nouveau/nvkm/subdev/therm/g84.c | 11 
 drm/nouveau/nvkm/subdev/therm/nv40.c|  9 +++
 drm/nouveau/nvkm/subdev/therm/nv50.c|  9 +++
 drm/nouveau/nvkm/subdev/therm/priv.h|  4 +--
 drm/nouveau/nvkm/subdev/therm/temp.c| 16 ---
 drm/nouveau/nvkm/subdev/volt/gk104.c|  7 -
 9 files changed, 82 insertions(+), 43 deletions(-)

-- 
2.14.1

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 1/3] therm: split return code and value in nvkm_get_temp

2017-09-02 Thread Karol Herbst
The current hwmon code doesn't check if the returned value was actually an
error.

Since Kepler temperature sensors are able to report negative values.
Since Pascal (and maybe earlier) we have sensors with improved precision.

Adjust the nvkm_get_temp method to be able to deal with those changes
and let hwmon return an error properly.

Signed-off-by: Karol Herbst 
---
 drm/nouveau/include/nvkm/subdev/therm.h |  2 +-
 drm/nouveau/nouveau_hwmon.c | 15 +--
 drm/nouveau/nvkm/subdev/therm/base.c| 19 ++-
 drm/nouveau/nvkm/subdev/therm/g84.c | 11 ++-
 drm/nouveau/nvkm/subdev/therm/nv40.c|  9 +++--
 drm/nouveau/nvkm/subdev/therm/nv50.c|  9 +++--
 drm/nouveau/nvkm/subdev/therm/priv.h|  4 ++--
 drm/nouveau/nvkm/subdev/therm/temp.c| 16 
 8 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/drm/nouveau/include/nvkm/subdev/therm.h 
b/drm/nouveau/include/nvkm/subdev/therm.h
index 1bfd93b8..192169f2 100644
--- a/drm/nouveau/include/nvkm/subdev/therm.h
+++ b/drm/nouveau/include/nvkm/subdev/therm.h
@@ -86,7 +86,7 @@ struct nvkm_therm {
int (*attr_set)(struct nvkm_therm *, enum nvkm_therm_attr_type, int);
 };
 
-int nvkm_therm_temp_get(struct nvkm_therm *);
+int nvkm_therm_temp_get(struct nvkm_therm *, int *);
 int nvkm_therm_fan_sense(struct nvkm_therm *);
 int nvkm_therm_cstate(struct nvkm_therm *, int, int);
 
diff --git a/drm/nouveau/nouveau_hwmon.c b/drm/nouveau/nouveau_hwmon.c
index 7c965648..f1914d9a 100644
--- a/drm/nouveau/nouveau_hwmon.c
+++ b/drm/nouveau/nouveau_hwmon.c
@@ -326,8 +326,9 @@ nouveau_temp_is_visible(const void *data, u32 attr, int 
channel)
 {
struct nouveau_drm *drm = nouveau_drm((struct drm_device *)data);
struct nvkm_therm *therm = nvxx_therm(>client.device);
+   int val;
 
-   if (therm && therm->attr_get && nvkm_therm_temp_get(therm) < 0)
+   if (therm && therm->attr_get && nvkm_therm_temp_get(therm, ) < 0)
return 0;
 
switch (attr) {
@@ -421,15 +422,16 @@ nouveau_temp_read(struct device *dev, u32 attr, int 
channel, long *val)
struct drm_device *drm_dev = dev_get_drvdata(dev);
struct nouveau_drm *drm = nouveau_drm(drm_dev);
struct nvkm_therm *therm = nvxx_therm(>client.device);
-   int ret;
+   int ret = 0;
+   int temp;
 
if (!therm || !therm->attr_get)
return -EOPNOTSUPP;
 
switch (attr) {
case hwmon_temp_input:
-   ret = nvkm_therm_temp_get(therm);
-   *val = ret < 0 ? ret : (ret * 1000);
+   ret = nvkm_therm_temp_get(therm, );
+   *val = temp * 1000;
break;
case hwmon_temp_max:
*val = therm->attr_get(therm, NVKM_THERM_ATTR_THRS_DOWN_CLK)
@@ -459,7 +461,7 @@ nouveau_temp_read(struct device *dev, u32 attr, int 
channel, long *val)
return -EOPNOTSUPP;
}
 
-   return 0;
+   return ret;
 }
 
 static int
@@ -713,6 +715,7 @@ nouveau_hwmon_init(struct drm_device *dev)
struct device *hwmon_dev;
int ret = 0;
int i = 0;
+   int val;
 
hwmon = drm->hwmon = kzalloc(sizeof(*hwmon), GFP_KERNEL);
if (!hwmon)
@@ -720,7 +723,7 @@ nouveau_hwmon_init(struct drm_device *dev)
hwmon->dev = dev;
 
if (therm && therm->attr_get && therm->attr_set) {
-   if (nvkm_therm_temp_get(therm) >= 0)
+   if (nvkm_therm_temp_get(therm, ) >= 0)
special_groups[i++] = _auto_point_sensor_group;
if (therm->fan_get && therm->fan_get(therm) >= 0)
special_groups[i++] = _fan_sensor_group;
diff --git a/drm/nouveau/nvkm/subdev/therm/base.c 
b/drm/nouveau/nvkm/subdev/therm/base.c
index 952a7cb0..27a03c50 100644
--- a/drm/nouveau/nvkm/subdev/therm/base.c
+++ b/drm/nouveau/nvkm/subdev/therm/base.c
@@ -24,22 +24,26 @@
 #include "priv.h"
 
 int
-nvkm_therm_temp_get(struct nvkm_therm *therm)
+nvkm_therm_temp_get(struct nvkm_therm *therm, int *val)
 {
if (therm->func->temp_get)
-   return therm->func->temp_get(therm);
+   return therm->func->temp_get(therm, val);
return -ENODEV;
 }
 
 static int
 nvkm_therm_update_trip(struct nvkm_therm *therm)
 {
+   int temp, ret;
struct nvbios_therm_trip_point *trip = therm->fan->bios.trip,
   *cur_trip = NULL,
   *last_trip = therm->last_trip;
-   u8  temp = therm->func->temp_get(therm);
u16 duty, i;
 
+   ret = therm->func->temp_get(therm, );
+   if (ret < 0)
+   return ret;
+
/* look for the trip point corresponding to the current temperature */
cur_trip = NULL;
for (i = 0; i < therm->fan->bios.nr_fan_trip; i++) {
@@ -67,9 +71,13 @@ static int
 nvkm_therm_compute_linear_duty(struct nvkm_therm *therm, u8 

[Nouveau] [PATCH 2/3] hwmon: properly check for errors

2017-09-02 Thread Karol Herbst
Otherwise hwmon interprets error codes as real values.

Signed-off-by: Karol Herbst 
---
 drm/nouveau/nouveau_hwmon.c | 33 ++---
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/drm/nouveau/nouveau_hwmon.c b/drm/nouveau/nouveau_hwmon.c
index f1914d9a..0d75c14d 100644
--- a/drm/nouveau/nouveau_hwmon.c
+++ b/drm/nouveau/nouveau_hwmon.c
@@ -470,18 +470,23 @@ nouveau_fan_read(struct device *dev, u32 attr, int 
channel, long *val)
struct drm_device *drm_dev = dev_get_drvdata(dev);
struct nouveau_drm *drm = nouveau_drm(drm_dev);
struct nvkm_therm *therm = nvxx_therm(>client.device);
+   int ret;
 
if (!therm)
return -EOPNOTSUPP;
 
switch (attr) {
case hwmon_fan_input:
-   *val = nvkm_therm_fan_sense(therm);
+   ret = nvkm_therm_fan_sense(therm);
break;
default:
return -EOPNOTSUPP;
}
 
+   if (ret < 0)
+   return ret;
+
+   *val = ret;
return 0;
 }
 
@@ -491,7 +496,7 @@ nouveau_in_read(struct device *dev, u32 attr, int channel, 
long *val)
struct drm_device *drm_dev = dev_get_drvdata(dev);
struct nouveau_drm *drm = nouveau_drm(drm_dev);
struct nvkm_volt *volt = nvxx_volt(>client.device);
-   int ret;
+   int ret = 0;
 
if (!volt)
return -EOPNOTSUPP;
@@ -499,7 +504,8 @@ nouveau_in_read(struct device *dev, u32 attr, int channel, 
long *val)
switch (attr) {
case hwmon_in_input:
ret = nvkm_volt_get(volt);
-   *val = ret < 0 ? ret : (ret / 1000);
+   if (ret >= 0)
+   *val = ret / 1000;
break;
case hwmon_in_min:
*val = volt->min_uv > 0 ? (volt->min_uv / 1000) : -ENODEV;
@@ -511,7 +517,7 @@ nouveau_in_read(struct device *dev, u32 attr, int channel, 
long *val)
return -EOPNOTSUPP;
}
 
-   return 0;
+   return ret;
 }
 
 static int
@@ -520,21 +526,26 @@ nouveau_pwm_read(struct device *dev, u32 attr, int 
channel, long *val)
struct drm_device *drm_dev = dev_get_drvdata(dev);
struct nouveau_drm *drm = nouveau_drm(drm_dev);
struct nvkm_therm *therm = nvxx_therm(>client.device);
+   int ret;
 
if (!therm || !therm->attr_get || !therm->fan_get)
return -EOPNOTSUPP;
 
switch (attr) {
case hwmon_pwm_enable:
-   *val = therm->attr_get(therm, NVKM_THERM_ATTR_FAN_MODE);
+   ret = therm->attr_get(therm, NVKM_THERM_ATTR_FAN_MODE);
break;
case hwmon_pwm_input:
-   *val = therm->fan_get(therm);
+   ret = therm->fan_get(therm);
break;
default:
return -EOPNOTSUPP;
}
 
+   if (ret < 0)
+   return ret;
+
+   *val = ret;
return 0;
 }
 
@@ -544,18 +555,26 @@ nouveau_power_read(struct device *dev, u32 attr, int 
channel, long *val)
struct drm_device *drm_dev = dev_get_drvdata(dev);
struct nouveau_drm *drm = nouveau_drm(drm_dev);
struct nvkm_iccsense *iccsense = nvxx_iccsense(>client.device);
+   int ret;
 
if (!iccsense)
return -EOPNOTSUPP;
 
switch (attr) {
case hwmon_power_input:
-   *val = nvkm_iccsense_read_all(iccsense);
+   ret = nvkm_iccsense_read_all(iccsense);
+   if (ret < 0)
+   return ret;
+   *val = ret;
break;
case hwmon_power_max:
+   if (iccsense->power_w_max <= 0)
+   return -ENODEV;
*val = iccsense->power_w_max;
break;
case hwmon_power_crit:
+   if (iccsense->power_w_crit <= 0)
+   return -ENODEV;
*val = iccsense->power_w_crit;
break;
default:
-- 
2.14.1

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [PATCH 3/3] subdev/volt/gk104: return error when read fails

2017-09-02 Thread Karol Herbst
While my gpu was powered off, hwmon returned 0.6V as the current voltage.
If nvkm_rd32 fails for any reason, return the error.

With that sensors will display a "N/A" instead of 0.6V.

Signed-off-by: Karol Herbst 
---
 drm/nouveau/nvkm/subdev/volt/gk104.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drm/nouveau/nvkm/subdev/volt/gk104.c 
b/drm/nouveau/nvkm/subdev/volt/gk104.c
index 1c744e02..53a7af9d 100644
--- a/drm/nouveau/nvkm/subdev/volt/gk104.c
+++ b/drm/nouveau/nvkm/subdev/volt/gk104.c
@@ -40,10 +40,15 @@ gk104_volt_get(struct nvkm_volt *base)
 {
struct nvbios_volt *bios = _volt(base)->bios;
struct nvkm_device *device = base->subdev.device;
-   u32 div, duty;
+   int div, duty;
 
div  = nvkm_rd32(device, 0x20340);
+   if (div < 0)
+   return div;
+
duty = nvkm_rd32(device, 0x20344);
+   if (duty < 0)
+   return duty;
 
return bios->base + bios->pwm_range * duty / div;
 }
-- 
2.14.1

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] [Bug 93629] [NVE6] complete system freeze, PGRAPH engine fault on channel 2, SCHED_ERROR [ CTXSW_TIMEOUT ]

2017-09-02 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93629

--- Comment #38 from Robb Ebright  ---
I ran into a similar bug with a recent Debian install. It occurs only when I
plugged in a monitor into the DVI port. It works fine with the HDMI port as I'm
using it to type. But it freezes with the following error.

I'm installing the proprietary drivers so but I wanted to provide this
confirmation/documentation in case it is helpful to anyone trying to solve
this. In an ideal world I would use the free software drivers as well but there
is evidently some issue with the software that is causing it to crash.


1 23:07:09 dragonpunk /usr/lib/gdm3/gdm-x-session[787]: (II) systemd-logind:
got pause for 226:0
Sep  1 23:07:09 dragonpunk /usr/lib/gdm3/gdm-x-session[787]: (II)
systemd-logind: got pause for 13:67
Sep  1 23:07:09 dragonpunk /usr/lib/gdm3/gdm-x-session[787]: (II)
systemd-logind: got pause for 13:66
Sep  1 23:07:09 dragonpunk /usr/lib/gdm3/gdm-x-session[787]: (II)
systemd-logind: got pause for 13:64
Sep  1 23:07:09 dragonpunk /usr/lib/gdm3/gdm-x-session[787]: (II)
systemd-logind: got pause for 13:65
Sep  1 23:07:09 dragonpunk /usr/lib/gdm3/gdm-x-session[787]: (II)
systemd-logind: got pause for 13:68
Sep  1 23:07:09 dragonpunk kernel: [74586.320157] nouveau :01:00.0: fifo:
write fault at 31d000 engine 05 [BAR3] client 08 [BAR_WRITE] reason 02
[PAGE_NOT_PRESENT] on channel -1 [003ff35000 unknown]
Sep  1 23:07:09 dragonpunk kernel: [74586.320169] nouveau :01:00.0: fifo:
INTR 0800
Sep  1 23:07:09 dragonpunk kernel: [74586.825813] nouveau :01:00.0: gr:
TRAP ch 2 [003fd3 systemd-logind[448]]
Sep  1 23:07:09 dragonpunk kernel: [74586.825826] nouveau :01:00.0: gr:
GPC0/TPC0/TEX: 8041
Sep  1 23:07:09 dragonpunk kernel: [74586.825834] nouveau :01:00.0: gr:
GPC0/TPC1/TEX: 8041
Sep  1 23:07:09 dragonpunk kernel: [74586.825846] nouveau :01:00.0: gr:
GPC1/TPC0/TEX: 8041
Sep  1 23:07:09 dragonpunk kernel: [74586.825854] nouveau :01:00.0: gr:
GPC1/TPC2/TEX: 8041
Sep  1 23:07:09 dragonpunk kernel: [74586.825866] nouveau :01:00.0: fifo:
read fault at 0002f61000 engine 00 [PGRAPH] client 01 [GPC1/TEX] reason 02
[PAGE_NOT_PRESENT] on channel 2 [003fd3 systemd-logind[448]]
Sep  1 23:07:09 dragonpunk kernel: [74586.825868] nouveau :01:00.0: fifo:
gr engine fault on channel 2, recovering...
Sep  1 23:07:09 dragonpunk kernel: [74586.881742] nouveau :01:00.0: fifo:
write fault at 278000 engine 00 [PGRAPH] client 0f [GPC0/PROP] reason 02
[PAGE_NOT_PRESENT] on channel 3 [003fc83000 Xwayland[602]]
Sep  1 23:07:09 dragonpunk kernel: [74586.881746] nouveau :01:00.0: fifo:
gr engine fault on channel 3, recovering...
Sep  1 23:07:48 dragonpunk gnome-shell[566]: Failed to apply DRM plane
transform 0: Invalid argument
Sep  1 23:08:06 dragonpunk gnome-shell[566]: Failed to apply DRM plane
transform 0: Invalid argument
Sep  1 23:08:06 dragonpunk gnome-shell[566]: Failed to apply DRM plane
transform 0: Invalid argument
Sep  1 23:08:19 dragonpunk gnome-shell[566]: Failed to apply DRM plane
transform 0: Invalid argument

-- 
You are receiving this mail because:
You are the assignee for the bug.___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau