Re: [Intel-gfx] [PATCH 3/3] drm/i915: move gpu error sysfs to i915_gpu_error.c

2023-10-11 Thread Jani Nikula
On Wed, 11 Oct 2023, John Harrison  wrote:
> On 10/11/2023 09:38, Jani Nikula wrote:
>> Hide gpu error specifics in i915_gpu_error.c. This is also cleaner wrt
>> conditional compilation, as i915_gpu_error.c is only built with
>> DRM_I915_CAPTURE_ERROR=y.
>>
>> With this, we can also make i915_first_error_state() static.
>>
>> Signed-off-by: Jani Nikula 
>> ---
>>   drivers/gpu/drm/i915/i915_gpu_error.c | 75 -
>>   drivers/gpu/drm/i915/i915_gpu_error.h | 17 +++---
>>   drivers/gpu/drm/i915/i915_sysfs.c | 79 +--
>>   3 files changed, 86 insertions(+), 85 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
>> b/drivers/gpu/drm/i915/i915_gpu_error.c
>> index b4c8459deb7b..f9e750217f18 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
>> @@ -57,6 +57,7 @@
>>   #include "i915_memcpy.h"
>>   #include "i915_reg.h"
>>   #include "i915_scatterlist.h"
>> +#include "i915_sysfs.h"
>>   #include "i915_utils.h"
>>   
>>   #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | 
>> __GFP_NOWARN)
>> @@ -2208,7 +2209,7 @@ void i915_capture_error_state(struct intel_gt *gt,
>>  i915_gpu_coredump_put(error);
>>   }
>>   
>> -struct i915_gpu_coredump *
>> +static struct i915_gpu_coredump *
>>   i915_first_error_state(struct drm_i915_private *i915)
>>   {
>>  struct i915_gpu_coredump *error;
>> @@ -2484,3 +2485,75 @@ void i915_gpu_error_debugfs_register(struct 
>> drm_i915_private *i915)
>>  debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915,
>>  _gpu_info_fops);
>>   }
>> +
>> +static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
>> +struct bin_attribute *attr, char *buf,
>> +loff_t off, size_t count)
>> +{
>> +
>> +struct device *kdev = kobj_to_dev(kobj);
>> +struct drm_i915_private *i915 = kdev_minor_to_i915(kdev);
>> +struct i915_gpu_coredump *gpu;
>> +ssize_t ret = 0;
>> +
>> +/*
>> + * FIXME: Concurrent clients triggering resets and reading + clearing
>> + * dumps can cause inconsistent sysfs reads when a user calls in with a
>> + * non-zero offset to complete a prior partial read but the
>> + * gpu_coredump has been cleared or replaced.
>> + */
>> +
>> +gpu = i915_first_error_state(i915);
>> +if (IS_ERR(gpu)) {
>> +ret = PTR_ERR(gpu);
>> +} else if (gpu) {
>> +ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count);
>> +i915_gpu_coredump_put(gpu);
>> +} else {
>> +const char *str = "No error state collected\n";
>> +size_t len = strlen(str);
>> +
>> +if (off < len) {
>> +ret = min_t(size_t, count, len - off);
>> +memcpy(buf, str + off, ret);
>> +}
>> +}
> Can this and the debugfs equivalent not be common code? It seems like 
> the implementations are conceptually the same even if the code currently 
> looks quite different.

They probably can, but this is just the code movement part. I initially
sent a bigger refactoring series [1], but decided to chop it up and send
it in smaller pieces, to not burden the reviewers. The first part [2]
has already been merged, and this is follow-up.

BR,
Jani.


[1] https://lore.kernel.org/r/cover.1695924021.git.jani.nik...@intel.com
[2] https://lore.kernel.org/r/cover.1696236329.git.jani.nik...@intel.com

>
> John.
>
>> +
>> +return ret;
>> +}
>> +
>> +static ssize_t error_state_write(struct file *file, struct kobject *kobj,
>> + struct bin_attribute *attr, char *buf,
>> + loff_t off, size_t count)
>> +{
>> +struct device *kdev = kobj_to_dev(kobj);
>> +struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev);
>> +
>> +drm_dbg(_priv->drm, "Resetting error state\n");
>> +i915_reset_error_state(dev_priv);
>> +
>> +return count;
>> +}
>> +
>> +static const struct bin_attribute error_state_attr = {
>> +.attr.name = "error",
>> +.attr.mode = S_IRUSR | S_IWUSR,
>> +.size = 0,
>> +.read = error_state_read,
>> +.write = error_state_write,
>> +};
>> +
>> +void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915)
>> +{
>> +struct device *kdev = i915->drm.primary->kdev;
>> +
>> +if (sysfs_create_bin_file(>kobj, _state_attr))
>> +drm_err(>drm, "error_state sysfs setup failed\n");
>> +}
>> +
>> +void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915)
>> +{
>> +struct device *kdev = i915->drm.primary->kdev;
>> +
>> +sysfs_remove_bin_file(>kobj, _state_attr);
>> +}
>> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
>> b/drivers/gpu/drm/i915/i915_gpu_error.h
>> index a6f2a7518cf0..68c964d6720a 100644
>> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
>> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
>> @@ -323,11 +323,12 

Re: [Intel-gfx] [PATCH 3/3] drm/i915: move gpu error sysfs to i915_gpu_error.c

2023-10-11 Thread John Harrison




On 10/11/2023 09:38, Jani Nikula wrote:

Hide gpu error specifics in i915_gpu_error.c. This is also cleaner wrt
conditional compilation, as i915_gpu_error.c is only built with
DRM_I915_CAPTURE_ERROR=y.

With this, we can also make i915_first_error_state() static.

Signed-off-by: Jani Nikula 
---
  drivers/gpu/drm/i915/i915_gpu_error.c | 75 -
  drivers/gpu/drm/i915/i915_gpu_error.h | 17 +++---
  drivers/gpu/drm/i915/i915_sysfs.c | 79 +--
  3 files changed, 86 insertions(+), 85 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index b4c8459deb7b..f9e750217f18 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -57,6 +57,7 @@
  #include "i915_memcpy.h"
  #include "i915_reg.h"
  #include "i915_scatterlist.h"
+#include "i915_sysfs.h"
  #include "i915_utils.h"
  
  #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)

@@ -2208,7 +2209,7 @@ void i915_capture_error_state(struct intel_gt *gt,
i915_gpu_coredump_put(error);
  }
  
-struct i915_gpu_coredump *

+static struct i915_gpu_coredump *
  i915_first_error_state(struct drm_i915_private *i915)
  {
struct i915_gpu_coredump *error;
@@ -2484,3 +2485,75 @@ void i915_gpu_error_debugfs_register(struct 
drm_i915_private *i915)
debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915,
_gpu_info_fops);
  }
+
+static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
+   struct bin_attribute *attr, char *buf,
+   loff_t off, size_t count)
+{
+
+   struct device *kdev = kobj_to_dev(kobj);
+   struct drm_i915_private *i915 = kdev_minor_to_i915(kdev);
+   struct i915_gpu_coredump *gpu;
+   ssize_t ret = 0;
+
+   /*
+* FIXME: Concurrent clients triggering resets and reading + clearing
+* dumps can cause inconsistent sysfs reads when a user calls in with a
+* non-zero offset to complete a prior partial read but the
+* gpu_coredump has been cleared or replaced.
+*/
+
+   gpu = i915_first_error_state(i915);
+   if (IS_ERR(gpu)) {
+   ret = PTR_ERR(gpu);
+   } else if (gpu) {
+   ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count);
+   i915_gpu_coredump_put(gpu);
+   } else {
+   const char *str = "No error state collected\n";
+   size_t len = strlen(str);
+
+   if (off < len) {
+   ret = min_t(size_t, count, len - off);
+   memcpy(buf, str + off, ret);
+   }
+   }
Can this and the debugfs equivalent not be common code? It seems like 
the implementations are conceptually the same even if the code currently 
looks quite different.


John.


+
+   return ret;
+}
+
+static ssize_t error_state_write(struct file *file, struct kobject *kobj,
+struct bin_attribute *attr, char *buf,
+loff_t off, size_t count)
+{
+   struct device *kdev = kobj_to_dev(kobj);
+   struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev);
+
+   drm_dbg(_priv->drm, "Resetting error state\n");
+   i915_reset_error_state(dev_priv);
+
+   return count;
+}
+
+static const struct bin_attribute error_state_attr = {
+   .attr.name = "error",
+   .attr.mode = S_IRUSR | S_IWUSR,
+   .size = 0,
+   .read = error_state_read,
+   .write = error_state_write,
+};
+
+void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915)
+{
+   struct device *kdev = i915->drm.primary->kdev;
+
+   if (sysfs_create_bin_file(>kobj, _state_attr))
+   drm_err(>drm, "error_state sysfs setup failed\n");
+}
+
+void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915)
+{
+   struct device *kdev = i915->drm.primary->kdev;
+
+   sysfs_remove_bin_file(>kobj, _state_attr);
+}
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
b/drivers/gpu/drm/i915/i915_gpu_error.h
index a6f2a7518cf0..68c964d6720a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -323,11 +323,12 @@ static inline void i915_gpu_coredump_put(struct 
i915_gpu_coredump *gpu)
kref_put(>ref, __i915_gpu_coredump_free);
  }
  
-struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915);

  void i915_reset_error_state(struct drm_i915_private *i915);
  void i915_disable_error_state(struct drm_i915_private *i915, int err);
  
  void i915_gpu_error_debugfs_register(struct drm_i915_private *i915);

+void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915);
+void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915);
  
  #else
  
@@ -396,12 +397,6 @@ static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu)

  {
  

[Intel-gfx] [PATCH 3/3] drm/i915: move gpu error sysfs to i915_gpu_error.c

2023-10-11 Thread Jani Nikula
Hide gpu error specifics in i915_gpu_error.c. This is also cleaner wrt
conditional compilation, as i915_gpu_error.c is only built with
DRM_I915_CAPTURE_ERROR=y.

With this, we can also make i915_first_error_state() static.

Signed-off-by: Jani Nikula 
---
 drivers/gpu/drm/i915/i915_gpu_error.c | 75 -
 drivers/gpu/drm/i915/i915_gpu_error.h | 17 +++---
 drivers/gpu/drm/i915/i915_sysfs.c | 79 +--
 3 files changed, 86 insertions(+), 85 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c 
b/drivers/gpu/drm/i915/i915_gpu_error.c
index b4c8459deb7b..f9e750217f18 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -57,6 +57,7 @@
 #include "i915_memcpy.h"
 #include "i915_reg.h"
 #include "i915_scatterlist.h"
+#include "i915_sysfs.h"
 #include "i915_utils.h"
 
 #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
@@ -2208,7 +2209,7 @@ void i915_capture_error_state(struct intel_gt *gt,
i915_gpu_coredump_put(error);
 }
 
-struct i915_gpu_coredump *
+static struct i915_gpu_coredump *
 i915_first_error_state(struct drm_i915_private *i915)
 {
struct i915_gpu_coredump *error;
@@ -2484,3 +2485,75 @@ void i915_gpu_error_debugfs_register(struct 
drm_i915_private *i915)
debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915,
_gpu_info_fops);
 }
+
+static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
+   struct bin_attribute *attr, char *buf,
+   loff_t off, size_t count)
+{
+
+   struct device *kdev = kobj_to_dev(kobj);
+   struct drm_i915_private *i915 = kdev_minor_to_i915(kdev);
+   struct i915_gpu_coredump *gpu;
+   ssize_t ret = 0;
+
+   /*
+* FIXME: Concurrent clients triggering resets and reading + clearing
+* dumps can cause inconsistent sysfs reads when a user calls in with a
+* non-zero offset to complete a prior partial read but the
+* gpu_coredump has been cleared or replaced.
+*/
+
+   gpu = i915_first_error_state(i915);
+   if (IS_ERR(gpu)) {
+   ret = PTR_ERR(gpu);
+   } else if (gpu) {
+   ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count);
+   i915_gpu_coredump_put(gpu);
+   } else {
+   const char *str = "No error state collected\n";
+   size_t len = strlen(str);
+
+   if (off < len) {
+   ret = min_t(size_t, count, len - off);
+   memcpy(buf, str + off, ret);
+   }
+   }
+
+   return ret;
+}
+
+static ssize_t error_state_write(struct file *file, struct kobject *kobj,
+struct bin_attribute *attr, char *buf,
+loff_t off, size_t count)
+{
+   struct device *kdev = kobj_to_dev(kobj);
+   struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev);
+
+   drm_dbg(_priv->drm, "Resetting error state\n");
+   i915_reset_error_state(dev_priv);
+
+   return count;
+}
+
+static const struct bin_attribute error_state_attr = {
+   .attr.name = "error",
+   .attr.mode = S_IRUSR | S_IWUSR,
+   .size = 0,
+   .read = error_state_read,
+   .write = error_state_write,
+};
+
+void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915)
+{
+   struct device *kdev = i915->drm.primary->kdev;
+
+   if (sysfs_create_bin_file(>kobj, _state_attr))
+   drm_err(>drm, "error_state sysfs setup failed\n");
+}
+
+void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915)
+{
+   struct device *kdev = i915->drm.primary->kdev;
+
+   sysfs_remove_bin_file(>kobj, _state_attr);
+}
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h 
b/drivers/gpu/drm/i915/i915_gpu_error.h
index a6f2a7518cf0..68c964d6720a 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -323,11 +323,12 @@ static inline void i915_gpu_coredump_put(struct 
i915_gpu_coredump *gpu)
kref_put(>ref, __i915_gpu_coredump_free);
 }
 
-struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private 
*i915);
 void i915_reset_error_state(struct drm_i915_private *i915);
 void i915_disable_error_state(struct drm_i915_private *i915, int err);
 
 void i915_gpu_error_debugfs_register(struct drm_i915_private *i915);
+void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915);
+void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915);
 
 #else
 
@@ -396,12 +397,6 @@ static inline void i915_gpu_coredump_put(struct 
i915_gpu_coredump *gpu)
 {
 }
 
-static inline struct i915_gpu_coredump *
-i915_first_error_state(struct drm_i915_private *i915)
-{
-   return ERR_PTR(-ENODEV);
-}
-
 static inline void i915_reset_error_state(struct drm_i915_private *i915)
 {
 }
@@ -415,6 +410,14 @@ static