Re: [Intel-gfx] [PATCH 3/3] drm/i915: move gpu error sysfs to i915_gpu_error.c
On Wed, 11 Oct 2023, John Harrison wrote: > On 10/11/2023 09:38, Jani Nikula wrote: >> Hide gpu error specifics in i915_gpu_error.c. This is also cleaner wrt >> conditional compilation, as i915_gpu_error.c is only built with >> DRM_I915_CAPTURE_ERROR=y. >> >> With this, we can also make i915_first_error_state() static. >> >> Signed-off-by: Jani Nikula >> --- >> drivers/gpu/drm/i915/i915_gpu_error.c | 75 - >> drivers/gpu/drm/i915/i915_gpu_error.h | 17 +++--- >> drivers/gpu/drm/i915/i915_sysfs.c | 79 +-- >> 3 files changed, 86 insertions(+), 85 deletions(-) >> >> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c >> b/drivers/gpu/drm/i915/i915_gpu_error.c >> index b4c8459deb7b..f9e750217f18 100644 >> --- a/drivers/gpu/drm/i915/i915_gpu_error.c >> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c >> @@ -57,6 +57,7 @@ >> #include "i915_memcpy.h" >> #include "i915_reg.h" >> #include "i915_scatterlist.h" >> +#include "i915_sysfs.h" >> #include "i915_utils.h" >> >> #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | >> __GFP_NOWARN) >> @@ -2208,7 +2209,7 @@ void i915_capture_error_state(struct intel_gt *gt, >> i915_gpu_coredump_put(error); >> } >> >> -struct i915_gpu_coredump * >> +static struct i915_gpu_coredump * >> i915_first_error_state(struct drm_i915_private *i915) >> { >> struct i915_gpu_coredump *error; >> @@ -2484,3 +2485,75 @@ void i915_gpu_error_debugfs_register(struct >> drm_i915_private *i915) >> debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915, >> _gpu_info_fops); >> } >> + >> +static ssize_t error_state_read(struct file *filp, struct kobject *kobj, >> +struct bin_attribute *attr, char *buf, >> +loff_t off, size_t count) >> +{ >> + >> +struct device *kdev = kobj_to_dev(kobj); >> +struct drm_i915_private *i915 = kdev_minor_to_i915(kdev); >> +struct i915_gpu_coredump *gpu; >> +ssize_t ret = 0; >> + >> +/* >> + * FIXME: Concurrent clients triggering resets and reading + clearing >> + * dumps can cause inconsistent sysfs reads when a user calls in with a >> + * non-zero offset to complete a prior partial read but the >> + * gpu_coredump has been cleared or replaced. >> + */ >> + >> +gpu = i915_first_error_state(i915); >> +if (IS_ERR(gpu)) { >> +ret = PTR_ERR(gpu); >> +} else if (gpu) { >> +ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count); >> +i915_gpu_coredump_put(gpu); >> +} else { >> +const char *str = "No error state collected\n"; >> +size_t len = strlen(str); >> + >> +if (off < len) { >> +ret = min_t(size_t, count, len - off); >> +memcpy(buf, str + off, ret); >> +} >> +} > Can this and the debugfs equivalent not be common code? It seems like > the implementations are conceptually the same even if the code currently > looks quite different. They probably can, but this is just the code movement part. I initially sent a bigger refactoring series [1], but decided to chop it up and send it in smaller pieces, to not burden the reviewers. The first part [2] has already been merged, and this is follow-up. BR, Jani. [1] https://lore.kernel.org/r/cover.1695924021.git.jani.nik...@intel.com [2] https://lore.kernel.org/r/cover.1696236329.git.jani.nik...@intel.com > > John. > >> + >> +return ret; >> +} >> + >> +static ssize_t error_state_write(struct file *file, struct kobject *kobj, >> + struct bin_attribute *attr, char *buf, >> + loff_t off, size_t count) >> +{ >> +struct device *kdev = kobj_to_dev(kobj); >> +struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); >> + >> +drm_dbg(_priv->drm, "Resetting error state\n"); >> +i915_reset_error_state(dev_priv); >> + >> +return count; >> +} >> + >> +static const struct bin_attribute error_state_attr = { >> +.attr.name = "error", >> +.attr.mode = S_IRUSR | S_IWUSR, >> +.size = 0, >> +.read = error_state_read, >> +.write = error_state_write, >> +}; >> + >> +void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915) >> +{ >> +struct device *kdev = i915->drm.primary->kdev; >> + >> +if (sysfs_create_bin_file(>kobj, _state_attr)) >> +drm_err(>drm, "error_state sysfs setup failed\n"); >> +} >> + >> +void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915) >> +{ >> +struct device *kdev = i915->drm.primary->kdev; >> + >> +sysfs_remove_bin_file(>kobj, _state_attr); >> +} >> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h >> b/drivers/gpu/drm/i915/i915_gpu_error.h >> index a6f2a7518cf0..68c964d6720a 100644 >> --- a/drivers/gpu/drm/i915/i915_gpu_error.h >> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h >> @@ -323,11 +323,12
Re: [Intel-gfx] [PATCH 3/3] drm/i915: move gpu error sysfs to i915_gpu_error.c
On 10/11/2023 09:38, Jani Nikula wrote: Hide gpu error specifics in i915_gpu_error.c. This is also cleaner wrt conditional compilation, as i915_gpu_error.c is only built with DRM_I915_CAPTURE_ERROR=y. With this, we can also make i915_first_error_state() static. Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gpu_error.c | 75 - drivers/gpu/drm/i915/i915_gpu_error.h | 17 +++--- drivers/gpu/drm/i915/i915_sysfs.c | 79 +-- 3 files changed, 86 insertions(+), 85 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index b4c8459deb7b..f9e750217f18 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -57,6 +57,7 @@ #include "i915_memcpy.h" #include "i915_reg.h" #include "i915_scatterlist.h" +#include "i915_sysfs.h" #include "i915_utils.h" #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) @@ -2208,7 +2209,7 @@ void i915_capture_error_state(struct intel_gt *gt, i915_gpu_coredump_put(error); } -struct i915_gpu_coredump * +static struct i915_gpu_coredump * i915_first_error_state(struct drm_i915_private *i915) { struct i915_gpu_coredump *error; @@ -2484,3 +2485,75 @@ void i915_gpu_error_debugfs_register(struct drm_i915_private *i915) debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915, _gpu_info_fops); } + +static ssize_t error_state_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + + struct device *kdev = kobj_to_dev(kobj); + struct drm_i915_private *i915 = kdev_minor_to_i915(kdev); + struct i915_gpu_coredump *gpu; + ssize_t ret = 0; + + /* +* FIXME: Concurrent clients triggering resets and reading + clearing +* dumps can cause inconsistent sysfs reads when a user calls in with a +* non-zero offset to complete a prior partial read but the +* gpu_coredump has been cleared or replaced. +*/ + + gpu = i915_first_error_state(i915); + if (IS_ERR(gpu)) { + ret = PTR_ERR(gpu); + } else if (gpu) { + ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count); + i915_gpu_coredump_put(gpu); + } else { + const char *str = "No error state collected\n"; + size_t len = strlen(str); + + if (off < len) { + ret = min_t(size_t, count, len - off); + memcpy(buf, str + off, ret); + } + } Can this and the debugfs equivalent not be common code? It seems like the implementations are conceptually the same even if the code currently looks quite different. John. + + return ret; +} + +static ssize_t error_state_write(struct file *file, struct kobject *kobj, +struct bin_attribute *attr, char *buf, +loff_t off, size_t count) +{ + struct device *kdev = kobj_to_dev(kobj); + struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); + + drm_dbg(_priv->drm, "Resetting error state\n"); + i915_reset_error_state(dev_priv); + + return count; +} + +static const struct bin_attribute error_state_attr = { + .attr.name = "error", + .attr.mode = S_IRUSR | S_IWUSR, + .size = 0, + .read = error_state_read, + .write = error_state_write, +}; + +void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915) +{ + struct device *kdev = i915->drm.primary->kdev; + + if (sysfs_create_bin_file(>kobj, _state_attr)) + drm_err(>drm, "error_state sysfs setup failed\n"); +} + +void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915) +{ + struct device *kdev = i915->drm.primary->kdev; + + sysfs_remove_bin_file(>kobj, _state_attr); +} diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index a6f2a7518cf0..68c964d6720a 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -323,11 +323,12 @@ static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu) kref_put(>ref, __i915_gpu_coredump_free); } -struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915); void i915_reset_error_state(struct drm_i915_private *i915); void i915_disable_error_state(struct drm_i915_private *i915, int err); void i915_gpu_error_debugfs_register(struct drm_i915_private *i915); +void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915); +void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915); #else @@ -396,12 +397,6 @@ static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu) {
[Intel-gfx] [PATCH 3/3] drm/i915: move gpu error sysfs to i915_gpu_error.c
Hide gpu error specifics in i915_gpu_error.c. This is also cleaner wrt conditional compilation, as i915_gpu_error.c is only built with DRM_I915_CAPTURE_ERROR=y. With this, we can also make i915_first_error_state() static. Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_gpu_error.c | 75 - drivers/gpu/drm/i915/i915_gpu_error.h | 17 +++--- drivers/gpu/drm/i915/i915_sysfs.c | 79 +-- 3 files changed, 86 insertions(+), 85 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index b4c8459deb7b..f9e750217f18 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -57,6 +57,7 @@ #include "i915_memcpy.h" #include "i915_reg.h" #include "i915_scatterlist.h" +#include "i915_sysfs.h" #include "i915_utils.h" #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) @@ -2208,7 +2209,7 @@ void i915_capture_error_state(struct intel_gt *gt, i915_gpu_coredump_put(error); } -struct i915_gpu_coredump * +static struct i915_gpu_coredump * i915_first_error_state(struct drm_i915_private *i915) { struct i915_gpu_coredump *error; @@ -2484,3 +2485,75 @@ void i915_gpu_error_debugfs_register(struct drm_i915_private *i915) debugfs_create_file("i915_gpu_info", 0644, minor->debugfs_root, i915, _gpu_info_fops); } + +static ssize_t error_state_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) +{ + + struct device *kdev = kobj_to_dev(kobj); + struct drm_i915_private *i915 = kdev_minor_to_i915(kdev); + struct i915_gpu_coredump *gpu; + ssize_t ret = 0; + + /* +* FIXME: Concurrent clients triggering resets and reading + clearing +* dumps can cause inconsistent sysfs reads when a user calls in with a +* non-zero offset to complete a prior partial read but the +* gpu_coredump has been cleared or replaced. +*/ + + gpu = i915_first_error_state(i915); + if (IS_ERR(gpu)) { + ret = PTR_ERR(gpu); + } else if (gpu) { + ret = i915_gpu_coredump_copy_to_buffer(gpu, buf, off, count); + i915_gpu_coredump_put(gpu); + } else { + const char *str = "No error state collected\n"; + size_t len = strlen(str); + + if (off < len) { + ret = min_t(size_t, count, len - off); + memcpy(buf, str + off, ret); + } + } + + return ret; +} + +static ssize_t error_state_write(struct file *file, struct kobject *kobj, +struct bin_attribute *attr, char *buf, +loff_t off, size_t count) +{ + struct device *kdev = kobj_to_dev(kobj); + struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); + + drm_dbg(_priv->drm, "Resetting error state\n"); + i915_reset_error_state(dev_priv); + + return count; +} + +static const struct bin_attribute error_state_attr = { + .attr.name = "error", + .attr.mode = S_IRUSR | S_IWUSR, + .size = 0, + .read = error_state_read, + .write = error_state_write, +}; + +void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915) +{ + struct device *kdev = i915->drm.primary->kdev; + + if (sysfs_create_bin_file(>kobj, _state_attr)) + drm_err(>drm, "error_state sysfs setup failed\n"); +} + +void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915) +{ + struct device *kdev = i915->drm.primary->kdev; + + sysfs_remove_bin_file(>kobj, _state_attr); +} diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h index a6f2a7518cf0..68c964d6720a 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.h +++ b/drivers/gpu/drm/i915/i915_gpu_error.h @@ -323,11 +323,12 @@ static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu) kref_put(>ref, __i915_gpu_coredump_free); } -struct i915_gpu_coredump *i915_first_error_state(struct drm_i915_private *i915); void i915_reset_error_state(struct drm_i915_private *i915); void i915_disable_error_state(struct drm_i915_private *i915, int err); void i915_gpu_error_debugfs_register(struct drm_i915_private *i915); +void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915); +void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915); #else @@ -396,12 +397,6 @@ static inline void i915_gpu_coredump_put(struct i915_gpu_coredump *gpu) { } -static inline struct i915_gpu_coredump * -i915_first_error_state(struct drm_i915_private *i915) -{ - return ERR_PTR(-ENODEV); -} - static inline void i915_reset_error_state(struct drm_i915_private *i915) { } @@ -415,6 +410,14 @@ static