Re: [Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling
On Fri, 27 Apr 2012 17:40:18 -0700 Ben Widawsky b...@bwidawsk.net wrote: + +/** + * ivybridge_parity_work - Workqueue called when a parity error interrupt + * occurred. + * + * Doesn't actually do anything except notify userspace so that userspace may + * disable things later on. kdoc style comment but missing the @work: parameter. Might be a good place to describe what userspace can do in response (i.e. duplicate some of your commit message here). + */ +static void ivybridge_parity_work(struct work_struct *work) +{ + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, + parity_error_work); + Stray newline. + u32 error_status, row, bank, subbank; + char *parity_event[5]; + uint32_t misccpctl; + unsigned long flags; + + /* We must turn off DOP level clock gating to access the L3 registers. + * In order to prevent a get/put style interface, acquire struct mutex + * any time we access those registers. + */ + mutex_lock(dev_priv-dev-struct_mutex); + + misccpctl = I915_READ(GEN7_MISCCPCTL); + I915_WRITE(GEN7_MISCCPCTL, misccpctl ~GEN7_DOP_CLOCK_GATE_ENABLE); DOP clock gating should be unconditionally disabled, you can move this to the clock gating routine. + POSTING_READ(GEN7_MISCCPCTL); + + error_status = I915_READ(GEN7_L3CDERRST1); + row = GEN7_PARITY_ERROR_ROW(error_status); + bank = GEN7_PARITY_ERROR_BANK(error_status); + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status); + + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID | + GEN7_L3CDERRST1_ENABLE); + POSTING_READ(GEN7_L3CDERRST1); + + I915_WRITE(GEN7_MISCCPCTL, misccpctl); + + spin_lock_irqsave(dev_priv-irq_lock, flags); + dev_priv-gt_irq_mask = ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv-gt_irq_mask); + spin_unlock_irqrestore(dev_priv-irq_lock, flags); Is there a debug register we can read to see if we missed any events at this point? A followon patch might be to see when the last event occured, and if we get two or more in rapid succession (like right when we re-enable interrupts) the kernel could automatically do the remapping. But that's totally optional; I doubt we'll see that in practice except on machines we've specially broken in the lab. :) + + mutex_unlock(dev_priv-dev-struct_mutex); + + parity_event[0] = L3_PARITY_ERROR=1; + parity_event[1] = kasprintf(GFP_KERNEL, ROW=%d, row); + parity_event[2] = kasprintf(GFP_KERNEL, BANK=%d, bank); + parity_event[3] = kasprintf(GFP_KERNEL, SUBBANK=%d, subbank); + parity_event[4] = NULL; + + kobject_uevent_env(dev_priv-dev-primary-kdev.kobj, +KOBJ_CHANGE, parity_event); You have a DRM_INFO when the interrupt comes in, may as well spit out the row/bank/subbank info here too. Though I'd make them both DEBUG. + + kfree(parity_event[3]); + kfree(parity_event[2]); + kfree(parity_event[1]); +} Hm we really need a man page for our events and such, but that's a separate patch. + +void ivybridge_handle_parity_error(struct drm_device *dev) +{ + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev-dev_private; + unsigned long flags; + + if (WARN_ON(IS_GEN6(dev))) + return; What's this for? The user won't be able to do anything at this point except get scared... I'd either just make this return if GEN6 or add a GEN7 check before we branch here in the first place from the interrupt handler. + + spin_lock_irqsave(dev_priv-irq_lock, flags); + dev_priv-gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv-gt_irq_mask); + spin_unlock_irqrestore(dev_priv-irq_lock, flags); + + queue_work(dev_priv-wq, dev_priv-parity_error_work); + DRM_INFO(Parity error interrupt. Scheduling work\n); +} + static void snb_gt_irq_handler(struct drm_device *dev, struct drm_i915_private *dev_priv, u32 gt_iir) @@ -449,6 +526,9 @@ static void snb_gt_irq_handler(struct drm_device *dev, DRM_ERROR(GT error interrupt 0x%08x\n, gt_iir); i915_handle_error(dev, false); } + + if (gt_iir GT_GEN7_L3_PARITY_ERROR_INTERRUPT) + ivybridge_handle_parity_error(dev); } static void gen6_queue_rps_work(struct drm_i915_private *dev_priv, @@ -1978,6 +2058,9 @@ static void ironlake_irq_preinstall(struct drm_device *dev) if (IS_GEN6(dev) || IS_IVYBRIDGE(dev)) INIT_WORK(dev_priv-rps_work, gen6_pm_rps_work); + if (IS_IVYBRIDGE(dev)) + INIT_WORK(dev_priv-parity_error_work, ivybridge_parity_work); Looks good otherwise, so with the above fixed up or denied: Reviewed-by: Jesse Barnes jbar...@virtuousgeek.org -- Jesse Barnes, Intel Open Source
Re: [Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling
On Fri, 25 May 2012 10:34:58 -0700 Jesse Barnes jbar...@virtuousgeek.org wrote: + misccpctl = I915_READ(GEN7_MISCCPCTL); + I915_WRITE(GEN7_MISCCPCTL, misccpctl ~GEN7_DOP_CLOCK_GATE_ENABLE); DOP clock gating should be unconditionally disabled, you can move this to the clock gating routine. Ok I dug into this at your prodding and take it back. We can leave DOP clock gating enabled; only VLV needs the unconditional disable, so we don't need to worry about this if/until we add DPF there. -- Jesse Barnes, Intel Open Source Technology Center ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling
On Fri, 25 May 2012 10:34:58 -0700 Jesse Barnes jbar...@virtuousgeek.org wrote: On Fri, 27 Apr 2012 17:40:18 -0700 Ben Widawsky b...@bwidawsk.net wrote: + +/** + * ivybridge_parity_work - Workqueue called when a parity error interrupt + * occurred. + * + * Doesn't actually do anything except notify userspace so that userspace may + * disable things later on. kdoc style comment but missing the @work: parameter. Might be a good place to describe what userspace can do in response (i.e. duplicate some of your commit message here). Got it, Thanks. + */ +static void ivybridge_parity_work(struct work_struct *work) +{ + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, + parity_error_work); + Stray newline. Got it, Thanks. + u32 error_status, row, bank, subbank; + char *parity_event[5]; + uint32_t misccpctl; + unsigned long flags; + + /* We must turn off DOP level clock gating to access the L3 registers. +* In order to prevent a get/put style interface, acquire struct mutex +* any time we access those registers. +*/ + mutex_lock(dev_priv-dev-struct_mutex); + + misccpctl = I915_READ(GEN7_MISCCPCTL); + I915_WRITE(GEN7_MISCCPCTL, misccpctl ~GEN7_DOP_CLOCK_GATE_ENABLE); DOP clock gating should be unconditionally disabled, you can move this to the clock gating routine. + POSTING_READ(GEN7_MISCCPCTL); + + error_status = I915_READ(GEN7_L3CDERRST1); + row = GEN7_PARITY_ERROR_ROW(error_status); + bank = GEN7_PARITY_ERROR_BANK(error_status); + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status); + + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID | + GEN7_L3CDERRST1_ENABLE); + POSTING_READ(GEN7_L3CDERRST1); + + I915_WRITE(GEN7_MISCCPCTL, misccpctl); + + spin_lock_irqsave(dev_priv-irq_lock, flags); + dev_priv-gt_irq_mask = ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv-gt_irq_mask); + spin_unlock_irqrestore(dev_priv-irq_lock, flags); Is there a debug register we can read to see if we missed any events at this point? A followon patch might be to see when the last event occured, and if we get two or more in rapid succession (like right when we re-enable interrupts) the kernel could automatically do the remapping. But that's totally optional; I doubt we'll see that in practice except on machines we've specially broken in the lab. :) I've come up with some similarly complex ideas in previous versions of the patch, and Daniel seemed to be not so interested because of the expectedly infrequent nature of the events. If Daniel is willing to accept the patch with your suggestion, I would be happy to implement it. Of course the metric for rapid succession would need to be agreed upon. I suggestion. + + mutex_unlock(dev_priv-dev-struct_mutex); + + parity_event[0] = L3_PARITY_ERROR=1; +parity_event[1] = kasprintf(GFP_KERNEL, ROW=%d, row); + parity_event[2] = kasprintf(GFP_KERNEL, BANK=%d, bank); + parity_event[3] = kasprintf(GFP_KERNEL, SUBBANK=%d, subbank); + parity_event[4] = NULL; + + kobject_uevent_env(dev_priv-dev-primary-kdev.kobj, + KOBJ_CHANGE, parity_event); You have a DRM_INFO when the interrupt comes in, may as well spit out the row/bank/subbank info here too. Though I'd make them both DEBUG. Good point. I had that info in an earlier version of the patch. Not sure when it got dropped. Also noted the DRM_DEBUG suggestion, thanks. + + kfree(parity_event[3]); + kfree(parity_event[2]); + kfree(parity_event[1]); +} Hm we really need a man page for our events and such, but that's a separate patch. + +void ivybridge_handle_parity_error(struct drm_device *dev) +{ + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev-dev_private; + unsigned long flags; + +if (WARN_ON(IS_GEN6(dev))) + return; What's this for? The user won't be able to do anything at this point except get scared... I'd either just make this return if GEN6 or add a GEN7 check before we branch here in the first place from the interrupt handler. It is really just to prevent developers from enabling this interrupt on GEN6, or VLV... It really should have been if (!IS_IVYBRIDGE). I'll drop the WARN_ON and just return. + + spin_lock_irqsave(dev_priv-irq_lock, flags); + dev_priv-gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv-gt_irq_mask); + spin_unlock_irqrestore(dev_priv-irq_lock, flags); + + queue_work(dev_priv-wq, dev_priv-parity_error_work); + DRM_INFO(Parity error interrupt. Scheduling work\n); +} + static void snb_gt_irq_handler(struct drm_device *dev, struct drm_i915_private *dev_priv, u32 gt_iir) @@ -449,6 +526,9 @@ static void snb_gt_irq_handler(struct drm_device *dev, DRM_ERROR(GT error interrupt 0x%08x\n, gt_iir);
Re: [Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling
On Fri, Apr 27, 2012 at 05:40:18PM -0700, Ben Widawsky wrote: On IVB hardware we are given an interrupt whenever a L3 parity error occurs in the L3 cache. The L3 cache is used by internal GPU clients only. This is a very rare occurrence (in fact to test this I need to use specially instrumented silicon). When a row in the L3 cache detects a parity error the HW generates an interrupt. The interrupt is masked in GTIMR until we get a chance to read some registers and alert userspace via a uevent. With this information userspace can use a sysfs interface (follow-up patch) to remap those rows. Way above my level of understanding, but if a given row fails, it is statistically more likely to fail again than a row which has not failed. Therefore it is desirable for an operating system to maintain a lifelong list of failing rows and always remap any bad rows on driver load. Hardware limits the number of rows that are remappable per bank/subbank, and should more than that many rows detect parity errors, software should maintain a list of the most frequent errors, and remap those rows. Signed-off-by: Ben Widawsky b...@bwidawsk.net --- drivers/gpu/drm/i915/i915_drv.h |2 + drivers/gpu/drm/i915/i915_irq.c | 83 +++ drivers/gpu/drm/i915/i915_reg.h | 17 3 files changed, 102 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 69e1539..9505fc0 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -804,6 +804,8 @@ typedef struct drm_i915_private { struct drm_property *broadcast_rgb_property; struct drm_property *force_audio_property; + + struct work_struct parity_error_work; } drm_i915_private_t; enum hdmi_force_audio { diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index ab023ca..81e5a7d 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -430,6 +430,83 @@ static void gen6_pm_rps_work(struct work_struct *work) mutex_unlock(dev_priv-dev-struct_mutex); } + +/** + * ivybridge_parity_work - Workqueue called when a parity error interrupt + * occurred. + * + * Doesn't actually do anything except notify userspace so that userspace may + * disable things later on. + */ +static void ivybridge_parity_work(struct work_struct *work) +{ + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, + parity_error_work); + + u32 error_status, row, bank, subbank; + char *parity_event[5]; + uint32_t misccpctl; + unsigned long flags; + + /* We must turn off DOP level clock gating to access the L3 registers. + * In order to prevent a get/put style interface, acquire struct mutex + * any time we access those registers. + */ + mutex_lock(dev_priv-dev-struct_mutex); + + misccpctl = I915_READ(GEN7_MISCCPCTL); + I915_WRITE(GEN7_MISCCPCTL, misccpctl ~GEN7_DOP_CLOCK_GATE_ENABLE); + POSTING_READ(GEN7_MISCCPCTL); + + error_status = I915_READ(GEN7_L3CDERRST1); + row = GEN7_PARITY_ERROR_ROW(error_status); + bank = GEN7_PARITY_ERROR_BANK(error_status); + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status); + + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID | + GEN7_L3CDERRST1_ENABLE); + POSTING_READ(GEN7_L3CDERRST1); + + I915_WRITE(GEN7_MISCCPCTL, misccpctl); + + spin_lock_irqsave(dev_priv-irq_lock, flags); + dev_priv-gt_irq_mask = ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv-gt_irq_mask); + spin_unlock_irqrestore(dev_priv-irq_lock, flags); + + mutex_unlock(dev_priv-dev-struct_mutex); + + parity_event[0] = L3_PARITY_ERROR=1; + parity_event[1] = kasprintf(GFP_KERNEL, ROW=%d, row); + parity_event[2] = kasprintf(GFP_KERNEL, BANK=%d, bank); + parity_event[3] = kasprintf(GFP_KERNEL, SUBBANK=%d, subbank); + parity_event[4] = NULL; + + kobject_uevent_env(dev_priv-dev-primary-kdev.kobj, +KOBJ_CHANGE, parity_event); + + kfree(parity_event[3]); + kfree(parity_event[2]); + kfree(parity_event[1]); +} + +void ivybridge_handle_parity_error(struct drm_device *dev) +{ + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev-dev_private; + unsigned long flags; + + if (WARN_ON(IS_GEN6(dev))) + return; + + spin_lock_irqsave(dev_priv-irq_lock, flags); + dev_priv-gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv-gt_irq_mask); + spin_unlock_irqrestore(dev_priv-irq_lock, flags); + + queue_work(dev_priv-wq, dev_priv-parity_error_work); + DRM_INFO(Parity error interrupt. Scheduling work\n); +} + static void snb_gt_irq_handler(struct drm_device *dev,
[Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling
On IVB hardware we are given an interrupt whenever a L3 parity error occurs in the L3 cache. The L3 cache is used by internal GPU clients only. This is a very rare occurrence (in fact to test this I need to use specially instrumented silicon). When a row in the L3 cache detects a parity error the HW generates an interrupt. The interrupt is masked in GTIMR until we get a chance to read some registers and alert userspace via a uevent. With this information userspace can use a sysfs interface (follow-up patch) to remap those rows. Way above my level of understanding, but if a given row fails, it is statistically more likely to fail again than a row which has not failed. Therefore it is desirable for an operating system to maintain a lifelong list of failing rows and always remap any bad rows on driver load. Hardware limits the number of rows that are remappable per bank/subbank, and should more than that many rows detect parity errors, software should maintain a list of the most frequent errors, and remap those rows. Signed-off-by: Ben Widawsky b...@bwidawsk.net --- drivers/gpu/drm/i915/i915_drv.h |2 + drivers/gpu/drm/i915/i915_irq.c | 83 +++ drivers/gpu/drm/i915/i915_reg.h | 17 3 files changed, 102 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 69e1539..9505fc0 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -804,6 +804,8 @@ typedef struct drm_i915_private { struct drm_property *broadcast_rgb_property; struct drm_property *force_audio_property; + + struct work_struct parity_error_work; } drm_i915_private_t; enum hdmi_force_audio { diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index ab023ca..81e5a7d 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -430,6 +430,83 @@ static void gen6_pm_rps_work(struct work_struct *work) mutex_unlock(dev_priv-dev-struct_mutex); } + +/** + * ivybridge_parity_work - Workqueue called when a parity error interrupt + * occurred. + * + * Doesn't actually do anything except notify userspace so that userspace may + * disable things later on. + */ +static void ivybridge_parity_work(struct work_struct *work) +{ + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t, + parity_error_work); + + u32 error_status, row, bank, subbank; + char *parity_event[5]; + uint32_t misccpctl; + unsigned long flags; + + /* We must turn off DOP level clock gating to access the L3 registers. +* In order to prevent a get/put style interface, acquire struct mutex +* any time we access those registers. +*/ + mutex_lock(dev_priv-dev-struct_mutex); + + misccpctl = I915_READ(GEN7_MISCCPCTL); + I915_WRITE(GEN7_MISCCPCTL, misccpctl ~GEN7_DOP_CLOCK_GATE_ENABLE); + POSTING_READ(GEN7_MISCCPCTL); + + error_status = I915_READ(GEN7_L3CDERRST1); + row = GEN7_PARITY_ERROR_ROW(error_status); + bank = GEN7_PARITY_ERROR_BANK(error_status); + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status); + + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID | + GEN7_L3CDERRST1_ENABLE); + POSTING_READ(GEN7_L3CDERRST1); + + I915_WRITE(GEN7_MISCCPCTL, misccpctl); + + spin_lock_irqsave(dev_priv-irq_lock, flags); + dev_priv-gt_irq_mask = ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv-gt_irq_mask); + spin_unlock_irqrestore(dev_priv-irq_lock, flags); + + mutex_unlock(dev_priv-dev-struct_mutex); + + parity_event[0] = L3_PARITY_ERROR=1; + parity_event[1] = kasprintf(GFP_KERNEL, ROW=%d, row); + parity_event[2] = kasprintf(GFP_KERNEL, BANK=%d, bank); + parity_event[3] = kasprintf(GFP_KERNEL, SUBBANK=%d, subbank); + parity_event[4] = NULL; + + kobject_uevent_env(dev_priv-dev-primary-kdev.kobj, + KOBJ_CHANGE, parity_event); + + kfree(parity_event[3]); + kfree(parity_event[2]); + kfree(parity_event[1]); +} + +void ivybridge_handle_parity_error(struct drm_device *dev) +{ + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev-dev_private; + unsigned long flags; + + if (WARN_ON(IS_GEN6(dev))) + return; + + spin_lock_irqsave(dev_priv-irq_lock, flags); + dev_priv-gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT; + I915_WRITE(GTIMR, dev_priv-gt_irq_mask); + spin_unlock_irqrestore(dev_priv-irq_lock, flags); + + queue_work(dev_priv-wq, dev_priv-parity_error_work); + DRM_INFO(Parity error interrupt. Scheduling work\n); +} + static void snb_gt_irq_handler(struct drm_device *dev, struct drm_i915_private *dev_priv, u32