i915: Dynamic Parity Detection handling

Daniel Vetter Tue, 01 May 2012 11:04:38 -0700

On Fri, Apr 27, 2012 at 05:40:18PM -0700, Ben Widawsky wrote:
> On IVB hardware we are given an interrupt whenever a L3 parity error
> occurs in the L3 cache. The L3 cache is used by internal GPU clients
> only.  This is a very rare occurrence (in fact to test this I need to
> use specially instrumented silicon).
> 
> When a row in the L3 cache detects a parity error the HW generates an
> interrupt. The interrupt is masked in GTIMR until we get a chance to
> read some registers and alert userspace via a uevent. With this
> information userspace can use a sysfs interface (follow-up patch) to
> remap those rows.
> 
> Way above my level of understanding, but if a given row fails, it is
> statistically more likely to fail again than a row which has not failed.
> Therefore it is desirable for an operating system to maintain a lifelong
> list of failing rows and always remap any bad rows on driver load.
> Hardware limits the number of rows that are remappable per bank/subbank,
> and should more than that many rows detect parity errors, software
> should maintain a list of the most frequent errors, and remap those
> rows.
> 
> Signed-off-by: Ben Widawsky <[email protected]>
> ---
>  drivers/gpu/drm/i915/i915_drv.h |    2 +
>  drivers/gpu/drm/i915/i915_irq.c |   83 
> +++++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_reg.h |   17 ++++++++
>  3 files changed, 102 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 69e1539..9505fc0 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -804,6 +804,8 @@ typedef struct drm_i915_private {
>  
>       struct drm_property *broadcast_rgb_property;
>       struct drm_property *force_audio_property;
> +
> +     struct work_struct parity_error_work;
>  } drm_i915_private_t;
>  
>  enum hdmi_force_audio {
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index ab023ca..81e5a7d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -430,6 +430,83 @@ static void gen6_pm_rps_work(struct work_struct *work)
>       mutex_unlock(&dev_priv->dev->struct_mutex);
>  }
>  
> +
> +/**
> + * ivybridge_parity_work - Workqueue called when a parity error interrupt
> + * occurred.
> + *
> + * Doesn't actually do anything except notify userspace so that userspace may
> + * disable things later on.
> + */
> +static void ivybridge_parity_work(struct work_struct *work)
> +{
> +     drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
> +                                                 parity_error_work);
> +
> +     u32 error_status, row, bank, subbank;
> +     char *parity_event[5];
> +     uint32_t misccpctl;
> +     unsigned long flags;
> +
> +     /* We must turn off DOP level clock gating to access the L3 registers.
> +      * In order to prevent a get/put style interface, acquire struct mutex
> +      * any time we access those registers.
> +      */
> +     mutex_lock(&dev_priv->dev->struct_mutex);
> +
> +     misccpctl = I915_READ(GEN7_MISCCPCTL);
> +     I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE);
> +     POSTING_READ(GEN7_MISCCPCTL);
> +
> +     error_status = I915_READ(GEN7_L3CDERRST1);
> +     row = GEN7_PARITY_ERROR_ROW(error_status);
> +     bank = GEN7_PARITY_ERROR_BANK(error_status);
> +     subbank = GEN7_PARITY_ERROR_SUBBANK(error_status);
> +
> +     I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID |
> +                                 GEN7_L3CDERRST1_ENABLE);
> +     POSTING_READ(GEN7_L3CDERRST1);
> +
> +     I915_WRITE(GEN7_MISCCPCTL, misccpctl);
> +
> +     spin_lock_irqsave(&dev_priv->irq_lock, flags);
> +     dev_priv->gt_irq_mask &= ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
> +     I915_WRITE(GTIMR, dev_priv->gt_irq_mask);
> +     spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
> +
> +     mutex_unlock(&dev_priv->dev->struct_mutex);
> +
> +     parity_event[0] = "L3_PARITY_ERROR=1";
> +     parity_event[1] = kasprintf(GFP_KERNEL, "ROW=%d", row);
> +     parity_event[2] = kasprintf(GFP_KERNEL, "BANK=%d", bank);
> +     parity_event[3] = kasprintf(GFP_KERNEL, "SUBBANK=%d", subbank);
> +     parity_event[4] = NULL;
> +
> +     kobject_uevent_env(&dev_priv->dev->primary->kdev.kobj,
> +                        KOBJ_CHANGE, parity_event);
> +
> +     kfree(parity_event[3]);
> +     kfree(parity_event[2]);
> +     kfree(parity_event[1]);
> +}
> +
> +void ivybridge_handle_parity_error(struct drm_device *dev)
> +{
> +     drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private;
> +     unsigned long flags;
> +
> +     if (WARN_ON(IS_GEN6(dev)))
> +             return;
> +
> +     spin_lock_irqsave(&dev_priv->irq_lock, flags);
> +     dev_priv->gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
> +     I915_WRITE(GTIMR, dev_priv->gt_irq_mask);
> +     spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
> +
> +     queue_work(dev_priv->wq, &dev_priv->parity_error_work);
> +     DRM_INFO("Parity error interrupt. Scheduling work\n");
> +}
> +
>  static void snb_gt_irq_handler(struct drm_device *dev,
>                              struct drm_i915_private *dev_priv,
>                              u32 gt_iir)
> @@ -449,6 +526,9 @@ static void snb_gt_irq_handler(struct drm_device *dev,
>               DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
>               i915_handle_error(dev, false);
>       }
> +
> +     if (gt_iir & GT_GEN7_L3_PARITY_ERROR_INTERRUPT)
> +             ivybridge_handle_parity_error(dev);
>  }
>  
>  static void gen6_queue_rps_work(struct drm_i915_private *dev_priv,
> @@ -1978,6 +2058,9 @@ static void ironlake_irq_preinstall(struct drm_device 
> *dev)
>       if (IS_GEN6(dev) || IS_IVYBRIDGE(dev))
>               INIT_WORK(&dev_priv->rps_work, gen6_pm_rps_work);
>  
> +     if (IS_IVYBRIDGE(dev))
> +             INIT_WORK(&dev_priv->parity_error_work, ivybridge_parity_work);
> +


work init has moved to intel_irq_init in dinq, and for good reasons as
I've figured out after merging the patch: _preinstall is also called on
resume, and if we're unlucky we have a work item outstanding from before
the suspend, so that the we re-init a life work item. The core work queue
code doesn't approve of that, resulting in decent hilarity (NULL deref
after suspend).
-Daniel

>       I915_WRITE(HWSTAM, 0xeffe);
>  
>       /* XXX hotplug from PCH */
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 5ac9837..72db6a9 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -4030,6 +4030,23 @@
>  #define   GEN6_RC6                   3
>  #define   GEN6_RC7                   4
>  
> +#define GEN7_MISCCPCTL                       (0x9424)
> +#define   GEN7_DOP_CLOCK_GATE_ENABLE (1<<0)
> +
> +/* IVYBRIDGE DPF */
> +#define GEN7_L3CDERRST1                      0xB008 /* L3CD Error Status 1 */
> +#define   GEN7_L3CDERRST1_ROW_MASK   (0x7ff<<14)
> +#define   GEN7_PARITY_ERROR_VALID    (1<<13)
> +#define   GEN7_L3CDERRST1_BANK_MASK  (3<<11)
> +#define   GEN7_L3CDERRST1_SUBBANK_MASK       (7<<8)
> +#define GEN7_PARITY_ERROR_ROW(reg) \
> +             ((reg & GEN7_L3CDERRST1_ROW_MASK) >> 14)
> +#define GEN7_PARITY_ERROR_BANK(reg) \
> +             ((reg & GEN7_L3CDERRST1_BANK_MASK) >> 11)
> +#define GEN7_PARITY_ERROR_SUBBANK(reg) \
> +             ((reg & GEN7_L3CDERRST1_SUBBANK_MASK) >> 8)
> +#define   GEN7_L3CDERRST1_ENABLE     (1<<7)
> +
>  #define G4X_AUD_VID_DID                      0x62020
>  #define INTEL_AUDIO_DEVCL            0x808629FB
>  #define INTEL_AUDIO_DEVBLC           0x80862801
> -- 
> 1.7.10
> 
> _______________________________________________
> Intel-gfx mailing list
> [email protected]
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Mail: [email protected]
Mobile: +41 (0)79 365 57 48
_______________________________________________
Intel-gfx mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Re: [Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling

Reply via email to