Re: [Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling

2012-05-25 Thread Jesse Barnes
On Fri, 27 Apr 2012 17:40:18 -0700
Ben Widawsky b...@bwidawsk.net wrote:
 +
 +/**
 + * ivybridge_parity_work - Workqueue called when a parity error interrupt
 + * occurred.
 + *
 + * Doesn't actually do anything except notify userspace so that userspace may
 + * disable things later on.

kdoc style comment but missing the @work: parameter.

Might be a good place to describe what userspace can do in response
(i.e. duplicate some of your commit message here).

 + */
 +static void ivybridge_parity_work(struct work_struct *work)
 +{
 + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
 + parity_error_work);
 +

Stray newline.

 + u32 error_status, row, bank, subbank;
 + char *parity_event[5];
 + uint32_t misccpctl;
 + unsigned long flags;
 +
 + /* We must turn off DOP level clock gating to access the L3 registers.
 +  * In order to prevent a get/put style interface, acquire struct mutex
 +  * any time we access those registers.
 +  */
 + mutex_lock(dev_priv-dev-struct_mutex);
 +
 + misccpctl = I915_READ(GEN7_MISCCPCTL);
 + I915_WRITE(GEN7_MISCCPCTL, misccpctl  ~GEN7_DOP_CLOCK_GATE_ENABLE);

DOP clock gating should be unconditionally disabled, you can move this
to the clock gating routine.

 + POSTING_READ(GEN7_MISCCPCTL);
 +
 + error_status = I915_READ(GEN7_L3CDERRST1);
 + row = GEN7_PARITY_ERROR_ROW(error_status);
 + bank = GEN7_PARITY_ERROR_BANK(error_status);
 + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status);
 +
 + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID |
 + GEN7_L3CDERRST1_ENABLE);
 + POSTING_READ(GEN7_L3CDERRST1);
 +
 + I915_WRITE(GEN7_MISCCPCTL, misccpctl);
 +
 + spin_lock_irqsave(dev_priv-irq_lock, flags);
 + dev_priv-gt_irq_mask = ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
 + I915_WRITE(GTIMR, dev_priv-gt_irq_mask);
 + spin_unlock_irqrestore(dev_priv-irq_lock, flags);

Is there a debug register we can read to see if we missed any events at
this point?  A followon patch might be to see when the last event
occured, and if we get two or more in rapid succession (like right when
we re-enable interrupts) the kernel could automatically do the
remapping.  But that's totally optional; I doubt we'll see that in
practice except on machines we've specially broken in the lab. :)

 +
 + mutex_unlock(dev_priv-dev-struct_mutex);
 +
 + parity_event[0] = L3_PARITY_ERROR=1;
 + parity_event[1] = kasprintf(GFP_KERNEL, ROW=%d, row);
 + parity_event[2] = kasprintf(GFP_KERNEL, BANK=%d, bank);
 + parity_event[3] = kasprintf(GFP_KERNEL, SUBBANK=%d, subbank);
 + parity_event[4] = NULL;
 +
 + kobject_uevent_env(dev_priv-dev-primary-kdev.kobj,
 +KOBJ_CHANGE, parity_event);

You have a DRM_INFO when the interrupt comes in, may as well spit out
the row/bank/subbank info here too.  Though I'd make them both DEBUG.

 +
 + kfree(parity_event[3]);
 + kfree(parity_event[2]);
 + kfree(parity_event[1]);
 +}

Hm we really need a man page for our events and such, but that's a
separate patch.

 +
 +void ivybridge_handle_parity_error(struct drm_device *dev)
 +{
 + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev-dev_private;
 + unsigned long flags;
 +
 + if (WARN_ON(IS_GEN6(dev)))
 + return;

What's this for?  The user won't be able to do anything at this
point except get scared...  I'd either just make this return if GEN6 or
add a GEN7 check before we branch here in the first place from the
interrupt handler.

 +
 + spin_lock_irqsave(dev_priv-irq_lock, flags);
 + dev_priv-gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
 + I915_WRITE(GTIMR, dev_priv-gt_irq_mask);
 + spin_unlock_irqrestore(dev_priv-irq_lock, flags);
 +
 + queue_work(dev_priv-wq, dev_priv-parity_error_work);
 + DRM_INFO(Parity error interrupt. Scheduling work\n);
 +}
 +
  static void snb_gt_irq_handler(struct drm_device *dev,
  struct drm_i915_private *dev_priv,
  u32 gt_iir)
 @@ -449,6 +526,9 @@ static void snb_gt_irq_handler(struct drm_device *dev,
   DRM_ERROR(GT error interrupt 0x%08x\n, gt_iir);
   i915_handle_error(dev, false);
   }
 +
 + if (gt_iir  GT_GEN7_L3_PARITY_ERROR_INTERRUPT)
 + ivybridge_handle_parity_error(dev);
  }
  
  static void gen6_queue_rps_work(struct drm_i915_private *dev_priv,
 @@ -1978,6 +2058,9 @@ static void ironlake_irq_preinstall(struct drm_device 
 *dev)
   if (IS_GEN6(dev) || IS_IVYBRIDGE(dev))
   INIT_WORK(dev_priv-rps_work, gen6_pm_rps_work);
  
 + if (IS_IVYBRIDGE(dev))
 + INIT_WORK(dev_priv-parity_error_work, ivybridge_parity_work);


Looks good otherwise, so with the above fixed up or denied:

Reviewed-by: Jesse Barnes jbar...@virtuousgeek.org

-- 
Jesse Barnes, Intel Open Source 

Re: [Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling

2012-05-25 Thread Jesse Barnes
On Fri, 25 May 2012 10:34:58 -0700
Jesse Barnes jbar...@virtuousgeek.org wrote:
  +   misccpctl = I915_READ(GEN7_MISCCPCTL);
  +   I915_WRITE(GEN7_MISCCPCTL, misccpctl  ~GEN7_DOP_CLOCK_GATE_ENABLE);
 
 DOP clock gating should be unconditionally disabled, you can move this
 to the clock gating routine.

Ok I dug into this at your prodding and take it back.  We can leave DOP
clock gating enabled; only VLV needs the unconditional disable, so we
don't need to worry about this if/until we add DPF there.

-- 
Jesse Barnes, Intel Open Source Technology Center
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling

2012-05-25 Thread Ben Widawsky
On Fri, 25 May 2012 10:34:58 -0700
Jesse Barnes jbar...@virtuousgeek.org wrote:

 On Fri, 27 Apr 2012 17:40:18 -0700
 Ben Widawsky b...@bwidawsk.net wrote:
  +
  +/**
  + * ivybridge_parity_work - Workqueue called when a parity error interrupt
  + * occurred.
  + *
  + * Doesn't actually do anything except notify userspace so that userspace 
  may
  + * disable things later on.
 
 kdoc style comment but missing the @work: parameter.
 
 Might be a good place to describe what userspace can do in response
 (i.e. duplicate some of your commit message here).
 

Got it, Thanks.

  + */
  +static void ivybridge_parity_work(struct work_struct *work)
  +{
  +   drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
  +   parity_error_work);
  +
 
 Stray newline.

Got it, Thanks.

 
  +   u32 error_status, row, bank, subbank;
  +   char *parity_event[5];
  +   uint32_t misccpctl;
  +   unsigned long flags;
  +
  +   /* We must turn off DOP level clock gating to access the L3 registers.
  +* In order to prevent a get/put style interface, acquire struct mutex
  +* any time we access those registers.
  +*/
  +   mutex_lock(dev_priv-dev-struct_mutex);
  +
  +   misccpctl = I915_READ(GEN7_MISCCPCTL);
  +   I915_WRITE(GEN7_MISCCPCTL, misccpctl  ~GEN7_DOP_CLOCK_GATE_ENABLE);
 
 DOP clock gating should be unconditionally disabled, you can move this
 to the clock gating routine.
 
  +   POSTING_READ(GEN7_MISCCPCTL);
  +
  +   error_status = I915_READ(GEN7_L3CDERRST1);
  +   row = GEN7_PARITY_ERROR_ROW(error_status);
  +   bank = GEN7_PARITY_ERROR_BANK(error_status);
  +   subbank = GEN7_PARITY_ERROR_SUBBANK(error_status);
  +
  +   I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID |
  +   GEN7_L3CDERRST1_ENABLE);
  +   POSTING_READ(GEN7_L3CDERRST1);
  +
  +   I915_WRITE(GEN7_MISCCPCTL, misccpctl);
  +
  +   spin_lock_irqsave(dev_priv-irq_lock, flags);
  +   dev_priv-gt_irq_mask = ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
  +   I915_WRITE(GTIMR, dev_priv-gt_irq_mask);
  +   spin_unlock_irqrestore(dev_priv-irq_lock, flags);
 
 Is there a debug register we can read to see if we missed any events at
 this point?  A followon patch might be to see when the last event
 occured, and if we get two or more in rapid succession (like right when
 we re-enable interrupts) the kernel could automatically do the
 remapping.  But that's totally optional; I doubt we'll see that in
 practice except on machines we've specially broken in the lab. :)
 

I've come up with some similarly complex ideas in previous versions of
the patch, and Daniel seemed to be not so interested because of the
expectedly infrequent nature of the events. 

If Daniel is willing to accept the patch with your suggestion, I would
be happy to implement it. Of course the metric for rapid succession
would need to be agreed upon. I suggestion.

  + + mutex_unlock(dev_priv-dev-struct_mutex); + + parity_event[0]
  = L3_PARITY_ERROR=1; +parity_event[1] = kasprintf(GFP_KERNEL,
  ROW=%d, row); +   parity_event[2] = kasprintf(GFP_KERNEL,
  BANK=%d, bank); + parity_event[3] = kasprintf(GFP_KERNEL,
  SUBBANK=%d, subbank); +   parity_event[4] = NULL; + +
  kobject_uevent_env(dev_priv-dev-primary-kdev.kobj, +
  KOBJ_CHANGE, parity_event);
 
 You have a DRM_INFO when the interrupt comes in, may as well spit out
 the row/bank/subbank info here too.  Though I'd make them both DEBUG.

Good point. I had that info in an earlier version of the patch. Not sure
when it got dropped. Also noted the DRM_DEBUG suggestion, thanks.

 
  + + kfree(parity_event[3]); +   kfree(parity_event[2]); +
  kfree(parity_event[1]); +}
 
 Hm we really need a man page for our events and such, but that's a
 separate patch.
 
  + +void ivybridge_handle_parity_error(struct drm_device *dev) +{ +
  drm_i915_private_t *dev_priv = (drm_i915_private_t *)
  dev-dev_private; + unsigned long flags; + +if
  (WARN_ON(IS_GEN6(dev))) +   return;
 
 What's this for?  The user won't be able to do anything at this point
 except get scared...  I'd either just make this return if GEN6 or add
 a GEN7 check before we branch here in the first place from the
 interrupt handler.

It is really just to prevent developers from enabling this interrupt on
GEN6, or VLV... It really should have been if (!IS_IVYBRIDGE). I'll drop
the WARN_ON and just return.

 
  + + spin_lock_irqsave(dev_priv-irq_lock, flags); +
  dev_priv-gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT; +
  I915_WRITE(GTIMR, dev_priv-gt_irq_mask); +
  spin_unlock_irqrestore(dev_priv-irq_lock, flags); + +
  queue_work(dev_priv-wq, dev_priv-parity_error_work); +
  DRM_INFO(Parity error interrupt. Scheduling work\n); +} + static
  void snb_gt_irq_handler(struct drm_device *dev, struct
  drm_i915_private *dev_priv, u32 gt_iir) @@ -449,6 +526,9 @@ static
  void snb_gt_irq_handler(struct drm_device *dev, DRM_ERROR(GT error
  interrupt 0x%08x\n, gt_iir); 

Re: [Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling

2012-05-01 Thread Daniel Vetter
On Fri, Apr 27, 2012 at 05:40:18PM -0700, Ben Widawsky wrote:
 On IVB hardware we are given an interrupt whenever a L3 parity error
 occurs in the L3 cache. The L3 cache is used by internal GPU clients
 only.  This is a very rare occurrence (in fact to test this I need to
 use specially instrumented silicon).
 
 When a row in the L3 cache detects a parity error the HW generates an
 interrupt. The interrupt is masked in GTIMR until we get a chance to
 read some registers and alert userspace via a uevent. With this
 information userspace can use a sysfs interface (follow-up patch) to
 remap those rows.
 
 Way above my level of understanding, but if a given row fails, it is
 statistically more likely to fail again than a row which has not failed.
 Therefore it is desirable for an operating system to maintain a lifelong
 list of failing rows and always remap any bad rows on driver load.
 Hardware limits the number of rows that are remappable per bank/subbank,
 and should more than that many rows detect parity errors, software
 should maintain a list of the most frequent errors, and remap those
 rows.
 
 Signed-off-by: Ben Widawsky b...@bwidawsk.net
 ---
  drivers/gpu/drm/i915/i915_drv.h |2 +
  drivers/gpu/drm/i915/i915_irq.c |   83 
 +++
  drivers/gpu/drm/i915/i915_reg.h |   17 
  3 files changed, 102 insertions(+)
 
 diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
 index 69e1539..9505fc0 100644
 --- a/drivers/gpu/drm/i915/i915_drv.h
 +++ b/drivers/gpu/drm/i915/i915_drv.h
 @@ -804,6 +804,8 @@ typedef struct drm_i915_private {
  
   struct drm_property *broadcast_rgb_property;
   struct drm_property *force_audio_property;
 +
 + struct work_struct parity_error_work;
  } drm_i915_private_t;
  
  enum hdmi_force_audio {
 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
 index ab023ca..81e5a7d 100644
 --- a/drivers/gpu/drm/i915/i915_irq.c
 +++ b/drivers/gpu/drm/i915/i915_irq.c
 @@ -430,6 +430,83 @@ static void gen6_pm_rps_work(struct work_struct *work)
   mutex_unlock(dev_priv-dev-struct_mutex);
  }
  
 +
 +/**
 + * ivybridge_parity_work - Workqueue called when a parity error interrupt
 + * occurred.
 + *
 + * Doesn't actually do anything except notify userspace so that userspace may
 + * disable things later on.
 + */
 +static void ivybridge_parity_work(struct work_struct *work)
 +{
 + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
 + parity_error_work);
 +
 + u32 error_status, row, bank, subbank;
 + char *parity_event[5];
 + uint32_t misccpctl;
 + unsigned long flags;
 +
 + /* We must turn off DOP level clock gating to access the L3 registers.
 +  * In order to prevent a get/put style interface, acquire struct mutex
 +  * any time we access those registers.
 +  */
 + mutex_lock(dev_priv-dev-struct_mutex);
 +
 + misccpctl = I915_READ(GEN7_MISCCPCTL);
 + I915_WRITE(GEN7_MISCCPCTL, misccpctl  ~GEN7_DOP_CLOCK_GATE_ENABLE);
 + POSTING_READ(GEN7_MISCCPCTL);
 +
 + error_status = I915_READ(GEN7_L3CDERRST1);
 + row = GEN7_PARITY_ERROR_ROW(error_status);
 + bank = GEN7_PARITY_ERROR_BANK(error_status);
 + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status);
 +
 + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID |
 + GEN7_L3CDERRST1_ENABLE);
 + POSTING_READ(GEN7_L3CDERRST1);
 +
 + I915_WRITE(GEN7_MISCCPCTL, misccpctl);
 +
 + spin_lock_irqsave(dev_priv-irq_lock, flags);
 + dev_priv-gt_irq_mask = ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
 + I915_WRITE(GTIMR, dev_priv-gt_irq_mask);
 + spin_unlock_irqrestore(dev_priv-irq_lock, flags);
 +
 + mutex_unlock(dev_priv-dev-struct_mutex);
 +
 + parity_event[0] = L3_PARITY_ERROR=1;
 + parity_event[1] = kasprintf(GFP_KERNEL, ROW=%d, row);
 + parity_event[2] = kasprintf(GFP_KERNEL, BANK=%d, bank);
 + parity_event[3] = kasprintf(GFP_KERNEL, SUBBANK=%d, subbank);
 + parity_event[4] = NULL;
 +
 + kobject_uevent_env(dev_priv-dev-primary-kdev.kobj,
 +KOBJ_CHANGE, parity_event);
 +
 + kfree(parity_event[3]);
 + kfree(parity_event[2]);
 + kfree(parity_event[1]);
 +}
 +
 +void ivybridge_handle_parity_error(struct drm_device *dev)
 +{
 + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev-dev_private;
 + unsigned long flags;
 +
 + if (WARN_ON(IS_GEN6(dev)))
 + return;
 +
 + spin_lock_irqsave(dev_priv-irq_lock, flags);
 + dev_priv-gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
 + I915_WRITE(GTIMR, dev_priv-gt_irq_mask);
 + spin_unlock_irqrestore(dev_priv-irq_lock, flags);
 +
 + queue_work(dev_priv-wq, dev_priv-parity_error_work);
 + DRM_INFO(Parity error interrupt. Scheduling work\n);
 +}
 +
  static void snb_gt_irq_handler(struct drm_device *dev,

[Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling

2012-04-27 Thread Ben Widawsky
On IVB hardware we are given an interrupt whenever a L3 parity error
occurs in the L3 cache. The L3 cache is used by internal GPU clients
only.  This is a very rare occurrence (in fact to test this I need to
use specially instrumented silicon).

When a row in the L3 cache detects a parity error the HW generates an
interrupt. The interrupt is masked in GTIMR until we get a chance to
read some registers and alert userspace via a uevent. With this
information userspace can use a sysfs interface (follow-up patch) to
remap those rows.

Way above my level of understanding, but if a given row fails, it is
statistically more likely to fail again than a row which has not failed.
Therefore it is desirable for an operating system to maintain a lifelong
list of failing rows and always remap any bad rows on driver load.
Hardware limits the number of rows that are remappable per bank/subbank,
and should more than that many rows detect parity errors, software
should maintain a list of the most frequent errors, and remap those
rows.

Signed-off-by: Ben Widawsky b...@bwidawsk.net
---
 drivers/gpu/drm/i915/i915_drv.h |2 +
 drivers/gpu/drm/i915/i915_irq.c |   83 +++
 drivers/gpu/drm/i915/i915_reg.h |   17 
 3 files changed, 102 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 69e1539..9505fc0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -804,6 +804,8 @@ typedef struct drm_i915_private {
 
struct drm_property *broadcast_rgb_property;
struct drm_property *force_audio_property;
+
+   struct work_struct parity_error_work;
 } drm_i915_private_t;
 
 enum hdmi_force_audio {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ab023ca..81e5a7d 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -430,6 +430,83 @@ static void gen6_pm_rps_work(struct work_struct *work)
mutex_unlock(dev_priv-dev-struct_mutex);
 }
 
+
+/**
+ * ivybridge_parity_work - Workqueue called when a parity error interrupt
+ * occurred.
+ *
+ * Doesn't actually do anything except notify userspace so that userspace may
+ * disable things later on.
+ */
+static void ivybridge_parity_work(struct work_struct *work)
+{
+   drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
+   parity_error_work);
+
+   u32 error_status, row, bank, subbank;
+   char *parity_event[5];
+   uint32_t misccpctl;
+   unsigned long flags;
+
+   /* We must turn off DOP level clock gating to access the L3 registers.
+* In order to prevent a get/put style interface, acquire struct mutex
+* any time we access those registers.
+*/
+   mutex_lock(dev_priv-dev-struct_mutex);
+
+   misccpctl = I915_READ(GEN7_MISCCPCTL);
+   I915_WRITE(GEN7_MISCCPCTL, misccpctl  ~GEN7_DOP_CLOCK_GATE_ENABLE);
+   POSTING_READ(GEN7_MISCCPCTL);
+
+   error_status = I915_READ(GEN7_L3CDERRST1);
+   row = GEN7_PARITY_ERROR_ROW(error_status);
+   bank = GEN7_PARITY_ERROR_BANK(error_status);
+   subbank = GEN7_PARITY_ERROR_SUBBANK(error_status);
+
+   I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID |
+   GEN7_L3CDERRST1_ENABLE);
+   POSTING_READ(GEN7_L3CDERRST1);
+
+   I915_WRITE(GEN7_MISCCPCTL, misccpctl);
+
+   spin_lock_irqsave(dev_priv-irq_lock, flags);
+   dev_priv-gt_irq_mask = ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
+   I915_WRITE(GTIMR, dev_priv-gt_irq_mask);
+   spin_unlock_irqrestore(dev_priv-irq_lock, flags);
+
+   mutex_unlock(dev_priv-dev-struct_mutex);
+
+   parity_event[0] = L3_PARITY_ERROR=1;
+   parity_event[1] = kasprintf(GFP_KERNEL, ROW=%d, row);
+   parity_event[2] = kasprintf(GFP_KERNEL, BANK=%d, bank);
+   parity_event[3] = kasprintf(GFP_KERNEL, SUBBANK=%d, subbank);
+   parity_event[4] = NULL;
+
+   kobject_uevent_env(dev_priv-dev-primary-kdev.kobj,
+  KOBJ_CHANGE, parity_event);
+
+   kfree(parity_event[3]);
+   kfree(parity_event[2]);
+   kfree(parity_event[1]);
+}
+
+void ivybridge_handle_parity_error(struct drm_device *dev)
+{
+   drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev-dev_private;
+   unsigned long flags;
+
+   if (WARN_ON(IS_GEN6(dev)))
+   return;
+
+   spin_lock_irqsave(dev_priv-irq_lock, flags);
+   dev_priv-gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
+   I915_WRITE(GTIMR, dev_priv-gt_irq_mask);
+   spin_unlock_irqrestore(dev_priv-irq_lock, flags);
+
+   queue_work(dev_priv-wq, dev_priv-parity_error_work);
+   DRM_INFO(Parity error interrupt. Scheduling work\n);
+}
+
 static void snb_gt_irq_handler(struct drm_device *dev,
   struct drm_i915_private *dev_priv,
   u32