On Sat, 2012-09-08 at 18:05 +0300, Dimitar Dimitrov wrote:
> Very rare kernel crashes are reported on a custom OMAP4 board. Kernel
> panics due to corrupted completion structure while executing
> dispc_irq_wait_handler(). Excerpt from kernel log:
> 
>   Internal error: Oops - undefined instruction: 0 [#1] PREEMPT SMP
>   Unable to handle kernel paging request at virtual address 00400130
>   ...
>   PC is at 0xebf205bc
>   LR is at __wake_up_common+0x54/0x94
>   ...
>   (__wake_up_common+0x0/0x94)
>   (complete+0x0/0x60)
>   (dispc_irq_wait_handler.36902+0x0/0x14)
>   (omap_dispc_irq_handler+0x0/0x354)
>   (handle_irq_event_percpu+0x0/0x188)
>   (handle_irq_event+0x0/0x64)
>   (handle_fasteoi_irq+0x0/0x10c)
>   (generic_handle_irq+0x0/0x48)
>   (asm_do_IRQ+0x0/0xc0)
> 
> DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
> unregister_isr() and DISPC IRQ might be running in parallel on different
> CPUs. So there is a chance that a callback is executed even though it
> has been unregistered. As omap_dispc_wait_for_irq_timeout() declares a
> completion on stack, the dispc_irq_wait_handler() callback might try to
> access a completion structure that is invalid. This leads to crashes and
> hangs.
> 
> Solution is to divide unregister calls into two sets:
>   1. Non-strict unregistering of callbacks. Callbacks could safely be
>      executed after unregistering them. This is the case with unregister
>      calls from the IRQ handler itself.
>   2. Strict (synchronized) unregistering. Callbacks are not allowed
>      after unregistering. This is the case with completion waiting.
> 
> The above solution should satisfy one of the original intentions of the
> driver: callbacks should be able to unregister themselves.
> 
> Also fix DSI IRQ unregister which has similar logic to DISPC IRQ handling.
> 
> Signed-off-by: Dimitar Dimitrov <dinu...@gmail.com>
> ---
> 
> WARNING: This bug is quite old. The patch has been tested on v3.0. No testing
> has been done after rebasing to v3.6. Hence the RFC tag. Hopefully someone
> will beat me in testing with latest linux-omap/master.
> 
> Changes since v1 per Tomi Valkeinen's comments:
>   - Don't rename omap_dispc_unregister_isr, just introduce nosync variant.
>   - Apply the same fix for DSI IRQ which suffers from the same race condition.

I made a quick test and works for me, although I haven't encountered the
problem itself.

Some mostly cosmetic comments below.

This seems to apply cleanly to v3.4+ kernels, but not earlier ones. Do
you want to make the needed modifications and mail this and the modified
patches for stable kernels also? I can do that also if you don't want
to.

>  drivers/staging/omapdrm/omap_plane.c |    2 +-
>  drivers/video/omap2/dss/apply.c      |    2 +-
>  drivers/video/omap2/dss/dispc.c      |   45 
> +++++++++++++++++++++++++++++-----
>  drivers/video/omap2/dss/dsi.c        |   19 ++++++++++++++
>  include/video/omapdss.h              |    1 +
>  5 files changed, 61 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/staging/omapdrm/omap_plane.c 
> b/drivers/staging/omapdrm/omap_plane.c
> index 7997be7..8d8aa5b 100644
> --- a/drivers/staging/omapdrm/omap_plane.c
> +++ b/drivers/staging/omapdrm/omap_plane.c
> @@ -82,7 +82,7 @@ static void dispc_isr(void *arg, uint32_t mask)
>       struct omap_plane *omap_plane = to_omap_plane(plane);
>       struct omap_drm_private *priv = plane->dev->dev_private;
>  
> -     omap_dispc_unregister_isr(dispc_isr, plane,
> +     omap_dispc_unregister_isr_nosync(dispc_isr, plane,
>                       id2irq[omap_plane->ovl->id]);
>  
>       queue_work(priv->wq, &omap_plane->work);
> diff --git a/drivers/video/omap2/dss/apply.c b/drivers/video/omap2/dss/apply.c
> index 0fefc68..9386834 100644
> --- a/drivers/video/omap2/dss/apply.c
> +++ b/drivers/video/omap2/dss/apply.c
> @@ -847,7 +847,7 @@ static void dss_unregister_vsync_isr(void)
>       for (i = 0; i < num_mgrs; ++i)
>               mask |= dispc_mgr_get_framedone_irq(i);
>  
> -     r = omap_dispc_unregister_isr(dss_apply_irq_handler, NULL, mask);
> +     r = omap_dispc_unregister_isr_nosync(dss_apply_irq_handler, NULL, mask);
>       WARN_ON(r);
>  
>       dss_data.irq_enabled = false;
> diff --git a/drivers/video/omap2/dss/dispc.c b/drivers/video/omap2/dss/dispc.c
> index ee9e296..a67d92c 100644
> --- a/drivers/video/omap2/dss/dispc.c
> +++ b/drivers/video/omap2/dss/dispc.c
> @@ -2421,8 +2421,8 @@ static void dispc_mgr_enable_digit_out(bool enable)
>                                       enable ? "start" : "stop");
>       }
>  
> -     r = omap_dispc_unregister_isr(dispc_disable_isr, &frame_done_completion,
> -                     irq_mask);
> +     r = omap_dispc_unregister_isr(dispc_disable_isr,
> +                     &frame_done_completion, irq_mask);

This change is not needed.

>       if (r)
>               DSSERR("failed to unregister %x isr\n", irq_mask);
>  
> @@ -3320,7 +3320,8 @@ err:
>  }
>  EXPORT_SYMBOL(omap_dispc_register_isr);
>  
> -int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
> +/* WARNING: callback might be executed even after this function returns! */
> +int omap_dispc_unregister_isr_nosync(omap_dispc_isr_t isr, void *arg, u32 
> mask)
>  {
>       int i;
>       unsigned long flags;
> @@ -3352,7 +3353,37 @@ int omap_dispc_unregister_isr(omap_dispc_isr_t isr, 
> void *arg, u32 mask)
>  
>       return ret;
>  }
> -EXPORT_SYMBOL(omap_dispc_unregister_isr);
> +EXPORT_SYMBOL(omap_dispc_unregister_isr_nosync);
> +
> +/*
> + * Ensure that callback <isr> will NOT be executed after this function
> + * returns. Must be called from sleepable context, though!
> + */
> +int omap_dispc_unregister_isr(omap_dispc_isr_t isr, void *arg, u32 mask)
> +{
> +     int ret;
> +
> +     ret = omap_dispc_unregister_isr_nosync(isr, arg, mask);
> +
> +     /* Task context is not really needed. But if we're called from atomic
> +      * context, it is probably from DISPC IRQ, where we will deadlock.
> +      * So use might_sleep() to catch potential deadlocks.
> +      */

Use the kernel multi-line comment style, i.e.:

/*
 * foobar
 */

> +     might_sleep();
> +
> +#if defined(CONFIG_SMP)
> +     /* DISPC IRQ executes callbacks with dispc.irq_lock released. Hence
> +      * unregister_isr() and DISPC IRQ might be running in parallel on
> +      * different CPUs. So there is a chance that a callback is executed
> +      * even though it has been unregistered. Add a barrier, in order to
> +      * ensure that after returning from this function, the new DISPC IRQ
> +      * will use an updated callback array, and NOT its cached copy.
> +      */

Comment style here also.

> +     synchronize_irq(dispc.irq);
> +#endif

Why do you use #if defined(CONFIG_SMP)? synchronize_irq is defined
with !CONFIG_SMP also. In that case it becomes just barrier().

> +
> +     return ret;
> +}
>  
>  #ifdef DEBUG
>  static void print_irq_status(u32 status)
> @@ -3567,7 +3598,8 @@ int omap_dispc_wait_for_irq_timeout(u32 irqmask, 
> unsigned long timeout)
>  
>       timeout = wait_for_completion_timeout(&completion, timeout);
>  
> -     omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
> +     omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion,
> +                     irqmask);

Change not needed.

>  
>       if (timeout == 0)
>               return -ETIMEDOUT;
> @@ -3598,7 +3630,8 @@ int omap_dispc_wait_for_irq_interruptible_timeout(u32 
> irqmask,
>       timeout = wait_for_completion_interruptible_timeout(&completion,
>                       timeout);
>  
> -     omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
> +     omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion,
> +                     irqmask);

Change not needed.

>  
>       if (timeout == 0)
>               return -ETIMEDOUT;
> diff --git a/drivers/video/omap2/dss/dsi.c b/drivers/video/omap2/dss/dsi.c
> index b07e886..24b4a3e 100644
> --- a/drivers/video/omap2/dss/dsi.c
> +++ b/drivers/video/omap2/dss/dsi.c
> @@ -960,6 +960,13 @@ static int dsi_unregister_isr(struct platform_device 
> *dsidev,
>  
>       spin_unlock_irqrestore(&dsi->irq_lock, flags);
>  
> +     might_sleep();
> +
> +#if defined(CONFIG_SMP)
> +     /* See notes for dispc.c:omap_dispc_unregister_isr() . */
> +     synchronize_irq(dsi->irq);
> +#endif
> +

Same SMP comment as with dispc.

 Tomi

Attachment: signature.asc
Description: This is a digitally signed message part

Reply via email to