Re: [Intel-gfx] [PATCH v2 1/4] drm/i915/guc: Fix lockdep due to log relay channel handling under struct_mutex

2018-01-23 Thread Chris Wilson
Quoting Michal Wajdeczko (2018-01-23 16:44:33)
> On Tue, 23 Jan 2018 16:42:17 +0100, Sagar Arun Kamble  
> > @@ -605,7 +681,11 @@ int i915_guc_log_control(struct drm_i915_private  
> > *dev_priv, u64 control_val)
> >   }
> >   /* GuC logging is currently the only user of Guc2Host 
> > interrupts */
> > + mutex_lock(_priv->drm.struct_mutex);
> > + intel_runtime_pm_get(dev_priv);
> >   gen9_enable_guc_interrupts(dev_priv);
> > + intel_runtime_pm_put(dev_priv);
> > + mutex_unlock(_priv->drm.struct_mutex);
> 
> Maybe pm_get/lock/xxx/unlock/pm_put would be better order ?

There's no strong reason to prefer one nesting or the other. So pick a
pattern for guc_interrupts and stick to it :)
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


Re: [Intel-gfx] [PATCH v2 1/4] drm/i915/guc: Fix lockdep due to log relay channel handling under struct_mutex

2018-01-23 Thread Michal Wajdeczko
On Tue, 23 Jan 2018 16:42:17 +0100, Sagar Arun Kamble  
 wrote:



This patch fixes lockdep issue due to circular locking dependency of
struct_mutex, i_mutex_key, mmap_sem, relay_channels_mutex.
For GuC log relay channel we create debugfs file that requires  
i_mutex_key

lock and we are doing that under struct_mutex. So we introduced newer
dependency as:
>struct_mutex --> >s_type->i_mutex_key#3 --> >mmap_sem
However, there is dependency from mmap_sem to struct_mutex. Hence we
separate the relay create/destroy operation from under struct_mutex.



... 

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c  
b/drivers/gpu/drm/i915/i915_debugfs.c

index 80dc679..b45be0d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2467,7 +2467,6 @@ static int i915_guc_log_control_get(void *data,  
u64 *val)

 static int i915_guc_log_control_set(void *data, u64 val)
 {
struct drm_i915_private *dev_priv = data;
-   int ret;
if (!HAS_GUC(dev_priv))
return -ENODEV;
@@ -2475,16 +2474,7 @@ static int i915_guc_log_control_set(void *data,  
u64 val)

if (!dev_priv->guc.log.vma)
return -EINVAL;
-   ret = mutex_lock_interruptible(_priv->drm.struct_mutex);
-   if (ret)
-   return ret;
-
-   intel_runtime_pm_get(dev_priv);
-   ret = i915_guc_log_control(dev_priv, val);
-   intel_runtime_pm_put(dev_priv);
-
-   mutex_unlock(_priv->drm.struct_mutex);
-   return ret;
+   return i915_guc_log_control(dev_priv, val);


I hope that one day we change signature of this function to

int intel_guc_log_control(struct intel_guc *guc, u64 control);

... 

diff --git a/drivers/gpu/drm/i915/intel_guc.c  
b/drivers/gpu/drm/i915/intel_guc.c

index ea30e7c..cab3c98 100644
--- a/drivers/gpu/drm/i915/intel_guc.c
+++ b/drivers/gpu/drm/i915/intel_guc.c
@@ -66,6 +66,7 @@ void intel_guc_init_early(struct intel_guc *guc)
intel_guc_ct_init_early(>ct);
mutex_init(>send_mutex);
+   mutex_init(>log.runtime.relay_lock);


Maybe this can be done in intel_guc_loc.c (or .h) as

void intel_guc_log_init_early() { }


guc->send = intel_guc_send_nop;
guc->notify = gen8_guc_raise_irq;
 }
@@ -87,8 +88,10 @@ int intel_guc_init_wq(struct intel_guc *guc)
 */
guc->log.runtime.flush_wq = alloc_ordered_workqueue("i915-guc_log",
WQ_HIGHPRI | WQ_FREEZABLE);
-   if (!guc->log.runtime.flush_wq)
+   if (!guc->log.runtime.flush_wq) {
+   DRM_ERROR("Couldn't allocate workqueue for GuC log\n");
return -ENOMEM;
+   }
/*
 * Even though both sending GuC action, and adding a new workitem to
@@ -109,6 +112,8 @@ int intel_guc_init_wq(struct intel_guc *guc)
  WQ_HIGHPRI);
if (!guc->preempt_wq) {
destroy_workqueue(guc->log.runtime.flush_wq);
+   DRM_ERROR("Couldn't allocate workqueue for GuC "
+ "preemption\n");
return -ENOMEM;
}
}


... 



static void capture_logs_work(struct work_struct *work)
@@ -363,12 +377,12 @@ static int guc_log_runtime_create(struct intel_guc  
*guc)

 {
struct drm_i915_private *dev_priv = guc_to_i915(guc);
void *vaddr;
-   struct rchan *guc_log_relay_chan;
-   size_t n_subbufs, subbuf_size;
int ret;
lockdep_assert_held(_priv->drm.struct_mutex);
+   GEM_BUG_ON(!guc_log_has_relay(guc));
+


Do we need this line?


GEM_BUG_ON(guc_log_has_runtime(guc));
ret = i915_gem_object_set_to_wc_domain(guc->log.vma->obj, true);
@@ -387,8 +401,40 @@ static int guc_log_runtime_create(struct intel_guc  
*guc)

guc->log.runtime.buf_addr = vaddr;
+   INIT_WORK(>log.runtime.flush_work, capture_logs_work);


I'm not sure about other BKMs, but I prefer to not delay such  
initialization

and perform them in functions like init_early


+
+   return 0;
+}
+
+static void guc_log_runtime_destroy(struct intel_guc *guc)
+{
+   /*
+* It's possible that the runtime stuff was never allocated because
+* GuC log was disabled at the boot time.
+**/


Is this correct comment style ?


+   if (!guc_log_has_runtime(guc))
+   return;
+
+   i915_gem_object_unpin_map(guc->log.vma->obj);
+   guc->log.runtime.buf_addr = NULL;
+}
+


... 

@@ -605,7 +681,11 @@ int i915_guc_log_control(struct drm_i915_private  
*dev_priv, u64 control_val)

}
/* GuC logging is currently the only user of Guc2Host 
interrupts */
+   mutex_lock(_priv->drm.struct_mutex);
+   intel_runtime_pm_get(dev_priv);
gen9_enable_guc_interrupts(dev_priv);
+   intel_runtime_pm_put(dev_priv);
+   

Re: [Intel-gfx] [PATCH v2 1/4] drm/i915/guc: Fix lockdep due to log relay channel handling under struct_mutex

2018-01-23 Thread Chris Wilson
Quoting Sagar Arun Kamble (2018-01-23 15:42:17)
>  static void *guc_get_write_buffer(struct intel_guc *guc)
>  {
> -   if (!guc->log.runtime.relay_chan)
> +   if (!guc_log_has_relay(guc))
> return NULL;
>  
> /* Just get the base address of a new sub buffer and copy data into it
> @@ -266,7 +276,9 @@ static void guc_read_update_log_buffer(struct intel_guc 
> *guc)
> log_buf_state = src_data = guc->log.runtime.buf_addr;
>  
> /* Get the pointer to local buffer to store the logs */
> +   mutex_lock(>log.runtime.relay_lock);
> log_buf_snapshot_state = dst_data = guc_get_write_buffer(guc);
> +   mutex_unlock(>log.runtime.relay_lock);

Since dst_data/log_buf_snapshot_state are pointing into the relay_chan
they are only valid underneath the relay_lock (we don't have any
references or whatnot).

> /* Actual logs are present from the 2nd page */
> src_data += PAGE_SIZE;
> @@ -335,6 +347,7 @@ static void guc_read_update_log_buffer(struct intel_guc 
> *guc)
> dst_data += buffer_size;
> }
>  
> +   mutex_lock(>log.runtime.relay_lock);
> if (log_buf_snapshot_state)
> guc_move_to_next_buf(guc);
> else {
> @@ -344,6 +357,7 @@ static void guc_read_update_log_buffer(struct intel_guc 
> *guc)
> DRM_ERROR_RATELIMITED("no sub-buffer to capture logs\n");
> guc->log.capture_miss_count++;
> }
> +   mutex_unlock(>log.runtime.relay_lock);

I.e. it's no good just reacquiring the lock as the state may have
perished in between.
-Chris
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH v2 1/4] drm/i915/guc: Fix lockdep due to log relay channel handling under struct_mutex

2018-01-23 Thread Sagar Arun Kamble
This patch fixes lockdep issue due to circular locking dependency of
struct_mutex, i_mutex_key, mmap_sem, relay_channels_mutex.
For GuC log relay channel we create debugfs file that requires i_mutex_key
lock and we are doing that under struct_mutex. So we introduced newer
dependency as:
>struct_mutex --> >s_type->i_mutex_key#3 --> >mmap_sem
However, there is dependency from mmap_sem to struct_mutex. Hence we
separate the relay create/destroy operation from under struct_mutex.

==
WARNING: possible circular locking dependency detected
4.15.0-rc6-CI-Patchwork_7614+ #1 Not tainted
--
debugfs_test/1388 is trying to acquire lock:
 (>struct_mutex){+.+.}, at: [] 
i915_mutex_lock_interruptible+0x47/0x130 [i915]

but task is already holding lock:
 (>mmap_sem){}, at: [<29a9c131>] __do_page_fault+0x106/0x560

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (>mmap_sem){}:
   _copy_to_user+0x1e/0x70
   filldir+0x8c/0xf0
   dcache_readdir+0xeb/0x160
   iterate_dir+0xdc/0x140
   SyS_getdents+0xa0/0x130
   entry_SYSCALL_64_fastpath+0x1c/0x89

-> #2 (>s_type->i_mutex_key#3){}:
   start_creating+0x59/0x110
   __debugfs_create_file+0x2e/0xe0
   relay_create_buf_file+0x62/0x80
   relay_late_setup_files+0x84/0x250
   guc_log_late_setup+0x4f/0x110 [i915]
   i915_guc_log_register+0x32/0x40 [i915]
   i915_driver_load+0x7b6/0x1720 [i915]
   i915_pci_probe+0x2e/0x90 [i915]
   pci_device_probe+0x9c/0x120
   driver_probe_device+0x2a3/0x480
   __driver_attach+0xd9/0xe0
   bus_for_each_dev+0x57/0x90
   bus_add_driver+0x168/0x260
   driver_register+0x52/0xc0
   do_one_initcall+0x39/0x150
   do_init_module+0x56/0x1ef
   load_module+0x231c/0x2d70
   SyS_finit_module+0xa5/0xe0
   entry_SYSCALL_64_fastpath+0x1c/0x89

-> #1 (relay_channels_mutex){+.+.}:
   relay_open+0x12c/0x2b0
   intel_guc_log_runtime_create+0xab/0x230 [i915]
   intel_guc_init+0x81/0x120 [i915]
   intel_uc_init+0x29/0xa0 [i915]
   i915_gem_init+0x182/0x530 [i915]
   i915_driver_load+0xaa9/0x1720 [i915]
   i915_pci_probe+0x2e/0x90 [i915]
   pci_device_probe+0x9c/0x120
   driver_probe_device+0x2a3/0x480
   __driver_attach+0xd9/0xe0
   bus_for_each_dev+0x57/0x90
   bus_add_driver+0x168/0x260
   driver_register+0x52/0xc0
   do_one_initcall+0x39/0x150
   do_init_module+0x56/0x1ef
   load_module+0x231c/0x2d70
   SyS_finit_module+0xa5/0xe0
   entry_SYSCALL_64_fastpath+0x1c/0x89

-> #0 (>struct_mutex){+.+.}:
   __mutex_lock+0x81/0x9b0
   i915_mutex_lock_interruptible+0x47/0x130 [i915]
   i915_gem_fault+0x201/0x790 [i915]
   __do_fault+0x15/0x70
   __handle_mm_fault+0x677/0xdc0
   handle_mm_fault+0x14f/0x2f0
   __do_page_fault+0x2d1/0x560
   page_fault+0x4c/0x60

other info that might help us debug this:

Chain exists of:
  >struct_mutex --> >s_type->i_mutex_key#3 --> >mmap_sem

 Possible unsafe locking scenario:

   CPU0CPU1
   
  lock(>mmap_sem);
   lock(>s_type->i_mutex_key#3);
   lock(>mmap_sem);
  lock(>struct_mutex);

 *** DEADLOCK ***

1 lock held by debugfs_test/1388:
 #0:  (>mmap_sem){}, at: [<29a9c131>] 
__do_page_fault+0x106/0x560

stack backtrace:
CPU: 2 PID: 1388 Comm: debugfs_test Not tainted 4.15.0-rc6-CI-Patchwork_7614+ #1
Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4205-ITX, BIOS 
P1.10 09/29/2016
Call Trace:
 dump_stack+0x5f/0x86
 print_circular_bug.isra.18+0x1d0/0x2c0
 __lock_acquire+0x14ae/0x1b60
 ? lock_acquire+0xaf/0x200
 lock_acquire+0xaf/0x200
 ? i915_mutex_lock_interruptible+0x47/0x130 [i915]
 __mutex_lock+0x81/0x9b0
 ? i915_mutex_lock_interruptible+0x47/0x130 [i915]
 ? i915_mutex_lock_interruptible+0x47/0x130 [i915]
 ? i915_mutex_lock_interruptible+0x47/0x130 [i915]
 i915_mutex_lock_interruptible+0x47/0x130 [i915]
 ? __pm_runtime_resume+0x4f/0x80
 i915_gem_fault+0x201/0x790 [i915]
 __do_fault+0x15/0x70
 ? _raw_spin_unlock+0x29/0x40
 __handle_mm_fault+0x677/0xdc0
 handle_mm_fault+0x14f/0x2f0
 __do_page_fault+0x2d1/0x560
 ? page_fault+0x36/0x60
 page_fault+0x4c/0x60

v2: Added lock protection to guc->log.runtime.relay_chan (Chris)
Fixed locking inside guc_flush_logs uncovered by new lockdep.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104693
Testcase: igt/debugfs_test/read_all_entries # with enable_guc=1 and 
guc_log_level=1
Signed-off-by: Sagar Arun Kamble 
Cc: Michal Wajdeczko 
Cc: Daniele Ceraolo Spurio 
Cc: Tvrtko Ursulin 
Cc: Chris Wilson 
Cc: Joonas Lahtinen