Re: [Intel-gfx] [PATCH v2 1/4] drm/i915/guc: Fix lockdep due to log relay channel handling under struct_mutex
Quoting Michal Wajdeczko (2018-01-23 16:44:33) > On Tue, 23 Jan 2018 16:42:17 +0100, Sagar Arun Kamble > > @@ -605,7 +681,11 @@ int i915_guc_log_control(struct drm_i915_private > > *dev_priv, u64 control_val) > > } > > /* GuC logging is currently the only user of Guc2Host > > interrupts */ > > + mutex_lock(_priv->drm.struct_mutex); > > + intel_runtime_pm_get(dev_priv); > > gen9_enable_guc_interrupts(dev_priv); > > + intel_runtime_pm_put(dev_priv); > > + mutex_unlock(_priv->drm.struct_mutex); > > Maybe pm_get/lock/xxx/unlock/pm_put would be better order ? There's no strong reason to prefer one nesting or the other. So pick a pattern for guc_interrupts and stick to it :) -Chris ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Re: [Intel-gfx] [PATCH v2 1/4] drm/i915/guc: Fix lockdep due to log relay channel handling under struct_mutex
On Tue, 23 Jan 2018 16:42:17 +0100, Sagar Arun Kamblewrote: This patch fixes lockdep issue due to circular locking dependency of struct_mutex, i_mutex_key, mmap_sem, relay_channels_mutex. For GuC log relay channel we create debugfs file that requires i_mutex_key lock and we are doing that under struct_mutex. So we introduced newer dependency as: >struct_mutex --> >s_type->i_mutex_key#3 --> >mmap_sem However, there is dependency from mmap_sem to struct_mutex. Hence we separate the relay create/destroy operation from under struct_mutex. ... diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 80dc679..b45be0d 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -2467,7 +2467,6 @@ static int i915_guc_log_control_get(void *data, u64 *val) static int i915_guc_log_control_set(void *data, u64 val) { struct drm_i915_private *dev_priv = data; - int ret; if (!HAS_GUC(dev_priv)) return -ENODEV; @@ -2475,16 +2474,7 @@ static int i915_guc_log_control_set(void *data, u64 val) if (!dev_priv->guc.log.vma) return -EINVAL; - ret = mutex_lock_interruptible(_priv->drm.struct_mutex); - if (ret) - return ret; - - intel_runtime_pm_get(dev_priv); - ret = i915_guc_log_control(dev_priv, val); - intel_runtime_pm_put(dev_priv); - - mutex_unlock(_priv->drm.struct_mutex); - return ret; + return i915_guc_log_control(dev_priv, val); I hope that one day we change signature of this function to int intel_guc_log_control(struct intel_guc *guc, u64 control); ... diff --git a/drivers/gpu/drm/i915/intel_guc.c b/drivers/gpu/drm/i915/intel_guc.c index ea30e7c..cab3c98 100644 --- a/drivers/gpu/drm/i915/intel_guc.c +++ b/drivers/gpu/drm/i915/intel_guc.c @@ -66,6 +66,7 @@ void intel_guc_init_early(struct intel_guc *guc) intel_guc_ct_init_early(>ct); mutex_init(>send_mutex); + mutex_init(>log.runtime.relay_lock); Maybe this can be done in intel_guc_loc.c (or .h) as void intel_guc_log_init_early() { } guc->send = intel_guc_send_nop; guc->notify = gen8_guc_raise_irq; } @@ -87,8 +88,10 @@ int intel_guc_init_wq(struct intel_guc *guc) */ guc->log.runtime.flush_wq = alloc_ordered_workqueue("i915-guc_log", WQ_HIGHPRI | WQ_FREEZABLE); - if (!guc->log.runtime.flush_wq) + if (!guc->log.runtime.flush_wq) { + DRM_ERROR("Couldn't allocate workqueue for GuC log\n"); return -ENOMEM; + } /* * Even though both sending GuC action, and adding a new workitem to @@ -109,6 +112,8 @@ int intel_guc_init_wq(struct intel_guc *guc) WQ_HIGHPRI); if (!guc->preempt_wq) { destroy_workqueue(guc->log.runtime.flush_wq); + DRM_ERROR("Couldn't allocate workqueue for GuC " + "preemption\n"); return -ENOMEM; } } ... static void capture_logs_work(struct work_struct *work) @@ -363,12 +377,12 @@ static int guc_log_runtime_create(struct intel_guc *guc) { struct drm_i915_private *dev_priv = guc_to_i915(guc); void *vaddr; - struct rchan *guc_log_relay_chan; - size_t n_subbufs, subbuf_size; int ret; lockdep_assert_held(_priv->drm.struct_mutex); + GEM_BUG_ON(!guc_log_has_relay(guc)); + Do we need this line? GEM_BUG_ON(guc_log_has_runtime(guc)); ret = i915_gem_object_set_to_wc_domain(guc->log.vma->obj, true); @@ -387,8 +401,40 @@ static int guc_log_runtime_create(struct intel_guc *guc) guc->log.runtime.buf_addr = vaddr; + INIT_WORK(>log.runtime.flush_work, capture_logs_work); I'm not sure about other BKMs, but I prefer to not delay such initialization and perform them in functions like init_early + + return 0; +} + +static void guc_log_runtime_destroy(struct intel_guc *guc) +{ + /* +* It's possible that the runtime stuff was never allocated because +* GuC log was disabled at the boot time. +**/ Is this correct comment style ? + if (!guc_log_has_runtime(guc)) + return; + + i915_gem_object_unpin_map(guc->log.vma->obj); + guc->log.runtime.buf_addr = NULL; +} + ... @@ -605,7 +681,11 @@ int i915_guc_log_control(struct drm_i915_private *dev_priv, u64 control_val) } /* GuC logging is currently the only user of Guc2Host interrupts */ + mutex_lock(_priv->drm.struct_mutex); + intel_runtime_pm_get(dev_priv); gen9_enable_guc_interrupts(dev_priv); + intel_runtime_pm_put(dev_priv); +
Re: [Intel-gfx] [PATCH v2 1/4] drm/i915/guc: Fix lockdep due to log relay channel handling under struct_mutex
Quoting Sagar Arun Kamble (2018-01-23 15:42:17) > static void *guc_get_write_buffer(struct intel_guc *guc) > { > - if (!guc->log.runtime.relay_chan) > + if (!guc_log_has_relay(guc)) > return NULL; > > /* Just get the base address of a new sub buffer and copy data into it > @@ -266,7 +276,9 @@ static void guc_read_update_log_buffer(struct intel_guc > *guc) > log_buf_state = src_data = guc->log.runtime.buf_addr; > > /* Get the pointer to local buffer to store the logs */ > + mutex_lock(>log.runtime.relay_lock); > log_buf_snapshot_state = dst_data = guc_get_write_buffer(guc); > + mutex_unlock(>log.runtime.relay_lock); Since dst_data/log_buf_snapshot_state are pointing into the relay_chan they are only valid underneath the relay_lock (we don't have any references or whatnot). > /* Actual logs are present from the 2nd page */ > src_data += PAGE_SIZE; > @@ -335,6 +347,7 @@ static void guc_read_update_log_buffer(struct intel_guc > *guc) > dst_data += buffer_size; > } > > + mutex_lock(>log.runtime.relay_lock); > if (log_buf_snapshot_state) > guc_move_to_next_buf(guc); > else { > @@ -344,6 +357,7 @@ static void guc_read_update_log_buffer(struct intel_guc > *guc) > DRM_ERROR_RATELIMITED("no sub-buffer to capture logs\n"); > guc->log.capture_miss_count++; > } > + mutex_unlock(>log.runtime.relay_lock); I.e. it's no good just reacquiring the lock as the state may have perished in between. -Chris ___ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/intel-gfx
[Intel-gfx] [PATCH v2 1/4] drm/i915/guc: Fix lockdep due to log relay channel handling under struct_mutex
This patch fixes lockdep issue due to circular locking dependency of struct_mutex, i_mutex_key, mmap_sem, relay_channels_mutex. For GuC log relay channel we create debugfs file that requires i_mutex_key lock and we are doing that under struct_mutex. So we introduced newer dependency as: >struct_mutex --> >s_type->i_mutex_key#3 --> >mmap_sem However, there is dependency from mmap_sem to struct_mutex. Hence we separate the relay create/destroy operation from under struct_mutex. == WARNING: possible circular locking dependency detected 4.15.0-rc6-CI-Patchwork_7614+ #1 Not tainted -- debugfs_test/1388 is trying to acquire lock: (>struct_mutex){+.+.}, at: [] i915_mutex_lock_interruptible+0x47/0x130 [i915] but task is already holding lock: (>mmap_sem){}, at: [<29a9c131>] __do_page_fault+0x106/0x560 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (>mmap_sem){}: _copy_to_user+0x1e/0x70 filldir+0x8c/0xf0 dcache_readdir+0xeb/0x160 iterate_dir+0xdc/0x140 SyS_getdents+0xa0/0x130 entry_SYSCALL_64_fastpath+0x1c/0x89 -> #2 (>s_type->i_mutex_key#3){}: start_creating+0x59/0x110 __debugfs_create_file+0x2e/0xe0 relay_create_buf_file+0x62/0x80 relay_late_setup_files+0x84/0x250 guc_log_late_setup+0x4f/0x110 [i915] i915_guc_log_register+0x32/0x40 [i915] i915_driver_load+0x7b6/0x1720 [i915] i915_pci_probe+0x2e/0x90 [i915] pci_device_probe+0x9c/0x120 driver_probe_device+0x2a3/0x480 __driver_attach+0xd9/0xe0 bus_for_each_dev+0x57/0x90 bus_add_driver+0x168/0x260 driver_register+0x52/0xc0 do_one_initcall+0x39/0x150 do_init_module+0x56/0x1ef load_module+0x231c/0x2d70 SyS_finit_module+0xa5/0xe0 entry_SYSCALL_64_fastpath+0x1c/0x89 -> #1 (relay_channels_mutex){+.+.}: relay_open+0x12c/0x2b0 intel_guc_log_runtime_create+0xab/0x230 [i915] intel_guc_init+0x81/0x120 [i915] intel_uc_init+0x29/0xa0 [i915] i915_gem_init+0x182/0x530 [i915] i915_driver_load+0xaa9/0x1720 [i915] i915_pci_probe+0x2e/0x90 [i915] pci_device_probe+0x9c/0x120 driver_probe_device+0x2a3/0x480 __driver_attach+0xd9/0xe0 bus_for_each_dev+0x57/0x90 bus_add_driver+0x168/0x260 driver_register+0x52/0xc0 do_one_initcall+0x39/0x150 do_init_module+0x56/0x1ef load_module+0x231c/0x2d70 SyS_finit_module+0xa5/0xe0 entry_SYSCALL_64_fastpath+0x1c/0x89 -> #0 (>struct_mutex){+.+.}: __mutex_lock+0x81/0x9b0 i915_mutex_lock_interruptible+0x47/0x130 [i915] i915_gem_fault+0x201/0x790 [i915] __do_fault+0x15/0x70 __handle_mm_fault+0x677/0xdc0 handle_mm_fault+0x14f/0x2f0 __do_page_fault+0x2d1/0x560 page_fault+0x4c/0x60 other info that might help us debug this: Chain exists of: >struct_mutex --> >s_type->i_mutex_key#3 --> >mmap_sem Possible unsafe locking scenario: CPU0CPU1 lock(>mmap_sem); lock(>s_type->i_mutex_key#3); lock(>mmap_sem); lock(>struct_mutex); *** DEADLOCK *** 1 lock held by debugfs_test/1388: #0: (>mmap_sem){}, at: [<29a9c131>] __do_page_fault+0x106/0x560 stack backtrace: CPU: 2 PID: 1388 Comm: debugfs_test Not tainted 4.15.0-rc6-CI-Patchwork_7614+ #1 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./J4205-ITX, BIOS P1.10 09/29/2016 Call Trace: dump_stack+0x5f/0x86 print_circular_bug.isra.18+0x1d0/0x2c0 __lock_acquire+0x14ae/0x1b60 ? lock_acquire+0xaf/0x200 lock_acquire+0xaf/0x200 ? i915_mutex_lock_interruptible+0x47/0x130 [i915] __mutex_lock+0x81/0x9b0 ? i915_mutex_lock_interruptible+0x47/0x130 [i915] ? i915_mutex_lock_interruptible+0x47/0x130 [i915] ? i915_mutex_lock_interruptible+0x47/0x130 [i915] i915_mutex_lock_interruptible+0x47/0x130 [i915] ? __pm_runtime_resume+0x4f/0x80 i915_gem_fault+0x201/0x790 [i915] __do_fault+0x15/0x70 ? _raw_spin_unlock+0x29/0x40 __handle_mm_fault+0x677/0xdc0 handle_mm_fault+0x14f/0x2f0 __do_page_fault+0x2d1/0x560 ? page_fault+0x36/0x60 page_fault+0x4c/0x60 v2: Added lock protection to guc->log.runtime.relay_chan (Chris) Fixed locking inside guc_flush_logs uncovered by new lockdep. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104693 Testcase: igt/debugfs_test/read_all_entries # with enable_guc=1 and guc_log_level=1 Signed-off-by: Sagar Arun Kamble Cc: Michal Wajdeczko Cc: Daniele Ceraolo Spurio Cc: Tvrtko Ursulin Cc: Chris Wilson Cc: Joonas Lahtinen