Yeah I was also looking into this problem 

Our sched_main routine need to access entity after "sched_select_entity", but 
after selected the entity may just get released by context free

So the following entity accessing may operate on wild pointer, 

Although I have a patch to fix it, but I don't know how to deal with the same 
wild pointer issue in "sched_job_recovery", for my gpu reset feature 

BR Monk


-----Original Message-----
From: amd-gfx [mailto:[email protected]] On Behalf Of 
Chunming Zhou
Sent: 2017年10月24日 17:56
To: [email protected]
Cc: Zhou, David(ChunMing) <[email protected]>
Subject: [PATCH] drm/amd/scheduler: fix one used-after-free case for 
job->s_entity

The s_entity presented process could already be closed when calling 
amdgpu_job_free_cb.
the s_entity will be buggy pointer after it's freed. See below calltrace:

[  355.616964] 
==================================================================
[  355.617191] BUG: KASAN: use-after-free in amdgpu_job_free_cb+0x2f/0xc0 
[amdgpu] [  355.617197] Read of size 8 at addr ffff88039d593c40 by task 
kworker/9:1/100

[  355.617206] CPU: 9 PID: 100 Comm: kworker/9:1 Not tainted 4.13.0-custom #1 [ 
 355.617208] Hardware name: Gigabyte Technology Co., Ltd. Default 
string/X99P-SLI-CF, BIOS F23 07/22/2016 [  355.617342] Workqueue: events 
amd_sched_job_finish [amdgpu] [  355.617344] Call Trace:
[  355.617351]  dump_stack+0x63/0x8d
[  355.617356]  print_address_description+0x70/0x290
[  355.617474]  ? amdgpu_job_free_cb+0x2f/0xc0 [amdgpu] [  355.617477]  
kasan_report+0x265/0x350 [  355.617479]  __asan_load8+0x54/0x90 [  355.617603]  
amdgpu_job_free_cb+0x2f/0xc0 [amdgpu] [  355.617721]  
amd_sched_job_finish+0x161/0x180 [amdgpu] [  355.617725]  
process_one_work+0x2ab/0x700 [  355.617727]  worker_thread+0x90/0x720 [  
355.617731]  kthread+0x18c/0x1e0 [  355.617732]  ? process_one_work+0x700/0x700 
[  355.617735]  ? kthread_create_on_node+0xb0/0xb0 [  355.617738]  
ret_from_fork+0x25/0x30

[  355.617742] Allocated by task 1347:
[  355.617747]  save_stack_trace+0x1b/0x20 [  355.617749]  save_stack+0x46/0xd0 
[  355.617751]  kasan_kmalloc+0xad/0xe0 [  355.617753]  
kmem_cache_alloc_trace+0xef/0x200 [  355.617853]  
amdgpu_driver_open_kms+0x98/0x290 [amdgpu] [  355.617883]  drm_open+0x38c/0x6e0 
[drm] [  355.617908]  drm_stub_open+0x144/0x1b0 [drm] [  355.617911]  
chrdev_open+0x180/0x320 [  355.617913]  do_dentry_open+0x3a2/0x570 [  
355.617915]  vfs_open+0x86/0xe0 [  355.617918]  path_openat+0x49e/0x1db0 [  
355.617919]  do_filp_open+0x11c/0x1a0 [  355.617921]  do_sys_open+0x16f/0x2a0 [ 
 355.617923]  SyS_open+0x1e/0x20 [  355.617926]  do_syscall_64+0xea/0x210 [  
355.617928]  return_from_SYSCALL_64+0x0/0x6a

[  355.617931] Freed by task 1347:
[  355.617934]  save_stack_trace+0x1b/0x20 [  355.617936]  save_stack+0x46/0xd0 
[  355.617937]  kasan_slab_free+0x70/0xc0 [  355.617939]  kfree+0x9d/0x1c0 [  
355.618038]  amdgpu_driver_postclose_kms+0x1bc/0x3e0 [amdgpu] [  355.618063]  
drm_release+0x454/0x610 [drm] [  355.618065]  __fput+0x177/0x350 [  355.618066] 
 ____fput+0xe/0x10 [  355.618068]  task_work_run+0xa0/0xc0 [  355.618070]  
do_exit+0x456/0x1320 [  355.618072]  do_group_exit+0x86/0x130 [  355.618074]  
SyS_exit_group+0x1d/0x20 [  355.618076]  do_syscall_64+0xea/0x210 [  
355.618078]  return_from_SYSCALL_64+0x0/0x6a

[  355.618081] The buggy address belongs to the object at ffff88039d593b80
                which belongs to the cache kmalloc-2048 of size 2048 [  
355.618085] The buggy address is located 192 bytes inside of
                2048-byte region [ffff88039d593b80, ffff88039d594380) [  
355.618087] The buggy address belongs to the page:
[  355.618091] page:ffffea000e756400 count:1 mapcount:0 mapping:          
(null) index:0x0 compound_mapcount: 0
[  355.618095] flags: 0x2ffff0000008100(slab|head) [  355.618099] raw: 
02ffff0000008100 0000000000000000 0000000000000000 00000001000f000f [  
355.618103] raw: ffffea000edb0600 0000000200000002 ffff8803bfc0ea00 
0000000000000000 [  355.618105] page dumped because: kasan: bad access detected

[  355.618108] Memory state around the buggy address:
[  355.618110]  ffff88039d593b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
fc [  355.618113]  ffff88039d593b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb [  355.618116] >ffff88039d593c00: fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb fb
[  355.618117]                                            ^
[  355.618120]  ffff88039d593c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb [  355.618122]  ffff88039d593d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb 
fb fb [  355.618124] 
==================================================================
[  355.618126] Disabling lock debugging due to kernel taint

Change-Id: I8ff7122796b8cd16fc26e9c40e8d4c8153d67e0c
Signed-off-by: Chunming Zhou <[email protected]>
---
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c |  1 +  
drivers/gpu/drm/amd/scheduler/gpu_scheduler.h | 27 ++++++++++++++-------------
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 007fdbd..8101ed7 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -535,6 +535,7 @@ int amd_sched_job_init(struct amd_sched_job *job,
        if (!job->s_fence)
                return -ENOMEM;
        job->id = atomic64_inc_return(&sched->job_id_count);
+       job->priority = job->s_entity->rq - job->sched->sched_rq;
 
        INIT_WORK(&job->finish_work, amd_sched_job_finish);
        INIT_LIST_HEAD(&job->node);
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index e21299c..8808eb1 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -77,6 +77,18 @@ struct amd_sched_fence {
        void                            *owner;
 };
 
+enum amd_sched_priority {
+       AMD_SCHED_PRIORITY_MIN,
+       AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
+       AMD_SCHED_PRIORITY_NORMAL,
+       AMD_SCHED_PRIORITY_HIGH_SW,
+       AMD_SCHED_PRIORITY_HIGH_HW,
+       AMD_SCHED_PRIORITY_KERNEL,
+       AMD_SCHED_PRIORITY_MAX,
+       AMD_SCHED_PRIORITY_INVALID = -1,
+       AMD_SCHED_PRIORITY_UNSET = -2
+};
+
 struct amd_sched_job {
        struct amd_gpu_scheduler        *sched;
        struct amd_sched_entity         *s_entity;
@@ -87,6 +99,7 @@ struct amd_sched_job {
        struct delayed_work             work_tdr;
        uint64_t                        id;
        atomic_t karma;
+       enum amd_sched_priority         priority;
 };
 
 extern const struct dma_fence_ops amd_sched_fence_ops_scheduled; @@ -118,18 
+131,6 @@ struct amd_sched_backend_ops {
        void (*free_job)(struct amd_sched_job *sched_job);  };
 
-enum amd_sched_priority {
-       AMD_SCHED_PRIORITY_MIN,
-       AMD_SCHED_PRIORITY_LOW = AMD_SCHED_PRIORITY_MIN,
-       AMD_SCHED_PRIORITY_NORMAL,
-       AMD_SCHED_PRIORITY_HIGH_SW,
-       AMD_SCHED_PRIORITY_HIGH_HW,
-       AMD_SCHED_PRIORITY_KERNEL,
-       AMD_SCHED_PRIORITY_MAX,
-       AMD_SCHED_PRIORITY_INVALID = -1,
-       AMD_SCHED_PRIORITY_UNSET = -2
-};
-
 /**
  * One scheduler is implemented for each hardware ring  */ @@ -183,7 +184,7 @@ 
void amd_sched_job_kickout(struct amd_sched_job *s_job);  static inline enum 
amd_sched_priority  amd_sched_get_job_priority(struct amd_sched_job *job)  {
-       return (job->s_entity->rq - job->sched->sched_rq);
+       return job->priority;
 }
 
 #endif
--
2.7.4

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to