Re: [PATCH] drm/sched: fix null-ptr-deref in init entity

2024-03-15 Thread Christian König

Am 15.03.24 um 15:12 schrieb Alex Deucher:

On Fri, Mar 15, 2024 at 10:12 AM Christian König
 wrote:

Am 15.03.24 um 03:39 schrieb [email protected]:

From: Vitaly Prosyak 

The bug can be triggered by sending an amdgpu_cs_wait_ioctl
to the AMDGPU DRM driver on any ASICs with valid context.
The bug was reported by Joonkyo Jung .
For example the following code:

  static void Syzkaller2(int fd)
  {
   union drm_amdgpu_ctx arg1;
   union drm_amdgpu_wait_cs arg2;

   arg1.in.op = AMDGPU_CTX_OP_ALLOC_CTX;
   ret = drmIoctl(fd, 0x140106442 /* amdgpu_ctx_ioctl */, &arg1);

   arg2.in.handle = 0x0;
   arg2.in.timeout = 0x2;
   arg2.in.ip_type = AMD_IP_VPE /* 0x9 */;
   arg2->in.ip_instance = 0x0;
   arg2.in.ring = 0x0;
   arg2.in.ctx_id = arg1.out.alloc.ctx_id;

   drmIoctl(fd, 0xc0206449 /* AMDGPU_WAIT_CS * /, &arg2);
  }

The ioctl AMDGPU_WAIT_CS without previously submitted job could be assumed that
the error should be returned, but the following commit 
1decbf6bb0b4dc56c9da6c5e57b994ebfc2be3aa
modified the logic and allowed to have sched_rq equal to NULL.

As a result when there is no job the ioctl AMDGPU_WAIT_CS returns success.
The change fixes null-ptr-deref in init entity and the stack below demonstrates
the error condition:

[  +0.07] BUG: kernel NULL pointer dereference, address: 0028
[  +0.007086] #PF: supervisor read access in kernel mode
[  +0.005234] #PF: error_code(0x) - not-present page
[  +0.005232] PGD 0 P4D 0
[  +0.002501] Oops:  [#1] PREEMPT SMP KASAN NOPTI
[  +0.005034] CPU: 10 PID: 9229 Comm: amd_basic Tainted: GB   WL 
6.7.0+ #4
[  +0.007797] Hardware name: ASUS System Product Name/ROG STRIX B550-F GAMING 
(WI-FI), BIOS 1401 12/03/2020
[  +0.009798] RIP: 0010:drm_sched_entity_init+0x2d3/0x420 [gpu_sched]
[  +0.006426] Code: 80 00 00 00 00 00 00 00 e8 1a 81 82 e0 49 89 9c 24 c0 00 00 00 4c 
89 ef e8 4a 80 82 e0 49 8b 5d 00 48 8d 7b 28 e8 3d 80 82 e0 <48> 83 7b 28 00 0f 
84 28 01 00 00 4d 8d ac 24 98 00 00 00 49 8d 5c
[  +0.019094] RSP: 0018:c90014c1fa40 EFLAGS: 00010282
[  +0.005237] RAX: 0001 RBX:  RCX: 8113f3fa
[  +0.007326] RDX: fbfff0a7889d RSI: 0008 RDI: 853c44e0
[  +0.007264] RBP: c90014c1fa80 R08: 0001 R09: fbfff0a7889c
[  +0.007266] R10: 853c44e7 R11: 0001 R12: 8881a719b010
[  +0.007263] R13: 88810d412748 R14: 0002 R15: 
[  +0.007264] FS:  77045540() GS:8883cc90() 
knlGS:
[  +0.008236] CS:  0010 DS:  ES:  CR0: 80050033
[  +0.005851] CR2: 0028 CR3: 00011912e000 CR4: 00350ef0
[  +0.007175] Call Trace:
[  +0.002561]  
[  +0.002141]  ? show_regs+0x6a/0x80
[  +0.003473]  ? __die+0x25/0x70
[  +0.003124]  ? page_fault_oops+0x214/0x720
[  +0.004179]  ? preempt_count_sub+0x18/0xc0
[  +0.004093]  ? __pfx_page_fault_oops+0x10/0x10
[  +0.004590]  ? srso_return_thunk+0x5/0x5f
[  +0.004000]  ? vprintk_default+0x1d/0x30
[  +0.004063]  ? srso_return_thunk+0x5/0x5f
[  +0.004087]  ? vprintk+0x5c/0x90
[  +0.003296]  ? drm_sched_entity_init+0x2d3/0x420 [gpu_sched]
[  +0.005807]  ? srso_return_thunk+0x5/0x5f
[  +0.004090]  ? _printk+0xb3/0xe0
[  +0.003293]  ? __pfx__printk+0x10/0x10
[  +0.003735]  ? asm_sysvec_apic_timer_interrupt+0x1b/0x20
[  +0.005482]  ? do_user_addr_fault+0x345/0x770
[  +0.004361]  ? exc_page_fault+0x64/0xf0
[  +0.003972]  ? asm_exc_page_fault+0x27/0x30
[  +0.004271]  ? add_taint+0x2a/0xa0
[  +0.003476]  ? drm_sched_entity_init+0x2d3/0x420 [gpu_sched]
[  +0.005812]  amdgpu_ctx_get_entity+0x3f9/0x770 [amdgpu]
[  +0.009530]  ? finish_task_switch.isra.0+0x129/0x470
[  +0.005068]  ? __pfx_amdgpu_ctx_get_entity+0x10/0x10 [amdgpu]
[  +0.010063]  ? __kasan_check_write+0x14/0x20
[  +0.004356]  ? srso_return_thunk+0x5/0x5f
[  +0.004001]  ? mutex_unlock+0x81/0xd0
[  +0.003802]  ? srso_return_thunk+0x5/0x5f
[  +0.004096]  amdgpu_cs_wait_ioctl+0xf6/0x270 [amdgpu]
[  +0.009355]  ? __pfx_amdgpu_cs_wait_ioctl+0x10/0x10 [amdgpu]
[  +0.009981]  ? srso_return_thunk+0x5/0x5f
[  +0.004089]  ? srso_return_thunk+0x5/0x5f
[  +0.004090]  ? __srcu_read_lock+0x20/0x50
[  +0.004096]  drm_ioctl_kernel+0x140/0x1f0 [drm]
[  +0.005080]  ? __pfx_amdgpu_cs_wait_ioctl+0x10/0x10 [amdgpu]
[  +0.009974]  ? __pfx_drm_ioctl_kernel+0x10/0x10 [drm]
[  +0.005618]  ? srso_return_thunk+0x5/0x5f
[  +0.004088]  ? __kasan_check_write+0x14/0x20
[  +0.004357]  drm_ioctl+0x3da/0x730 [drm]
[  +0.004461]  ? __pfx_amdgpu_cs_wait_ioctl+0x10/0x10 [amdgpu]
[  +0.009979]  ? __pfx_drm_ioctl+0x10/0x10 [drm]
[  +0.004993]  ? srso_return_thunk+0x5/0x5f
[  +0.004090]  ? __kasan_check_write+0x14/0x20
[  +0.004356]  ? srso_return_thunk+0x5/0x5f
[  +0.004090]  ? _raw_spin_lock_irqsave+0x99/0x100
[  +0.004712]  ? __pfx__raw_spin_lock_irqsave+0x10/0x10
[  +0.005063]  ? __pfx_arch_do_signal_or_restart+0x10/0x10
[  +0.005477]  

Re: [PATCH] drm/sched: fix null-ptr-deref in init entity

2024-03-15 Thread Alex Deucher
On Fri, Mar 15, 2024 at 10:12 AM Christian König
 wrote:
>
> Am 15.03.24 um 03:39 schrieb [email protected]:
> > From: Vitaly Prosyak 
> >
> > The bug can be triggered by sending an amdgpu_cs_wait_ioctl
> > to the AMDGPU DRM driver on any ASICs with valid context.
> > The bug was reported by Joonkyo Jung .
> > For example the following code:
> >
> >  static void Syzkaller2(int fd)
> >  {
> >   union drm_amdgpu_ctx arg1;
> >   union drm_amdgpu_wait_cs arg2;
> >
> >   arg1.in.op = AMDGPU_CTX_OP_ALLOC_CTX;
> >   ret = drmIoctl(fd, 0x140106442 /* amdgpu_ctx_ioctl */, &arg1);
> >
> >   arg2.in.handle = 0x0;
> >   arg2.in.timeout = 0x2;
> >   arg2.in.ip_type = AMD_IP_VPE /* 0x9 */;
> >   arg2->in.ip_instance = 0x0;
> >   arg2.in.ring = 0x0;
> >   arg2.in.ctx_id = arg1.out.alloc.ctx_id;
> >
> >   drmIoctl(fd, 0xc0206449 /* AMDGPU_WAIT_CS * /, &arg2);
> >  }
> >
> > The ioctl AMDGPU_WAIT_CS without previously submitted job could be assumed 
> > that
> > the error should be returned, but the following commit 
> > 1decbf6bb0b4dc56c9da6c5e57b994ebfc2be3aa
> > modified the logic and allowed to have sched_rq equal to NULL.
> >
> > As a result when there is no job the ioctl AMDGPU_WAIT_CS returns success.
> > The change fixes null-ptr-deref in init entity and the stack below 
> > demonstrates
> > the error condition:
> >
> > [  +0.07] BUG: kernel NULL pointer dereference, address: 
> > 0028
> > [  +0.007086] #PF: supervisor read access in kernel mode
> > [  +0.005234] #PF: error_code(0x) - not-present page
> > [  +0.005232] PGD 0 P4D 0
> > [  +0.002501] Oops:  [#1] PREEMPT SMP KASAN NOPTI
> > [  +0.005034] CPU: 10 PID: 9229 Comm: amd_basic Tainted: GB   WL
> >  6.7.0+ #4
> > [  +0.007797] Hardware name: ASUS System Product Name/ROG STRIX B550-F 
> > GAMING (WI-FI), BIOS 1401 12/03/2020
> > [  +0.009798] RIP: 0010:drm_sched_entity_init+0x2d3/0x420 [gpu_sched]
> > [  +0.006426] Code: 80 00 00 00 00 00 00 00 e8 1a 81 82 e0 49 89 9c 24 c0 
> > 00 00 00 4c 89 ef e8 4a 80 82 e0 49 8b 5d 00 48 8d 7b 28 e8 3d 80 82 e0 
> > <48> 83 7b 28 00 0f 84 28 01 00 00 4d 8d ac 24 98 00 00 00 49 8d 5c
> > [  +0.019094] RSP: 0018:c90014c1fa40 EFLAGS: 00010282
> > [  +0.005237] RAX: 0001 RBX:  RCX: 
> > 8113f3fa
> > [  +0.007326] RDX: fbfff0a7889d RSI: 0008 RDI: 
> > 853c44e0
> > [  +0.007264] RBP: c90014c1fa80 R08: 0001 R09: 
> > fbfff0a7889c
> > [  +0.007266] R10: 853c44e7 R11: 0001 R12: 
> > 8881a719b010
> > [  +0.007263] R13: 88810d412748 R14: 0002 R15: 
> > 
> > [  +0.007264] FS:  77045540() GS:8883cc90() 
> > knlGS:
> > [  +0.008236] CS:  0010 DS:  ES:  CR0: 80050033
> > [  +0.005851] CR2: 0028 CR3: 00011912e000 CR4: 
> > 00350ef0
> > [  +0.007175] Call Trace:
> > [  +0.002561]  
> > [  +0.002141]  ? show_regs+0x6a/0x80
> > [  +0.003473]  ? __die+0x25/0x70
> > [  +0.003124]  ? page_fault_oops+0x214/0x720
> > [  +0.004179]  ? preempt_count_sub+0x18/0xc0
> > [  +0.004093]  ? __pfx_page_fault_oops+0x10/0x10
> > [  +0.004590]  ? srso_return_thunk+0x5/0x5f
> > [  +0.004000]  ? vprintk_default+0x1d/0x30
> > [  +0.004063]  ? srso_return_thunk+0x5/0x5f
> > [  +0.004087]  ? vprintk+0x5c/0x90
> > [  +0.003296]  ? drm_sched_entity_init+0x2d3/0x420 [gpu_sched]
> > [  +0.005807]  ? srso_return_thunk+0x5/0x5f
> > [  +0.004090]  ? _printk+0xb3/0xe0
> > [  +0.003293]  ? __pfx__printk+0x10/0x10
> > [  +0.003735]  ? asm_sysvec_apic_timer_interrupt+0x1b/0x20
> > [  +0.005482]  ? do_user_addr_fault+0x345/0x770
> > [  +0.004361]  ? exc_page_fault+0x64/0xf0
> > [  +0.003972]  ? asm_exc_page_fault+0x27/0x30
> > [  +0.004271]  ? add_taint+0x2a/0xa0
> > [  +0.003476]  ? drm_sched_entity_init+0x2d3/0x420 [gpu_sched]
> > [  +0.005812]  amdgpu_ctx_get_entity+0x3f9/0x770 [amdgpu]
> > [  +0.009530]  ? finish_task_switch.isra.0+0x129/0x470
> > [  +0.005068]  ? __pfx_amdgpu_ctx_get_entity+0x10/0x10 [amdgpu]
> > [  +0.010063]  ? __kasan_check_write+0x14/0x20
> > [  +0.004356]  ? srso_return_thunk+0x5/0x5f
> > [  +0.004001]  ? mutex_unlock+0x81/0xd0
> > [  +0.003802]  ? srso_return_thunk+0x5/0x5f
> > [  +0.004096]  amdgpu_cs_wait_ioctl+0xf6/0x270 [amdgpu]
> > [  +0.009355]  ? __pfx_amdgpu_cs_wait_ioctl+0x10/0x10 [amdgpu]
> > [  +0.009981]  ? srso_return_thunk+0x5/0x5f
> > [  +0.004089]  ? srso_return_thunk+0x5/0x5f
> > [  +0.004090]  ? __srcu_read_lock+0x20/0x50
> > [  +0.004096]  drm_ioctl_kernel+0x140/0x1f0 [drm]
> > [  +0.005080]  ? __pfx_amdgpu_cs_wait_ioctl+0x10/0x10 [amdgpu]
> > [  +0.009974]  ? __pfx_drm_ioctl_kernel+0x10/0x10 [drm]
> > [  +0.005618]  ? srso_return_thunk+0x5/0x5f
> > [  +0.004088]  ? __kasan_check_write+0x14/0x20
> > [  +0.004357]  drm_ioctl+0x3da/0x730 [drm]
> > [  +0.004461]  ? __pfx_amdgpu_cs_wait_ioctl+0x10/0x10 [amdgp

Re: [PATCH] drm/sched: fix null-ptr-deref in init entity

2024-03-15 Thread Christian König

Am 15.03.24 um 03:39 schrieb [email protected]:

From: Vitaly Prosyak 

The bug can be triggered by sending an amdgpu_cs_wait_ioctl
to the AMDGPU DRM driver on any ASICs with valid context.
The bug was reported by Joonkyo Jung .
For example the following code:

 static void Syzkaller2(int fd)
 {
union drm_amdgpu_ctx arg1;
union drm_amdgpu_wait_cs arg2;

arg1.in.op = AMDGPU_CTX_OP_ALLOC_CTX;
ret = drmIoctl(fd, 0x140106442 /* amdgpu_ctx_ioctl */, &arg1);

arg2.in.handle = 0x0;
arg2.in.timeout = 0x2;
arg2.in.ip_type = AMD_IP_VPE /* 0x9 */;
arg2->in.ip_instance = 0x0;
arg2.in.ring = 0x0;
arg2.in.ctx_id = arg1.out.alloc.ctx_id;

drmIoctl(fd, 0xc0206449 /* AMDGPU_WAIT_CS * /, &arg2);
 }

The ioctl AMDGPU_WAIT_CS without previously submitted job could be assumed that
the error should be returned, but the following commit 
1decbf6bb0b4dc56c9da6c5e57b994ebfc2be3aa
modified the logic and allowed to have sched_rq equal to NULL.

As a result when there is no job the ioctl AMDGPU_WAIT_CS returns success.
The change fixes null-ptr-deref in init entity and the stack below demonstrates
the error condition:

[  +0.07] BUG: kernel NULL pointer dereference, address: 0028
[  +0.007086] #PF: supervisor read access in kernel mode
[  +0.005234] #PF: error_code(0x) - not-present page
[  +0.005232] PGD 0 P4D 0
[  +0.002501] Oops:  [#1] PREEMPT SMP KASAN NOPTI
[  +0.005034] CPU: 10 PID: 9229 Comm: amd_basic Tainted: GB   WL 
6.7.0+ #4
[  +0.007797] Hardware name: ASUS System Product Name/ROG STRIX B550-F GAMING 
(WI-FI), BIOS 1401 12/03/2020
[  +0.009798] RIP: 0010:drm_sched_entity_init+0x2d3/0x420 [gpu_sched]
[  +0.006426] Code: 80 00 00 00 00 00 00 00 e8 1a 81 82 e0 49 89 9c 24 c0 00 00 00 4c 
89 ef e8 4a 80 82 e0 49 8b 5d 00 48 8d 7b 28 e8 3d 80 82 e0 <48> 83 7b 28 00 0f 
84 28 01 00 00 4d 8d ac 24 98 00 00 00 49 8d 5c
[  +0.019094] RSP: 0018:c90014c1fa40 EFLAGS: 00010282
[  +0.005237] RAX: 0001 RBX:  RCX: 8113f3fa
[  +0.007326] RDX: fbfff0a7889d RSI: 0008 RDI: 853c44e0
[  +0.007264] RBP: c90014c1fa80 R08: 0001 R09: fbfff0a7889c
[  +0.007266] R10: 853c44e7 R11: 0001 R12: 8881a719b010
[  +0.007263] R13: 88810d412748 R14: 0002 R15: 
[  +0.007264] FS:  77045540() GS:8883cc90() 
knlGS:
[  +0.008236] CS:  0010 DS:  ES:  CR0: 80050033
[  +0.005851] CR2: 0028 CR3: 00011912e000 CR4: 00350ef0
[  +0.007175] Call Trace:
[  +0.002561]  
[  +0.002141]  ? show_regs+0x6a/0x80
[  +0.003473]  ? __die+0x25/0x70
[  +0.003124]  ? page_fault_oops+0x214/0x720
[  +0.004179]  ? preempt_count_sub+0x18/0xc0
[  +0.004093]  ? __pfx_page_fault_oops+0x10/0x10
[  +0.004590]  ? srso_return_thunk+0x5/0x5f
[  +0.004000]  ? vprintk_default+0x1d/0x30
[  +0.004063]  ? srso_return_thunk+0x5/0x5f
[  +0.004087]  ? vprintk+0x5c/0x90
[  +0.003296]  ? drm_sched_entity_init+0x2d3/0x420 [gpu_sched]
[  +0.005807]  ? srso_return_thunk+0x5/0x5f
[  +0.004090]  ? _printk+0xb3/0xe0
[  +0.003293]  ? __pfx__printk+0x10/0x10
[  +0.003735]  ? asm_sysvec_apic_timer_interrupt+0x1b/0x20
[  +0.005482]  ? do_user_addr_fault+0x345/0x770
[  +0.004361]  ? exc_page_fault+0x64/0xf0
[  +0.003972]  ? asm_exc_page_fault+0x27/0x30
[  +0.004271]  ? add_taint+0x2a/0xa0
[  +0.003476]  ? drm_sched_entity_init+0x2d3/0x420 [gpu_sched]
[  +0.005812]  amdgpu_ctx_get_entity+0x3f9/0x770 [amdgpu]
[  +0.009530]  ? finish_task_switch.isra.0+0x129/0x470
[  +0.005068]  ? __pfx_amdgpu_ctx_get_entity+0x10/0x10 [amdgpu]
[  +0.010063]  ? __kasan_check_write+0x14/0x20
[  +0.004356]  ? srso_return_thunk+0x5/0x5f
[  +0.004001]  ? mutex_unlock+0x81/0xd0
[  +0.003802]  ? srso_return_thunk+0x5/0x5f
[  +0.004096]  amdgpu_cs_wait_ioctl+0xf6/0x270 [amdgpu]
[  +0.009355]  ? __pfx_amdgpu_cs_wait_ioctl+0x10/0x10 [amdgpu]
[  +0.009981]  ? srso_return_thunk+0x5/0x5f
[  +0.004089]  ? srso_return_thunk+0x5/0x5f
[  +0.004090]  ? __srcu_read_lock+0x20/0x50
[  +0.004096]  drm_ioctl_kernel+0x140/0x1f0 [drm]
[  +0.005080]  ? __pfx_amdgpu_cs_wait_ioctl+0x10/0x10 [amdgpu]
[  +0.009974]  ? __pfx_drm_ioctl_kernel+0x10/0x10 [drm]
[  +0.005618]  ? srso_return_thunk+0x5/0x5f
[  +0.004088]  ? __kasan_check_write+0x14/0x20
[  +0.004357]  drm_ioctl+0x3da/0x730 [drm]
[  +0.004461]  ? __pfx_amdgpu_cs_wait_ioctl+0x10/0x10 [amdgpu]
[  +0.009979]  ? __pfx_drm_ioctl+0x10/0x10 [drm]
[  +0.004993]  ? srso_return_thunk+0x5/0x5f
[  +0.004090]  ? __kasan_check_write+0x14/0x20
[  +0.004356]  ? srso_return_thunk+0x5/0x5f
[  +0.004090]  ? _raw_spin_lock_irqsave+0x99/0x100
[  +0.004712]  ? __pfx__raw_spin_lock_irqsave+0x10/0x10
[  +0.005063]  ? __pfx_arch_do_signal_or_restart+0x10/0x10
[  +0.005477]  ? srso_return_thunk+0x5/0x5f
[  +0.004000]  ? preempt_count_sub+0x18/0xc0
[  +0.004237]  ? srs