[PATCH 3.16 102/129] mm: fix potential data race in SyS_swapon

2019-07-07 Thread Ben Hutchings
3.16.70-rc1 review patch.  If anyone has any objections, please let me know.

--

From: Hugh Dickins 

commit 6f179af88f60b32c2855e7f3e16ea8e336a7043f upstream.

While running KernelThreadSanitizer (ktsan) on upstream kernel with
trinity, we got a few reports from SyS_swapon, here is one of them:

Read of size 8 by thread T307 (K7621):
 [< inlined>] SyS_swapon+0x3c0/0x1850 SYSC_swapon mm/swapfile.c:2395
 [] SyS_swapon+0x3c0/0x1850 mm/swapfile.c:2345
 [] ia32_do_call+0x1b/0x25

Looks like the swap_lock should be taken when iterating through the
swap_info array on lines 2392 - 2401: q->swap_file may be reset to
NULL by another thread before it is dereferenced for f_mapping.

But why is that iteration needed at all?  Doesn't the claim_swapfile()
which follows do all that is needed to check for a duplicate entry -
FMODE_EXCL on a bdev, testing IS_SWAPFILE under i_mutex on a regfile?

Well, not quite: bd_may_claim() allows the same "holder" to claim the
bdev again, so we do need to use a different holder than "sys_swapon";
and we should not replace appropriate -EBUSY by inappropriate -EINVAL.

Index i was reused in a cpu loop further down: renamed cpu there.

Reported-by: Andrey Konovalov 
Signed-off-by: Hugh Dickins 
Signed-off-by: Al Viro 
Signed-off-by: Ben Hutchings 
---
 mm/swapfile.c | 25 +++--
 1 file changed, 7 insertions(+), 18 deletions(-)

--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2144,11 +2144,10 @@ static int claim_swapfile(struct swap_in
if (S_ISBLK(inode->i_mode)) {
p->bdev = bdgrab(I_BDEV(inode));
error = blkdev_get(p->bdev,
-  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-  sys_swapon);
+  FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
if (error < 0) {
p->bdev = NULL;
-   return -EINVAL;
+   return error;
}
p->old_block_size = block_size(p->bdev);
error = set_blocksize(p->bdev, PAGE_SIZE);
@@ -2365,7 +2364,6 @@ SYSCALL_DEFINE2(swapon, const char __use
struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
-   int i;
int prio;
int error;
union swap_header *swap_header;
@@ -2405,19 +2403,8 @@ SYSCALL_DEFINE2(swapon, const char __use
 
p->swap_file = swap_file;
mapping = swap_file->f_mapping;
-
-   for (i = 0; i < nr_swapfiles; i++) {
-   struct swap_info_struct *q = swap_info[i];
-
-   if (q == p || !q->swap_file)
-   continue;
-   if (mapping == q->swap_file->f_mapping) {
-   error = -EBUSY;
-   goto bad_swap;
-   }
-   }
-
inode = mapping->host;
+
/* If S_ISREG(inode->i_mode) will do mutex_lock(>i_mutex); */
error = claim_swapfile(p, inode);
if (unlikely(error))
@@ -2450,6 +2437,8 @@ SYSCALL_DEFINE2(swapon, const char __use
goto bad_swap;
}
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+   int cpu;
+
p->flags |= SWP_SOLIDSTATE;
/*
 * select a random position to start with to help wear leveling
@@ -2468,9 +2457,9 @@ SYSCALL_DEFINE2(swapon, const char __use
error = -ENOMEM;
goto bad_swap;
}
-   for_each_possible_cpu(i) {
+   for_each_possible_cpu(cpu) {
struct percpu_cluster *cluster;
-   cluster = per_cpu_ptr(p->percpu_cluster, i);
+   cluster = per_cpu_ptr(p->percpu_cluster, cpu);
cluster_set_null(>index);
}
}



[PATCH] mm: fix potential data race in SyS_swapon

2015-08-17 Thread Hugh Dickins
While running KernelThreadSanitizer (ktsan) on upstream kernel with
trinity, we got a few reports from SyS_swapon, here is one of them:

Read of size 8 by thread T307 (K7621):
 [< inlined>] SyS_swapon+0x3c0/0x1850 SYSC_swapon mm/swapfile.c:2395
 [] SyS_swapon+0x3c0/0x1850 mm/swapfile.c:2345
 [] ia32_do_call+0x1b/0x25

Looks like the swap_lock should be taken when iterating through the
swap_info array on lines 2392 - 2401: q->swap_file may be reset to
NULL by another thread before it is dereferenced for f_mapping.

But why is that iteration needed at all?  Doesn't the claim_swapfile()
which follows do all that is needed to check for a duplicate entry -
FMODE_EXCL on a bdev, testing IS_SWAPFILE under i_mutex on a regfile?

Well, not quite: bd_may_claim() allows the same "holder" to claim the
bdev again, so we do need to use a different holder than "sys_swapon";
and we should not replace appropriate -EBUSY by inappropriate -EINVAL.

Index i was reused in a cpu loop further down: renamed cpu there.

Reported-by: Andrey Konovalov 
Signed-off-by: Hugh Dickins 
---

 mm/swapfile.c |   25 +++--
 1 file changed, 7 insertions(+), 18 deletions(-)

--- 4.2-rc7/mm/swapfile.c   2015-07-05 19:25:02.852131158 -0700
+++ linux/mm/swapfile.c 2015-08-16 21:30:22.694123923 -0700
@@ -2143,11 +2143,10 @@ static int claim_swapfile(struct swap_in
if (S_ISBLK(inode->i_mode)) {
p->bdev = bdgrab(I_BDEV(inode));
error = blkdev_get(p->bdev,
-  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-  sys_swapon);
+  FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
if (error < 0) {
p->bdev = NULL;
-   return -EINVAL;
+   return error;
}
p->old_block_size = block_size(p->bdev);
error = set_blocksize(p->bdev, PAGE_SIZE);
@@ -2348,7 +2347,6 @@ SYSCALL_DEFINE2(swapon, const char __use
struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
-   int i;
int prio;
int error;
union swap_header *swap_header;
@@ -2388,19 +2386,8 @@ SYSCALL_DEFINE2(swapon, const char __use
 
p->swap_file = swap_file;
mapping = swap_file->f_mapping;
-
-   for (i = 0; i < nr_swapfiles; i++) {
-   struct swap_info_struct *q = swap_info[i];
-
-   if (q == p || !q->swap_file)
-   continue;
-   if (mapping == q->swap_file->f_mapping) {
-   error = -EBUSY;
-   goto bad_swap;
-   }
-   }
-
inode = mapping->host;
+
/* If S_ISREG(inode->i_mode) will do mutex_lock(>i_mutex); */
error = claim_swapfile(p, inode);
if (unlikely(error))
@@ -2433,6 +2420,8 @@ SYSCALL_DEFINE2(swapon, const char __use
goto bad_swap;
}
if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+   int cpu;
+
p->flags |= SWP_SOLIDSTATE;
/*
 * select a random position to start with to help wear leveling
@@ -2451,9 +2440,9 @@ SYSCALL_DEFINE2(swapon, const char __use
error = -ENOMEM;
goto bad_swap;
}
-   for_each_possible_cpu(i) {
+   for_each_possible_cpu(cpu) {
struct percpu_cluster *cluster;
-   cluster = per_cpu_ptr(p->percpu_cluster, i);
+   cluster = per_cpu_ptr(p->percpu_cluster, cpu);
cluster_set_null(>index);
}
}
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Potential data race in SyS_swapon

2015-08-17 Thread Hugh Dickins
On Fri, 7 Aug 2015, Andrey Konovalov wrote:
> 
> We are working on a dynamic data race detector for the Linux kernel
> called KernelThreadSanitizer (ktsan)
> (https://github.com/google/ktsan/wiki).
> 
> While running ktsan on the upstream revision 21bdb584af8c with trinity
> we got a few reports from SyS_swapon, here is one of them:
> 
> ==
> ThreadSanitizer: data-race in SyS_swapon
> 
> Read of size 8 by thread T307 (K7621):
>  [< inlined>] SyS_swapon+0x3c0/0x1850 SYSC_swapon mm/swapfile.c:2395
>  [] SyS_swapon+0x3c0/0x1850 mm/swapfile.c:2345
>  [] ia32_do_call+0x1b/0x25
> 
> Looks like the swap_lock should be taken when iterating through the
> swap_info array on lines 2392 - 2401.

Thanks for the report.  Actually, lines 2392 to 2401 just look redundant
to me: it looks as if claim_swapfile() should do all that's needed,
though in fact it doesn't quite.  I'll send akpm a patch and Cc you,
no need to retest since the offending lines just won't be there.

Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] mm: fix potential data race in SyS_swapon

2015-08-17 Thread Hugh Dickins
While running KernelThreadSanitizer (ktsan) on upstream kernel with
trinity, we got a few reports from SyS_swapon, here is one of them:

Read of size 8 by thread T307 (K7621):
 [ inlined] SyS_swapon+0x3c0/0x1850 SYSC_swapon mm/swapfile.c:2395
 [812242c0] SyS_swapon+0x3c0/0x1850 mm/swapfile.c:2345
 [81e97c8a] ia32_do_call+0x1b/0x25

Looks like the swap_lock should be taken when iterating through the
swap_info array on lines 2392 - 2401: q-swap_file may be reset to
NULL by another thread before it is dereferenced for f_mapping.

But why is that iteration needed at all?  Doesn't the claim_swapfile()
which follows do all that is needed to check for a duplicate entry -
FMODE_EXCL on a bdev, testing IS_SWAPFILE under i_mutex on a regfile?

Well, not quite: bd_may_claim() allows the same holder to claim the
bdev again, so we do need to use a different holder than sys_swapon;
and we should not replace appropriate -EBUSY by inappropriate -EINVAL.

Index i was reused in a cpu loop further down: renamed cpu there.

Reported-by: Andrey Konovalov andreyk...@google.com
Signed-off-by: Hugh Dickins hu...@google.com
---

 mm/swapfile.c |   25 +++--
 1 file changed, 7 insertions(+), 18 deletions(-)

--- 4.2-rc7/mm/swapfile.c   2015-07-05 19:25:02.852131158 -0700
+++ linux/mm/swapfile.c 2015-08-16 21:30:22.694123923 -0700
@@ -2143,11 +2143,10 @@ static int claim_swapfile(struct swap_in
if (S_ISBLK(inode-i_mode)) {
p-bdev = bdgrab(I_BDEV(inode));
error = blkdev_get(p-bdev,
-  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
-  sys_swapon);
+  FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
if (error  0) {
p-bdev = NULL;
-   return -EINVAL;
+   return error;
}
p-old_block_size = block_size(p-bdev);
error = set_blocksize(p-bdev, PAGE_SIZE);
@@ -2348,7 +2347,6 @@ SYSCALL_DEFINE2(swapon, const char __use
struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
-   int i;
int prio;
int error;
union swap_header *swap_header;
@@ -2388,19 +2386,8 @@ SYSCALL_DEFINE2(swapon, const char __use
 
p-swap_file = swap_file;
mapping = swap_file-f_mapping;
-
-   for (i = 0; i  nr_swapfiles; i++) {
-   struct swap_info_struct *q = swap_info[i];
-
-   if (q == p || !q-swap_file)
-   continue;
-   if (mapping == q-swap_file-f_mapping) {
-   error = -EBUSY;
-   goto bad_swap;
-   }
-   }
-
inode = mapping-host;
+
/* If S_ISREG(inode-i_mode) will do mutex_lock(inode-i_mutex); */
error = claim_swapfile(p, inode);
if (unlikely(error))
@@ -2433,6 +2420,8 @@ SYSCALL_DEFINE2(swapon, const char __use
goto bad_swap;
}
if (p-bdev  blk_queue_nonrot(bdev_get_queue(p-bdev))) {
+   int cpu;
+
p-flags |= SWP_SOLIDSTATE;
/*
 * select a random position to start with to help wear leveling
@@ -2451,9 +2440,9 @@ SYSCALL_DEFINE2(swapon, const char __use
error = -ENOMEM;
goto bad_swap;
}
-   for_each_possible_cpu(i) {
+   for_each_possible_cpu(cpu) {
struct percpu_cluster *cluster;
-   cluster = per_cpu_ptr(p-percpu_cluster, i);
+   cluster = per_cpu_ptr(p-percpu_cluster, cpu);
cluster_set_null(cluster-index);
}
}
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Potential data race in SyS_swapon

2015-08-17 Thread Hugh Dickins
On Fri, 7 Aug 2015, Andrey Konovalov wrote:
 
 We are working on a dynamic data race detector for the Linux kernel
 called KernelThreadSanitizer (ktsan)
 (https://github.com/google/ktsan/wiki).
 
 While running ktsan on the upstream revision 21bdb584af8c with trinity
 we got a few reports from SyS_swapon, here is one of them:
 
 ==
 ThreadSanitizer: data-race in SyS_swapon
 
 Read of size 8 by thread T307 (K7621):
  [ inlined] SyS_swapon+0x3c0/0x1850 SYSC_swapon mm/swapfile.c:2395
  [812242c0] SyS_swapon+0x3c0/0x1850 mm/swapfile.c:2345
  [81e97c8a] ia32_do_call+0x1b/0x25
 
 Looks like the swap_lock should be taken when iterating through the
 swap_info array on lines 2392 - 2401.

Thanks for the report.  Actually, lines 2392 to 2401 just look redundant
to me: it looks as if claim_swapfile() should do all that's needed,
though in fact it doesn't quite.  I'll send akpm a patch and Cc you,
no need to retest since the offending lines just won't be there.

Hugh
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Potential data race in SyS_swapon

2015-08-07 Thread Cesar Eduardo Barros

Em 07-08-2015 13:14, Andrey Konovalov escreveu:

Hi!

We are working on a dynamic data race detector for the Linux kernel
called KernelThreadSanitizer (ktsan)
(https://github.com/google/ktsan/wiki).

While running ktsan on the upstream revision 21bdb584af8c with trinity
we got a few reports from SyS_swapon, here is one of them:


[...]


The race is happening when accessing the swap_file field of a
swap_info_struct struct.

2392 for (i = 0; i < nr_swapfiles; i++) {
2393 struct swap_info_struct *q = swap_info[i];
2394
2395 if (q == p || !q->swap_file)
2396 continue;
2397 if (mapping == q->swap_file->f_mapping) {
2398 error = -EBUSY;
2399 goto bad_swap;
2400 }
2401 }

2539 spin_lock(_lock);
2540 p->swap_file = NULL;
2541 p->flags = 0;
2542 spin_unlock(_lock);


There's another (more important) place which sets the swap_file field to 
NULL, it's within swapoff. It's also protected by swap_lock.



Since the swap_lock lock is not taken in the first snippet, it's
possible for q->swap_file to be assigned to NULL and reloaded between
executing lines 2395 and 2397, which might lead to a null pointer
dereference.


I agree with that analysis. It should be possible to hit by racing 
swapon of a file with swapoff of another.



Looks like the swap_lock should be taken when iterating through the
swap_info array on lines 2392 - 2401.


I'd take that lock a couple of lines earlier, so that every place that 
sets the swap_file field on a swap_info_struct is behind swap_lock, for 
simplicity.


--
Cesar Eduardo Barros
ces...@cesarb.eti.br
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Potential data race in SyS_swapon

2015-08-07 Thread Andrey Konovalov
Hi!

We are working on a dynamic data race detector for the Linux kernel
called KernelThreadSanitizer (ktsan)
(https://github.com/google/ktsan/wiki).

While running ktsan on the upstream revision 21bdb584af8c with trinity
we got a few reports from SyS_swapon, here is one of them:

==
ThreadSanitizer: data-race in SyS_swapon

Read of size 8 by thread T307 (K7621):
 [< inlined>] SyS_swapon+0x3c0/0x1850 SYSC_swapon mm/swapfile.c:2395
 [] SyS_swapon+0x3c0/0x1850 mm/swapfile.c:2345
 [] ia32_do_call+0x1b/0x25
arch/x86/entry/entry_64_compat.S:500
DBG: cpu = 88063fc9fe68
DBG: cpu id = 1

Previous write of size 8 by thread T322 (K7625):
 [< inlined>] SyS_swapon+0x809/0x1850 SYSC_swapon mm/swapfile.c:2540
 [] SyS_swapon+0x809/0x1850 mm/swapfile.c:2345
 [] entry_SYSCALL_64_fastpath+0x12/0x71
arch/x86/entry/entry_64.S:186
DBG: cpu = 0

DBG: addr: 8800bba262d8
DBG: first offset: 0, second offset: 0
DBG: T307 clock: {T307: 1942841, T322: 3661262}
DBG: T322 clock: {T322: 3661679}
==

The race is happening when accessing the swap_file field of a
swap_info_struct struct.

2392 for (i = 0; i < nr_swapfiles; i++) {
2393 struct swap_info_struct *q = swap_info[i];
2394
2395 if (q == p || !q->swap_file)
2396 continue;
2397 if (mapping == q->swap_file->f_mapping) {
2398 error = -EBUSY;
2399 goto bad_swap;
2400 }
2401 }

2539 spin_lock(_lock);
2540 p->swap_file = NULL;
2541 p->flags = 0;
2542 spin_unlock(_lock);

Since the swap_lock lock is not taken in the first snippet, it's
possible for q->swap_file to be assigned to NULL and reloaded between
executing lines 2395 and 2397, which might lead to a null pointer
dereference.

To confirm this I added a sleep in there:

2393 for (i = 0; i < nr_swapfiles; i++) {
2394 struct swap_info_struct *q = swap_info[i];
2395
2396 if (q == p || !q->swap_file)
2397 continue;
2398 msleep(10);
2399 if (mapping == q->swap_file->f_mapping) {
2400 error = -EBUSY;
2401 goto bad_swap;
2402 }
2403 }

And that leads to:

BUG: unable to handle kernel NULL pointer dereference at 00f8
IP: [< inlined>] SyS_swapon+0x3eb/0x1880 SYSC_swapon mm/swapfile.c:2399
IP: [] SyS_swapon+0x3eb/0x1880 mm/swapfile.c:2346
PGD 1d08db067 PUD 1d0e63067 PMD 0
Oops:  [#5] SMP
Modules linked in:
CPU: 0 PID: 7516 Comm: trinity-c7 Tainted: G  D 4.2.0-rc2-tsan #229
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
task: 8801d37d0040 ti: 8801cc1f4000 task.ti: 8801cc1f4000
RIP: 0010:[]  [] SyS_swapon+0x3eb/0x1880
RSP: :8801cc1f7e28  EFLAGS: 00010292
RAX: 0001 RBX: 8800bb0e9400 RCX: 0003
RDX:  RSI: 0001 RDI: 0292
RBP: 8801cc1f7f48 R08: 0001 R09: 0006
R10: 880249752820 R11: 0005 R12: 
R13:  R14: 8800bb4ca2d8 R15: 8800bb0394d8
FS:  7f080f521700() GS:88063fc0() knlGS:
CS:  0010 DS:  ES:  CR0: 8005003b
CR2: 02360fe8 CR3: 0001cf2f CR4: 06f0
DR0: 01f6f000 DR1:  DR2: 
DR3:  DR6: 0ff0 DR7: 0600
Stack:
 0505d200 00dc1eb617ff 8801cc1f7e58 812454c8
 8801cc1f7f30 0246 8801cc1f7e78 880249752820
 880249752820 0007 0268 880249752820
Call Trace:
 [] ? kt_func_exit+0x18/0x60 mm/ktsan/func.c:14
 [] ? kt_func_exit+0x18/0x60 mm/ktsan/func.c:14
 [] entry_SYSCALL_64_fastpath+0x12/0x71
arch/x86/entry/entry_64.S:186
Code: 00 49 83 bd d8 00 00 00 00 74 2d e8 80 b8 c6 00 4c 89 ff e8 38
1b 02 00 4d 8b ad d8 00 00 00 49 8d bd f8 00 00 00 e8 25 1b 02 00 <4d>
3b b5 f8 00 00 00 0f 84 c7 03 00 00 48 c7 c7 50 10 72 82 41
RIP  [< inlined>] SyS_swapon+0x3eb/0x1880 SYSC_swapon mm/swapfile.c:2399
RIP  [] SyS_swapon+0x3eb/0x1880 mm/swapfile.c:2346
 RSP 
CR2: 00f8
---[ end trace e38cbebf888067b7 ]---

Looks like the swap_lock should be taken when iterating through the
swap_info array on lines 2392 - 2401.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Potential data race in SyS_swapon

2015-08-07 Thread Cesar Eduardo Barros

Em 07-08-2015 13:14, Andrey Konovalov escreveu:

Hi!

We are working on a dynamic data race detector for the Linux kernel
called KernelThreadSanitizer (ktsan)
(https://github.com/google/ktsan/wiki).

While running ktsan on the upstream revision 21bdb584af8c with trinity
we got a few reports from SyS_swapon, here is one of them:


[...]


The race is happening when accessing the swap_file field of a
swap_info_struct struct.

2392 for (i = 0; i  nr_swapfiles; i++) {
2393 struct swap_info_struct *q = swap_info[i];
2394
2395 if (q == p || !q-swap_file)
2396 continue;
2397 if (mapping == q-swap_file-f_mapping) {
2398 error = -EBUSY;
2399 goto bad_swap;
2400 }
2401 }

2539 spin_lock(swap_lock);
2540 p-swap_file = NULL;
2541 p-flags = 0;
2542 spin_unlock(swap_lock);


There's another (more important) place which sets the swap_file field to 
NULL, it's within swapoff. It's also protected by swap_lock.



Since the swap_lock lock is not taken in the first snippet, it's
possible for q-swap_file to be assigned to NULL and reloaded between
executing lines 2395 and 2397, which might lead to a null pointer
dereference.


I agree with that analysis. It should be possible to hit by racing 
swapon of a file with swapoff of another.



Looks like the swap_lock should be taken when iterating through the
swap_info array on lines 2392 - 2401.


I'd take that lock a couple of lines earlier, so that every place that 
sets the swap_file field on a swap_info_struct is behind swap_lock, for 
simplicity.


--
Cesar Eduardo Barros
ces...@cesarb.eti.br
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Potential data race in SyS_swapon

2015-08-07 Thread Andrey Konovalov
Hi!

We are working on a dynamic data race detector for the Linux kernel
called KernelThreadSanitizer (ktsan)
(https://github.com/google/ktsan/wiki).

While running ktsan on the upstream revision 21bdb584af8c with trinity
we got a few reports from SyS_swapon, here is one of them:

==
ThreadSanitizer: data-race in SyS_swapon

Read of size 8 by thread T307 (K7621):
 [ inlined] SyS_swapon+0x3c0/0x1850 SYSC_swapon mm/swapfile.c:2395
 [812242c0] SyS_swapon+0x3c0/0x1850 mm/swapfile.c:2345
 [81e97c8a] ia32_do_call+0x1b/0x25
arch/x86/entry/entry_64_compat.S:500
DBG: cpu = 88063fc9fe68
DBG: cpu id = 1

Previous write of size 8 by thread T322 (K7625):
 [ inlined] SyS_swapon+0x809/0x1850 SYSC_swapon mm/swapfile.c:2540
 [81224709] SyS_swapon+0x809/0x1850 mm/swapfile.c:2345
 [81e957ae] entry_SYSCALL_64_fastpath+0x12/0x71
arch/x86/entry/entry_64.S:186
DBG: cpu = 0

DBG: addr: 8800bba262d8
DBG: first offset: 0, second offset: 0
DBG: T307 clock: {T307: 1942841, T322: 3661262}
DBG: T322 clock: {T322: 3661679}
==

The race is happening when accessing the swap_file field of a
swap_info_struct struct.

2392 for (i = 0; i  nr_swapfiles; i++) {
2393 struct swap_info_struct *q = swap_info[i];
2394
2395 if (q == p || !q-swap_file)
2396 continue;
2397 if (mapping == q-swap_file-f_mapping) {
2398 error = -EBUSY;
2399 goto bad_swap;
2400 }
2401 }

2539 spin_lock(swap_lock);
2540 p-swap_file = NULL;
2541 p-flags = 0;
2542 spin_unlock(swap_lock);

Since the swap_lock lock is not taken in the first snippet, it's
possible for q-swap_file to be assigned to NULL and reloaded between
executing lines 2395 and 2397, which might lead to a null pointer
dereference.

To confirm this I added a sleep in there:

2393 for (i = 0; i  nr_swapfiles; i++) {
2394 struct swap_info_struct *q = swap_info[i];
2395
2396 if (q == p || !q-swap_file)
2397 continue;
2398 msleep(10);
2399 if (mapping == q-swap_file-f_mapping) {
2400 error = -EBUSY;
2401 goto bad_swap;
2402 }
2403 }

And that leads to:

BUG: unable to handle kernel NULL pointer dereference at 00f8
IP: [ inlined] SyS_swapon+0x3eb/0x1880 SYSC_swapon mm/swapfile.c:2399
IP: [8122431b] SyS_swapon+0x3eb/0x1880 mm/swapfile.c:2346
PGD 1d08db067 PUD 1d0e63067 PMD 0
Oops:  [#5] SMP
Modules linked in:
CPU: 0 PID: 7516 Comm: trinity-c7 Tainted: G  D 4.2.0-rc2-tsan #229
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
task: 8801d37d0040 ti: 8801cc1f4000 task.ti: 8801cc1f4000
RIP: 0010:[8122431b]  [8122431b] SyS_swapon+0x3eb/0x1880
RSP: :8801cc1f7e28  EFLAGS: 00010292
RAX: 0001 RBX: 8800bb0e9400 RCX: 0003
RDX:  RSI: 0001 RDI: 0292
RBP: 8801cc1f7f48 R08: 0001 R09: 0006
R10: 880249752820 R11: 0005 R12: 
R13:  R14: 8800bb4ca2d8 R15: 8800bb0394d8
FS:  7f080f521700() GS:88063fc0() knlGS:
CS:  0010 DS:  ES:  CR0: 8005003b
CR2: 02360fe8 CR3: 0001cf2f CR4: 06f0
DR0: 01f6f000 DR1:  DR2: 
DR3:  DR6: 0ff0 DR7: 0600
Stack:
 0505d200 00dc1eb617ff 8801cc1f7e58 812454c8
 8801cc1f7f30 0246 8801cc1f7e78 880249752820
 880249752820 0007 0268 880249752820
Call Trace:
 [812454c8] ? kt_func_exit+0x18/0x60 mm/ktsan/func.c:14
 [812454c8] ? kt_func_exit+0x18/0x60 mm/ktsan/func.c:14
 [81e9582e] entry_SYSCALL_64_fastpath+0x12/0x71
arch/x86/entry/entry_64.S:186
Code: 00 49 83 bd d8 00 00 00 00 74 2d e8 80 b8 c6 00 4c 89 ff e8 38
1b 02 00 4d 8b ad d8 00 00 00 49 8d bd f8 00 00 00 e8 25 1b 02 00 4d
3b b5 f8 00 00 00 0f 84 c7 03 00 00 48 c7 c7 50 10 72 82 41
RIP  [ inlined] SyS_swapon+0x3eb/0x1880 SYSC_swapon mm/swapfile.c:2399
RIP  [8122431b] SyS_swapon+0x3eb/0x1880 mm/swapfile.c:2346
 RSP 8801cc1f7e28
CR2: 00f8
---[ end trace e38cbebf888067b7 ]---

Looks like the swap_lock should be taken when iterating through the
swap_info array on lines 2392 - 2401.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at