from:"Dmitry Safonov"

Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso

2021-03-31 Thread Dmitry Safonov

On 3/31/21 10:59 AM, Michael Ellerman wrote:
> Christophe Leroy  writes:
[..]
>>
>>> @@ -133,7 +135,13 @@ static int __arch_setup_additional_pages(struct 
>>> linux_binprm *bprm, int uses_int
>>>  * install_special_mapping or the perf counter mmap tracking code
>>>  * will fail to recognise it as a vDSO.
>>>  */
>>> -   mm->context.vdso = (void __user *)vdso_base + PAGE_SIZE;
>>> +   mm->context.vdso = (void __user *)vdso_base + vvar_size;
>>> +
>>> +   vma = _install_special_mapping(mm, vdso_base, vvar_size,
>>> +  VM_READ | VM_MAYREAD | VM_IO |
>>> +  VM_DONTDUMP | VM_PFNMAP, _spec);
>>> +   if (IS_ERR(vma))
>>> +   return PTR_ERR(vma);
>>>   
>>> /*
>>>  * our vma flags don't have VM_WRITE so by default, the process isn't
>>
>>
>> IIUC, VM_PFNMAP is for when we have a vvar_fault handler.
>> Allthough we will soon have one for handle TIME_NS, at the moment
>> powerpc doesn't have that handler.
>> Isn't it dangerous to set VM_PFNMAP then ?

I believe, it's fine, special_mapping_fault() does:
:   if (sm->fault)
:   return sm->fault(sm, vmf->vma, vmf);

> Some of the other flags seem odd too.
> eg. VM_IO ? VM_DONTDUMP ?

Yeah, so:
VM_PFNMAP | VM_IO is a protection from remote access on pages. So one
can't access such page with ptrace(), /proc/$pid/mem or
process_vm_write(). Otherwise, it would create COW mapping and the
tracee will stop working with stale vvar.

VM_DONTDUMP restricts the area from coredumping and gdb will also avoid
accessing those[1][2].

I agree that VM_PFNMAP was probably excessive in this patch alone and
rather synchronized code with other architectures, but it makes more
sense now in the new patches set by Christophe:
https://lore.kernel.org/linux-arch/cover.1617209141.git.christophe.le...@csgroup.eu/


[1] https://lore.kernel.org/lkml/550731af.6080...@redhat.com/T/
[2] https://sourceware.org/legacy-ml/gdb-patches/2015-03/msg00383.html

Thanks,
  Dmitry

Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso

2021-03-31 Thread Dmitry Safonov

On 3/30/21 11:17 AM, Christophe Leroy wrote:
> 
> 
> Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
[..]
>> --- a/arch/powerpc/kernel/vdso.c
>> +++ b/arch/powerpc/kernel/vdso.c
>> @@ -55,10 +55,10 @@ static int vdso_mremap(const struct
>> vm_special_mapping *sm, struct vm_area_struc
>>   {
>>   unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
>>   -    if (new_size != text_size + PAGE_SIZE)
>> +    if (new_size != text_size)
>>   return -EINVAL;
> 
> In ARM64 you have removed the above test in commit 871402e05b24cb56
> ("mm: forbid splitting special mappings"). Do we need to keep it here ?
> 
>>   -    current->mm->context.vdso = (void __user *)new_vma->vm_start +
>> PAGE_SIZE;
>> +    current->mm->context.vdso = (void __user *)new_vma->vm_start;
>>     return 0;
>>   }
> 

Yes, right you are, this can be dropped.

Thanks,
  Dmitry

[PATCH] xfrm/compat: Cleanup WARN()s that can be user-triggered

2021-03-29 Thread Dmitry Safonov

Replace WARN_ONCE() that can be triggered from userspace with
pr_warn_once(). Those still give user a hint what's the issue.

I've left WARN()s that are not possible to trigger with current
code-base and that would mean that the code has issues:
- relying on current compat_msg_min[type] <= xfrm_msg_min[type]
- expected 4-byte padding size difference between
  compat_msg_min[type] and xfrm_msg_min[type]
- compat_policy[type].len <= xfrma_policy[type].len
(for every type)

Reported-by: syzbot+834ffd1afc7212eb8...@syzkaller.appspotmail.com
Fixes: 5f3eea6b7e8f ("xfrm/compat: Attach xfrm dumps to 64=>32 bit translator")
Cc: "David S. Miller" 
Cc: Eric Dumazet 
Cc: Herbert Xu 
Cc: Jakub Kicinski 
Cc: Steffen Klassert 
Cc: net...@vger.kernel.org
Cc: sta...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 net/xfrm/xfrm_compat.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index d8e8a11ca845..a20aec9d7393 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -216,7 +216,7 @@ static struct nlmsghdr *xfrm_nlmsg_put_compat(struct 
sk_buff *skb,
case XFRM_MSG_GETSADINFO:
case XFRM_MSG_GETSPDINFO:
default:
-   WARN_ONCE(1, "unsupported nlmsg_type %d", nlh_src->nlmsg_type);
+   pr_warn_once("unsupported nlmsg_type %d\n", 
nlh_src->nlmsg_type);
return ERR_PTR(-EOPNOTSUPP);
}
 
@@ -277,7 +277,7 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const 
struct nlattr *src)
return xfrm_nla_cpy(dst, src, nla_len(src));
default:
BUILD_BUG_ON(XFRMA_MAX != XFRMA_IF_ID);
-   WARN_ONCE(1, "unsupported nla_type %d", src->nla_type);
+   pr_warn_once("unsupported nla_type %d\n", src->nla_type);
return -EOPNOTSUPP;
}
 }
@@ -315,8 +315,10 @@ static int xfrm_alloc_compat(struct sk_buff *skb, const 
struct nlmsghdr *nlh_src
struct sk_buff *new = NULL;
int err;
 
-   if (WARN_ON_ONCE(type >= ARRAY_SIZE(xfrm_msg_min)))
+   if (type >= ARRAY_SIZE(xfrm_msg_min)) {
+   pr_warn_once("unsupported nlmsg_type %d\n", 
nlh_src->nlmsg_type);
return -EOPNOTSUPP;
+   }
 
if (skb_shinfo(skb)->frag_list == NULL) {
new = alloc_skb(skb->len + skb_tailroom(skb), GFP_ATOMIC);
@@ -378,6 +380,10 @@ static int xfrm_attr_cpy32(void *dst, size_t *pos, const 
struct nlattr *src,
struct nlmsghdr *nlmsg = dst;
struct nlattr *nla;
 
+   /* xfrm_user_rcv_msg_compat() relies on fact that 32-bit messages
+* have the same len or shorted than 64-bit ones.
+* 32-bit translation that is bigger than 64-bit original is unexpected.
+*/
if (WARN_ON_ONCE(copy_len > payload))
copy_len = payload;
 
-- 
2.31.0

Re: [syzbot] WARNING in xfrm_alloc_compat (2)

2021-03-29 Thread Dmitry Safonov

On 3/29/21 9:31 PM, Eric Dumazet wrote:
> 
> 
> On 3/29/21 9:57 PM, Dmitry Safonov wrote:
[..]
>>> [ cut here ]
>>> unsupported nla_type 356
>>
>> This doesn't seem to be an issue.
>> Userspace sent message with nla_type 356, which is > __XFRM_MSG_MAX, so
>> this warning is expected. I've added it so when a new XFRM_MSG_* will be
>> added, to make sure that there will be translations to such messages in
>> xfrm_compat.ko (if the translation needed).
>> This is WARN_ON_ONCE(), so it can't be used as DOS.
>>
>> Ping me if you'd expect something else than once-a-boot warning for
>> this. Not sure how-to close syzkaller bug, docs have only `invalid' tag,
>> which isn't `not-a-bug'/`expected' tag:
>> https://github.com/google/syzkaller/blob/master/docs/syzbot.md
>>
> 
> You should not use WARN_ON_ONCE() for this case (if user space can trigger it)
> 
> https://lwn.net/Articles/769365/
> 
> 
> Greg Kroah-Hartman raised the problem of core kernel API code that will use 
> WARN_ON_ONCE() to complain about bad usage; that will not generate the 
> desired result if WARN_ON_ONCE() is configured to crash the machine. He was 
> told that the code should just call pr_warn() instead, and that the called 
> function should return an error in such situations. It was generally agreed 
> that any WARN_ON() or WARN_ON_ONCE() calls that can be triggered from user 
> space need to be fixed. 
> 

Yeah, fair enough, I've already thought after sending the reply that
WARN_ON*() prints registers and that may be unwanted.
I'll cook a patch to convert this and other WARNs in the module.

I wish there was something like pr_warn_backtrace_once(), but I guess
it's fine without dumpstack(), after all.

Thanks,
 Dmitry

Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso

2021-03-29 Thread Dmitry Safonov

On 3/29/21 4:14 PM, Laurent Dufour wrote:
> Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
>> Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front")
>> VVAR page is in front of the VDSO area. In result it breaks CRIU
>> (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]"
>> from /proc/../maps points at ELF/vdso image, rather than at VVAR data
>> page.
>> Laurent made a patch to keep CRIU working (by reading aux vector).
>> But I think it still makes sence to separate two mappings into different
>> VMAs. It will also make ppc64 less "special" for userspace and as
>> a side-bonus will make VVAR page un-writable by debugger (which
>> previously
>> would COW page and can be unexpected).
>>
>> I opportunistically Cc stable on it: I understand that usually such
>> stuff isn't a stable material, but that will allow us in CRIU have
>> one workaround less that is needed just for one release (v5.11) on
>> one platform (ppc64), which we otherwise have to maintain.
>> I wouldn't go as far as to say that the commit 511157ab641e is ABI
>> regression as no other userspace got broken, but I'd really appreciate
>> if it gets backported to v5.11 after v5.12 is released, so as not
>> to complicate already non-simple CRIU-vdso code. Thanks!
>>
>> Cc: Andrei Vagin 
>> Cc: Andy Lutomirski 
>> Cc: Benjamin Herrenschmidt 
>> Cc: Christophe Leroy 
>> Cc: Laurent Dufour 
>> Cc: Michael Ellerman 
>> Cc: Paul Mackerras 
>> Cc: linuxppc-...@lists.ozlabs.org
>> Cc: sta...@vger.kernel.org # v5.11
>> [1]: https://github.com/checkpoint-restore/criu/issues/1417
>> Signed-off-by: Dmitry Safonov 
>> Tested-by: Christophe Leroy 
> 
> I run the CRIU's test suite and except the usual suspects, all the tests
> passed.
> 
> Tested-by: Laurent Dufour 

Thank you, Laurent!

-- 
  Dmitry

Re: [syzbot] WARNING in xfrm_alloc_compat (2)

2021-03-29 Thread Dmitry Safonov

Hi,

On 3/29/21 8:04 PM, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:6c996e19 net: change netdev_unregister_timeout_secs min va..
> git tree:   net-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=102e5926d0
> kernel config:  https://syzkaller.appspot.com/x/.config?x=c0ac79756537274e
> dashboard link: https://syzkaller.appspot.com/bug?extid=834ffd1afc7212eb8147
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=10a7b1aad0
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17ae6b7cd0
> 
> The issue was bisected to:
> 
> commit 5f3eea6b7e8f58cf5c8a9d4b9679dc19e9e67ba3
> Author: Dmitry Safonov 
> Date:   Mon Sep 21 14:36:53 2020 +
> 
> xfrm/compat: Attach xfrm dumps to 64=>32 bit translator
> 
> bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=10694b3ad0
> final oops: https://syzkaller.appspot.com/x/report.txt?x=12694b3ad0
> console output: https://syzkaller.appspot.com/x/log.txt?x=14694b3ad0
> 
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+834ffd1afc7212eb8...@syzkaller.appspotmail.com
> Fixes: 5f3eea6b7e8f ("xfrm/compat: Attach xfrm dumps to 64=>32 bit 
> translator")
> 
> netlink: 208 bytes leftover after parsing attributes in process 
> `syz-executor193'.
> [ cut here ]
> unsupported nla_type 356

This doesn't seem to be an issue.
Userspace sent message with nla_type 356, which is > __XFRM_MSG_MAX, so
this warning is expected. I've added it so when a new XFRM_MSG_* will be
added, to make sure that there will be translations to such messages in
xfrm_compat.ko (if the translation needed).
This is WARN_ON_ONCE(), so it can't be used as DOS.

Ping me if you'd expect something else than once-a-boot warning for
this. Not sure how-to close syzkaller bug, docs have only `invalid' tag,
which isn't `not-a-bug'/`expected' tag:
https://github.com/google/syzkaller/blob/master/docs/syzbot.md


> WARNING: CPU: 1 PID: 8423 at net/xfrm/xfrm_compat.c:280 xfrm_xlate64_attr 
> net/xfrm/xfrm_compat.c:280 [inline]
> WARNING: CPU: 1 PID: 8423 at net/xfrm/xfrm_compat.c:280 xfrm_xlate64 
> net/xfrm/xfrm_compat.c:301 [inline]
> WARNING: CPU: 1 PID: 8423 at net/xfrm/xfrm_compat.c:280 
> xfrm_alloc_compat+0xf39/0x10d0 net/xfrm/xfrm_compat.c:328
> Modules linked in:
> CPU: 1 PID: 8423 Comm: syz-executor193 Not tainted 5.12.0-rc4-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> RIP: 0010:xfrm_xlate64_attr net/xfrm/xfrm_compat.c:280 [inline]
> RIP: 0010:xfrm_xlate64 net/xfrm/xfrm_compat.c:301 [inline]
> RIP: 0010:xfrm_alloc_compat+0xf39/0x10d0 net/xfrm/xfrm_compat.c:328
> Code: de e8 5b d7 c3 f9 84 db 0f 85 b0 f8 ff ff e8 9e d0 c3 f9 8b 74 24 08 48 
> c7 c7 80 42 76 8a c6 05 39 2e 02 06 01 e8 74 b7 16 01 <0f> 0b e9 8d f8 ff ff 
> e8 7b d0 c3 f9 8b 14 24 48 c7 c7 40 42 76 8a
> RSP: 0018:c9fff4b8 EFLAGS: 00010282
> RAX:  RBX:  RCX: 
> RDX: 88801b1554c0 RSI: 815c4d75 RDI: f520001ffe89
> RBP: 00d8 R08:  R09: 
> R10: 815bdb0e R11:  R12: ffa1
> R13: 8880248eb8f8 R14: 888013256dc0 R15: 8880132568c0
> FS:  0092f300() GS:8880b9d0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 21e0 CR3: 23271000 CR4: 001506e0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> Call Trace:
>  xfrm_alloc_userspi+0x66a/0xa30 net/xfrm/xfrm_user.c:1448
>  xfrm_user_rcv_msg+0x42c/0x8b0 net/xfrm/xfrm_user.c:2812
>  netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2502
>  xfrm_netlink_rcv+0x6b/0x90 net/xfrm/xfrm_user.c:2824
>  netlink_unicast_kernel net/netlink/af_netlink.c:1312 [inline]
>  netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1338
>  netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1927
>  sock_sendmsg_nosec net/socket.c:654 [inline]
>  sock_sendmsg+0xcf/0x120 net/socket.c:674
>  sys_sendmsg+0x6e8/0x810 net/socket.c:2350
>  ___sys_sendmsg+0xf3/0x170 net/socket.c:2404
>  __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433
>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x43f009
> Code: 28 c3 e8 2a 14 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 
> 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 
> 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:7ffe0986bcb8 EFLA

Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso

2021-03-27 Thread Dmitry Safonov

Hi Christophe,

On 3/27/21 5:19 PM, Christophe Leroy wrote:
[..]
>> I opportunistically Cc stable on it: I understand that usually such
>> stuff isn't a stable material, but that will allow us in CRIU have
>> one workaround less that is needed just for one release (v5.11) on
>> one platform (ppc64), which we otherwise have to maintain.
> 
> Why is that a workaround, and why for one release only ? I think the
> solution proposed by Laurentto use the aux vector AT_SYSINFO_EHDR should
> work with any past and future release.

Yeah, I guess.
Previously, (before v5.11/power) all kernels had ELF start at "[vdso]"
VMA start, now we'll have to carry the offset in the VMA. Probably, not
the worst thing, but as it will be only for v5.11 release it can break,
so needs separate testing.
Kinda life was a bit easier without this additional code.

>> I wouldn't go as far as to say that the commit 511157ab641e is ABI
>> regression as no other userspace got broken, but I'd really appreciate
>> if it gets backported to v5.11 after v5.12 is released, so as not
>> to complicate already non-simple CRIU-vdso code. Thanks!
>>
>> Cc: Andrei Vagin 
>> Cc: Andy Lutomirski 
>> Cc: Benjamin Herrenschmidt 
>> Cc: Christophe Leroy 
>> Cc: Laurent Dufour 
>> Cc: Michael Ellerman 
>> Cc: Paul Mackerras 
>> Cc: linuxppc-...@lists.ozlabs.org
>> Cc: sta...@vger.kernel.org # v5.11
>> [1]: https://github.com/checkpoint-restore/criu/issues/1417
>> Signed-off-by: Dmitry Safonov 
>> Tested-by: Christophe Leroy 
> 
> I tested it with sifreturn_vdso selftest and it worked, because that
> selftest doesn't involve VDSO data.

Thanks again on helping with testing it, I appreciate it!

> But if I do a mremap() on the VDSO text vma without remapping VVAR to
> keep the same distance between the two vmas, gettimeofday() crashes. The
> reason is that the code obtains the address of the data by calculating a
> fix difference from its own address with the below macro, the delta
> being resolved at link time:
> 
> .macro get_datapage ptr
> bcl    20, 31, .+4
> 999:
> mflr    \ptr
> #if CONFIG_PPC_PAGE_SHIFT > 14
> addis    \ptr, \ptr, (_vdso_datapage - 999b)@ha
> #endif
> addi    \ptr, \ptr, (_vdso_datapage - 999b)@l
> .endm
> 
> So the datapage needs to remain at the same distance from the code at
> all time.
> 
> Wondering how the other architectures do to have two independent VMAs
> and be able to move one independently of the other.

It's alright as far as I know. If userspace remaps vdso/vvar it should
be aware of this (CRIU keeps this in mind, also old vdso image is dumped
to compare on restore with the one that the host has).

Thanks,
  Dmitry

[PATCH] powerpc/vdso: Separate vvar vma from vdso

2021-03-26 Thread Dmitry Safonov

Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front")
VVAR page is in front of the VDSO area. In result it breaks CRIU
(Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]"
from /proc/../maps points at ELF/vdso image, rather than at VVAR data page.
Laurent made a patch to keep CRIU working (by reading aux vector).
But I think it still makes sence to separate two mappings into different
VMAs. It will also make ppc64 less "special" for userspace and as
a side-bonus will make VVAR page un-writable by debugger (which previously
would COW page and can be unexpected).

I opportunistically Cc stable on it: I understand that usually such
stuff isn't a stable material, but that will allow us in CRIU have
one workaround less that is needed just for one release (v5.11) on
one platform (ppc64), which we otherwise have to maintain.
I wouldn't go as far as to say that the commit 511157ab641e is ABI
regression as no other userspace got broken, but I'd really appreciate
if it gets backported to v5.11 after v5.12 is released, so as not
to complicate already non-simple CRIU-vdso code. Thanks!

Cc: Andrei Vagin 
Cc: Andy Lutomirski 
Cc: Benjamin Herrenschmidt 
Cc: Christophe Leroy 
Cc: Laurent Dufour 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: linuxppc-...@lists.ozlabs.org
Cc: sta...@vger.kernel.org # v5.11
[1]: https://github.com/checkpoint-restore/criu/issues/1417
Signed-off-by: Dmitry Safonov 
Tested-by: Christophe Leroy 
---
 arch/powerpc/include/asm/mmu_context.h |  2 +-
 arch/powerpc/kernel/vdso.c | 54 +++---
 2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 652ce85f9410..4bc45d3ed8b0 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -263,7 +263,7 @@ extern void arch_exit_mmap(struct mm_struct *mm);
 static inline void arch_unmap(struct mm_struct *mm,
  unsigned long start, unsigned long end)
 {
-   unsigned long vdso_base = (unsigned long)mm->context.vdso - PAGE_SIZE;
+   unsigned long vdso_base = (unsigned long)mm->context.vdso;
 
if (start <= vdso_base && vdso_base < end)
mm->context.vdso = NULL;
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index e839a906fdf2..b14907209822 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -55,10 +55,10 @@ static int vdso_mremap(const struct vm_special_mapping *sm, 
struct vm_area_struc
 {
unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
 
-   if (new_size != text_size + PAGE_SIZE)
+   if (new_size != text_size)
return -EINVAL;
 
-   current->mm->context.vdso = (void __user *)new_vma->vm_start + 
PAGE_SIZE;
+   current->mm->context.vdso = (void __user *)new_vma->vm_start;
 
return 0;
 }
@@ -73,6 +73,10 @@ static int vdso64_mremap(const struct vm_special_mapping 
*sm, struct vm_area_str
return vdso_mremap(sm, new_vma, _end - _start);
 }
 
+static struct vm_special_mapping vvar_spec __ro_after_init = {
+   .name = "[vvar]",
+};
+
 static struct vm_special_mapping vdso32_spec __ro_after_init = {
.name = "[vdso]",
.mremap = vdso32_mremap,
@@ -89,11 +93,11 @@ static struct vm_special_mapping vdso64_spec 
__ro_after_init = {
  */
 static int __arch_setup_additional_pages(struct linux_binprm *bprm, int 
uses_interp)
 {
-   struct mm_struct *mm = current->mm;
+   unsigned long vdso_size, vdso_base, mappings_size;
struct vm_special_mapping *vdso_spec;
+   unsigned long vvar_size = PAGE_SIZE;
+   struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
-   unsigned long vdso_size;
-   unsigned long vdso_base;
 
if (is_32bit_task()) {
vdso_spec = _spec;
@@ -110,8 +114,8 @@ static int __arch_setup_additional_pages(struct 
linux_binprm *bprm, int uses_int
vdso_base = 0;
}
 
-   /* Add a page to the vdso size for the data page */
-   vdso_size += PAGE_SIZE;
+   mappings_size = vdso_size + vvar_size;
+   mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK;
 
/*
 * pick a base address for the vDSO in process space. We try to put it
@@ -119,9 +123,7 @@ static int __arch_setup_additional_pages(struct 
linux_binprm *bprm, int uses_int
 * and end up putting it elsewhere.
 * Add enough to the size so that the result can be aligned.
 */
-   vdso_base = get_unmapped_area(NULL, vdso_base,
- vdso_size + ((VDSO_ALIGNMENT - 1) & 
PAGE_MASK),
- 0, 0);
+   vdso_base = get_unmapped_area(NULL, vdso_base, mappings_size, 0, 0);
if (IS_ERR_VALUE(vdso_

Re: [PATCH v2] prctl: PR_SET_MM - unify copying of user's auvx

2021-03-25 Thread Dmitry Safonov

Hi Cyrill,

On 3/23/21 10:06 PM, Cyrill Gorcunov wrote:
[..]
> --- linux-tip.git.orig/kernel/sys.c
> +++ linux-tip.git/kernel/sys.c
> @@ -1961,6 +1961,30 @@ out:
>   return error;
>  }
>  
> +static int copy_auxv_from_user(unsigned long *auxv, size_t auxv_size,
> +const void __user *addr, size_t len)
> +{
> + BUG_ON(auxv_size != sizeof(current->mm->saved_auxv));

Nit:
size_t auxv_size = sizeof(user_auxv);
BUILD_BUG_ON(sizeof(user_auxv) != sizeof(current->mm->saved_auxv));

(to make it local variable instead of a parameter and get rid of a new
BUG_ON())

> +
> + if (!addr || len > auxv_size)
> + return -EINVAL;
> +
> + memset(auxv, 0, auxv_size);
> + if (len && copy_from_user(auxv, addr, len))
> + return -EFAULT;
> +
> + /*
> +  * Specification requires the vector to be
> +  * ended up with AT_NULL entry so user space
> +  * will notice where to stop enumerating.
> +  */
> + if (len == auxv_size) {
> + auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
> + auxv[AT_VECTOR_SIZE - 1] = AT_NULL;

I don't follow why it became conditional.
Perhaps, you meant that memset(0) above will zerofy it anyway, but in
case (len == auxv_size - 1) it won't work. Or I'm missing something
obvious :-)

Thanks,
   Dima

Re: [PATCH v5 2/3] Revert "mremap: don't allow MREMAP_DONTUNMAP on special_mappings and aio"

2021-03-23 Thread Dmitry Safonov

On 3/23/21 6:25 PM, Brian Geffon wrote:
> This reverts commit cd544fd1dc9293c6702fab6effa63dac1cc67e99.
> 
> As discussed in [1] this commit was a no-op because the mapping type was
> checked in vma_to_resize before move_vma is ever called. This meant that
> vm_ops->mremap() would never be called on such mappings. Furthermore,
> we've since expanded support of MREMAP_DONTUNMAP to non-anonymous
> mappings, and these special mappings are still protected by the existing
> check of !VM_DONTEXPAND and !VM_PFNMAP which will result in a -EINVAL.
> 
> 1. https://lkml.org/lkml/2020/12/28/2340
> 
> Signed-off-by: Brian Geffon 
> Acked-by: Hugh Dickins 

Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com>

Thanks,
Dmitry

Re: [PATCH v5 1/3] mm: Extend MREMAP_DONTUNMAP to non-anonymous mappings

2021-03-23 Thread Dmitry Safonov

On 3/23/21 6:25 PM, Brian Geffon wrote:
> Currently MREMAP_DONTUNMAP only accepts private anonymous mappings.
> This restriction was placed initially for simplicity and not because
> there exists a technical reason to do so.
> 
> This change will widen the support to include any mappings which are not
> VM_DONTEXPAND or VM_PFNMAP. The primary use case is to support
> MREMAP_DONTUNMAP on mappings which may have been created from a memfd.
> This change will result in mremap(MREMAP_DONTUNMAP) returning -EINVAL
> if VM_DONTEXPAND or VM_PFNMAP mappings are specified.
> 
> Lokesh Gidra who works on the Android JVM, provided an explanation of how
> such a feature will improve Android JVM garbage collection:
> "Android is developing a new garbage collector (GC), based on userfaultfd.
> The garbage collector will use userfaultfd (uffd) on the java heap during
> compaction. On accessing any uncompacted page, the application threads will
> find it missing, at which point the thread will create the compacted page
> and then use UFFDIO_COPY ioctl to get it mapped and then resume execution.
> Before starting this compaction, in a stop-the-world pause the heap will be
> mremap(MREMAP_DONTUNMAP) so that the java heap is ready to receive
> UFFD_EVENT_PAGEFAULT events after resuming execution.

Pretty interesting idea :-)

> 
> To speedup mremap operations, pagetable movement was optimized by moving
> PUD entries instead of PTE entries [1]. It was necessary as mremap of even
> modest sized memory ranges also took several milliseconds, and stopping the
> application for that long isn't acceptable in response-time sensitive
> cases.
> 
> With UFFDIO_CONTINUE feature [2], it will be even more efficient to
> implement this GC, particularly the 'non-moveable' portions of the heap.
> It will also help in reducing the need to copy (UFFDIO_COPY) the pages.
> However, for this to work, the java heap has to be on a 'shared' vma.
> Currently MREMAP_DONTUNMAP only supports private anonymous mappings, this
> patch will enable using UFFDIO_CONTINUE for the new userfaultfd-based heap
> compaction."
> 
> [1] 
> https://lore.kernel.org/linux-mm/20201215030730.nc3cu98e4%25a...@linux-foundation.org/
> [2] 
> https://lore.kernel.org/linux-mm/20210302000133.272579-1-axelrasmus...@google.com/
> 
> Signed-off-by: Brian Geffon 
> Acked-by: Hugh Dickins 
> Tested-by: Lokesh Gidra 

Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com>

Thanks,
Dmitry

Re: [PATCH] mm: Allow shmem mappings with MREMAP_DONTUNMAP

2021-03-16 Thread Dmitry Safonov

Hi Brian, Hugh,

On 3/16/21 7:18 PM, Brian Geffon wrote:
> Hi Hugh,
> Thanks for this suggestion, responses in line.
> 
>> A better patch would say:
>>
>> -   if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
>> -   vma->vm_flags & VM_SHARED))
>> +   if ((flags & MREMAP_DONTUNMAP) &&
>> +   (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
>> return ERR_PTR(-EINVAL);
>>
>> VM_DONTEXPAND is what has long been used on special mappings, to prevent
>> surprises from mremap changing the size of the mapping: MREMAP_DONTUNMAP
>> introduced a different way of expanding the mapping, so VM_DONTEXPAND
>> still seems a reasonable name (I've thrown in VM_PFNMAP there because
>> it's in the VM_DONTEXPAND test lower down: for safety I guess, and best
>> if both behave the same - though one says -EINVAL and the other -EFAULT).
> 
> I like this idea and am happy to mail a new patch. I think it may make
> sense to bring the lower block up here so that it becomes more clear
> that it's not duplicate code and that the MREMAP_DONTUNMAP case
> returns -EINVAL and other cases return -EFAULT. I wonder if the
> -EFAULT error code would have made more sense from the start for both
> cases, do you have any thoughts on changing the error code at this
> point?
> 
>> With that VM_DONTEXPAND check in, Dmitry's commit cd544fd1dc92
>> ("mremap: don't allow MREMAP_DONTUNMAP on special_mappings and aio")
>> can still be reverted (as you agreed on 28th December), even though
>> vma_is_anonymous() will no longer protect it.
> 
> I agree and if Dmitry does not have time I would be happy to mail a
> revert to cd544fd1dc92 as we discussed in [1]. Dmitry, would you like
> me to do that?

Ack. I was planning to send a patches set that includes the revert, but
that's stalled a bit. As the patch just adds excessive checks, but
doesn't introduce an issue, I haven't sent it separately.
Feel free to revert it :-)

Thanks,
  Dmitry

[PATCH] perf diff: Don't crash on freeing errno-session

2021-03-01 Thread Dmitry Safonov

__cmd_diff() sets result of perf_session__new() to d->session.
In case of failure, it's errno and perf-diff may crash with:
failed to open perf.data: Permission denied
Failed to open perf.data
Segmentation fault (core dumped)

>From the coredump:
0  0x5569a62b5955 in auxtrace__free (session=0x)
at util/auxtrace.c:2681
1  0x5569a626b37d in perf_session__delete (session=0x)
at util/session.c:295
2  perf_session__delete (session=0x) at util/session.c:291
3  0x5569a618008a in __cmd_diff () at builtin-diff.c:1239
4  cmd_diff (argc=, argv=) at builtin-diff.c:2011
[..]

Funny enough, it won't always crash. For me it crashes only if failed
file is second in cmd-line: the reason is that cmd_diff() check files for
branch-stacks [in check_file_brstack()] and if the first file doesn't
have brstacks, it doesn't proceed to try open other files from cmd-line.

Check d->session before calling perf_session__delete().

Another solution would be assigning to temporary variable, checking it,
but I find it easier to follow with IS_ERR() check in the same function.
After some time it's still obvious why the check is needed, and with
temp variable it's possible to make the same mistake.

Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Ingo Molnar 
Cc: Jiri Olsa 
Cc: Mark Rutland 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Signed-off-by: Dmitry Safonov 
---
 tools/perf/builtin-diff.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index cefc71506409..b0c57e55052d 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -1236,7 +1236,8 @@ static int __cmd_diff(void)
 
  out_delete:
data__for_each_file(i, d) {
-   perf_session__delete(d->session);
+   if (!IS_ERR(d->session))
+   perf_session__delete(d->session);
data__free(d);
}
 
-- 
2.30.1

[PATCH] perf: Use (long) for iterator for bfd symbols

2021-02-09 Thread Dmitry Safonov

GCC (GCC) 8.4.0 20200304 fails to build perf with:
: util/symbol.c: In function 'dso__load_bfd_symbols':
: util/symbol.c:1626:16: error: comparison of integer expressions of different 
signednes
:   for (i = 0; i < symbols_count; ++i) {
: ^
: util/symbol.c:1632:16: error: comparison of integer expressions of different 
signednes
:while (i + 1 < symbols_count &&
: ^
: util/symbol.c:1637:13: error: comparison of integer expressions of different 
signednes
:if (i + 1 < symbols_count &&
:  ^
: cc1: all warnings being treated as errors

It's unlikely that the symtable will be that big, but the fix is
oneliner and as perf has CORE_CFLAGS += -Wextra, which makes build to
fail together with CORE_CFLAGS += -Werror

Fixes: eac9a4342e54 ("perf symbols: Try reading the symbol table with libbfd")
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Ingo Molnar 
Cc: Jacek Caban 
Cc: Jiri Olsa 
Cc: Mark Rutland 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Remi Bernon 
Signed-off-by: Dmitry Safonov 
---
 tools/perf/util/symbol.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 64a039cbba1b..1645fb4ec9ed 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -1561,12 +1561,11 @@ static int bfd2elf_binding(asymbol *symbol)
 int dso__load_bfd_symbols(struct dso *dso, const char *debugfile)
 {
int err = -1;
-   long symbols_size, symbols_count;
+   long symbols_size, symbols_count, i;
asection *section;
asymbol **symbols, *sym;
struct symbol *symbol;
bfd *abfd;
-   u_int i;
u64 start, len;
 
abfd = bfd_openr(dso->long_name, NULL);
-- 
2.30.0

Re: [PATCH 6/6] mm: Forbid splitting special mappings

2021-01-22 Thread Dmitry Safonov

On 1/22/21 1:00 PM, Will Deacon wrote:
> On Fri, Jan 22, 2021 at 12:58:58PM +, Will Deacon wrote:
>> On Tue, Oct 13, 2020 at 02:34:16AM +0100, Dmitry Safonov wrote:
>>> Don't allow splitting of vm_special_mapping's.
>>> It affects vdso/vvar areas. Uprobes have only one page in xol_area so
>>> they aren't affected.
>>>
>>> Those restrictions were enforced by checks in .mremap() callbacks.
>>> Restrict resizing with generic .split() callback.
>>>
>>> Signed-off-by: Dmitry Safonov 
>>> ---
>>>  arch/arm/kernel/vdso.c|  9 -
>>>  arch/arm64/kernel/vdso.c  | 41 +++
>>>  arch/mips/vdso/genvdso.c  |  4 
>>>  arch/s390/kernel/vdso.c   | 11 +--
>>>  arch/x86/entry/vdso/vma.c | 17 
>>>  mm/mmap.c | 12 
>>>  6 files changed, 16 insertions(+), 78 deletions(-)
>>
>> For arm64:
>>
>> Acked-by: Will Deacon 
> 
> Wait -- this got merged ages ago! Why am I reading such old email?

It's alright, thanks for looking at this again!

-- 
  Dmitry

Re: [PATCH 3/6] mremap: Don't allow MREMAP_DONTUNMAP on special_mappings and aio

2020-12-28 Thread Dmitry Safonov

[I moved your reply to avoid top-posting]

On 12/28/20 6:03 PM, Brian Geffon wrote:
> On Mon, Oct 12, 2020 at 6:34 PM Dmitry Safonov  wrote:
>>
>> As kernel expect to see only one of such mappings, any further
>> operations on the VMA-copy may be unexpected by the kernel.
>> Maybe it's being on the safe side, but there doesn't seem to be any
>> expected use-case for this, so restrict it now.
>>
>> Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()")
>> Signed-off-by: Dmitry Safonov 
>
> I don't think this situation can ever happen MREMAP_DONTUNMAP is
> already restricted to anonymous mappings (defined as not having
> vm_ops) and vma_to_resize checks that the mapping is anonymous before
> move_vma is called.

I've looked again now, I think it is possible. One can call
MREMAP_DONTUNMAP without MREMAP_FIXED and without resizing. So that the
old VMA is copied at some free address.

The calltrace would be: mremap()=>move_vma()
[under if (flags & MREMAP_MAYMOVE)].

On the other side I agree with you that the fix could have been better
if I realized the semantics that MREMAP_DONTUNMAP should only work with
anonymous mappings.

Probably, a better fix would be to move
:   if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
:   vma->vm_flags & VM_SHARED))
:   return ERR_PTR(-EINVAL);

from vma_to_resize() into the mremap() syscall directly.
What do you think?

-- 
  Dima

Re: [PATCH 3/6] mremap: Don't allow MREMAP_DONTUNMAP on special_mappings and aio

2020-12-28 Thread Dmitry Safonov

On 12/28/20 7:33 PM, Dmitry Safonov wrote:
> [I moved your reply to avoid top-posting]
> 
> On 12/28/20 6:03 PM, Brian Geffon wrote:
>> On Mon, Oct 12, 2020 at 6:34 PM Dmitry Safonov  wrote:
>>>
>>> As kernel expect to see only one of such mappings, any further
>>> operations on the VMA-copy may be unexpected by the kernel.
>>> Maybe it's being on the safe side, but there doesn't seem to be any
>>> expected use-case for this, so restrict it now.
>>>
>>> Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()")
>>> Signed-off-by: Dmitry Safonov 
>>
>> I don't think this situation can ever happen MREMAP_DONTUNMAP is
>> already restricted to anonymous mappings (defined as not having
>> vm_ops) and vma_to_resize checks that the mapping is anonymous before
>> move_vma is called.
> 
> I've looked again now, I think it is possible. One can call
> MREMAP_DONTUNMAP without MREMAP_FIXED and without resizing. So that the
> old VMA is copied at some free address.
> 
> The calltrace would be: mremap()=>move_vma()
> [under if (flags & MREMAP_MAYMOVE)].
> 
> On the other side I agree with you that the fix could have been better
> if I realized the semantics that MREMAP_DONTUNMAP should only work with
> anonymous mappings.
> 
> Probably, a better fix would be to move
> :   if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
> :   vma->vm_flags & VM_SHARED))
> :   return ERR_PTR(-EINVAL);
> 
> from vma_to_resize() into the mremap() syscall directly.
> What do you think?

Ok, I've misread the code now, it checks vma_to_resize() before.
I'll send a revert to this one.

Thanks for noticing,
  Dima

Re: [PATCH 2/6] mm/mremap: For MREMAP_DONTUNMAP check security_vm_enough_memory_mm()

2020-12-28 Thread Dmitry Safonov

On 12/28/20 6:21 PM, Brian Geffon wrote:
> This looks good to me with a small comment.
> 
>> if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
>> /* OOM: unable to split vma, just get accounts right */
>> -   if (vm_flags & VM_ACCOUNT)
>> +   if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
>> vm_acct_memory(new_len >> PAGE_SHIFT);
> 
> Checking MREMAP_DONTUNMAP in the do_munmap path is unnecessary as
> MREMAP_DONTUNMAP will have already returned by this point.

In this code it is also used as err-path. In case move_page_tables()
fails to move all page tables or .mremap() callback fails, the new VMA
is unmapped.

IOW, MREMAP_DONTUNMAP returns under:
:   if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {

-- 
  Dima

Re: [RESEND PATCH v3 0/4] iommu/iova: Solve longterm IOVA issue

2020-12-02 Thread Dmitry Safonov

On Tue, 1 Dec 2020 at 21:50, Will Deacon  wrote:
>
> On Tue, 17 Nov 2020 18:25:30 +0800, John Garry wrote:
> > This series contains a patch to solve the longterm IOVA issue which
> > leizhen originally tried to address at [0].
> >
> > A sieved kernel log is at the following, showing periodic dumps of IOVA
> > sizes, per CPU and per depot bin, per IOVA size granule:
> > https://raw.githubusercontent.com/hisilicon/kernel-dev/topic-iommu-5.10-iova-debug-v3/aging_test
> >
> > [...]
>
> Applied the final patch to arm64 (for-next/iommu/iova), thanks!
>
> [4/4] iommu: avoid taking iova_rbtree_lock twice
>   https://git.kernel.org/arm64/c/3a651b3a27a1

Glad it made in next, 2 years ago I couldn't convince iommu maintainer
it's worth it (but with a different justification):
https://lore.kernel.org/linux-iommu/20180621180823.805-3-d...@arista.com/

Thanks,
 Dmitry

Re: [PATCH v2 00/19] Add generic vdso_base tracking

2020-11-26 Thread Dmitry Safonov

Hi Christophe,

On 11/24/20 6:53 AM, Christophe Leroy wrote:
> 
> 
> Le 24/11/2020 à 01:29, Dmitry Safonov a écrit :
>> v2 Changes:
>> - Rename user_landing to vdso_base as it tracks vDSO VMA start address,
>>    rather than the explicit address to land (Andy)
>> - Reword and don't use "new-execed" and "new-born" task (Andy)
>> - Fix failures reported by build robot
>>
>> Started from discussion [1], where was noted that currently a couple of
>> architectures support mremap() for vdso/sigpage, but not munmap().
>> If an application maps something on the ex-place of vdso/sigpage,
>> later after processing signal it will land there (good luck!)
>>
>> Patches set is based on linux-next (next-20201123) and it depends on
>> changes in x86/cleanups (those reclaim TIF_IA32/TIF_X32) and also
>> on my changes in akpm (fixing several mremap() issues).
> 
> I have a series that cleans up VDSO init on powerpc and migrates powerpc
> to _install_special_mapping() (patch 10 of the series).
> 
> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=204396=%2A=both
> 
> 
> I'm wondering how we should coordinate with your series for merging.
> 
> I guess your series will also imply removal of arch_unmap() ? see
> https://elixir.bootlin.com/linux/v5.10-rc4/source/arch/powerpc/include/asm/mmu_context.h#L262

I think our series intersect only in that moment where I re-introduce
arch_setup_additional_pages() parameters. So, in theory we could
minimize the conflicts by merging both series in parallel and cleanup
the result by moving to generic vdso_base on the top, what do you think?

Thanks,
  Dmitry

Re: [PATCH v2 13/19] x86/signal: Check if vdso_image_32 is mapped before trying to land on it

2020-11-26 Thread Dmitry Safonov

On 11/24/20 11:43 PM, Andy Lutomirski wrote:
> On Mon, Nov 23, 2020 at 4:29 PM Dmitry Safonov  wrote:
>>
>> Provide current_has_vdso_image_32() helper and check it apriory landing
>> attempt on vdso vma.
>> The helper is a macro, not a static inline funciton to avoid
>> linux/sched/task_stack.h inclusion in asm/vdso.h.
> 
> Can you make this more general?  For example:
> 
> current_has_vdso(_image_whatever)

Sounds good, will do.

> 
> also:
> 
>> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
>> index 18d8f17f755c..d9ab58cc765b 100644
>> --- a/arch/x86/entry/common.c
>> +++ b/arch/x86/entry/common.c
>> @@ -142,11 +142,16 @@ static noinstr bool __do_fast_syscall_32(struct 
>> pt_regs *regs)
>>  /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
>>  __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
>>  {
>> +   unsigned long landing_pad;
>> +
>> +   if (!current_has_vdso_image_32())
>> +   force_sigsegv(SIGSEGV);
> 
> Falling through seems incorrect here.  I would instead write 0 to rip,
> do the force_sigsegv(), and exit, making sure that exiting goes
> through the correct path.
> 

Something like this?

:   if (!current_has_vdso(_image_32)) {
:   regs->ax = -EFAULT;
:   regs->ip = 0;
:   force_sigsegv(SIGSEGV);
:   syscall_exit_to_user_mode(regs);
:   }

Thanks,
  Dmitry

Re: [PATCH v2 08/19] arm/vdso: Remove vdso pointer from mm->context

2020-11-26 Thread Dmitry Safonov

On 11/24/20 6:22 AM, Christophe Leroy wrote:
> 
> 
> Le 24/11/2020 à 01:29, Dmitry Safonov a écrit :
>> Not used any more.
> 
> But what about mremap() ? Maybe you should explain why you can remove it ?

Yep, it was only to keep track of context->vdso position.
I'll add it to the description.

Thanks,
  Dmitry

Re: [PATCH v2 05/19] elf: Remove compat_arch_setup_additional_pages()

2020-11-26 Thread Dmitry Safonov

On 11/24/20 6:13 AM, Christophe Leroy wrote:
> 
> 
> Le 24/11/2020 à 01:29, Dmitry Safonov a écrit :
>> Now that all users rely on detecting bitness of new-born task checking
>> personality, remove compat_arch_setup_additional_pages() macro,
>> simplifying the code.
> 
> I understand from cover that you wanted to reword "new-born" ?

Yes. I have reread the messages, not sure how I've missed it,

Thanks,
  Dmitry

Re: [PATCH v2 06/19] elf/vdso: Reuse arch_setup_additional_pages() parameters

2020-11-26 Thread Dmitry Safonov

On 11/24/20 6:18 AM, Christophe Leroy wrote:
> "Reuse arch_setup_additional_pages() parameters"
> 
> Did you mean "remove" ? Or "Revise" ?
> 
> Maybe could be:
> 
> "Modify arch_setup_additional_pages() parameters"

Sure.

Thanks,
  Dmitry

Re: [PATCH v2 02/19] elf: Move arch_setup_additional_pages() to generic elf.h

2020-11-26 Thread Dmitry Safonov

On 11/24/20 6:12 AM, Christophe Leroy wrote:
[..]
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 6fd7d38a60c8..4221f171d1a9 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -134,6 +134,7 @@ config PPC
>>   select ARCH_HAS_PTE_SPECIAL
>>   select ARCH_HAS_MEMBARRIER_CALLBACKS
>>   select ARCH_HAS_MEMBARRIER_SYNC_CORE
>> +    select ARCH_HAS_SETUP_ADDITIONAL_PAGES
> 
> We try to keep alphabetic order on powerpc, should go after
> ARCH_HAS_SCALED_CPUTIME

Yep, I think I have to learn the alphabet.

Thanks for noticing,
  Dmitry

[PATCH v2 02/19] elf: Move arch_setup_additional_pages() to generic elf.h

2020-11-23 Thread Dmitry Safonov

Ifdef the function in the header, not in the code.
Following kernel style, move it to Kconfig.
All it makes it easier to follow when the option is enabled/disabled.
Remove re-definition from compat_binfmt_elf, as it's always defined
under compat_arch_setup_additional_pages (to be reworked).

Signed-off-by: Dmitry Safonov 
---
 arch/arm/Kconfig|  1 +
 arch/arm/include/asm/elf.h  |  5 -
 arch/arm64/Kconfig  |  1 +
 arch/arm64/include/asm/elf.h|  6 +-
 arch/csky/Kconfig   |  1 +
 arch/csky/include/asm/elf.h |  4 
 arch/hexagon/Kconfig|  1 +
 arch/hexagon/include/asm/elf.h  |  6 --
 arch/mips/Kconfig   |  1 +
 arch/mips/include/asm/elf.h |  5 -
 arch/nds32/Kconfig  |  1 +
 arch/nds32/include/asm/elf.h|  3 ---
 arch/nios2/Kconfig  |  1 +
 arch/nios2/include/asm/elf.h|  4 
 arch/powerpc/Kconfig|  1 +
 arch/powerpc/include/asm/elf.h  |  5 -
 arch/riscv/Kconfig  |  1 +
 arch/riscv/include/asm/elf.h|  4 
 arch/s390/Kconfig   |  1 +
 arch/s390/include/asm/elf.h |  5 -
 arch/sh/Kconfig |  1 +
 arch/sh/include/asm/elf.h   |  6 --
 arch/sparc/Kconfig  |  1 +
 arch/sparc/include/asm/elf_64.h |  6 --
 arch/x86/Kconfig|  1 +
 arch/x86/include/asm/elf.h  |  4 
 arch/x86/um/asm/elf.h   |  5 -
 fs/Kconfig.binfmt   |  3 +++
 fs/binfmt_elf.c |  2 --
 fs/binfmt_elf_fdpic.c   |  3 +--
 fs/compat_binfmt_elf.c  |  2 --
 include/linux/elf.h | 12 
 32 files changed, 30 insertions(+), 73 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 3b1caaefbff0..a6330e7eb2b8 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -15,6 +15,7 @@ config ARM
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
select ARCH_HAS_PHYS_TO_DMA
+   select ARCH_HAS_SETUP_ADDITIONAL_PAGES if MMU
select ARCH_HAS_SETUP_DMA_OPS
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index b8102a6ddf16..47347d7412ec 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -137,7 +137,6 @@ extern int arm_elf_read_implies_exec(int);
 extern void elf_set_personality(const struct elf32_hdr *);
 #define SET_PERSONALITY(ex)elf_set_personality(&(ex))
 
-#ifdef CONFIG_MMU
 #ifdef CONFIG_VDSO
 #define ARCH_DLINFO\
 do {   \
@@ -145,9 +144,5 @@ do {
\
(elf_addr_t)current->mm->context.vdso); \
 } while (0)
 #endif
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
-struct linux_binprm;
-int arch_setup_additional_pages(struct linux_binprm *, int);
-#endif
 
 #endif
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 629a293cc408..35d8e4acd38d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -26,6 +26,7 @@ config ARM64
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_DEVMAP
select ARCH_HAS_PTE_SPECIAL
+   select ARCH_HAS_SETUP_ADDITIONAL_PAGES
select ARCH_HAS_SETUP_DMA_OPS
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 8d1c8dcb87fd..d1073ffa7f24 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -181,11 +181,6 @@ do {   
\
NEW_AUX_ENT(AT_IGNORE, 0);  \
 } while (0)
 
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
-struct linux_binprm;
-extern int arch_setup_additional_pages(struct linux_binprm *bprm,
-  int uses_interp);
-
 /* 1GB of VA */
 #ifdef CONFIG_COMPAT
 #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \
@@ -242,6 +237,7 @@ do {
\
 #else
 #define COMPAT_ARCH_DLINFO
 #endif
+struct linux_binprm;
 extern int aarch32_setup_additional_pages(struct linux_binprm *bprm,
  int uses_interp);
 #define compat_arch_setup_additional_pages \
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 21b2ab099c8b..f068d67456a2 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -4,6 +4,7 @@ config CSKY
select ARCH_32BIT_OFF_T
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_GCOV_PROFILE_ALL
+   select ARCH_HAS_SETUP_ADDITIONAL_PAGES
select ARCH_HAS_SYNC_DMA_FOR_CPU
select ARCH_HAS_SYNC_DMA_FOR_DEV

[PATCH v2 08/19] arm/vdso: Remove vdso pointer from mm->context

2020-11-23 Thread Dmitry Safonov

Not used any more.

Signed-off-by: Dmitry Safonov 
---
 arch/arm/include/asm/mmu.h |  3 ---
 arch/arm/kernel/vdso.c | 12 
 2 files changed, 15 deletions(-)

diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h
index 1592a4264488..2397b0a19f59 100644
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -12,9 +12,6 @@ typedef struct {
 #endif
unsigned intvmalloc_seq;
unsigned long   sigpage;
-#ifdef CONFIG_VDSO
-   unsigned long   vdso;
-#endif
 #ifdef CONFIG_BINFMT_ELF_FDPIC
unsigned long   exec_fdpic_loadmap;
unsigned long   interp_fdpic_loadmap;
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index 710e5ca99a53..4b39c7d8f525 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -47,17 +47,8 @@ static const struct vm_special_mapping vdso_data_mapping = {
.pages = _data_page,
 };
 
-static int vdso_mremap(const struct vm_special_mapping *sm,
-   struct vm_area_struct *new_vma)
-{
-   current->mm->context.vdso = new_vma->vm_start;
-
-   return 0;
-}
-
 static struct vm_special_mapping vdso_text_mapping __ro_after_init = {
.name = "[vdso]",
-   .mremap = vdso_mremap,
 };
 
 struct elfinfo {
@@ -239,8 +230,6 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long 
addr,
struct vm_area_struct *vma;
unsigned long len;
 
-   mm->context.vdso = 0;
-
if (vdso_text_pagelist == NULL)
return;
 
@@ -258,7 +247,6 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long 
addr,
if (IS_ERR(vma))
return;
 
-   mm->context.vdso = addr;
*sysinfo_ehdr = addr;
 }
 
-- 
2.29.2

[PATCH v2 06/19] elf/vdso: Reuse arch_setup_additional_pages() parameters

2020-11-23 Thread Dmitry Safonov

Both parameters of arch_setup_additional_pages() are currently unused.
commit fc5243d98ac2 ("[S390] arch_setup_additional_pages arguments")
tried to introduce useful arguments, but they still are not used.

Remove old parameters and introduce sysinfo_ehdr argument that will be
used to return vdso address to put as AT_SYSINFO_EHDR tag in auxiliary
vector. The reason to add this parameter is that many architecture
have vDSO pointer saved in their mm->context with the only purpose
to use it later in ARCH_DLINFO. That's the macro for elf loader
to setup sysinfo_ehdr tag.

Return sysinfo_ehdr address that will be later used by ARCH_DLINFO as
an argument. That will allow to drop vDSO pointer from mm->context
and any code responsible to track vDSO position on platforms that
don't use vDSO as a landing in userspace (arm/s390/sparc).

Cc: Albert Ou 
Cc: "David S. Miller" 
Cc: Palmer Dabbelt 
Cc: Paul Walmsley 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/arm/include/asm/vdso.h|  6 --
 arch/arm/kernel/process.c  |  4 ++--
 arch/arm/kernel/vdso.c | 10 +++---
 arch/arm64/kernel/vdso.c   | 17 
 arch/csky/kernel/vdso.c|  3 ++-
 arch/hexagon/kernel/vdso.c |  3 ++-
 arch/mips/kernel/vdso.c|  3 ++-
 arch/nds32/kernel/vdso.c   |  3 ++-
 arch/nios2/mm/init.c   |  2 +-
 arch/powerpc/kernel/vdso.c |  3 ++-
 arch/riscv/kernel/vdso.c   | 11 +-
 arch/s390/kernel/vdso.c|  3 ++-
 arch/sh/kernel/vsyscall/vsyscall.c |  3 ++-
 arch/sparc/vdso/vma.c  | 15 +++---
 arch/x86/entry/vdso/vma.c  | 32 +-
 arch/x86/um/vdso/vma.c |  2 +-
 fs/binfmt_elf.c|  3 ++-
 fs/binfmt_elf_fdpic.c  |  3 ++-
 include/linux/elf.h| 17 +++-
 19 files changed, 85 insertions(+), 58 deletions(-)

diff --git a/arch/arm/include/asm/vdso.h b/arch/arm/include/asm/vdso.h
index 5b85889f82ee..6b2b3b1fe833 100644
--- a/arch/arm/include/asm/vdso.h
+++ b/arch/arm/include/asm/vdso.h
@@ -10,13 +10,15 @@ struct mm_struct;
 
 #ifdef CONFIG_VDSO
 
-void arm_install_vdso(struct mm_struct *mm, unsigned long addr);
+void arm_install_vdso(struct mm_struct *mm, unsigned long addr,
+ unsigned long *sysinfo_ehdr);
 
 extern unsigned int vdso_total_pages;
 
 #else /* CONFIG_VDSO */
 
-static inline void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
+static inline void arm_install_vdso(struct mm_struct *mm, unsigned long addr,
+   unsigned long *sysinfo_ehdr)
 {
 }
 
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index d0220da1d1b1..0e90cba8ac7a 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -389,7 +389,7 @@ static const struct vm_special_mapping sigpage_mapping = {
.mremap = sigpage_mremap,
 };
 
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
 {
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -430,7 +430,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
 * to be fatal to the process, so no error check needed
 * here.
 */
-   arm_install_vdso(mm, addr + PAGE_SIZE);
+   arm_install_vdso(mm, addr + PAGE_SIZE, sysinfo_ehdr);
 
  up_fail:
mmap_write_unlock(mm);
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index 3408269d19c7..710e5ca99a53 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -233,7 +233,8 @@ static int install_vvar(struct mm_struct *mm, unsigned long 
addr)
 }
 
 /* assumes mmap_lock is write-locked */
-void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
+void arm_install_vdso(struct mm_struct *mm, unsigned long addr,
+ unsigned long *sysinfo_ehdr)
 {
struct vm_area_struct *vma;
unsigned long len;
@@ -254,7 +255,10 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long 
addr)
VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
_text_mapping);
 
-   if (!IS_ERR(vma))
-   mm->context.vdso = addr;
+   if (IS_ERR(vma))
+   return;
+
+   mm->context.vdso = addr;
+   *sysinfo_ehdr = addr;
 }
 
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 1b710deb84d6..666338724a07 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -213,8 +213,7 @@ static vm_fault_t vvar_fault(const struct 
vm_special_mapping *sm,
 
 static int __setup_additional_pages(enum vdso_abi abi,
struct mm_struct *mm,
-   struct linux_binprm *bprm,
-

[PATCH v2 13/19] x86/signal: Check if vdso_image_32 is mapped before trying to land on it

2020-11-23 Thread Dmitry Safonov

Provide current_has_vdso_image_32() helper and check it apriory landing
attempt on vdso vma.
The helper is a macro, not a static inline funciton to avoid
linux/sched/task_stack.h inclusion in asm/vdso.h.

Signed-off-by: Dmitry Safonov 
---
 arch/x86/entry/common.c | 7 ++-
 arch/x86/ia32/ia32_signal.c | 4 ++--
 arch/x86/include/asm/vdso.h | 4 
 arch/x86/kernel/signal.c| 4 ++--
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 18d8f17f755c..d9ab58cc765b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -142,11 +142,16 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs 
*regs)
 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
 __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
 {
+   unsigned long landing_pad;
+
+   if (!current_has_vdso_image_32())
+   force_sigsegv(SIGSEGV);
+
/*
 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
 * convention.  Adjust regs so it looks like we entered using int80.
 */
-   unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+   landing_pad = (unsigned long)current->mm->context.vdso +
vdso_image_32.sym_int80_landing_pad;
 
/*
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index ea3db15b57bf..f87ed1d53938 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -255,7 +255,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
restorer = ksig->ka.sa.sa_restorer;
} else {
/* Return stub is in 32bit vsyscall page */
-   if (current->mm->context.vdso)
+   if (current_has_vdso_image_32())
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_sigreturn;
else
@@ -336,7 +336,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
-   else if (current->mm->context.vdso)
+   else if (current_has_vdso_image_32())
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_rt_sigreturn;
else
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index b5d23470f56b..e3829c3a6149 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -41,6 +41,10 @@ extern const struct vdso_image vdso_image_x32;
 
 #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
 extern const struct vdso_image vdso_image_32;
+
+#define current_has_vdso_image_32()\
+   likely(current->mm->context.vdso_image == _image_32 && \
+   !!current->mm->context.vdso)
 #endif
 
 extern void __init init_vdso_image(const struct vdso_image *image);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 372ec09dc4ac..6fed2e523e0a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -319,7 +319,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
unsafe_put_user(set->sig[1], >extramask[0], Efault);
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
-   else if (current->mm->context.vdso)
+   else if (current_has_vdso_image_32())
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_sigreturn;
else
@@ -381,7 +381,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
/* Set up to return from userspace.  */
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
-   else if (current->mm->context.vdso)
+   else if (current_has_vdso_image_32())
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_rt_sigreturn;
else
-- 
2.29.2

[PATCH v2 15/19] x86/vdso: Migrate to generic vdso_base

2020-11-23 Thread Dmitry Safonov

Generic way to track the landing vma area.
As a bonus, after unmapping vdso, kernel won't try to land on its
previous position (due to UNMAPPED_VDSO_BASE check instead of
context.vdso ?= 0 check).

Signed-off-by: Dmitry Safonov 
---
 arch/x86/Kconfig  | 1 +
 arch/x86/entry/common.c   | 2 +-
 arch/x86/entry/vdso/extable.c | 4 ++--
 arch/x86/entry/vdso/vma.c | 9 -
 arch/x86/ia32/ia32_signal.c   | 4 ++--
 arch/x86/include/asm/mmu.h| 1 -
 arch/x86/include/asm/vdso.h   | 2 +-
 arch/x86/kernel/signal.c  | 4 ++--
 8 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d676e0bad1f1..4f06760f849e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -85,6 +85,7 @@ config X86
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_UBSAN_SANITIZE_ALL
+   select ARCH_HAS_VDSO_BASE
select ARCH_HAS_DEBUG_WX
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d9ab58cc765b..655b7d8fe734 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -151,7 +151,7 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs 
*regs)
 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
 * convention.  Adjust regs so it looks like we entered using int80.
 */
-   landing_pad = (unsigned long)current->mm->context.vdso +
+   landing_pad = (unsigned long)current->mm->vdso_base +
vdso_image_32.sym_int80_landing_pad;
 
/*
diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c
index afcf5b65beef..16688c2d032c 100644
--- a/arch/x86/entry/vdso/extable.c
+++ b/arch/x86/entry/vdso/extable.c
@@ -25,10 +25,10 @@ bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP)
return false;
 
-   if (!current->mm->context.vdso)
+   if (current->mm->vdso_base == (void *)UNMAPPED_VDSO_BASE)
return false;
 
-   base =  (unsigned long)current->mm->context.vdso + image->extable_base;
+   base = (unsigned long)current->mm->vdso_base + image->extable_base;
nr_entries = image->extable_len / (sizeof(*extable));
extable = image->extable;
 
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 65780a0164e3..29a795face9d 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -77,7 +77,7 @@ static void vdso_fix_landing(const struct vdso_image *image,
struct pt_regs *regs = current_pt_regs();
unsigned long vdso_land = image->sym_int80_landing_pad;
unsigned long old_land_addr = vdso_land +
-   (unsigned long)current->mm->context.vdso;
+   (unsigned long)current->mm->vdso_base;
 
/* Fixing userspace landing - look at do_fast_syscall_32 */
if (regs->ip == old_land_addr)
@@ -92,7 +92,6 @@ static void vdso_mremap(const struct vm_special_mapping *sm,
const struct vdso_image *image = current->mm->context.vdso_image;
 
vdso_fix_landing(image, new_vma);
-   current->mm->context.vdso = (void __user *)new_vma->vm_start;
 }
 
 #ifdef CONFIG_TIME_NS
@@ -287,7 +286,7 @@ static int map_vdso(const struct vdso_image *image, 
unsigned long addr,
ret = PTR_ERR(vma);
do_munmap(mm, text_start, image->size, NULL);
} else {
-   current->mm->context.vdso = (void __user *)text_start;
+   current->mm->vdso_base = (void __user *)text_start;
current->mm->context.vdso_image = image;
*sysinfo_ehdr = text_start;
}
@@ -362,8 +361,8 @@ int map_vdso_once(const struct vdso_image *image, unsigned 
long addr)
 * Check if we have already mapped vdso blob - fail to prevent
 * abusing from userspace install_speciall_mapping, which may
 * not do accounting and rlimit right.
-* We could search vma near context.vdso, but it's a slowpath,
-* so let's explicitly check all VMAs to be completely sure.
+* It's a slowpath, let's explicitly check all VMAs to be
+* completely sure.
 */
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma_is_special_mapping(vma, _mapping) ||
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f87ed1d53938..67204b1eeea0 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -256,7 +256,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
} else {
/* Return stub is in 32bit vsyscall page */
if (current_has_vdso

[PATCH v2 19/19] mips/vdso: Migrate to generic vdso_base

2020-11-23 Thread Dmitry Safonov

Generic way to track the landing vma area.
As a bonus, after unmapping sigpage, kernel won't try to land on its
previous position (due to UNMAPPED_VDSO_BASE check instead of
context.vdso ?= 0 check).

Cc: Thomas Bogendoerfer 
Cc: linux-m...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/mips/Kconfig |  1 +
 arch/mips/kernel/signal.c | 11 +++
 arch/mips/kernel/vdso.c   |  2 +-
 arch/mips/vdso/genvdso.c  |  8 
 4 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 818a9b2c78f1..70424605710f 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -10,6 +10,7 @@ config MIPS
select ARCH_HAS_SETUP_ADDITIONAL_PAGES
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN_SANITIZE_ALL
+   select ARCH_HAS_VDSO_BASE
select ARCH_SUPPORTS_UPROBES
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index f1e985109da0..e0beaf2cdc0f 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -806,11 +806,13 @@ struct mips_abi mips_abi = {
 
 static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 {
+   void *land = (void *)current->mm->vdso_base;
sigset_t *oldset = sigmask_to_save();
-   int ret;
+   int ret = 1;
struct mips_abi *abi = current->thread.abi;
-   void *vdso = current->mm->context.vdso;
 
+   if (land == (void *)UNMAPPED_VDSO_BASE)
+   goto err;
/*
 * If we were emulating a delay slot instruction, exit that frame such
 * that addresses in the sigframe are as expected for userland and we
@@ -843,12 +845,13 @@ static void handle_signal(struct ksignal *ksig, struct 
pt_regs *regs)
rseq_signal_deliver(ksig, regs);
 
if (sig_uses_siginfo(>ka, abi))
-   ret = abi->setup_rt_frame(vdso + abi->vdso->off_rt_sigreturn,
+   ret = abi->setup_rt_frame(land + abi->vdso->off_rt_sigreturn,
  ksig, regs, oldset);
else
-   ret = abi->setup_frame(vdso + abi->vdso->off_sigreturn,
+   ret = abi->setup_frame(land + abi->vdso->off_sigreturn,
   ksig, regs, oldset);
 
+err:
signal_setup_done(ret, ksig, 0);
 }
 
diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index e124c68322bb..b2b2e596f03b 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -183,7 +183,7 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
goto out;
}
 
-   mm->context.vdso = (void *)vdso_addr;
+   mm->vdso_base = (void __user *)vdso_addr;
*sysinfo_ehdr = vdso_addr;
ret = 0;
 
diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c
index 0303d30cde03..8f581a2c8578 100644
--- a/arch/mips/vdso/genvdso.c
+++ b/arch/mips/vdso/genvdso.c
@@ -259,13 +259,6 @@ int main(int argc, char **argv)
fprintf(out_file, "#include \n");
fprintf(out_file, "#include \n");
fprintf(out_file, "#include \n");
-   fprintf(out_file, "static void vdso_mremap(\n");
-   fprintf(out_file, " const struct vm_special_mapping *sm,\n");
-   fprintf(out_file, " struct vm_area_struct *new_vma)\n");
-   fprintf(out_file, "{\n");
-   fprintf(out_file, " current->mm->context.vdso =\n");
-   fprintf(out_file, " (void *)(new_vma->vm_start);\n");
-   fprintf(out_file, "}\n");
 
/* Write out the stripped VDSO data. */
fprintf(out_file,
@@ -290,7 +283,6 @@ int main(int argc, char **argv)
fprintf(out_file, "\t.mapping = {\n");
fprintf(out_file, "\t\t.name = \"[vdso]\",\n");
fprintf(out_file, "\t\t.pages = vdso_pages,\n");
-   fprintf(out_file, "\t\t.mremap = vdso_mremap,\n");
fprintf(out_file, "\t},\n");
 
/* Calculate and write symbol offsets to  */
-- 
2.29.2

[PATCH v2 18/19] arm64/vdso: Migrate native signals to generic vdso_base

2020-11-23 Thread Dmitry Safonov

Generic way to track the land vma area.
As a bonus, after unmapping vdso, kernel won't try to land on its
previous position (due to UNMAPPED_VDSO_BASE check instead of
context.vdso ?= 0 check).

Signed-off-by: Dmitry Safonov 
---
 arch/arm64/kernel/signal.c | 10 +++---
 arch/arm64/kernel/vdso.c   | 16 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index bec6ef69704f..96e8988f033d 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -723,9 +723,10 @@ static int get_sigframe(struct rt_sigframe_user_layout 
*user,
return 0;
 }
 
-static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
+static int setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 struct rt_sigframe_user_layout *user, int usig)
 {
+   unsigned long land = (unsigned long)current->mm->vdso_base;
__sigrestore_t sigtramp;
 
regs->regs[0] = usig;
@@ -754,10 +755,13 @@ static void setup_return(struct pt_regs *regs, struct 
k_sigaction *ka,
 
if (ka->sa.sa_flags & SA_RESTORER)
sigtramp = ka->sa.sa_restorer;
+   else if (land != UNMAPPED_VDSO_BASE)
+   sigtramp = VDSO_SYMBOL(land, sigtramp);
else
-   sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp);
+   return 1;
 
regs->regs[30] = (unsigned long)sigtramp;
+   return 0;
 }
 
 static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
@@ -780,7 +784,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, 
sigset_t *set,
err |= __save_altstack(>uc.uc_stack, regs->sp);
err |= setup_sigframe(, regs, set);
if (err == 0) {
-   setup_return(regs, >ka, , usig);
+   err = setup_return(regs, >ka, , usig);
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
err |= copy_siginfo_to_user(>info, >info);
regs->regs[1] = (unsigned long)>info;
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 03a6519d50cd..5b44463b739a 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -78,12 +78,6 @@ static union {
 } vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = vdso_data_store.data;
 
-static void vdso_mremap(const struct vm_special_mapping *sm,
-   struct vm_area_struct *new_vma)
-{
-   current->mm->context.vdso = (void *)new_vma->vm_start;
-}
-
 static int __vdso_init(enum vdso_abi abi)
 {
int i;
@@ -239,7 +233,6 @@ static int __setup_additional_pages(enum vdso_abi abi,
gp_flags = VM_ARM64_BTI;
 
vdso_base += VVAR_NR_PAGES * PAGE_SIZE;
-   mm->context.vdso = (void *)vdso_base;
ret = _install_special_mapping(mm, vdso_base, vdso_text_len,
   VM_READ|VM_EXEC|gp_flags|
   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
@@ -247,12 +240,17 @@ static int __setup_additional_pages(enum vdso_abi abi,
if (IS_ERR(ret))
goto up_fail;
 
+   /*
+* 32-bit ABI is to land on sigpage (see aarch32_sigreturn_setup()),
+* 64-bit on vDSO.
+*/
+   if (abi == VDSO_ABI_AA64)
+   mm->vdso_base = (void __user *)vdso_base;
*sysinfo_ehdr = vdso_base;
 
return 0;
 
 up_fail:
-   mm->context.vdso = NULL;
return PTR_ERR(ret);
 }
 
@@ -285,7 +283,6 @@ static struct vm_special_mapping aarch32_vdso_maps[] = {
},
[AA32_MAP_VDSO] = {
.name = "[vdso]",
-   .mremap = vdso_mremap,
},
 };
 
@@ -431,7 +428,6 @@ static struct vm_special_mapping aarch64_vdso_maps[] 
__ro_after_init = {
},
[AA64_MAP_VDSO] = {
.name   = "[vdso]",
-   .mremap = vdso_mremap,
},
 };
 
-- 
2.29.2

[PATCH v2 16/19] arm/vdso: Migrate to generic vdso_base

2020-11-23 Thread Dmitry Safonov

Generic way to track the landing vma area.
As a bonus, after unmapping sigpage, kernel won't try to land on its
previous position (due to UNMAPPED_VDSO_BASE check instead of
context.vdso ?= 0 check).

Signed-off-by: Dmitry Safonov 
---
 arch/arm/Kconfig  | 1 +
 arch/arm/kernel/process.c | 9 +
 arch/arm/kernel/signal.c  | 6 +-
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a6330e7eb2b8..1df40666b024 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -24,6 +24,7 @@ config ARM
select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB
select ARCH_HAS_TEARDOWN_DMA_OPS if MMU
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+   select ARCH_HAS_VDSO_BASE
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_KEEP_MEMBLOCK
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 5f4eced738f5..c6d44b34b05e 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -376,16 +376,9 @@ static unsigned long sigpage_addr(const struct mm_struct 
*mm,
 static struct page *signal_page;
 extern struct page *get_signal_page(void);
 
-static void sigpage_mremap(const struct vm_special_mapping *sm,
-   struct vm_area_struct *new_vma)
-{
-   current->mm->context.sigpage = new_vma->vm_start;
-}
-
 static const struct vm_special_mapping sigpage_mapping = {
.name = "[sigpage]",
.pages = _page,
-   .mremap = sigpage_mremap,
 };
 
 int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
@@ -423,7 +416,7 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
goto up_fail;
}
 
-   mm->context.sigpage = addr;
+   mm->vdso_base = (void __user *)addr;
 
/* Unlike the sigpage, failure to install the vdso is unlikely
 * to be fatal to the process, so no error check needed
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 9d2e916121be..ac9c0b4869c9 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -451,13 +451,17 @@ setup_return(struct pt_regs *regs, struct ksignal *ksig,
 #ifdef CONFIG_MMU
if (cpsr & MODE32_BIT) {
struct mm_struct *mm = current->mm;
+   unsigned long land = (unsigned long)mm->vdso_base;
+
+   if (land == UNMAPPED_VDSO_BASE)
+   return 1;
 
/*
 * 32-bit code can use the signal return page
 * except when the MPU has protected the vectors
 * page from PL0
 */
-   retcode = mm->context.sigpage + signal_return_offset +
+   retcode = land + signal_return_offset +
  (idx << 2) + thumb;
} else
 #endif
-- 
2.29.2

[PATCH v2 17/19] arm64/vdso: Migrate compat signals to generic vdso_base

2020-11-23 Thread Dmitry Safonov

Generic way to track the landing vma area.
As a bonus, after unmapping sigpage, kernel won't try to land on its
previous position (due to UNMAPPED_VDSO_BASE check instead of
context.vdso ?= 0 check).

Signed-off-by: Dmitry Safonov 
---
 arch/arm64/Kconfig   |  1 +
 arch/arm64/kernel/signal32.c | 17 -
 arch/arm64/kernel/vdso.c |  2 +-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 35d8e4acd38d..a93ec99644b8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -38,6 +38,7 @@ config ARM64
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+   select ARCH_HAS_VDSO_BASE
select ARCH_HAVE_ELF_PROT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_INLINE_READ_LOCK if !PREEMPTION
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 2f507f565c48..3d2ac8062e99 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -315,7 +315,7 @@ static void __user *compat_get_sigframe(struct ksignal 
*ksig,
return frame;
 }
 
-static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka,
+static int compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka,
compat_ulong_t __user *rc, void __user *frame,
int usig)
 {
@@ -342,13 +342,16 @@ static void compat_setup_return(struct pt_regs *regs, 
struct k_sigaction *ka,
retcode = ptr_to_compat(ka->sa.sa_restorer);
} else {
/* Set up sigreturn pointer */
+   unsigned long land = (unsigned long)current->mm->vdso_base;
unsigned int idx = thumb << 1;
 
if (ka->sa.sa_flags & SA_SIGINFO)
idx += 3;
 
-   retcode = (unsigned long)current->mm->context.sigpage +
- (idx << 2) + thumb;
+   if (land == UNMAPPED_VDSO_BASE)
+   return 1;
+
+   retcode = land + (idx << 2) + thumb;
}
 
regs->regs[0]   = usig;
@@ -356,6 +359,8 @@ static void compat_setup_return(struct pt_regs *regs, 
struct k_sigaction *ka,
regs->compat_lr = retcode;
regs->pc= handler;
regs->pstate= spsr;
+
+   return 0;
 }
 
 static int compat_setup_sigframe(struct compat_sigframe __user *sf,
@@ -425,7 +430,8 @@ int compat_setup_rt_frame(int usig, struct ksignal *ksig,
err |= compat_setup_sigframe(>sig, regs, set);
 
if (err == 0) {
-   compat_setup_return(regs, >ka, frame->sig.retcode, frame, 
usig);
+   err = compat_setup_return(regs, >ka,
+ frame->sig.retcode, frame, usig);
regs->regs[1] = (compat_ulong_t)(unsigned long)>info;
regs->regs[2] = (compat_ulong_t)(unsigned long)>sig.uc;
}
@@ -448,7 +454,8 @@ int compat_setup_frame(int usig, struct ksignal *ksig, 
sigset_t *set,
 
err |= compat_setup_sigframe(frame, regs, set);
if (err == 0)
-   compat_setup_return(regs, >ka, frame->retcode, frame, 
usig);
+   err = compat_setup_return(regs, >ka,
+ frame->retcode, frame, usig);
 
return err;
 }
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 91c1b7c716b7..03a6519d50cd 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -394,7 +394,7 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm)
if (IS_ERR(ret))
goto out;
 
-   mm->context.sigpage = (void *)addr;
+   mm->vdso_base = (void __user *)addr;
 
 out:
return PTR_ERR_OR_ZERO(ret);
-- 
2.29.2

[PATCH v2 12/19] x86/signal: Land on >retcode when vdso isn't mapped

2020-11-23 Thread Dmitry Safonov

Since commit 9fbbd4dd17d0 ("x86: Don't require the vDSO for handling
a.out signals") after processing 32-bit signal if there is no vdso
mapped frame->retcode is used as a landing.
Do the same for rt ia32 signals.
It also makes the ia32 compat signals match the native ia32 case.

This shouldn't be mistaken for encouragement for running binaries with
executable stack, rather something to do in hopefully very rare
situation with disabled or unmapped vdso and absent SA_RESTORER.
For non-executable stack it'll segfault on attempt to land, rather than
land on a random address where vdso was previously mapped.
For programs with executable stack it'll just do the same for rt signals
as for non-rt.

Discouraging users to run with executable stack is done separately in
commit 47a2ebb7f505 ("execve: warn if process starts with executable
stack").

Signed-off-by: Dmitry Safonov 
---
 arch/x86/ia32/ia32_signal.c | 12 +++-
 arch/x86/kernel/signal.c| 23 ++-
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 81cf22398cd1..ea3db15b57bf 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -270,8 +270,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
unsafe_put_user(set->sig[1], >extramask[0], Efault);
unsafe_put_user(ptr_to_compat(restorer), >pretcode, Efault);
/*
-* These are actually not used anymore, but left because some
-* gdb versions depend on them as a marker.
+* This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
+* gdb uses it as a signature to notice signal handler stack frames.
 */
unsafe_put_user(*((u64 *)), (u64 __user *)frame->retcode, Efault);
user_access_end();
@@ -336,14 +336,16 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
-   else
+   else if (current->mm->context.vdso)
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_rt_sigreturn;
+   else
+   restorer = >retcode;
unsafe_put_user(ptr_to_compat(restorer), >pretcode, Efault);
 
/*
-* Not actually used anymore, but left because some gdb
-* versions need it.
+* This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
+* gdb uses it as a signature to notice signal handler stack frames.
 */
unsafe_put_user(*((u64 *)), (u64 __user *)frame->retcode, Efault);
unsafe_put_sigcontext32(>uc.uc_mcontext, fp, regs, set, Efault);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ea794a083c44..372ec09dc4ac 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -317,23 +317,20 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t 
*set,
unsafe_put_user(sig, >sig, Efault);
unsafe_put_sigcontext(>sc, fp, regs, set, Efault);
unsafe_put_user(set->sig[1], >extramask[0], Efault);
-   if (current->mm->context.vdso)
+   if (ksig->ka.sa.sa_flags & SA_RESTORER)
+   restorer = ksig->ka.sa.sa_restorer;
+   else if (current->mm->context.vdso)
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_sigreturn;
else
restorer = >retcode;
-   if (ksig->ka.sa.sa_flags & SA_RESTORER)
-   restorer = ksig->ka.sa.sa_restorer;
 
/* Set up to return from userspace.  */
unsafe_put_user(restorer, >pretcode, Efault);
 
/*
 * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
-*
-* WE DO NOT USE IT ANY MORE! It's only left here for historical
-* reasons and because gdb uses it as a signature to notice
-* signal handler stack frames.
+* gdb uses it as a signature to notice signal handler stack frames.
 */
unsafe_put_user(*((u64 *)), (u64 *)frame->retcode, Efault);
user_access_end();
@@ -382,18 +379,18 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
unsafe_save_altstack(>uc.uc_stack, regs->sp, Efault);
 
/* Set up to return from userspace.  */
-   restorer = current->mm->context.vdso +
-   vdso_image_32.sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
+   else if (current->mm->context.vdso)
+   restorer = current->mm->context.vdso +
+   vdso_image_32.sym___kernel_rt_sigreturn;
+   else
+   restorer = >retcode;
unsafe_put_user(restorer, >pretcode, Efault);
 
/*
 * This

[PATCH v2 14/19] mm: Add vdso_base in mm_struct

2020-11-23 Thread Dmitry Safonov

Instead of having every architecture to define vdso_base/vdso_addr etc,
provide a generic mechanism to track vdso_base for landing in userspace.
It'll minimize per-architecture difference, the number of callbacks to
provide.

Originally, it started from thread [1] where the need for .close()
callback on vm_special_mapping was pointed, this generic code besides
removing duplicated .mremap() callbacks provides a cheaper way to
support munmap() on vdso mappings without introducing .close() callbacks
for every architecture (with would bring even more code duplication).

[1]: 
https://lore.kernel.org/linux-arch/cajwjo6zanqykshbq+3b+fi_vt80mtrzev5yreqawx-l8j8x...@mail.gmail.com/
Cc: Thomas Bogendoerfer 
Cc: linux-m...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c |  3 ++-
 fs/aio.c  |  3 ++-
 include/linux/mm.h|  3 ++-
 include/linux/mm_types.h  | 10 ++
 mm/Kconfig|  3 +++
 mm/mmap.c | 19 ++-
 mm/mremap.c   |  2 +-
 7 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c 
b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index e916646adc69..786c97203bf6 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1458,7 +1458,8 @@ static int pseudo_lock_dev_release(struct inode *inode, 
struct file *filp)
return 0;
 }
 
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long 
flags)
+static int pseudo_lock_dev_mremap(struct vm_area_struct *old_vma,
+   struct vm_area_struct *new_vma, unsigned long flags)
 {
/* Not supported */
return -EINVAL;
diff --git a/fs/aio.c b/fs/aio.c
index d213be7b8a7e..9b205ebf17e8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -323,7 +323,8 @@ static void aio_free_ring(struct kioctx *ctx)
}
 }
 
-static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags)
+static int aio_ring_mremap(struct vm_area_struct *old_vma,
+  struct vm_area_struct *vma, unsigned long flags)
 {
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a1c25da94663..dbda1b91f971 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -559,7 +559,8 @@ struct vm_operations_struct {
void (*close)(struct vm_area_struct * area);
/* Called any time before splitting to check if it's allowed */
int (*may_split)(struct vm_area_struct *area, unsigned long addr);
-   int (*mremap)(struct vm_area_struct *area, unsigned long flags);
+   int (*mremap)(struct vm_area_struct *old_vma,
+   struct vm_area_struct *new_vma, unsigned long flags);
/*
 * Called by mprotect() to make driver-specific permission
 * checks before mprotect() is finalised.   The VMA must not
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 95a257927dae..e1ca750874af 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -496,6 +496,16 @@ struct mm_struct {
 
/* Architecture-specific MM context */
mm_context_t context;
+#ifdef CONFIG_ARCH_HAS_VDSO_BASE
+   /*
+* Address of special mapping VMA to land after processing
+* a signal. Reads are unprotected: if a thread unmaps or
+* mremaps the mapping while another thread is processing
+* a signal, it can segfault while landing.
+*/
+   void __user *vdso_base;
+#endif
+#define UNMAPPED_VDSO_BASE TASK_SIZE_MAX
 
unsigned long flags; /* Must use atomic bitops to access */
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 29904dc16bfc..941ae0597734 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -893,4 +893,7 @@ config SECRETMEM
select GENERIC_ALLOCATOR
select CMA
 
+config ARCH_HAS_VDSO_BASE
+   bool
+
 endmenu
diff --git a/mm/mmap.c b/mm/mmap.c
index 17fe59a9780b..561afc3f1744 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3409,11 +3409,25 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t 
flags, long npages)
 
 static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
 
+static void update_vdso_base(struct vm_area_struct *old_vma,
+   unsigned long new_addr)
+{
+#ifdef CONFIG_ARCH_HAS_VDSO_BASE
+   struct mm_struct *mm = old_vma->vm_mm;
+
+   if (WARN_ON_ONCE(!mm))
+   return;
+   if (old_vma->vm_start == (unsigned long)mm->vdso_base)
+   mm->vdso_base = (void __user *)new_addr;
+#endif
+}
+
 /*
  * Having a close hook prevents vma merging regardless of flags.
  */
 static void special_mapping_close(struct vm_area_struct *vma)
 {
+   up

[PATCH v2 10/19] sparc/vdso: Remove vdso pointer from mm->context

2020-11-23 Thread Dmitry Safonov

Not used any more.

Cc: "David S. Miller" 
Cc: sparcli...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/sparc/include/asm/mmu_64.h | 1 -
 arch/sparc/vdso/vma.c   | 5 +
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
index 7e2704c770e9..8e7892890d14 100644
--- a/arch/sparc/include/asm/mmu_64.h
+++ b/arch/sparc/include/asm/mmu_64.h
@@ -111,7 +111,6 @@ typedef struct {
unsigned long   thp_pte_count;
struct tsb_config   tsb_block[MM_NUM_TSBS];
struct hv_tsb_descr tsb_descr[MM_NUM_TSBS];
-   void*vdso;
booladi;
tag_storage_desc_t  *tag_store;
spinlock_t  tag_lock;
diff --git a/arch/sparc/vdso/vma.c b/arch/sparc/vdso/vma.c
index bf9195fe9bcc..255e052223ca 100644
--- a/arch/sparc/vdso/vma.c
+++ b/arch/sparc/vdso/vma.c
@@ -389,7 +389,6 @@ static int map_vdso(const struct vdso_image *image,
}
 
text_start = addr - image->sym_vvar_start;
-   current->mm->context.vdso = (void __user *)text_start;
 
/*
 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -418,9 +417,7 @@ static int map_vdso(const struct vdso_image *image,
}
 
 up_fail:
-   if (ret)
-   current->mm->context.vdso = NULL;
-   else
+   if (!ret)
*sysinfo_ehdr = text_start;
 
mmap_write_unlock(mm);
-- 
2.29.2

[PATCH v2 11/19] mm/mmap: Make vm_special_mapping::mremap return void

2020-11-23 Thread Dmitry Safonov

Previously .mremap() callback needed (int) return to provide way to
restrict resizing of a special mapping. Now it's restricted by
providing .may_split = special_mapping_split.

Removing (int) return simplifies further changes to
special_mapping_mremap() as it won't need save ret code from the
callback. Also, it removes needless `return 0` from callbacks.

Signed-off-by: Dmitry Safonov 
---
 arch/arm/kernel/process.c | 3 +--
 arch/arm64/kernel/vdso.c  | 4 +---
 arch/mips/vdso/genvdso.c  | 3 +--
 arch/x86/entry/vdso/vma.c | 4 +---
 include/linux/mm_types.h  | 2 +-
 mm/mmap.c | 2 +-
 6 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 0e90cba8ac7a..5f4eced738f5 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -376,11 +376,10 @@ static unsigned long sigpage_addr(const struct mm_struct 
*mm,
 static struct page *signal_page;
 extern struct page *get_signal_page(void);
 
-static int sigpage_mremap(const struct vm_special_mapping *sm,
+static void sigpage_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
 {
current->mm->context.sigpage = new_vma->vm_start;
-   return 0;
 }
 
 static const struct vm_special_mapping sigpage_mapping = {
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 666338724a07..91c1b7c716b7 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -78,12 +78,10 @@ static union {
 } vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = vdso_data_store.data;
 
-static int vdso_mremap(const struct vm_special_mapping *sm,
+static void vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
 {
current->mm->context.vdso = (void *)new_vma->vm_start;
-
-   return 0;
 }
 
 static int __vdso_init(enum vdso_abi abi)
diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c
index 09e30eb4be86..0303d30cde03 100644
--- a/arch/mips/vdso/genvdso.c
+++ b/arch/mips/vdso/genvdso.c
@@ -259,13 +259,12 @@ int main(int argc, char **argv)
fprintf(out_file, "#include \n");
fprintf(out_file, "#include \n");
fprintf(out_file, "#include \n");
-   fprintf(out_file, "static int vdso_mremap(\n");
+   fprintf(out_file, "static void vdso_mremap(\n");
fprintf(out_file, " const struct vm_special_mapping *sm,\n");
fprintf(out_file, " struct vm_area_struct *new_vma)\n");
fprintf(out_file, "{\n");
fprintf(out_file, " current->mm->context.vdso =\n");
fprintf(out_file, " (void *)(new_vma->vm_start);\n");
-   fprintf(out_file, " return 0;\n");
fprintf(out_file, "}\n");
 
/* Write out the stripped VDSO data. */
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 5b9020742e66..65780a0164e3 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -86,15 +86,13 @@ static void vdso_fix_landing(const struct vdso_image *image,
 #endif
 }
 
-static int vdso_mremap(const struct vm_special_mapping *sm,
+static void vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
 {
const struct vdso_image *image = current->mm->context.vdso_image;
 
vdso_fix_landing(image, new_vma);
current->mm->context.vdso = (void __user *)new_vma->vm_start;
-
-   return 0;
 }
 
 #ifdef CONFIG_TIME_NS
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 65df8abd90bd..95a257927dae 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -780,7 +780,7 @@ struct vm_special_mapping {
struct vm_area_struct *vma,
struct vm_fault *vmf);
 
-   int (*mremap)(const struct vm_special_mapping *sm,
+   void (*mremap)(const struct vm_special_mapping *sm,
 struct vm_area_struct *new_vma);
 };
 
diff --git a/mm/mmap.c b/mm/mmap.c
index d0d458632c1b..17fe59a9780b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3433,7 +3433,7 @@ static int special_mapping_mremap(struct vm_area_struct 
*new_vma,
return -EFAULT;
 
if (sm->mremap)
-   return sm->mremap(sm, new_vma);
+   sm->mremap(sm, new_vma);
 
return 0;
 }
-- 
2.29.2

[PATCH v2 09/19] s390/vdso: Remove vdso_base pointer from mm->context

2020-11-23 Thread Dmitry Safonov

Not used any more.

Cc: Christian Borntraeger 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: linux-s...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/s390/include/asm/mmu.h |  1 -
 arch/s390/kernel/vdso.c | 10 --
 2 files changed, 11 deletions(-)

diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index e12ff0f29d1a..095d0596f700 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -15,7 +15,6 @@ typedef struct {
unsigned long gmap_asce;
unsigned long asce;
unsigned long asce_limit;
-   unsigned long vdso_base;
/* The mmu context belongs to a secure guest. */
atomic_t is_protected;
/*
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 810b72f8985c..3f07711a07c1 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -58,18 +58,9 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping 
*sm,
return 0;
 }
 
-static int vdso_mremap(const struct vm_special_mapping *sm,
-  struct vm_area_struct *vma)
-{
-   current->mm->context.vdso_base = vma->vm_start;
-
-   return 0;
-}
-
 static const struct vm_special_mapping vdso_mapping = {
.name = "[vdso]",
.fault = vdso_fault,
-   .mremap = vdso_mremap,
 };
 
 static int __init vdso_setup(char *str)
@@ -204,7 +195,6 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
goto out_up;
}
 
-   current->mm->context.vdso_base = vdso_base;
*sysinfo_ehdr = vdso_base;
rc = 0;
 
-- 
2.29.2

[PATCH v2 04/19] x86: Remove compat_arch_setup_additional_pages()

2020-11-23 Thread Dmitry Safonov

The same as for x32 task, detect ia32 task by in_ia32_syscall().
It's valid as execed task is pretending to be in a syscall of
relevant bitness/ABI, see the comment near in_32bit_syscall().

Removing compat_arch_setup_additional_pages() provides single point of
entry - arch_setup_additional_pages(), makes ifdeffery easier to read,
aligns the code with powerpc and sparc (mips also has single vdso setup
function, but instead of taking bitness from mm.context, takes vdso
image pointer there).
Together with arm64 code align to use in_compat_task(), it makes
possible to remove compat_arch_setup_additional_pages() macro
re-definition from compat elf code (another redefined marco less).

Cc: x...@kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/x86/entry/vdso/vma.c  | 41 +++---
 arch/x86/include/asm/elf.h |  5 -
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 4eea508e9b10..aace862ed9a1 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -376,48 +376,49 @@ int map_vdso_once(const struct vdso_image *image, 
unsigned long addr)
 }
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-static int load_vdso32(void)
+static int load_vdso_ia32(void)
 {
if (vdso32_enabled != 1)  /* Other values all mean "disabled" */
return 0;
 
return map_vdso(_image_32, 0);
 }
+#else
+static int load_vdso_ia32(void)
+{
+   WARN_ON_ONCE(1);
+   return -ENODATA;
+}
 #endif
 
 #ifdef CONFIG_X86_64
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+static int load_vdso_64(void)
 {
if (!vdso64_enabled)
return 0;
 
-   return map_vdso_randomized(_image_64);
-}
-
-#ifdef CONFIG_COMPAT
-int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
-  int uses_interp)
-{
 #ifdef CONFIG_X86_X32_ABI
-   if (in_x32_syscall()) {
-   if (!vdso64_enabled)
-   return 0;
+   if (in_x32_syscall())
return map_vdso_randomized(_image_x32);
-   }
 #endif
-#ifdef CONFIG_IA32_EMULATION
-   return load_vdso32();
+
+   return map_vdso_randomized(_image_64);
+}
 #else
-   return 0;
-#endif
+static int load_vdso_64(void)
+{
+   WARN_ON_ONCE(1);
+   return -ENODATA;
 }
 #endif
-#else
+
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-   return load_vdso32();
+   if (in_ia32_syscall())
+   return load_vdso_ia32();
+
+   return load_vdso_64();
 }
-#endif
 
 #ifdef CONFIG_X86_64
 static __init int vdso_setup(char *s)
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index d00b723eea2d..51a08f6b18e5 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -377,11 +377,6 @@ else   
\
((unsigned long)current->mm->context.vdso + \
 vdso_image_32.sym___kernel_vsyscall)
 
-struct linux_binprm;
-extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp);
-#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
-
 /* Do not change the values. See get_align_mask() */
 enum align_flags {
ALIGN_VA_32 = BIT(0),
-- 
2.29.2

[PATCH v2 05/19] elf: Remove compat_arch_setup_additional_pages()

2020-11-23 Thread Dmitry Safonov

Now that all users rely on detecting bitness of new-born task checking
personality, remove compat_arch_setup_additional_pages() macro,
simplifying the code.

Signed-off-by: Dmitry Safonov 
---
 fs/compat_binfmt_elf.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 3606dd3a32f5..da8ee4d6e451 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -115,11 +115,6 @@
 #define START_THREAD   COMPAT_START_THREAD
 #endif
 
-#ifdef compat_arch_setup_additional_pages
-#undef arch_setup_additional_pages
-#definearch_setup_additional_pages compat_arch_setup_additional_pages
-#endif
-
 #ifdef compat_elf_read_implies_exec
 #undef elf_read_implies_exec
 #defineelf_read_implies_exec compat_elf_read_implies_exec
-- 
2.29.2

[PATCH v2 07/19] elf: Use sysinfo_ehdr in ARCH_DLINFO()

2020-11-23 Thread Dmitry Safonov

Instead mm->context.vdso use the pointer provided by elf loader.
That allows to drop the pointer on arm/s390/sparc.

Cc: Christian Borntraeger 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: linux-s...@vger.kernel.org
Cc: "David S. Miller" 
Cc: sparcli...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/alpha/include/asm/elf.h|  2 +-
 arch/arm/include/asm/elf.h  |  5 ++---
 arch/arm64/include/asm/elf.h| 18 +-
 arch/ia64/include/asm/elf.h |  2 +-
 arch/mips/include/asm/elf.h |  5 ++---
 arch/nds32/include/asm/elf.h|  5 ++---
 arch/powerpc/include/asm/elf.h  |  4 ++--
 arch/riscv/include/asm/elf.h|  5 ++---
 arch/s390/include/asm/elf.h |  5 ++---
 arch/sh/include/asm/elf.h   | 10 +-
 arch/sparc/include/asm/elf_64.h |  5 ++---
 arch/x86/include/asm/elf.h  | 33 ++---
 arch/x86/um/asm/elf.h   |  4 ++--
 fs/binfmt_elf.c |  6 +++---
 fs/binfmt_elf_fdpic.c   | 11 ++-
 15 files changed, 51 insertions(+), 69 deletions(-)

diff --git a/arch/alpha/include/asm/elf.h b/arch/alpha/include/asm/elf.h
index 8049997fa372..701e820f28f0 100644
--- a/arch/alpha/include/asm/elf.h
+++ b/arch/alpha/include/asm/elf.h
@@ -155,7 +155,7 @@ extern int alpha_l2_cacheshape;
 extern int alpha_l3_cacheshape;
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO\
+#define ARCH_DLINFO(sysinfo_ehdr)  \
   do { \
 NEW_AUX_ENT(AT_L1I_CACHESHAPE, alpha_l1i_cacheshape);  \
 NEW_AUX_ENT(AT_L1D_CACHESHAPE, alpha_l1d_cacheshape);  \
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index 47347d7412ec..76a0f04190f0 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -138,10 +138,9 @@ extern void elf_set_personality(const struct elf32_hdr *);
 #define SET_PERSONALITY(ex)elf_set_personality(&(ex))
 
 #ifdef CONFIG_VDSO
-#define ARCH_DLINFO\
+#define ARCH_DLINFO(sysinfo_ehdr)  \
 do {   \
-   NEW_AUX_ENT(AT_SYSINFO_EHDR,\
-   (elf_addr_t)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \
 } while (0)
 #endif
 
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index a81953bcc1cf..e62818967a69 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -165,10 +165,9 @@ typedef struct user_fpsimd_state elf_fpregset_t;
 })
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO\
+#define ARCH_DLINFO(sysinfo_ehdr)  \
 do {   \
-   NEW_AUX_ENT(AT_SYSINFO_EHDR,\
-   (elf_addr_t)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \
\
/*  \
 * Should always be nonzero unless there's a kernel bug.\
@@ -223,19 +222,12 @@ typedef compat_elf_greg_t 
compat_elf_gregset_t[COMPAT_ELF_NGREG];
set_thread_flag(TIF_32BIT); \
  })
 #ifdef CONFIG_COMPAT_VDSO
-#define COMPAT_ARCH_DLINFO \
+#define COMPAT_ARCH_DLINFO(sysinfo_ehdr)   \
 do {   \
-   /*  \
-* Note that we use Elf64_Off instead of elf_addr_t because \
-* elf_addr_t in compat is defined as Elf32_Addr and casting\
-* current->mm->context.vdso to it triggers a cast warning of   \
-* cast from pointer to integer of different size.  \
-*/ \
-   NEW_AUX_ENT(AT_SYSINFO_EHDR,\
-   (Elf64_Off)current->mm->context.vdso);  \
+   NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \
 } while (0)
 #else
-#define COMPAT_ARCH_DLINFO
+#define COMPAT_ARCH_DLINFO(sysinfo_ehdr)
 #endif
 
 #endif /* CONFIG_COMPAT */
diff --git a/arch/ia64/include/asm/elf.h b/arch/ia64/include/asm/elf.h
index 6629301a2620..a257e5abddce 100644
--- a/arch/ia64/include/asm/elf.h
+++ b/arch/ia64/include/asm/elf.h
@@ -208,7 +208,7 @@ struct task_struct;
 #define GATE_EHD

[PATCH v2 01/19] x86/elf: Check in_x32_syscall() in compat_arch_setup_additional_pages()

2020-11-23 Thread Dmitry Safonov

Partly revert commit 3316ec8ccd34 ("x86/elf: Use e_machine to check for
x32/ia32 in setup_additional_pages()") and commit 9a29a671902c ("elf:
Expose ELF header on arch_setup_additional_pages()".
Both patches did a good thing: removed usage of TIF_X32, but with
a price of additional macros ARCH_SETUP_ADDITIONAL_PAGES() and ifdeffs.

Instead, use in_x32_syscall() - the first thing load_elf_binary() does
after parsing and checking new ELF binary. It's done that early after
exec() that way also allows to use it in mmap() code straight away,
which needs it to know which mmap_base to use (see arch_pick_mmap_layout()).
Add comments that describe how it works.

Cc: x...@kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/x86/entry/vdso/vma.c |  4 ++--
 arch/x86/include/asm/compat.h |  6 ++
 arch/x86/include/asm/elf.h|  6 ++
 fs/binfmt_elf.c   | 10 +++---
 fs/compat_binfmt_elf.c| 11 +++
 include/linux/elf.h   |  5 -
 6 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 44de75b21fab..4eea508e9b10 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -396,10 +396,10 @@ int arch_setup_additional_pages(struct linux_binprm 
*bprm, int uses_interp)
 
 #ifdef CONFIG_COMPAT
 int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
-  int uses_interp, bool x32)
+  int uses_interp)
 {
 #ifdef CONFIG_X86_X32_ABI
-   if (x32) {
+   if (in_x32_syscall()) {
if (!vdso64_enabled)
return 0;
return map_vdso_randomized(_image_x32);
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index f145e3326c6d..4489bd60640b 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -197,6 +197,12 @@ static inline bool in_x32_syscall(void)
return false;
 }
 
+/*
+ * Valid all time on the context of process that performs a syscall.
+ * Just born process has __X32_SYSCALL_BIT or TS_COMPAT set very
+ * early in load_binary() on setting personality and flags.
+ * See also set_personality_ia32().
+ */
 static inline bool in_32bit_syscall(void)
 {
return in_ia32_syscall() || in_x32_syscall();
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 44a9b9940535..109697a19eb1 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -383,10 +383,8 @@ struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
   int uses_interp);
 extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp, bool x32);
-#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter)  \
-   compat_arch_setup_additional_pages(bprm, interpreter,   \
-  (ex->e_machine == EM_X86_64))
+ int uses_interp);
+#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
 
 /* Do not change the values. See get_align_mask() */
 enum align_flags {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ac0b5fc30ea6..3de72c0e0406 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -999,8 +999,12 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (retval)
goto out_free_dentry;
 
-   /* Do this immediately, since STACK_TOP as used in setup_arg_pages
-  may depend on the personality.  */
+   /*
+* Do this immediately, since STACK_TOP as used in setup_arg_pages
+* may depend on the personality. At this moment we start
+* pretending that we are in a context of compat syscall for
+* compatible applications on x86, in_compat_syscall() starts working.
+*/
SET_PERSONALITY2(*elf_ex, _state);
if (elf_read_implies_exec(*elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -1246,7 +1250,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
set_binfmt(_format);
 
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
-   retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
+   retval = arch_setup_additional_pages(bprm, !!interpreter);
if (retval < 0)
goto out;
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 2c557229696a..12b991368f0a 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -115,16 +115,11 @@
 #define START_THREAD   COMPAT_START_THREAD
 #endif
 
-#ifdef compat_arch_setup_additional_pages
-#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \
-   compat_arch_setup_additional_pages(bprm, interpreter)
-#endif
-
-#ifdef COMPAT_ARCH_S

[PATCH v2 00/19] Add generic vdso_base tracking

2020-11-23 Thread Dmitry Safonov

v2 Changes:
- Rename user_landing to vdso_base as it tracks vDSO VMA start address,
  rather than the explicit address to land (Andy)
- Reword and don't use "new-execed" and "new-born" task (Andy)
- Fix failures reported by build robot

Started from discussion [1], where was noted that currently a couple of
architectures support mremap() for vdso/sigpage, but not munmap().
If an application maps something on the ex-place of vdso/sigpage,
later after processing signal it will land there (good luck!)

Patches set is based on linux-next (next-20201123) and it depends on
changes in x86/cleanups (those reclaim TIF_IA32/TIF_X32) and also
on my changes in akpm (fixing several mremap() issues).

Logically, the patches set divides on:
- patch   1: a cleanup for patches in x86/cleanups
- patches  2-11: cleanups for arch_setup_additional_pages()
- patches 12-13: x86 signal changes for unmapped vdso
- patches 14-19: provide generic vdso_base in mm_struct

In the end, besides cleanups, it's now more predictable what happens for
applications with unmapped vdso on architectures those support .mremap()
for vdso/sigpage.

I'm aware of only one user that unmaps vdso - Valgrind [2].
(there possibly are more, but this one is "special", it unmaps vdso, but
 not vvar, which confuses CRIU [Checkpoint Restore In Userspace], that's
 why I'm aware of it)

Patches as a .git branch:
https://github.com/0x7f454c46/linux/tree/setup_additional_pages-v2

v1 Link:
https://lore.kernel.org/lkml/20201108051730.2042693-1-d...@arista.com/

[1]: 
https://lore.kernel.org/linux-arch/cajwjo6zanqykshbq+3b+fi_vt80mtrzev5yreqawx-l8j8x...@mail.gmail.com/
[2]: https://github.com/checkpoint-restore/criu/issues/488

Cc: Alexander Viro 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: Arnd Bergmann 
Cc: Borislav Petkov 
Cc: Catalin Marinas 
Cc: Christophe Leroy 
Cc: Guo Ren 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Oleg Nesterov 
Cc: Russell King 
Cc: Thomas Bogendoerfer 
Cc: Thomas Gleixner 
Cc: Vincenzo Frascino 
Cc: Will Deacon 
Cc: x...@kernel.org

Dmitry Safonov (19):
  x86/elf: Check in_x32_syscall() in compat_arch_setup_additional_pages()
  elf: Move arch_setup_additional_pages() to generic elf.h
  arm64: Use in_compat_task() in arch_setup_additional_pages()
  x86: Remove compat_arch_setup_additional_pages()
  elf: Remove compat_arch_setup_additional_pages()
  elf/vdso: Reuse arch_setup_additional_pages() parameters
  elf: Use sysinfo_ehdr in ARCH_DLINFO()
  arm/vdso: Remove vdso pointer from mm->context
  s390/vdso: Remove vdso_base pointer from mm->context
  sparc/vdso: Remove vdso pointer from mm->context
  mm/mmap: Make vm_special_mapping::mremap return void
  x86/signal: Land on >retcode when vdso isn't mapped
  x86/signal: Check if vdso_image_32 is mapped before trying to land on it
  mm: Add vdso_base in mm_struct
  x86/vdso: Migrate to generic vdso_base
  arm/vdso: Migrate to generic vdso_base
  arm64/vdso: Migrate compat signals to generic vdso_base
  arm64/vdso: Migrate native signals to generic vdso_base
  mips/vdso: Migrate to generic vdso_base

 arch/alpha/include/asm/elf.h  |  2 +-
 arch/arm/Kconfig  |  2 +
 arch/arm/include/asm/elf.h| 10 +---
 arch/arm/include/asm/mmu.h|  3 -
 arch/arm/include/asm/vdso.h   |  6 +-
 arch/arm/kernel/process.c | 14 +
 arch/arm/kernel/signal.c  |  6 +-
 arch/arm/kernel/vdso.c| 20 ++-
 arch/arm64/Kconfig|  2 +
 arch/arm64/include/asm/elf.h  | 27 ++---
 arch/arm64/kernel/signal.c| 10 +++-
 arch/arm64/kernel/signal32.c  | 17 --
 arch/arm64/kernel/vdso.c  | 50 +++-
 arch/csky/Kconfig |  1 +
 arch/csky/include/asm/elf.h   |  4 --
 arch/csky/kernel/vdso.c   |  3 +-
 arch/hexagon/Kconfig  |  1 +
 arch/hexagon/include/asm/elf.h|  6 --
 arch/hexagon/kernel/vdso.c|  3 +-
 arch/ia64/include/asm/elf.h   |  2 +-
 arch/mips/Kconfig |  2 +
 arch/mips/include/asm/elf.h   | 10 +---
 arch/mips/kernel/signal.c | 11 ++--
 arch/mips/kernel/vdso.c   |  5 +-
 arch/mips/vdso/genvdso.c  |  9 ---
 arch/nds32/Kconfig|  1 +
 arch/nds32/include/asm/elf.h  |  8 +--
 arch/nds32/kernel/vdso.c  |  3 +-
 arch/nios2/Kconfig|  1 +
 arch/nios2/include/asm/elf.h  |  4 --
 arch/nios2/mm/init.c  |  2 +-
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/elf.h|  9 +--
 arch/powerpc/kernel/vdso.c|  3 +-
 arch/riscv/Kconfig|  1 +
 arch/riscv/include/asm/elf.h  |  9

[PATCH v2 03/19] arm64: Use in_compat_task() in arch_setup_additional_pages()

2020-11-23 Thread Dmitry Safonov

Instead of providing compat_arch_setup_additional_pages(), check if the
task is compatible from personality, which is set earlier in
load_elf_binary(). That will align code with powerpc and sparc, also
it'll allow to completely remove compat_arch_setyp_addtional_pages()
macro after doing the same for x86, simiplifying the binfmt code
in the end.

Cc: linux-arm-ker...@lists.infradead.org
Signed-off-by: Dmitry Safonov 
---
 arch/arm64/include/asm/elf.h |  5 -
 arch/arm64/kernel/vdso.c | 21 ++---
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index d1073ffa7f24..a81953bcc1cf 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -237,11 +237,6 @@ do {   
\
 #else
 #define COMPAT_ARCH_DLINFO
 #endif
-struct linux_binprm;
-extern int aarch32_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp);
-#define compat_arch_setup_additional_pages \
-   aarch32_setup_additional_pages
 
 #endif /* CONFIG_COMPAT */
 
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index cee5d04ea9ad..1b710deb84d6 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -401,29 +401,24 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm)
return PTR_ERR_OR_ZERO(ret);
 }
 
-int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+static int aarch32_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp)
 {
struct mm_struct *mm = current->mm;
int ret;
 
-   if (mmap_write_lock_killable(mm))
-   return -EINTR;
-
ret = aarch32_kuser_helpers_setup(mm);
if (ret)
-   goto out;
+   return ret;
 
if (IS_ENABLED(CONFIG_COMPAT_VDSO)) {
ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm,
   uses_interp);
if (ret)
-   goto out;
+   return ret;
}
 
-   ret = aarch32_sigreturn_setup(mm);
-out:
-   mmap_write_unlock(mm);
-   return ret;
+   return aarch32_sigreturn_setup(mm);
 }
 #endif /* CONFIG_COMPAT */
 
@@ -460,7 +455,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
if (mmap_write_lock_killable(mm))
return -EINTR;
 
-   ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp);
+   if (is_compat_task())
+   ret = aarch32_setup_additional_pages(bprm, uses_interp);
+   else
+   ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, 
uses_interp);
+
mmap_write_unlock(mm);
 
return ret;
-- 
2.29.2

Re: [PATCH v2] drivers/virt: vmgenid: add vm generation id driver

2020-11-20 Thread Dmitry Safonov

Hello,

+Cc Eric, Adrian

On 11/19/20 6:36 PM, Alexander Graf wrote:
> On 19.11.20 18:38, Mike Rapoport wrote:
>> On Thu, Nov 19, 2020 at 01:51:18PM +0100, Alexander Graf wrote:
>>> On 19.11.20 13:02, Christian Borntraeger wrote:
 On 16.11.20 16:34, Catangiu, Adrian Costin wrote:
> - Background
>
> The VM Generation ID is a feature defined by Microsoft (paper:
> http://go.microsoft.com/fwlink/?LinkId=260709) and supported by
> multiple hypervisor vendors.
>
> The feature is required in virtualized environments by apps that work
> with local copies/caches of world-unique data such as random values,
> uuids, monotonically increasing counters, etc.
> Such apps can be negatively affected by VM snapshotting when the VM
> is either cloned or returned to an earlier point in time.
>
> The VM Generation ID is a simple concept meant to alleviate the issue
> by providing a unique ID that changes each time the VM is restored
> from a snapshot. The hw provided UUID value can be used to
> differentiate between VMs or different generations of the same VM.
>
> - Problem
>
> The VM Generation ID is exposed through an ACPI device by multiple
> hypervisor vendors but neither the vendors or upstream Linux have no
> default driver for it leaving users to fend for themselves.
[..]

>>> The only piece where I'm unsure is how this will interact with CRIU.
>>
>> To C/R applications that use /dev/vmgenid CRIU need to be aware of it.
>> Checkpointing and restoring withing the same "VM generation" shouldn't be
>> a problem, but IMHO, making restore work after genid bump could be
>> challenging.
>>
>> Alex, what scenario involving CRIU did you have in mind?
> 
> You can in theory run into the same situation with containers that this
> patch is solving for virtual machines. You could for example do a
> snapshot of a prewarmed Java runtime with CRIU to get full JIT speeds
> starting from the first request.
> 
> That however means you run into the problem of predictable randomness
> again.
> 
>>
>>> Can containers emulate ioctls and device nodes?
>>
>> Containers do not emulate ioctls but they can have /dev/vmgenid inside
>> the container, so applications can use it the same way as outside the
>> container.
> 
> Hm. I suppose we could add a CAP_ADMIN ioctl interface to /dev/vmgenid
> (when container people get to the point of needing it) that sets the
> generation to "at least X". That way on restore, you could just call
> that with "generation at snapshot"+1.
> 
> That also means we need to have this interface available without virtual
> machines then though, right?

Sounds like a good idea.
I guess, genvmid can be global on host, rather than per-userns or
per-process for simplicity. Later if somebody will have a bottleneck on
restore when every process on the machine wakes up from read() it could
be virtualized, but doing it now sounds too early.

ioctl() probably should go under
checkpoint_restore_ns_capable(current_user_ns()), rather than
CAP_SYS_ADMIN (I believe it should be safe from DOS as only CRIU should
run with this capability, but worth to document this).

Thanks,
 Dmitry

[PATCH] brcmsmac: ampdu: Check BA window size before checking block ack

2020-11-15 Thread Dmitry Safonov

bindex can be out of BA window (64):
  tid 0 seq 2983, start_seq 2915, bindex 68, index 39
  tid 0 seq 2984, start_seq 2915, bindex 69, index 40
  tid 0 seq 2985, start_seq 2915, bindex 70, index 41
  tid 0 seq 2986, start_seq 2915, bindex 71, index 42
  tid 0 seq 2879, start_seq 2915, bindex 4060, index 63
  tid 0 seq 2854, start_seq 2915, bindex 4035, index 38
  tid 0 seq 2795, start_seq 2915, bindex 3976, index 43
  tid 0 seq 2989, start_seq 2924, bindex 65, index 45
  tid 0 seq 2992, start_seq 2924, bindex 68, index 48
  tid 0 seq 2993, start_seq 2924, bindex 69, index 49
  tid 0 seq 2994, start_seq 2924, bindex 70, index 50
  tid 0 seq 2997, start_seq 2924, bindex 73, index 53
  tid 0 seq 2795, start_seq 2941, bindex 3950, index 43
  tid 0 seq 2921, start_seq 2941, bindex 4076, index 41
  tid 0 seq 2929, start_seq 2941, bindex 4084, index 49
  tid 0 seq 3011, start_seq 2946, bindex 65, index 3
  tid 0 seq 3012, start_seq 2946, bindex 66, index 4
  tid 0 seq 3013, start_seq 2946, bindex 67, index 5

In result isset() will try to dereference something on the stack,
causing panics:
  BUG: unable to handle page fault for address: a742800ed01f
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x) - not-present page
  PGD 6a4e9067 P4D 6a4e9067 PUD 6a4ec067 PMD 6a4ed067 PTE 0
  Oops:  [#1] PREEMPT SMP PTI
  CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Not tainted 5.8.5-arch1-1-kdump #1
  Hardware name: Apple Inc. MacBookAir3,1/Mac-942452F5819B1C1B, BIOS
MBA31.88Z.0061.B07.1201241641 01/24/12
  RIP: 0010:brcms_c_ampdu_dotxstatus+0x343/0x9f0 [brcmsmac]
  Code: 54 24 20 66 81 e2 ff 0f 41 83 e4 07 89 d1 0f b7 d2 66 c1 e9 03 0f b7 c9 
4c 8d 5c 0c 48 49 8b 4d 10 48 8b 79 68 41 57 44 89 e1 <41> 0f b6 33 41 d3 e0 48 
c7 c1 38 e0 ea c0 48 83 c7 10 44 21 c6 4c
  RSP: 0018:a742800ecdd0 EFLAGS: 00010207
  RAX: 0019 RBX: 000b RCX: 0006
  RDX: 0ffe RSI: 0004 RDI: 8fc6ad776800
  RBP: 8fc6855acb00 R08: 0001 R09: 05d9
  R10: fffe R11: a742800ed01f R12: 0006
  R13: 8fc68d75a000 R14: 05db R15: 0019
  FS:  () GS:8fc6aad0() knlGS:
  CS:  0010 DS:  ES:  CR0: 80050033
  CR2: a742800ed01f CR3: 2480a000 CR4: 000406e0
  Call Trace:
   
   brcms_c_dpc+0xb46/0x1020 [brcmsmac]
   ? wlc_intstatus+0xc8/0x180 [brcmsmac]
   ? __raise_softirq_irqoff+0x1a/0x80
   brcms_dpc+0x37/0xd0 [brcmsmac]
   tasklet_action_common.constprop.0+0x51/0xb0
   __do_softirq+0xff/0x340
   ? handle_level_irq+0x1a0/0x1a0
   asm_call_on_stack+0x12/0x20
   
   do_softirq_own_stack+0x5f/0x80
   irq_exit_rcu+0xcb/0x120
   common_interrupt+0xd1/0x200
   asm_common_interrupt+0x1e/0x40
  RIP: 0010:cpuidle_enter_state+0xb3/0x420

Check if the block is within BA window and only then check block's
status. Otherwise as Behan wrote: "When I came back to Dublin I
was courtmartialed in my absence and sentenced to death in my absence,
so I said they could shoot me in my absence."

Also reported:
https://bbs.archlinux.org/viewtopic.php?id=258428
https://lore.kernel.org/linux-wireless/87tuwgi92n@yujinakao.com/

Reported-by: Yuji Nakao 
Signed-off-by: Dmitry Safonov 
---
 .../net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c  | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c 
b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c
index c9fb4b0cffaf..2631eb7569eb 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c
@@ -942,14 +942,19 @@ brcms_c_ampdu_dotxstatus_complete(struct ampdu_info 
*ampdu, struct scb *scb,
index = TX_SEQ_TO_INDEX(seq);
ack_recd = false;
if (ba_recd) {
+   int block_acked;
+
bindex = MODSUB_POW2(seq, start_seq, SEQNUM_MAX);
+   if (bindex < AMPDU_TX_BA_MAX_WSIZE)
+   block_acked = isset(bitmap, bindex);
+   else
+   block_acked = 0;
brcms_dbg_ht(wlc->hw->d11core,
 "tid %d seq %d, start_seq %d, bindex %d 
set %d, index %d\n",
 tid, seq, start_seq, bindex,
-isset(bitmap, bindex), index);
+block_acked, index);
/* if acked then clear bit and free packet */
-   if ((bindex < AMPDU_TX_BA_MAX_WSIZE)
-   && isset(bitmap, bindex)) {
+   if (block_acked) {
ini->txretry[index] = 0;
 
/*
-- 
2.29.2

Re: [PATCH 00/19] Add generic user_landing tracking

2020-11-08 Thread Dmitry Safonov

On 11/8/20 7:07 PM, Andy Lutomirski wrote:
> On Sat, Nov 7, 2020 at 9:17 PM Dmitry Safonov  wrote:
>>
>> Started from discussion [1], where was noted that currently a couple of
>> architectures support mremap() for vdso/sigpage, but not munmap().
>> If an application maps something on the ex-place of vdso/sigpage,
>> later after processing signal it will land there (good luck!)
>>
>> Patches set is based on linux-next (next-20201106) and it depends on
>> changes in x86/cleanups (those reclaim TIF_IA32/TIF_X32) and also
>> on my changes in akpm (fixing several mremap() issues).
>>
>> Logically, the patches set divides on:
>> - patch   1: cleanup for patches in x86/cleanups
>> - patches  2-11: cleanups for arch_setup_additional_pages()
> 
> I like these cleanups, although I think you should stop using terms
> like "new-born".  A task being exec'd is not newborn at all -- it's in
> the middle of a transformation.

Thank you for looking at them, Andy :-)

Yeah, somehow I thought about new-execed process as a new-born binary.
I'll try to improve changelogs in v2.

Thanks,
 Dmitry

Re: [PATCH 14/19] mm: Add user_landing in mm_struct

2020-11-08 Thread Dmitry Safonov

On 11/8/20 7:04 PM, Andy Lutomirski wrote:
> On Sat, Nov 7, 2020 at 9:18 PM Dmitry Safonov  wrote:
>>
>> Instead of having every architecture to define vdso_base/vdso_addr etc,
>> provide a generic mechanism to track landing in userspace.
>> It'll minimize per-architecture difference, the number of callbacks to
>> provide.
>>
>> Originally, it started from thread [1] where the need for .close()
>> callback on vm_special_mapping was pointed, this generic code besides
>> removing duplicated .mremap() callbacks provides a cheaper way to
>> support munmap() on vdso mappings without introducing .close() callbacks
>> for every architecture (with would bring even more code duplication).
> 
> I find the naming odd.  It's called "user_landing", which is
> presumably a hard-to-understand shorthand for "user mode landing pad
> for return from a signal handler if SA_RESTORER is not set".  But,
> looking at the actual code, it's not this at all -- it's just the vDSO
> base address.

Agree. Originally, I tried to track the actual landing address on the
vdso, but .mremap() seemed simpler when tracking the vma base.

> So how about just calling it vdso_base?  I'm very much in favor of
> consolidating and cleaning up, and improving the vdso remap/unmap
> code, but I'm not convinced that we should call it anything other than
> the vdso base.

Sure.

Thanks,
 Dmitry

Re: [PATCH 12/19] x86/signal: Land on >retcode when vdso isn't mapped

2020-11-08 Thread Dmitry Safonov

On 11/8/20 7:06 PM, Andy Lutomirski wrote:
> On Sat, Nov 7, 2020 at 9:17 PM Dmitry Safonov  wrote:
>>
>> Since commit 9fbbd4dd17d0 ("x86: Don't require the vDSO for handling
>> a.out signals") after processing 32-bit signal if there is no vdso
>> mapped frame->retcode is used as a landing.
>> Do the same for rt ia32 signals.
> 
> Am I reading correctly that this makes the ia32 compat signals match
> the native ia32 case?

Yes, probably I should have added it to the changelog.

Thanks,
 Dmitry

[PATCH 06/19] elf/vdso: Reuse arch_setup_additional_pages() parameters

2020-11-07 Thread Dmitry Safonov

Both parameters of arch_setup_additional_pages() are currently unused.
commit fc5243d98ac2 ("[S390] arch_setup_additional_pages arguments")
tried to introduce useful arguments, but they still are not used.

Remove old parameters and introduce sysinfo_ehdr argument that will be
used to return vdso address to put as AT_SYSINFO_EHDR tag in auxiliary
vector. The reason to do it is that many architecture have vDSO pointer
saved in their mm->context with the only purpose to use it later
in ARCH_DLINFO. That's the macro for elf loader to setup sysinfo_ehdr
tag.

Return sysinfo_ehdr address that will be later used by ARCH_DLINFO as
an argument. That will allow to drop vDSO pointer from mm.context
and any code responsible to track vDSO position on platforms that
don't use vDSO as a landing in userspace (arm/s390/sparc).

Cc: Albert Ou 
Cc: "David S. Miller" 
Cc: Palmer Dabbelt 
Cc: Paul Walmsley 
Cc: linux-fsde...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/arm/include/asm/vdso.h|  6 --
 arch/arm/kernel/process.c  |  4 ++--
 arch/arm/kernel/vdso.c | 10 +++---
 arch/arm64/kernel/vdso.c   | 17 
 arch/csky/kernel/vdso.c|  3 ++-
 arch/hexagon/kernel/vdso.c |  3 ++-
 arch/mips/kernel/vdso.c|  3 ++-
 arch/nds32/kernel/vdso.c   |  3 ++-
 arch/nios2/mm/init.c   |  2 +-
 arch/powerpc/kernel/vdso.c |  3 ++-
 arch/riscv/kernel/vdso.c   |  9 +
 arch/s390/kernel/vdso.c|  3 ++-
 arch/sh/kernel/vsyscall/vsyscall.c |  3 ++-
 arch/sparc/vdso/vma.c  | 15 +++---
 arch/x86/entry/vdso/vma.c  | 32 +-
 arch/x86/um/vdso/vma.c |  2 +-
 fs/binfmt_elf.c|  3 ++-
 fs/binfmt_elf_fdpic.c  |  3 ++-
 include/linux/elf.h| 17 +++-
 19 files changed, 84 insertions(+), 57 deletions(-)

diff --git a/arch/arm/include/asm/vdso.h b/arch/arm/include/asm/vdso.h
index 5b85889f82ee..6b2b3b1fe833 100644
--- a/arch/arm/include/asm/vdso.h
+++ b/arch/arm/include/asm/vdso.h
@@ -10,13 +10,15 @@ struct mm_struct;
 
 #ifdef CONFIG_VDSO
 
-void arm_install_vdso(struct mm_struct *mm, unsigned long addr);
+void arm_install_vdso(struct mm_struct *mm, unsigned long addr,
+ unsigned long *sysinfo_ehdr);
 
 extern unsigned int vdso_total_pages;
 
 #else /* CONFIG_VDSO */
 
-static inline void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
+static inline void arm_install_vdso(struct mm_struct *mm, unsigned long addr,
+   unsigned long *sysinfo_ehdr)
 {
 }
 
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index d0220da1d1b1..0e90cba8ac7a 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -389,7 +389,7 @@ static const struct vm_special_mapping sigpage_mapping = {
.mremap = sigpage_mremap,
 };
 
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
 {
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -430,7 +430,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
 * to be fatal to the process, so no error check needed
 * here.
 */
-   arm_install_vdso(mm, addr + PAGE_SIZE);
+   arm_install_vdso(mm, addr + PAGE_SIZE, sysinfo_ehdr);
 
  up_fail:
mmap_write_unlock(mm);
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index 3408269d19c7..710e5ca99a53 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -233,7 +233,8 @@ static int install_vvar(struct mm_struct *mm, unsigned long 
addr)
 }
 
 /* assumes mmap_lock is write-locked */
-void arm_install_vdso(struct mm_struct *mm, unsigned long addr)
+void arm_install_vdso(struct mm_struct *mm, unsigned long addr,
+ unsigned long *sysinfo_ehdr)
 {
struct vm_area_struct *vma;
unsigned long len;
@@ -254,7 +255,10 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long 
addr)
VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
_text_mapping);
 
-   if (!IS_ERR(vma))
-   mm->context.vdso = addr;
+   if (IS_ERR(vma))
+   return;
+
+   mm->context.vdso = addr;
+   *sysinfo_ehdr = addr;
 }
 
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 1b710deb84d6..666338724a07 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -213,8 +213,7 @@ static vm_fault_t vvar_fault(const struct 
vm_special_mapping *sm,
 
 static int __setup_additional_pages(enum vdso_abi abi,
struct mm_struct *mm,
-   struct linux_binprm *bprm,
-   int uses_interp)
+

[PATCH 19/19] mips/vdso: Migrate to user_landing

2020-11-07 Thread Dmitry Safonov

Generic way to track the land vma area.
As a bonus, after unmapping sigpage, kernel won't try to land on its
previous position.

Cc: Thomas Bogendoerfer 
Cc: linux-m...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/mips/Kconfig |  1 +
 arch/mips/kernel/signal.c | 11 +++
 arch/mips/kernel/vdso.c   |  2 +-
 arch/mips/vdso/genvdso.c  |  8 
 4 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 5e696ab80df4..eedb1683ec8e 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -10,6 +10,7 @@ config MIPS
select ARCH_HAS_SETUP_ADDITIONAL_PAGES
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN_SANITIZE_ALL
+   select ARCH_HAS_USER_LANDING
select ARCH_SUPPORTS_UPROBES
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c
index f1e985109da0..eb79272d3cc2 100644
--- a/arch/mips/kernel/signal.c
+++ b/arch/mips/kernel/signal.c
@@ -806,11 +806,13 @@ struct mips_abi mips_abi = {
 
 static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 {
+   unsigned long land = (unsigned long)current->mm->user_landing;
sigset_t *oldset = sigmask_to_save();
-   int ret;
+   int ret = 1;
struct mips_abi *abi = current->thread.abi;
-   void *vdso = current->mm->context.vdso;
 
+   if (land == UNMAPPED_USER_LANDING)
+   goto err;
/*
 * If we were emulating a delay slot instruction, exit that frame such
 * that addresses in the sigframe are as expected for userland and we
@@ -843,12 +845,13 @@ static void handle_signal(struct ksignal *ksig, struct 
pt_regs *regs)
rseq_signal_deliver(ksig, regs);
 
if (sig_uses_siginfo(>ka, abi))
-   ret = abi->setup_rt_frame(vdso + abi->vdso->off_rt_sigreturn,
+   ret = abi->setup_rt_frame(land + abi->vdso->off_rt_sigreturn,
  ksig, regs, oldset);
else
-   ret = abi->setup_frame(vdso + abi->vdso->off_sigreturn,
+   ret = abi->setup_frame(land + abi->vdso->off_sigreturn,
   ksig, regs, oldset);
 
+err:
signal_setup_done(ret, ksig, 0);
 }
 
diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c
index a4a321252df6..5523ba25ab3d 100644
--- a/arch/mips/kernel/vdso.c
+++ b/arch/mips/kernel/vdso.c
@@ -183,7 +183,7 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
goto out;
}
 
-   mm->context.vdso = (void *)vdso_addr;
+   mm->user_landing = (void __user *)vdso_addr;
*sysinfo_ehdr = vdso_addr;
ret = 0;
 
diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c
index 0303d30cde03..8f581a2c8578 100644
--- a/arch/mips/vdso/genvdso.c
+++ b/arch/mips/vdso/genvdso.c
@@ -259,13 +259,6 @@ int main(int argc, char **argv)
fprintf(out_file, "#include \n");
fprintf(out_file, "#include \n");
fprintf(out_file, "#include \n");
-   fprintf(out_file, "static void vdso_mremap(\n");
-   fprintf(out_file, " const struct vm_special_mapping *sm,\n");
-   fprintf(out_file, " struct vm_area_struct *new_vma)\n");
-   fprintf(out_file, "{\n");
-   fprintf(out_file, " current->mm->context.vdso =\n");
-   fprintf(out_file, " (void *)(new_vma->vm_start);\n");
-   fprintf(out_file, "}\n");
 
/* Write out the stripped VDSO data. */
fprintf(out_file,
@@ -290,7 +283,6 @@ int main(int argc, char **argv)
fprintf(out_file, "\t.mapping = {\n");
fprintf(out_file, "\t\t.name = \"[vdso]\",\n");
fprintf(out_file, "\t\t.pages = vdso_pages,\n");
-   fprintf(out_file, "\t\t.mremap = vdso_mremap,\n");
fprintf(out_file, "\t},\n");
 
/* Calculate and write symbol offsets to  */
-- 
2.28.0

[PATCH 17/19] arm64/vdso: Migrate compat signals to user_landing

2020-11-07 Thread Dmitry Safonov

Generic way to track the land vma area.
As a bonus, after unmapping sigpage, kernel won't try to land on its
previous position.

Signed-off-by: Dmitry Safonov 
---
 arch/arm64/Kconfig   |  1 +
 arch/arm64/kernel/signal32.c | 17 -
 arch/arm64/kernel/vdso.c |  2 +-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7111cf335ede..56505e396253 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -38,6 +38,7 @@ config ARM64
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+   select ARCH_HAS_USER_LANDING
select ARCH_HAVE_ELF_PROT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_INLINE_READ_LOCK if !PREEMPTION
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index 2f507f565c48..61d583c73be3 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -315,7 +315,7 @@ static void __user *compat_get_sigframe(struct ksignal 
*ksig,
return frame;
 }
 
-static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka,
+static int compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka,
compat_ulong_t __user *rc, void __user *frame,
int usig)
 {
@@ -342,13 +342,16 @@ static void compat_setup_return(struct pt_regs *regs, 
struct k_sigaction *ka,
retcode = ptr_to_compat(ka->sa.sa_restorer);
} else {
/* Set up sigreturn pointer */
+   unsigned long land = (unsigned long)current->mm->user_landing;
unsigned int idx = thumb << 1;
 
if (ka->sa.sa_flags & SA_SIGINFO)
idx += 3;
 
-   retcode = (unsigned long)current->mm->context.sigpage +
- (idx << 2) + thumb;
+   if (land == UNMAPPED_USER_LANDING)
+   return 1;
+
+   retcode = land + (idx << 2) + thumb;
}
 
regs->regs[0]   = usig;
@@ -356,6 +359,8 @@ static void compat_setup_return(struct pt_regs *regs, 
struct k_sigaction *ka,
regs->compat_lr = retcode;
regs->pc= handler;
regs->pstate= spsr;
+
+   return 0;
 }
 
 static int compat_setup_sigframe(struct compat_sigframe __user *sf,
@@ -425,7 +430,8 @@ int compat_setup_rt_frame(int usig, struct ksignal *ksig,
err |= compat_setup_sigframe(>sig, regs, set);
 
if (err == 0) {
-   compat_setup_return(regs, >ka, frame->sig.retcode, frame, 
usig);
+   err = compat_setup_return(regs, >ka,
+ frame->sig.retcode, frame, usig);
regs->regs[1] = (compat_ulong_t)(unsigned long)>info;
regs->regs[2] = (compat_ulong_t)(unsigned long)>sig.uc;
}
@@ -448,7 +454,8 @@ int compat_setup_frame(int usig, struct ksignal *ksig, 
sigset_t *set,
 
err |= compat_setup_sigframe(frame, regs, set);
if (err == 0)
-   compat_setup_return(regs, >ka, frame->retcode, frame, 
usig);
+   err = compat_setup_return(regs, >ka,
+ frame->retcode, frame, usig);
 
return err;
 }
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 91c1b7c716b7..08e8f1d56d92 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -394,7 +394,7 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm)
if (IS_ERR(ret))
goto out;
 
-   mm->context.sigpage = (void *)addr;
+   mm->user_landing = (void __user *)addr;
 
 out:
return PTR_ERR_OR_ZERO(ret);
-- 
2.28.0

[PATCH 14/19] mm: Add user_landing in mm_struct

2020-11-07 Thread Dmitry Safonov

Instead of having every architecture to define vdso_base/vdso_addr etc,
provide a generic mechanism to track landing in userspace.
It'll minimize per-architecture difference, the number of callbacks to
provide.

Originally, it started from thread [1] where the need for .close()
callback on vm_special_mapping was pointed, this generic code besides
removing duplicated .mremap() callbacks provides a cheaper way to
support munmap() on vdso mappings without introducing .close() callbacks
for every architecture (with would bring even more code duplication).

[1]: 
https://lore.kernel.org/linux-arch/cajwjo6zanqykshbq+3b+fi_vt80mtrzev5yreqawx-l8j8x...@mail.gmail.com/
Cc: Thomas Bogendoerfer 
Cc: linux-m...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c |  3 ++-
 fs/aio.c  |  3 ++-
 include/linux/mm.h|  3 ++-
 include/linux/mm_types.h  | 10 ++
 mm/Kconfig|  3 +++
 mm/mmap.c | 19 ++-
 mm/mremap.c   |  2 +-
 7 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c 
b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index e916646adc69..786c97203bf6 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1458,7 +1458,8 @@ static int pseudo_lock_dev_release(struct inode *inode, 
struct file *filp)
return 0;
 }
 
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long 
flags)
+static int pseudo_lock_dev_mremap(struct vm_area_struct *old_vma,
+   struct vm_area_struct *new_vma, unsigned long flags)
 {
/* Not supported */
return -EINVAL;
diff --git a/fs/aio.c b/fs/aio.c
index d1dad4cd860f..2695dc9ed46f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -324,7 +324,8 @@ static void aio_free_ring(struct kioctx *ctx)
}
 }
 
-static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags)
+static int aio_ring_mremap(struct vm_area_struct *old_vma,
+  struct vm_area_struct *vma, unsigned long flags)
 {
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 427911d2c83e..4b0f97a289b3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -559,7 +559,8 @@ struct vm_operations_struct {
void (*close)(struct vm_area_struct * area);
/* Called any time before splitting to check if it's allowed */
int (*may_split)(struct vm_area_struct *area, unsigned long addr);
-   int (*mremap)(struct vm_area_struct *area, unsigned long flags);
+   int (*mremap)(struct vm_area_struct *old_vma,
+   struct vm_area_struct *new_vma, unsigned long flags);
vm_fault_t (*fault)(struct vm_fault *vmf);
vm_fault_t (*huge_fault)(struct vm_fault *vmf,
enum page_entry_size pe_size);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index b035caff6abe..f888257e973a 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -488,6 +488,16 @@ struct mm_struct {
 
/* Architecture-specific MM context */
mm_context_t context;
+#ifdef CONFIG_ARCH_HAS_USER_LANDING
+   /*
+* Address of special mapping VMA to land after processing
+* a signal. Reads are unprotected: if a thread unmaps or
+* mremaps the mapping while another thread is processing
+* a signal, it can segfault while landing.
+*/
+   void __user *user_landing;
+#endif
+#define UNMAPPED_USER_LANDING TASK_SIZE_MAX
 
unsigned long flags; /* Must use atomic bitops to access */
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 01b0ae0cd9d3..d43b61a21be8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -883,4 +883,7 @@ config ARCH_HAS_HUGEPD
 config MAPPING_DIRTY_HELPERS
 bool
 
+config ARCH_HAS_USER_LANDING
+   bool
+
 endmenu
diff --git a/mm/mmap.c b/mm/mmap.c
index 2376f3972f13..8a17ffdedacb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3410,11 +3410,25 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t 
flags, long npages)
 
 static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
 
+static void update_user_landing(struct vm_area_struct *old_vma,
+   unsigned long new_addr)
+{
+#ifdef CONFIG_ARCH_HAS_USER_LANDING
+   struct mm_struct *mm = old_vma->vm_mm;
+
+   if (WARN_ON_ONCE(!mm))
+   return;
+   if (old_vma->vm_start == (unsigned long)mm->user_landing)
+   mm->user_landing = (void __user *)new_addr;
+#endif
+}
+
 /*
  * Having a close hook prevents vma merging regardless of flags.
  */
 static void special_mapping_close(struct v

[PATCH 11/19] mm/mmap: Make vm_special_mapping::mremap return void

2020-11-07 Thread Dmitry Safonov

Previously .mremap() callback needed (int) return to provide way to
restrict resizing of a special mapping. Now it's restricted by
providing .may_split = special_mapping_split.

Removing (int) return simplifies further changes to
special_mapping_mremap() as it won't need save ret code from the
callback. Also, it removes needless `return 0` from callbacks.

Signed-off-by: Dmitry Safonov 
---
 arch/arm/kernel/process.c | 3 +--
 arch/arm64/kernel/vdso.c  | 4 +---
 arch/mips/vdso/genvdso.c  | 3 +--
 arch/x86/entry/vdso/vma.c | 4 +---
 include/linux/mm_types.h  | 2 +-
 mm/mmap.c | 2 +-
 6 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 0e90cba8ac7a..5f4eced738f5 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -376,11 +376,10 @@ static unsigned long sigpage_addr(const struct mm_struct 
*mm,
 static struct page *signal_page;
 extern struct page *get_signal_page(void);
 
-static int sigpage_mremap(const struct vm_special_mapping *sm,
+static void sigpage_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
 {
current->mm->context.sigpage = new_vma->vm_start;
-   return 0;
 }
 
 static const struct vm_special_mapping sigpage_mapping = {
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 666338724a07..91c1b7c716b7 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -78,12 +78,10 @@ static union {
 } vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = vdso_data_store.data;
 
-static int vdso_mremap(const struct vm_special_mapping *sm,
+static void vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
 {
current->mm->context.vdso = (void *)new_vma->vm_start;
-
-   return 0;
 }
 
 static int __vdso_init(enum vdso_abi abi)
diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c
index 09e30eb4be86..0303d30cde03 100644
--- a/arch/mips/vdso/genvdso.c
+++ b/arch/mips/vdso/genvdso.c
@@ -259,13 +259,12 @@ int main(int argc, char **argv)
fprintf(out_file, "#include \n");
fprintf(out_file, "#include \n");
fprintf(out_file, "#include \n");
-   fprintf(out_file, "static int vdso_mremap(\n");
+   fprintf(out_file, "static void vdso_mremap(\n");
fprintf(out_file, " const struct vm_special_mapping *sm,\n");
fprintf(out_file, " struct vm_area_struct *new_vma)\n");
fprintf(out_file, "{\n");
fprintf(out_file, " current->mm->context.vdso =\n");
fprintf(out_file, " (void *)(new_vma->vm_start);\n");
-   fprintf(out_file, " return 0;\n");
fprintf(out_file, "}\n");
 
/* Write out the stripped VDSO data. */
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 5b9020742e66..65780a0164e3 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -86,15 +86,13 @@ static void vdso_fix_landing(const struct vdso_image *image,
 #endif
 }
 
-static int vdso_mremap(const struct vm_special_mapping *sm,
+static void vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
 {
const struct vdso_image *image = current->mm->context.vdso_image;
 
vdso_fix_landing(image, new_vma);
current->mm->context.vdso = (void __user *)new_vma->vm_start;
-
-   return 0;
 }
 
 #ifdef CONFIG_TIME_NS
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6a6b078b9d6a..b035caff6abe 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -772,7 +772,7 @@ struct vm_special_mapping {
struct vm_area_struct *vma,
struct vm_fault *vmf);
 
-   int (*mremap)(const struct vm_special_mapping *sm,
+   void (*mremap)(const struct vm_special_mapping *sm,
 struct vm_area_struct *new_vma);
 };
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 61f72b09d990..2376f3972f13 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3434,7 +3434,7 @@ static int special_mapping_mremap(struct vm_area_struct 
*new_vma,
return -EFAULT;
 
if (sm->mremap)
-   return sm->mremap(sm, new_vma);
+   sm->mremap(sm, new_vma);
 
return 0;
 }
-- 
2.28.0

[PATCH 16/19] arm/vdso: Migrate to user_landing

2020-11-07 Thread Dmitry Safonov

Generic way to track the land vma area.
As a bonus, after unmapping sigpage, kernel won't try to land on its
previous position.

Signed-off-by: Dmitry Safonov 
---
 arch/arm/Kconfig  | 1 +
 arch/arm/kernel/process.c | 9 +
 arch/arm/kernel/signal.c  | 6 +-
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index bece41f3b3b9..c161d7313911 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -24,6 +24,7 @@ config ARM
select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB
select ARCH_HAS_TEARDOWN_DMA_OPS if MMU
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
+   select ARCH_HAS_USER_LANDING
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_KEEP_MEMBLOCK
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 5f4eced738f5..ac08241e5cf8 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -376,16 +376,9 @@ static unsigned long sigpage_addr(const struct mm_struct 
*mm,
 static struct page *signal_page;
 extern struct page *get_signal_page(void);
 
-static void sigpage_mremap(const struct vm_special_mapping *sm,
-   struct vm_area_struct *new_vma)
-{
-   current->mm->context.sigpage = new_vma->vm_start;
-}
-
 static const struct vm_special_mapping sigpage_mapping = {
.name = "[sigpage]",
.pages = _page,
-   .mremap = sigpage_mremap,
 };
 
 int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
@@ -423,7 +416,7 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
goto up_fail;
}
 
-   mm->context.sigpage = addr;
+   mm->user_landing = (void __user *)addr;
 
/* Unlike the sigpage, failure to install the vdso is unlikely
 * to be fatal to the process, so no error check needed
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 9d2e916121be..270b17a9dc0f 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -451,13 +451,17 @@ setup_return(struct pt_regs *regs, struct ksignal *ksig,
 #ifdef CONFIG_MMU
if (cpsr & MODE32_BIT) {
struct mm_struct *mm = current->mm;
+   unsigned long land = (unsigned long)mm->user_landing;
+
+   if (land == UNMAPPED_USER_LANDING)
+   return 1;
 
/*
 * 32-bit code can use the signal return page
 * except when the MPU has protected the vectors
 * page from PL0
 */
-   retcode = mm->context.sigpage + signal_return_offset +
+   retcode = land + signal_return_offset +
  (idx << 2) + thumb;
} else
 #endif
-- 
2.28.0

[PATCH 07/19] elf: Use sysinfo_ehdr in ARCH_DLINFO()

2020-11-07 Thread Dmitry Safonov

Instead mm->context.vdso use the pointer provided by elf loader.
That allows to drop the pointer on arm/s390/sparc.

Cc: Christian Borntraeger 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: linux-s...@vger.kernel.org
Cc: "David S. Miller" 
Cc: sparcli...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/alpha/include/asm/elf.h|  2 +-
 arch/arm/include/asm/elf.h  |  5 ++---
 arch/arm64/include/asm/elf.h| 18 +-
 arch/ia64/include/asm/elf.h |  2 +-
 arch/mips/include/asm/elf.h |  5 ++---
 arch/nds32/include/asm/elf.h|  5 ++---
 arch/powerpc/include/asm/elf.h  |  4 ++--
 arch/riscv/include/asm/elf.h|  5 ++---
 arch/s390/include/asm/elf.h |  5 ++---
 arch/sh/include/asm/elf.h   | 10 +-
 arch/sparc/include/asm/elf_64.h |  5 ++---
 arch/x86/include/asm/elf.h  | 33 ++---
 arch/x86/um/asm/elf.h   |  4 ++--
 fs/binfmt_elf.c |  6 +++---
 fs/binfmt_elf_fdpic.c   | 11 ++-
 15 files changed, 51 insertions(+), 69 deletions(-)

diff --git a/arch/alpha/include/asm/elf.h b/arch/alpha/include/asm/elf.h
index 8049997fa372..701e820f28f0 100644
--- a/arch/alpha/include/asm/elf.h
+++ b/arch/alpha/include/asm/elf.h
@@ -155,7 +155,7 @@ extern int alpha_l2_cacheshape;
 extern int alpha_l3_cacheshape;
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO\
+#define ARCH_DLINFO(sysinfo_ehdr)  \
   do { \
 NEW_AUX_ENT(AT_L1I_CACHESHAPE, alpha_l1i_cacheshape);  \
 NEW_AUX_ENT(AT_L1D_CACHESHAPE, alpha_l1d_cacheshape);  \
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index 1f4b91a17a91..7bb07056242f 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -133,10 +133,9 @@ extern void elf_set_personality(const struct elf32_hdr *);
 #define SET_PERSONALITY(ex)elf_set_personality(&(ex))
 
 #ifdef CONFIG_VDSO
-#define ARCH_DLINFO\
+#define ARCH_DLINFO(sysinfo_ehdr)  \
 do {   \
-   NEW_AUX_ENT(AT_SYSINFO_EHDR,\
-   (elf_addr_t)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \
 } while (0)
 #endif
 
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index a81953bcc1cf..e62818967a69 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -165,10 +165,9 @@ typedef struct user_fpsimd_state elf_fpregset_t;
 })
 
 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
-#define ARCH_DLINFO\
+#define ARCH_DLINFO(sysinfo_ehdr)  \
 do {   \
-   NEW_AUX_ENT(AT_SYSINFO_EHDR,\
-   (elf_addr_t)current->mm->context.vdso); \
+   NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \
\
/*  \
 * Should always be nonzero unless there's a kernel bug.\
@@ -223,19 +222,12 @@ typedef compat_elf_greg_t 
compat_elf_gregset_t[COMPAT_ELF_NGREG];
set_thread_flag(TIF_32BIT); \
  })
 #ifdef CONFIG_COMPAT_VDSO
-#define COMPAT_ARCH_DLINFO \
+#define COMPAT_ARCH_DLINFO(sysinfo_ehdr)   \
 do {   \
-   /*  \
-* Note that we use Elf64_Off instead of elf_addr_t because \
-* elf_addr_t in compat is defined as Elf32_Addr and casting\
-* current->mm->context.vdso to it triggers a cast warning of   \
-* cast from pointer to integer of different size.  \
-*/ \
-   NEW_AUX_ENT(AT_SYSINFO_EHDR,\
-   (Elf64_Off)current->mm->context.vdso);  \
+   NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \
 } while (0)
 #else
-#define COMPAT_ARCH_DLINFO
+#define COMPAT_ARCH_DLINFO(sysinfo_ehdr)
 #endif
 
 #endif /* CONFIG_COMPAT */
diff --git a/arch/ia64/include/asm/elf.h b/arch/ia64/include/asm/elf.h
index 6629301a2620..a257e5abddce 100644
--- a/arch/ia64/include/asm/elf.h
+++ b/arch/ia64/include/asm/elf.h
@@ -208,7 +208,7 @@ struct task_struct;
 #define GATE_EHD

[PATCH 09/19] s390/vdso: Remove vdso_base pointer from mm->context

2020-11-07 Thread Dmitry Safonov

Not used any more.

Cc: Christian Borntraeger 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: linux-s...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/s390/include/asm/mmu.h |  1 -
 arch/s390/kernel/vdso.c | 10 --
 2 files changed, 11 deletions(-)

diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index e12ff0f29d1a..095d0596f700 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -15,7 +15,6 @@ typedef struct {
unsigned long gmap_asce;
unsigned long asce;
unsigned long asce_limit;
-   unsigned long vdso_base;
/* The mmu context belongs to a secure guest. */
atomic_t is_protected;
/*
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 810b72f8985c..3f07711a07c1 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -58,18 +58,9 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping 
*sm,
return 0;
 }
 
-static int vdso_mremap(const struct vm_special_mapping *sm,
-  struct vm_area_struct *vma)
-{
-   current->mm->context.vdso_base = vma->vm_start;
-
-   return 0;
-}
-
 static const struct vm_special_mapping vdso_mapping = {
.name = "[vdso]",
.fault = vdso_fault,
-   .mremap = vdso_mremap,
 };
 
 static int __init vdso_setup(char *str)
@@ -204,7 +195,6 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr)
goto out_up;
}
 
-   current->mm->context.vdso_base = vdso_base;
*sysinfo_ehdr = vdso_base;
rc = 0;
 
-- 
2.28.0

[PATCH 12/19] x86/signal: Land on >retcode when vdso isn't mapped

2020-11-07 Thread Dmitry Safonov

Since commit 9fbbd4dd17d0 ("x86: Don't require the vDSO for handling
a.out signals") after processing 32-bit signal if there is no vdso
mapped frame->retcode is used as a landing.
Do the same for rt ia32 signals.

This shouldn't be mistaken for encouragement for running binaries with
executable stack, rather something to do in hopefully very rare
situation with disabled or unmapped vdso and absent SA_RESTORER.
For non-executable stack it'll segfault on attempt to land, rather than
land on a random address where vdso was previously mapped.
For programs with executable stack it'll just do the same for rt signals
as for non-rt.

Discouraging users to run with executable stack is done separately in
commit 47a2ebb7f505 ("execve: warn if process starts with executable
stack").

Signed-off-by: Dmitry Safonov 
---
 arch/x86/ia32/ia32_signal.c | 12 +++-
 arch/x86/kernel/signal.c| 23 ++-
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 81cf22398cd1..ea3db15b57bf 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -270,8 +270,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
unsafe_put_user(set->sig[1], >extramask[0], Efault);
unsafe_put_user(ptr_to_compat(restorer), >pretcode, Efault);
/*
-* These are actually not used anymore, but left because some
-* gdb versions depend on them as a marker.
+* This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
+* gdb uses it as a signature to notice signal handler stack frames.
 */
unsafe_put_user(*((u64 *)), (u64 __user *)frame->retcode, Efault);
user_access_end();
@@ -336,14 +336,16 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
-   else
+   else if (current->mm->context.vdso)
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_rt_sigreturn;
+   else
+   restorer = >retcode;
unsafe_put_user(ptr_to_compat(restorer), >pretcode, Efault);
 
/*
-* Not actually used anymore, but left because some gdb
-* versions need it.
+* This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
+* gdb uses it as a signature to notice signal handler stack frames.
 */
unsafe_put_user(*((u64 *)), (u64 __user *)frame->retcode, Efault);
unsafe_put_sigcontext32(>uc.uc_mcontext, fp, regs, set, Efault);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index ea794a083c44..372ec09dc4ac 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -317,23 +317,20 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t 
*set,
unsafe_put_user(sig, >sig, Efault);
unsafe_put_sigcontext(>sc, fp, regs, set, Efault);
unsafe_put_user(set->sig[1], >extramask[0], Efault);
-   if (current->mm->context.vdso)
+   if (ksig->ka.sa.sa_flags & SA_RESTORER)
+   restorer = ksig->ka.sa.sa_restorer;
+   else if (current->mm->context.vdso)
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_sigreturn;
else
restorer = >retcode;
-   if (ksig->ka.sa.sa_flags & SA_RESTORER)
-   restorer = ksig->ka.sa.sa_restorer;
 
/* Set up to return from userspace.  */
unsafe_put_user(restorer, >pretcode, Efault);
 
/*
 * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
-*
-* WE DO NOT USE IT ANY MORE! It's only left here for historical
-* reasons and because gdb uses it as a signature to notice
-* signal handler stack frames.
+* gdb uses it as a signature to notice signal handler stack frames.
 */
unsafe_put_user(*((u64 *)), (u64 *)frame->retcode, Efault);
user_access_end();
@@ -382,18 +379,18 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
unsafe_save_altstack(>uc.uc_stack, regs->sp, Efault);
 
/* Set up to return from userspace.  */
-   restorer = current->mm->context.vdso +
-   vdso_image_32.sym___kernel_rt_sigreturn;
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
+   else if (current->mm->context.vdso)
+   restorer = current->mm->context.vdso +
+   vdso_image_32.sym___kernel_rt_sigreturn;
+   else
+   restorer = >retcode;
unsafe_put_user(restorer, >pretcode, Efault);
 
/*
 * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
-*
-*

[PATCH 15/19] x86/vdso: Migrate to user_landing

2020-11-07 Thread Dmitry Safonov

Generic way to track the land vma area.
As a bonus, after unmapping vdso, kernel won't try to land on its
previous position.

Signed-off-by: Dmitry Safonov 
---
 arch/x86/Kconfig| 1 +
 arch/x86/entry/common.c | 3 ++-
 arch/x86/entry/vdso/vma.c   | 9 -
 arch/x86/ia32/ia32_signal.c | 4 ++--
 arch/x86/include/asm/mmu.h  | 1 -
 arch/x86/include/asm/vdso.h | 2 +-
 arch/x86/kernel/signal.c| 4 ++--
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b068f949d2e4..2449844ef0e9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -84,6 +84,7 @@ config X86
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_UBSAN_SANITIZE_ALL
+   select ARCH_HAS_USER_LANDING
select ARCH_HAS_DEBUG_WX
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 1be1bdbe55d4..1a3f795af331 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -151,9 +151,10 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs 
*regs)
 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
 * convention.  Adjust regs so it looks like we entered using int80.
 */
-   landing_pad = (unsigned long)current->mm->context.vdso +
+   landing_pad = (unsigned long)current->mm->user_landing +
vdso_image_32.sym_int80_landing_pad;
 
+
/*
 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 65780a0164e3..2c426997a7a4 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -77,7 +77,7 @@ static void vdso_fix_landing(const struct vdso_image *image,
struct pt_regs *regs = current_pt_regs();
unsigned long vdso_land = image->sym_int80_landing_pad;
unsigned long old_land_addr = vdso_land +
-   (unsigned long)current->mm->context.vdso;
+   (unsigned long)current->mm->user_landing;
 
/* Fixing userspace landing - look at do_fast_syscall_32 */
if (regs->ip == old_land_addr)
@@ -92,7 +92,6 @@ static void vdso_mremap(const struct vm_special_mapping *sm,
const struct vdso_image *image = current->mm->context.vdso_image;
 
vdso_fix_landing(image, new_vma);
-   current->mm->context.vdso = (void __user *)new_vma->vm_start;
 }
 
 #ifdef CONFIG_TIME_NS
@@ -287,7 +286,7 @@ static int map_vdso(const struct vdso_image *image, 
unsigned long addr,
ret = PTR_ERR(vma);
do_munmap(mm, text_start, image->size, NULL);
} else {
-   current->mm->context.vdso = (void __user *)text_start;
+   current->mm->user_landing = (void __user *)text_start;
current->mm->context.vdso_image = image;
*sysinfo_ehdr = text_start;
}
@@ -362,8 +361,8 @@ int map_vdso_once(const struct vdso_image *image, unsigned 
long addr)
 * Check if we have already mapped vdso blob - fail to prevent
 * abusing from userspace install_speciall_mapping, which may
 * not do accounting and rlimit right.
-* We could search vma near context.vdso, but it's a slowpath,
-* so let's explicitly check all VMAs to be completely sure.
+* It's a slowpath, let's explicitly check all VMAs to be
+* completely sure.
 */
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma_is_special_mapping(vma, _mapping) ||
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f87ed1d53938..fb84c0fcdc4e 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -256,7 +256,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
} else {
/* Return stub is in 32bit vsyscall page */
if (current_has_vdso_image_32())
-   restorer = current->mm->context.vdso +
+   restorer = current->mm->user_landing +
vdso_image_32.sym___kernel_sigreturn;
else
restorer = >retcode;
@@ -337,7 +337,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
else if (current_has_vdso_image_32())
-   restorer = current->mm->context.vdso +
+   restorer = current->mm->user_landing +
vdso_image_32.sym___kernel_rt_sigreturn;
else

[PATCH 10/19] sparc/vdso: Remove vdso pointer from mm->context

2020-11-07 Thread Dmitry Safonov

Not used any more.

Cc: "David S. Miller" 
Cc: sparcli...@vger.kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/sparc/include/asm/mmu_64.h | 1 -
 arch/sparc/vdso/vma.c   | 5 +
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
index 7e2704c770e9..8e7892890d14 100644
--- a/arch/sparc/include/asm/mmu_64.h
+++ b/arch/sparc/include/asm/mmu_64.h
@@ -111,7 +111,6 @@ typedef struct {
unsigned long   thp_pte_count;
struct tsb_config   tsb_block[MM_NUM_TSBS];
struct hv_tsb_descr tsb_descr[MM_NUM_TSBS];
-   void*vdso;
booladi;
tag_storage_desc_t  *tag_store;
spinlock_t  tag_lock;
diff --git a/arch/sparc/vdso/vma.c b/arch/sparc/vdso/vma.c
index bf9195fe9bcc..255e052223ca 100644
--- a/arch/sparc/vdso/vma.c
+++ b/arch/sparc/vdso/vma.c
@@ -389,7 +389,6 @@ static int map_vdso(const struct vdso_image *image,
}
 
text_start = addr - image->sym_vvar_start;
-   current->mm->context.vdso = (void __user *)text_start;
 
/*
 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -418,9 +417,7 @@ static int map_vdso(const struct vdso_image *image,
}
 
 up_fail:
-   if (ret)
-   current->mm->context.vdso = NULL;
-   else
+   if (!ret)
*sysinfo_ehdr = text_start;
 
mmap_write_unlock(mm);
-- 
2.28.0

[PATCH 08/19] arm/vdso: Remove vdso pointer from mm->context

2020-11-07 Thread Dmitry Safonov

Not used any more.

Signed-off-by: Dmitry Safonov 
---
 arch/arm/include/asm/mmu.h |  3 ---
 arch/arm/kernel/vdso.c | 12 
 2 files changed, 15 deletions(-)

diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h
index 1592a4264488..2397b0a19f59 100644
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -12,9 +12,6 @@ typedef struct {
 #endif
unsigned intvmalloc_seq;
unsigned long   sigpage;
-#ifdef CONFIG_VDSO
-   unsigned long   vdso;
-#endif
 #ifdef CONFIG_BINFMT_ELF_FDPIC
unsigned long   exec_fdpic_loadmap;
unsigned long   interp_fdpic_loadmap;
diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index 710e5ca99a53..4b39c7d8f525 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -47,17 +47,8 @@ static const struct vm_special_mapping vdso_data_mapping = {
.pages = _data_page,
 };
 
-static int vdso_mremap(const struct vm_special_mapping *sm,
-   struct vm_area_struct *new_vma)
-{
-   current->mm->context.vdso = new_vma->vm_start;
-
-   return 0;
-}
-
 static struct vm_special_mapping vdso_text_mapping __ro_after_init = {
.name = "[vdso]",
-   .mremap = vdso_mremap,
 };
 
 struct elfinfo {
@@ -239,8 +230,6 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long 
addr,
struct vm_area_struct *vma;
unsigned long len;
 
-   mm->context.vdso = 0;
-
if (vdso_text_pagelist == NULL)
return;
 
@@ -258,7 +247,6 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long 
addr,
if (IS_ERR(vma))
return;
 
-   mm->context.vdso = addr;
*sysinfo_ehdr = addr;
 }
 
-- 
2.28.0

[PATCH 18/19] arm64/vdso: Migrate native signals to user_landing

2020-11-07 Thread Dmitry Safonov

Generic way to track the land vma area.
As a bonus, after unmapping vdso, kernel won't try to land on its
previous position.

Signed-off-by: Dmitry Safonov 
---
 arch/arm64/kernel/signal.c | 10 +++---
 arch/arm64/kernel/vdso.c   | 13 +++--
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index bec6ef69704f..4c1dfbc1aed3 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -723,9 +723,10 @@ static int get_sigframe(struct rt_sigframe_user_layout 
*user,
return 0;
 }
 
-static void setup_return(struct pt_regs *regs, struct k_sigaction *ka,
+static int setup_return(struct pt_regs *regs, struct k_sigaction *ka,
 struct rt_sigframe_user_layout *user, int usig)
 {
+   unsigned long land = (unsigned long)current->mm->user_landing;
__sigrestore_t sigtramp;
 
regs->regs[0] = usig;
@@ -754,10 +755,13 @@ static void setup_return(struct pt_regs *regs, struct 
k_sigaction *ka,
 
if (ka->sa.sa_flags & SA_RESTORER)
sigtramp = ka->sa.sa_restorer;
+   else if (land != UNMAPPED_USER_LANDING)
+   sigtramp = VDSO_SYMBOL(land, sigtramp);
else
-   sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp);
+   return 1;
 
regs->regs[30] = (unsigned long)sigtramp;
+   return 0;
 }
 
 static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
@@ -780,7 +784,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, 
sigset_t *set,
err |= __save_altstack(>uc.uc_stack, regs->sp);
err |= setup_sigframe(, regs, set);
if (err == 0) {
-   setup_return(regs, >ka, , usig);
+   err = setup_return(regs, >ka, , usig);
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
err |= copy_siginfo_to_user(>info, >info);
regs->regs[1] = (unsigned long)>info;
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 08e8f1d56d92..d710fcd7141c 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -78,12 +78,6 @@ static union {
 } vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = vdso_data_store.data;
 
-static void vdso_mremap(const struct vm_special_mapping *sm,
-   struct vm_area_struct *new_vma)
-{
-   current->mm->context.vdso = (void *)new_vma->vm_start;
-}
-
 static int __vdso_init(enum vdso_abi abi)
 {
int i;
@@ -239,7 +233,6 @@ static int __setup_additional_pages(enum vdso_abi abi,
gp_flags = VM_ARM64_BTI;
 
vdso_base += VVAR_NR_PAGES * PAGE_SIZE;
-   mm->context.vdso = (void *)vdso_base;
ret = _install_special_mapping(mm, vdso_base, vdso_text_len,
   VM_READ|VM_EXEC|gp_flags|
   VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
@@ -247,12 +240,14 @@ static int __setup_additional_pages(enum vdso_abi abi,
if (IS_ERR(ret))
goto up_fail;
 
+   /* 32-bit ABI is to land on sigpage, 64-bit on vdso */
+   if (abi == VDSO_ABI_AA64)
+   mm->user_landing = (void __user *)vdso_base;
*sysinfo_ehdr = vdso_base;
 
return 0;
 
 up_fail:
-   mm->context.vdso = NULL;
return PTR_ERR(ret);
 }
 
@@ -285,7 +280,6 @@ static struct vm_special_mapping aarch32_vdso_maps[] = {
},
[AA32_MAP_VDSO] = {
.name = "[vdso]",
-   .mremap = vdso_mremap,
},
 };
 
@@ -431,7 +425,6 @@ static struct vm_special_mapping aarch64_vdso_maps[] 
__ro_after_init = {
},
[AA64_MAP_VDSO] = {
.name   = "[vdso]",
-   .mremap = vdso_mremap,
},
 };
 
-- 
2.28.0

[PATCH 13/19] x86/signal: Check if vdso_image_32 is mapped before trying to land on it

2020-11-07 Thread Dmitry Safonov

Provide current_has_vdso_image_32() helper and check it apriory landing
attempt on vdso vma.
The helper is a macro, not a static inline funciton to avoid
linux/sched/task_stack.h inclusion in asm/vdso.h.

Signed-off-by: Dmitry Safonov 
---
 arch/x86/entry/common.c | 7 ++-
 arch/x86/ia32/ia32_signal.c | 4 ++--
 arch/x86/include/asm/vdso.h | 4 
 arch/x86/kernel/signal.c| 4 ++--
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 870efeec8bda..1be1bdbe55d4 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -142,11 +142,16 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs 
*regs)
 /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
 __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
 {
+   unsigned long landing_pad;
+
+   if (!current_has_vdso_image_32())
+   force_sigsegv(SIGSEGV);
+
/*
 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
 * convention.  Adjust regs so it looks like we entered using int80.
 */
-   unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+   landing_pad = (unsigned long)current->mm->context.vdso +
vdso_image_32.sym_int80_landing_pad;
 
/*
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index ea3db15b57bf..f87ed1d53938 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -255,7 +255,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
restorer = ksig->ka.sa.sa_restorer;
} else {
/* Return stub is in 32bit vsyscall page */
-   if (current->mm->context.vdso)
+   if (current_has_vdso_image_32())
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_sigreturn;
else
@@ -336,7 +336,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
-   else if (current->mm->context.vdso)
+   else if (current_has_vdso_image_32())
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_rt_sigreturn;
else
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index bbcdc7b8f963..a19e0a7bae2d 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -39,6 +39,10 @@ extern const struct vdso_image vdso_image_x32;
 
 #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
 extern const struct vdso_image vdso_image_32;
+
+#define current_has_vdso_image_32()\
+   likely(current->mm->context.vdso_image == _image_32 && \
+   !!current->mm->context.vdso)
 #endif
 
 extern void __init init_vdso_image(const struct vdso_image *image);
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 372ec09dc4ac..6fed2e523e0a 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -319,7 +319,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set,
unsafe_put_user(set->sig[1], >extramask[0], Efault);
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
-   else if (current->mm->context.vdso)
+   else if (current_has_vdso_image_32())
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_sigreturn;
else
@@ -381,7 +381,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
/* Set up to return from userspace.  */
if (ksig->ka.sa.sa_flags & SA_RESTORER)
restorer = ksig->ka.sa.sa_restorer;
-   else if (current->mm->context.vdso)
+   else if (current_has_vdso_image_32())
restorer = current->mm->context.vdso +
vdso_image_32.sym___kernel_rt_sigreturn;
else
-- 
2.28.0

[PATCH 02/19] elf: Move arch_setup_additional_pages() to generic elf.h

2020-11-07 Thread Dmitry Safonov

Ifdef the function in the header, not in the code.
Following kernel style, move it to Kconfig.
All it makes it easier to follow when the option is enabled/disabled.
Remove re-definition from compat_binfmt_elf, as it's always defined
under compat_arch_setup_additional_pages (to be reworked).

Signed-off-by: Dmitry Safonov 
---
 arch/arm/Kconfig|  1 +
 arch/arm/include/asm/elf.h  |  5 -
 arch/arm64/Kconfig  |  1 +
 arch/arm64/include/asm/elf.h|  6 +-
 arch/csky/Kconfig   |  1 +
 arch/csky/include/asm/elf.h |  4 
 arch/hexagon/Kconfig|  1 +
 arch/hexagon/include/asm/elf.h  |  6 --
 arch/mips/Kconfig   |  1 +
 arch/mips/include/asm/elf.h |  5 -
 arch/nds32/Kconfig  |  1 +
 arch/nds32/include/asm/elf.h|  3 ---
 arch/nios2/Kconfig  |  1 +
 arch/nios2/include/asm/elf.h|  4 
 arch/powerpc/Kconfig|  1 +
 arch/powerpc/include/asm/elf.h  |  5 -
 arch/riscv/Kconfig  |  1 +
 arch/riscv/include/asm/elf.h|  5 -
 arch/s390/Kconfig   |  1 +
 arch/s390/include/asm/elf.h |  5 -
 arch/sh/Kconfig |  1 +
 arch/sh/include/asm/elf.h   |  6 --
 arch/sparc/Kconfig  |  1 +
 arch/sparc/include/asm/elf_64.h |  6 --
 arch/x86/Kconfig|  1 +
 arch/x86/include/asm/elf.h  |  4 
 arch/x86/um/asm/elf.h   |  5 -
 fs/Kconfig.binfmt   |  3 +++
 fs/binfmt_elf.c |  2 --
 fs/binfmt_elf_fdpic.c   |  3 +--
 fs/compat_binfmt_elf.c  |  2 --
 include/linux/elf.h | 12 
 32 files changed, 30 insertions(+), 74 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index fb700e471332..bece41f3b3b9 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -15,6 +15,7 @@ config ARM
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
select ARCH_HAS_PHYS_TO_DMA
+   select ARCH_HAS_SETUP_ADDITIONAL_PAGES if MMU
select ARCH_HAS_SETUP_DMA_OPS
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index 61941f369861..1f4b91a17a91 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -132,7 +132,6 @@ extern int arm_elf_read_implies_exec(int);
 extern void elf_set_personality(const struct elf32_hdr *);
 #define SET_PERSONALITY(ex)elf_set_personality(&(ex))
 
-#ifdef CONFIG_MMU
 #ifdef CONFIG_VDSO
 #define ARCH_DLINFO\
 do {   \
@@ -140,9 +139,5 @@ do {
\
(elf_addr_t)current->mm->context.vdso); \
 } while (0)
 #endif
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
-struct linux_binprm;
-int arch_setup_additional_pages(struct linux_binprm *, int);
-#endif
 
 #endif
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index d1b81221c6a9..7111cf335ede 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -26,6 +26,7 @@ config ARM64
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_DEVMAP
select ARCH_HAS_PTE_SPECIAL
+   select ARCH_HAS_SETUP_ADDITIONAL_PAGES
select ARCH_HAS_SETUP_DMA_OPS
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 8d1c8dcb87fd..d1073ffa7f24 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -181,11 +181,6 @@ do {   
\
NEW_AUX_ENT(AT_IGNORE, 0);  \
 } while (0)
 
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
-struct linux_binprm;
-extern int arch_setup_additional_pages(struct linux_binprm *bprm,
-  int uses_interp);
-
 /* 1GB of VA */
 #ifdef CONFIG_COMPAT
 #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \
@@ -242,6 +237,7 @@ do {
\
 #else
 #define COMPAT_ARCH_DLINFO
 #endif
+struct linux_binprm;
 extern int aarch32_setup_additional_pages(struct linux_binprm *bprm,
  int uses_interp);
 #define compat_arch_setup_additional_pages \
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 8fec85ab5da6..00e7b063f6ca 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -4,6 +4,7 @@ config CSKY
select ARCH_32BIT_OFF_T
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_GCOV_PROFILE_ALL
+   select ARCH_HAS_SETUP_ADDITIONAL_PAGES
select ARCH_HAS_SYNC_DMA_FOR_CPU
select ARCH_HAS_SYNC_DMA_FOR_DEV

[PATCH 03/19] arm64: Use in_compat_task() in arch_setup_additional_pages()

2020-11-07 Thread Dmitry Safonov

Instead of providing compat_arch_setup_additional_pages(), check if the
task is compatible from personality, which is set earlier in
load_elf_binary(). That will align code with powerpc and sparc, also
it'll allow to completely remove compat_arch_setyp_addtional_pages()
macro after doing the same for x86, simiplifying the binfmt code
in the end.

Cc: linux-arm-ker...@lists.infradead.org
Signed-off-by: Dmitry Safonov 
---
 arch/arm64/include/asm/elf.h |  5 -
 arch/arm64/kernel/vdso.c | 21 ++---
 2 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index d1073ffa7f24..a81953bcc1cf 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -237,11 +237,6 @@ do {   
\
 #else
 #define COMPAT_ARCH_DLINFO
 #endif
-struct linux_binprm;
-extern int aarch32_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp);
-#define compat_arch_setup_additional_pages \
-   aarch32_setup_additional_pages
 
 #endif /* CONFIG_COMPAT */
 
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index cee5d04ea9ad..1b710deb84d6 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -401,29 +401,24 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm)
return PTR_ERR_OR_ZERO(ret);
 }
 
-int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+static int aarch32_setup_additional_pages(struct linux_binprm *bprm,
+ int uses_interp)
 {
struct mm_struct *mm = current->mm;
int ret;
 
-   if (mmap_write_lock_killable(mm))
-   return -EINTR;
-
ret = aarch32_kuser_helpers_setup(mm);
if (ret)
-   goto out;
+   return ret;
 
if (IS_ENABLED(CONFIG_COMPAT_VDSO)) {
ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm,
   uses_interp);
if (ret)
-   goto out;
+   return ret;
}
 
-   ret = aarch32_sigreturn_setup(mm);
-out:
-   mmap_write_unlock(mm);
-   return ret;
+   return aarch32_sigreturn_setup(mm);
 }
 #endif /* CONFIG_COMPAT */
 
@@ -460,7 +455,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, 
int uses_interp)
if (mmap_write_lock_killable(mm))
return -EINTR;
 
-   ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp);
+   if (is_compat_task())
+   ret = aarch32_setup_additional_pages(bprm, uses_interp);
+   else
+   ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, 
uses_interp);
+
mmap_write_unlock(mm);
 
return ret;
-- 
2.28.0

[PATCH 05/19] elf: Remove compat_arch_setup_additional_pages()

2020-11-07 Thread Dmitry Safonov

Now that all users rely on detecting bitness of new-born task checking
personality, remove compat_arch_setup_additional_pages() macro,
simplifying the code.

Signed-off-by: Dmitry Safonov 
---
 fs/compat_binfmt_elf.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 3606dd3a32f5..da8ee4d6e451 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -115,11 +115,6 @@
 #define START_THREAD   COMPAT_START_THREAD
 #endif
 
-#ifdef compat_arch_setup_additional_pages
-#undef arch_setup_additional_pages
-#definearch_setup_additional_pages compat_arch_setup_additional_pages
-#endif
-
 #ifdef compat_elf_read_implies_exec
 #undef elf_read_implies_exec
 #defineelf_read_implies_exec compat_elf_read_implies_exec
-- 
2.28.0

[PATCH 00/19] Add generic user_landing tracking

2020-11-07 Thread Dmitry Safonov

Started from discussion [1], where was noted that currently a couple of
architectures support mremap() for vdso/sigpage, but not munmap().
If an application maps something on the ex-place of vdso/sigpage,
later after processing signal it will land there (good luck!)

Patches set is based on linux-next (next-20201106) and it depends on
changes in x86/cleanups (those reclaim TIF_IA32/TIF_X32) and also
on my changes in akpm (fixing several mremap() issues).

Logically, the patches set divides on:
- patch   1: cleanup for patches in x86/cleanups
- patches  2-11: cleanups for arch_setup_additional_pages()
- patches 12-13: x86 signal changes for unmapped vdso
- patches 14-19: provide generic user_landing in mm_struct

In the end, besides cleanups, it's now more predictable what happens for
applications with unmapped vdso on architectures those support .mremap()
for vdso/sigpage.

I'm aware of only one user that unmaps vdso - Valgrind [2].
(there possibly are more, but this one is "special", it unmaps vdso, but
 not vvar, which confuses CRIU [Checkpoint Restore In Userspace], that's
 why I'm aware of it)

Patches as a .git branch:
https://github.com/0x7f454c46/linux/tree/setup_additional_pages

[1]: 
https://lore.kernel.org/linux-arch/cajwjo6zanqykshbq+3b+fi_vt80mtrzev5yreqawx-l8j8x...@mail.gmail.com/
[2]: https://github.com/checkpoint-restore/criu/issues/488

Cc: Alexander Viro 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: Arnd Bergmann 
Cc: Borislav Petkov 
Cc: Catalin Marinas 
Cc: Christophe Leroy 
Cc: Guo Ren 
Cc: "H. Peter Anvin" 
Cc: Ingo Molnar 
Cc: Oleg Nesterov 
Cc: Russell King 
Cc: Thomas Bogendoerfer 
Cc: Thomas Gleixner 
Cc: Vincenzo Frascino 
Cc: Will Deacon 
Cc: x...@kernel.org

Dmitry Safonov (19):
  x86/elf: Check in_x32_syscall() in compat_arch_setup_additional_pages()
  elf: Move arch_setup_additional_pages() to generic elf.h
  arm64: Use in_compat_task() in arch_setup_additional_pages()
  x86: Remove compat_arch_setup_additional_pages()
  elf: Remove compat_arch_setup_additional_pages()
  elf/vdso: Reuse arch_setup_additional_pages() parameters
  elf: Use sysinfo_ehdr in ARCH_DLINFO()
  arm/vdso: Remove vdso pointer from mm->context
  s390/vdso: Remove vdso_base pointer from mm->context
  sparc/vdso: Remove vdso pointer from mm->context
  mm/mmap: Make vm_special_mapping::mremap return void
  x86/signal: Land on >retcode when vdso isn't mapped
  x86/signal: Check if vdso_image_32 is mapped before trying to land on it
  mm: Add user_landing in mm_struct
  x86/vdso: Migrate to user_landing
  arm/vdso: Migrate to user_landing
  arm64/vdso: Migrate compat signals to user_landing
  arm64/vdso: Migrate native signals to user_landing
  mips/vdso: Migrate to user_landing

 arch/alpha/include/asm/elf.h  |  2 +-
 arch/arm/Kconfig  |  2 +
 arch/arm/include/asm/elf.h| 10 +---
 arch/arm/include/asm/mmu.h|  3 -
 arch/arm/include/asm/vdso.h   |  6 +-
 arch/arm/kernel/process.c | 14 +
 arch/arm/kernel/signal.c  |  6 +-
 arch/arm/kernel/vdso.c| 20 ++-
 arch/arm64/Kconfig|  2 +
 arch/arm64/include/asm/elf.h  | 27 ++---
 arch/arm64/kernel/signal.c| 10 +++-
 arch/arm64/kernel/signal32.c  | 17 --
 arch/arm64/kernel/vdso.c  | 47 ++-
 arch/csky/Kconfig |  1 +
 arch/csky/include/asm/elf.h   |  4 --
 arch/csky/kernel/vdso.c   |  3 +-
 arch/hexagon/Kconfig  |  1 +
 arch/hexagon/include/asm/elf.h|  6 --
 arch/hexagon/kernel/vdso.c|  3 +-
 arch/ia64/include/asm/elf.h   |  2 +-
 arch/mips/Kconfig |  2 +
 arch/mips/include/asm/elf.h   | 10 +---
 arch/mips/kernel/signal.c | 11 ++--
 arch/mips/kernel/vdso.c   |  5 +-
 arch/mips/vdso/genvdso.c  |  9 ---
 arch/nds32/Kconfig|  1 +
 arch/nds32/include/asm/elf.h  |  8 +--
 arch/nds32/kernel/vdso.c  |  3 +-
 arch/nios2/Kconfig|  1 +
 arch/nios2/include/asm/elf.h  |  4 --
 arch/nios2/mm/init.c  |  2 +-
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/elf.h|  9 +--
 arch/powerpc/kernel/vdso.c|  3 +-
 arch/riscv/Kconfig|  1 +
 arch/riscv/include/asm/elf.h  | 10 +---
 arch/riscv/kernel/vdso.c  |  9 +--
 arch/s390/Kconfig |  1 +
 arch/s390/include/asm/elf.h   | 10 +---
 arch/s390/include/asm/mmu.h   |  1 -
 arch/s390/kernel/vdso.c   | 13 +---
 arch/sh/Kconfig   |  1 +
 arch/sh/include/asm/elf.h | 16 ++---
 ar

[PATCH 04/19] x86: Remove compat_arch_setup_additional_pages()

2020-11-07 Thread Dmitry Safonov

The same as for x32 task, detect ia32 task by in_ia32_syscall().
It's valid as new-execed task is pretending to be in a syscall of
relevant bitness/ABI, see the comment near in_32bit_syscall().

Removing compat_arch_setup_additional_pages() provides single point of
entry - arch_setup_additional_pages(), makes ifdeffery easier to read,
aligns the code with powerpc and sparc (mips also has single vdso setup
function, but instead of taking bitness from mm.context, takes vdso
image pointer there).
Together with arm64 code align to use in_compat_task(), it makes
possible to remove compat_arch_setup_additional_pages() macro
re-definition from compat elf code (another redefined marco less).

Cc: x...@kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/x86/entry/vdso/vma.c  | 41 +++---
 arch/x86/include/asm/elf.h |  5 -
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 4eea508e9b10..aace862ed9a1 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -376,48 +376,49 @@ int map_vdso_once(const struct vdso_image *image, 
unsigned long addr)
 }
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-static int load_vdso32(void)
+static int load_vdso_ia32(void)
 {
if (vdso32_enabled != 1)  /* Other values all mean "disabled" */
return 0;
 
return map_vdso(_image_32, 0);
 }
+#else
+static int load_vdso_ia32(void)
+{
+   WARN_ON_ONCE(1);
+   return -ENODATA;
+}
 #endif
 
 #ifdef CONFIG_X86_64
-int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+static int load_vdso_64(void)
 {
if (!vdso64_enabled)
return 0;
 
-   return map_vdso_randomized(_image_64);
-}
-
-#ifdef CONFIG_COMPAT
-int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
-  int uses_interp)
-{
 #ifdef CONFIG_X86_X32_ABI
-   if (in_x32_syscall()) {
-   if (!vdso64_enabled)
-   return 0;
+   if (in_x32_syscall())
return map_vdso_randomized(_image_x32);
-   }
 #endif
-#ifdef CONFIG_IA32_EMULATION
-   return load_vdso32();
+
+   return map_vdso_randomized(_image_64);
+}
 #else
-   return 0;
-#endif
+static int load_vdso_64(void)
+{
+   WARN_ON_ONCE(1);
+   return -ENODATA;
 }
 #endif
-#else
+
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-   return load_vdso32();
+   if (in_ia32_syscall())
+   return load_vdso_ia32();
+
+   return load_vdso_64();
 }
-#endif
 
 #ifdef CONFIG_X86_64
 static __init int vdso_setup(char *s)
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index d00b723eea2d..51a08f6b18e5 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -377,11 +377,6 @@ else   
\
((unsigned long)current->mm->context.vdso + \
 vdso_image_32.sym___kernel_vsyscall)
 
-struct linux_binprm;
-extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp);
-#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
-
 /* Do not change the values. See get_align_mask() */
 enum align_flags {
ALIGN_VA_32 = BIT(0),
-- 
2.28.0

[PATCH 01/19] x86/elf: Check in_x32_syscall() in compat_arch_setup_additional_pages()

2020-11-07 Thread Dmitry Safonov

Partly revert commit 3316ec8ccd34 ("x86/elf: Use e_machine to check for
x32/ia32 in setup_additional_pages()") and commit 9a29a671902c ("elf:
Expose ELF header on arch_setup_additional_pages()".
Both patches did a good thing: removed usage of TIF_X32, but with
a price of additional macros ARCH_SETUP_ADDITIONAL_PAGES() and ifdeffs.

Instead, use in_x32_syscall() - the first thing load_elf_binary() does
after parsing and checking new ELF binary. It's done that early which
also allows to use it in mmap() code straight away, which needs it
to know which mmap_base to use (see arch_pick_mmap_layout()).
Add comments that describe how it works.

Cc: x...@kernel.org
Signed-off-by: Dmitry Safonov 
---
 arch/x86/entry/vdso/vma.c |  4 ++--
 arch/x86/include/asm/compat.h |  6 ++
 arch/x86/include/asm/elf.h|  6 ++
 fs/binfmt_elf.c   | 10 +++---
 fs/compat_binfmt_elf.c| 11 +++
 include/linux/elf.h   |  5 -
 6 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 44de75b21fab..4eea508e9b10 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -396,10 +396,10 @@ int arch_setup_additional_pages(struct linux_binprm 
*bprm, int uses_interp)
 
 #ifdef CONFIG_COMPAT
 int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
-  int uses_interp, bool x32)
+  int uses_interp)
 {
 #ifdef CONFIG_X86_X32_ABI
-   if (x32) {
+   if (in_x32_syscall()) {
if (!vdso64_enabled)
return 0;
return map_vdso_randomized(_image_x32);
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index f145e3326c6d..4489bd60640b 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -197,6 +197,12 @@ static inline bool in_x32_syscall(void)
return false;
 }
 
+/*
+ * Valid all time on the context of process that performs a syscall.
+ * Just born process has __X32_SYSCALL_BIT or TS_COMPAT set very
+ * early in load_binary() on setting personality and flags.
+ * See also set_personality_ia32().
+ */
 static inline bool in_32bit_syscall(void)
 {
return in_ia32_syscall() || in_x32_syscall();
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 44a9b9940535..109697a19eb1 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -383,10 +383,8 @@ struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
   int uses_interp);
 extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
- int uses_interp, bool x32);
-#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter)  \
-   compat_arch_setup_additional_pages(bprm, interpreter,   \
-  (ex->e_machine == EM_X86_64))
+ int uses_interp);
+#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
 
 /* Do not change the values. See get_align_mask() */
 enum align_flags {
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index ac0b5fc30ea6..3de72c0e0406 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -999,8 +999,12 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (retval)
goto out_free_dentry;
 
-   /* Do this immediately, since STACK_TOP as used in setup_arg_pages
-  may depend on the personality.  */
+   /*
+* Do this immediately, since STACK_TOP as used in setup_arg_pages
+* may depend on the personality. At this moment we start
+* pretending that we are in a context of compat syscall for
+* compatible applications on x86, in_compat_syscall() starts working.
+*/
SET_PERSONALITY2(*elf_ex, _state);
if (elf_read_implies_exec(*elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -1246,7 +1250,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
set_binfmt(_format);
 
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
-   retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
+   retval = arch_setup_additional_pages(bprm, !!interpreter);
if (retval < 0)
goto out;
 #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 2c557229696a..12b991368f0a 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -115,16 +115,11 @@
 #define START_THREAD   COMPAT_START_THREAD
 #endif
 
-#ifdef compat_arch_setup_additional_pages
-#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \
-   compat_arch_setup_additional_pages(bprm, interpreter)
-#endif
-
-#ifdef COMPAT_ARCH_S

Re: [PATCH v3 10/10] x86: Reclaim TIF_IA32 and TIF_X32

2020-11-05 Thread Dmitry Safonov

On Sun, 4 Oct 2020 at 04:31, Gabriel Krisman Bertazi
 wrote:
>
> Now that these flags are no longer used, reclaim those TI bits.
>
> Signed-off-by: Gabriel Krisman Bertazi 

Oh wow!
I've just started rebasing patches that do essentially the same on linux-next
and found that it's already done and merged.

Thanks for doing this! :-)

-- 
 Dmitry

Re: [PATCH] x86/mpx: fix recursive munmap() corruption

2020-11-03 Thread Dmitry Safonov

Hi Laurent, Christophe, Michael, all,

On 11/3/20 5:11 PM, Laurent Dufour wrote:
> Le 23/10/2020 à 14:28, Christophe Leroy a écrit :
[..]
 That seems like it would work for CRIU and make sense in general?
>>>
>>> Sorry for the late answer, yes this would make more sense.
>>>
>>> Here is a patch doing that.
>>>
>>
>> In your patch, the test seems overkill:
>>
>> +    if ((start <= vdso_base && vdso_end <= end) ||  /* 1   */
>> +    (vdso_base <= start && start < vdso_end) || /* 3,4 */
>> +    (vdso_base < end && end <= vdso_end))   /* 2,3 */
>> +    mm->context.vdso_base = mm->context.vdso_end = 0;
>>
>> What about
>>
>>  if (start < vdso_end && vdso_start < end)
>>  mm->context.vdso_base = mm->context.vdso_end = 0;
>>
>> This should cover all cases, or am I missing something ?
>>
>>
>> And do we really need to store vdso_end in the context ?
>> I think it should be possible to re-calculate it: the size of the VDSO
>> should be (_end - _start) + PAGE_SIZE for 32 bits VDSO,
>> and (_end - _start) + PAGE_SIZE for the 64 bits VDSO.
> 
> Thanks Christophe for the advise.
> 
> That is covering all the cases, and indeed is similar to the Michael's
> proposal I missed last year.
> 
> I'll send a patch fixing this issue following your proposal.

It's probably not necessary anymore. I've sent patches [1], currently in
akpm, the last one forbids splitting of vm_special_mapping.
So, a user is able munmap() or mremap() vdso as a whole, but not partly.

[1]:
https://lore.kernel.org/linux-mm/20201013013416.390574-1-d...@arista.com/

Thanks,
  Dmitry

[PATCH v2 0/3] xfrm/compat: syzbot-found fixes

2020-11-02 Thread Dmitry Safonov

v2: Added "Fixes" tags to the patches.

WARN_ON() for XFRMA_UNSPEC translation which likely no-one except
syzkaller uses; properly zerofy tail-padding for 64-bit attribute;
don't use __GFP_ZERO as the memory is initialized during translation.

Cc: Steffen Klassert 
Cc: "David S. Miller" 
Cc: Jakub Kicinski 
Cc: Herbert Xu 
Cc: Hillf Danton 
Cc: net...@vger.kernel.org

Thanks,
     Dmitry

Dmitry Safonov (3):
  xfrm/compat: Translate by copying XFRMA_UNSPEC attribute
  xfrm/compat: memset(0) 64-bit padding at right place
  xfrm/compat: Don't allocate memory with __GFP_ZERO

 net/xfrm/xfrm_compat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

base-commit: 3cea11cd5e3b00d91caf0b4730194039b45c5891
-- 
2.28.0

[PATCH v2 3/3] xfrm/compat: Don't allocate memory with __GFP_ZERO

2020-11-02 Thread Dmitry Safonov

32-bit to 64-bit messages translator zerofies needed paddings in the
translation, the rest is the actual payload.
Don't allocate zero pages as they are not needed.

Fixes: 5106f4a8acff ("xfrm/compat: Add 32=>64-bit messages translator")
Signed-off-by: Dmitry Safonov 
---
 net/xfrm/xfrm_compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index 556e9f33b815..d8e8a11ca845 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -564,7 +564,7 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const 
struct nlmsghdr *h32,
return NULL;
 
len += NLMSG_HDRLEN;
-   h64 = kvmalloc(len, GFP_KERNEL | __GFP_ZERO);
+   h64 = kvmalloc(len, GFP_KERNEL);
if (!h64)
return ERR_PTR(-ENOMEM);
 
-- 
2.28.0

[PATCH v2 2/3] xfrm/compat: memset(0) 64-bit padding at right place

2020-11-02 Thread Dmitry Safonov

32-bit messages translated by xfrm_compat can have attributes attached.
For all, but XFRMA_SA, XFRMA_POLICY the size of payload is the same
in 32-bit UABI and 64-bit UABI. For XFRMA_SA (struct xfrm_usersa_info)
and XFRMA_POLICY (struct xfrm_userpolicy_info) it's only tail-padding
that is present in 64-bit payload, but not in 32-bit.
The proper size for destination nlattr is already calculated by
xfrm_user_rcv_calculate_len64() and allocated with kvmalloc().

xfrm_attr_cpy32() copies 32-bit copy_len into 64-bit attribute
translated payload, zero-filling possible padding for SA/POLICY.
Due to a typo, *pos already has 64-bit payload size, in a result next
memset(0) is called on the memory after the translated attribute, not on
the tail-padding of it.

Fixes: 5106f4a8acff ("xfrm/compat: Add 32=>64-bit messages translator")
Reported-by: syzbot+c43831072e7df506a...@syzkaller.appspotmail.com
Signed-off-by: Dmitry Safonov 
---
 net/xfrm/xfrm_compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index 17edbf935e35..556e9f33b815 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -388,7 +388,7 @@ static int xfrm_attr_cpy32(void *dst, size_t *pos, const 
struct nlattr *src,
 
memcpy(nla, src, nla_attr_size(copy_len));
nla->nla_len = nla_attr_size(payload);
-   *pos += nla_attr_size(payload);
+   *pos += nla_attr_size(copy_len);
nlmsg->nlmsg_len += nla->nla_len;
 
memset(dst + *pos, 0, payload - copy_len);
-- 
2.28.0

[PATCH v2 1/3] xfrm/compat: Translate by copying XFRMA_UNSPEC attribute

2020-11-02 Thread Dmitry Safonov

xfrm_xlate32() translates 64-bit message provided by kernel to be sent
for 32-bit listener (acknowledge or monitor). Translator code doesn't
expect XFRMA_UNSPEC attribute as it doesn't know its payload.
Kernel never attaches such attribute, but a user can.

I've searched if any opensource does it and the answer is no.
Nothing on github and google finds only tfcproject that has such code
commented-out.

What will happen if a user sends a netlink message with XFRMA_UNSPEC
attribute? Ipsec code ignores this attribute. But if there is a
monitor-process or 32-bit user requested ack - kernel will try to
translate such message and will hit WARN_ONCE() in xfrm_xlate64_attr().

Deal with XFRMA_UNSPEC by copying the attribute payload with
xfrm_nla_cpy(). In result, the default switch-case in xfrm_xlate64_attr()
becomes an unused code. Leave those 3 lines in case a new xfrm attribute
will be added.

Fixes: 5461fc0c8d9f ("xfrm/compat: Add 64=>32-bit messages translator")
Reported-by: syzbot+a7e701c8385bd8543...@syzkaller.appspotmail.com
Signed-off-by: Dmitry Safonov 
---
 net/xfrm/xfrm_compat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index e28f0c9ecd6a..17edbf935e35 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -234,6 +234,7 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const 
struct nlattr *src)
case XFRMA_PAD:
/* Ignore */
return 0;
+   case XFRMA_UNSPEC:
case XFRMA_ALG_AUTH:
case XFRMA_ALG_CRYPT:
case XFRMA_ALG_COMP:
-- 
2.28.0

Re: [PATCH] xfrm/compat: Remove use of kmalloc_track_caller

2020-11-01 Thread Dmitry Safonov

On 11/1/20 10:08 PM, Alistair Delva wrote:
> The __kmalloc_track_caller symbol is not exported if SLUB/SLOB are
> enabled instead of SLAB, which breaks the build on such configs when
> CONFIG_XFRM_USER_COMPAT=m.
> 
> ERROR: "__kmalloc_track_caller" [net/xfrm/xfrm_compat.ko] undefined!
> 
> Other users of this symbol are 'bool' options, but changing this to
> bool would require XFRM_USER to be built in as well, which doesn't
> seem worth it. Go back to kmalloc().
> 
> Fixes: 96392ee5a13b9 ("xfrm/compat: Translate 32-bit user_policy from 
> sockptr")
> Cc: Dmitry Safonov <0x7f454...@gmail.com>
> Cc: Maciej Żenczykowski 
> Cc: Steffen Klassert 
> Signed-off-by: Alistair Delva 

Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com>

Thank you!

--
  Dmitry

[PATCH 1/3] xfrm/compat: Translate by copying XFRMA_UNSPEC attribute

2020-10-29 Thread Dmitry Safonov

xfrm_xlate32() translates 64-bit message provided by kernel to be sent
for 32-bit listener (acknowledge or monitor). Translator code doesn't
expect XFRMA_UNSPEC attribute as it doesn't know its payload.
Kernel never attaches such attribute, but a user can.

I've searched if any opensource does it and the answer is no.
Nothing on github and google finds only tfcproject that has such code
commented-out.

What will happen if a user sends a netlink message with XFRMA_UNSPEC
attribute? Ipsec code ignores this attribute. But if there is a
monitor-process or 32-bit user requested ack - kernel will try to
translate such message and will hit WARN_ONCE() in xfrm_xlate64_attr().

Deal with XFRMA_UNSPEC by copying the attribute payload with
xfrm_nla_cpy(). In result, the default switch-case in xfrm_xlate64_attr()
becomes an unused code. Leave those 3 lines in case a new xfrm attribute
will be added.

Reported-by: syzbot+a7e701c8385bd8543...@syzkaller.appspotmail.com
Signed-off-by: Dmitry Safonov 
---
 net/xfrm/xfrm_compat.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index e28f0c9ecd6a..17edbf935e35 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -234,6 +234,7 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const 
struct nlattr *src)
case XFRMA_PAD:
/* Ignore */
return 0;
+   case XFRMA_UNSPEC:
case XFRMA_ALG_AUTH:
case XFRMA_ALG_CRYPT:
case XFRMA_ALG_COMP:
-- 
2.28.0

[PATCH 3/3] xfrm/compat: Don't allocate memory with __GFP_ZERO

2020-10-29 Thread Dmitry Safonov

32-bit to 64-bit messages translator zerofies needed paddings in the
translation, the rest is the actual payload.
Don't allocate zero pages as they are not needed.

Signed-off-by: Dmitry Safonov 
---
 net/xfrm/xfrm_compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index 556e9f33b815..d8e8a11ca845 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -564,7 +564,7 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const 
struct nlmsghdr *h32,
return NULL;
 
len += NLMSG_HDRLEN;
-   h64 = kvmalloc(len, GFP_KERNEL | __GFP_ZERO);
+   h64 = kvmalloc(len, GFP_KERNEL);
if (!h64)
return ERR_PTR(-ENOMEM);
 
-- 
2.28.0

[PATCH 2/3] xfrm/compat: memset(0) 64-bit padding at right place

2020-10-29 Thread Dmitry Safonov

32-bit messages translated by xfrm_compat can have attributes attached.
For all, but XFRMA_SA, XFRMA_POLICY the size of payload is the same
in 32-bit UABI and 64-bit UABI. For XFRMA_SA (struct xfrm_usersa_info)
and XFRMA_POLICY (struct xfrm_userpolicy_info) it's only tail-padding
that is present in 64-bit payload, but not in 32-bit.
The proper size for destination nlattr is already calculated by
xfrm_user_rcv_calculate_len64() and allocated with kvmalloc().

xfrm_attr_cpy32() copies 32-bit copy_len into 64-bit attribute
translated payload, zero-filling possible padding for SA/POLICY.
Due to a typo, *pos already has 64-bit payload size, in a result next
memset(0) is called on the memory after the translated attribute, not on
the tail-padding of it.

Reported-by: syzbot+c43831072e7df506a...@syzkaller.appspotmail.com
Signed-off-by: Dmitry Safonov 
---
 net/xfrm/xfrm_compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index 17edbf935e35..556e9f33b815 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -388,7 +388,7 @@ static int xfrm_attr_cpy32(void *dst, size_t *pos, const 
struct nlattr *src,
 
memcpy(nla, src, nla_attr_size(copy_len));
nla->nla_len = nla_attr_size(payload);
-   *pos += nla_attr_size(payload);
+   *pos += nla_attr_size(copy_len);
nlmsg->nlmsg_len += nla->nla_len;
 
memset(dst + *pos, 0, payload - copy_len);
-- 
2.28.0

[PATCH 0/3] xfrm/compat: syzbot-found fixes

2020-10-29 Thread Dmitry Safonov

WARN_ON() for XFRMA_UNSPEC translation which likely no-one except
syzkaller uses; properly zerofy tail-padding for 64-bit attribute;
don't use __GFP_ZERO as the memory is initialized during translation.

Cc: Steffen Klassert 
Cc: "David S. Miller" 
Cc: Jakub Kicinski 
Cc: Herbert Xu 
Cc: Hillf Danton 
Cc: net...@vger.kernel.org

Thanks,
     Dmitry

Dmitry Safonov (3):
  xfrm/compat: Translate by copying XFRMA_UNSPEC attribute
  xfrm/compat: memset(0) 64-bit padding at right place
  xfrm/compat: Don't allocate memory with __GFP_ZERO

 net/xfrm/xfrm_compat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

base-commit: 07e0887302450a62f51dba72df6afb5fabb23d1c
-- 
2.28.0

Re: [PATCH 2/4] arm64: hide more compat_vdso code

2020-10-29 Thread Dmitry Safonov

On 10/29/20 1:35 PM, Arnd Bergmann wrote:
> On Mon, Oct 26, 2020 at 5:55 PM Mark Rutland  wrote:
>> On Mon, Oct 26, 2020 at 05:03:29PM +0100, Arnd Bergmann wrote:
>>> From: Arnd Bergmann 
>>>
>>> When CONFIG_COMPAT_VDSO is disabled, we get a warning
>>> about a potential out-of-bounds access:
>>>
>>> arch/arm64/kernel/vdso.c: In function 'aarch32_vdso_mremap':
>>> arch/arm64/kernel/vdso.c:86:37: warning: array subscript 1 is above array 
>>> bounds of 'struct vdso_abi_info[1]' [-Warray-bounds]
>>>86 |  unsigned long vdso_size = vdso_info[abi].vdso_code_end -
>>>   |~^
>>>
>>> This is all in dead code however that the compiler is unable to
>>> eliminate by itself.
>>>
>>> Change the array to individual local variables that can be
>>> dropped in dead code elimination to let the compiler understand
>>> this better.
>>>
>>> Fixes: 0cbc2659123e ("arm64: vdso32: Remove a bunch of #ifdef 
>>> CONFIG_COMPAT_VDSO guards")
>>> Signed-off-by: Arnd Bergmann 
>>
>> This looks like a nice cleanup to me! I agree we don't need the array
>> here.
>>
>> Reviewed-by: Mark Rutland 
> 
> Thanks!
> 
> I see the patch now conflicts with "mm: forbid splitting special mappings"
> in -mm, by Dmitry Safonov. I have rebased my patch on top, should
> I send it to Andrew for inclusion in -mm then?

Makes sense to me.
I plan to add some more patches on top that will make tracking of user
landing (on vdso/sigpage/etc) common between architectures in generic code.
So, I think it's probably good idea to keep it in one place, -mm tree
seems like a proper place for it.

Thanks,
  Dmitry

Re: WARNING in xfrm_alloc_compat

2020-10-28 Thread Dmitry Safonov

On 10/28/20 10:45 AM, Steffen Klassert wrote:
> Same here, Dmitry please look into it.

Looking on both, thanks!

> I guess we can just remove the WARN_ON() that
> triggeres here.
Thanks,
  Dmitry

Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()

2020-10-23 Thread Dmitry Safonov

Hi Christophe, Will,

On 10/23/20 12:57 PM, Christophe Leroy wrote:
> 
> 
> Le 23/10/2020 à 13:25, Will Deacon a écrit :
>> On Fri, Oct 23, 2020 at 01:22:04PM +0200, Christophe Leroy wrote:
>>> Hi Dmitry,
[..]
>>> I haven't seen the patches, did you sent them out finally ?

I was working on .close() hook, but while cooking it, I thought it may
be better to make tracking of user landing generic. Note that the vdso
base address is mostly needed by kernel as an address to land in
userspace after processing a signal.

I have some raw patches that add
+#ifdef CONFIG_ARCH_HAS_USER_LANDING
+   struct vm_area_struct *user_landing;
+#endif
inside mm_struct and I plan to finish them after rc1 gets released.

While working on that, I noticed that arm32 and some other architectures
track vdso position in mm.context with the only reason to add
AT_SYSINFO_EHDR in the elf header that's being loaded. That's quite
overkill to have a pointer in mm.context that rather can be a local
variable in elf binfmt loader. Also, I found some issues with mremap
code. The patches series mentioned are at the base of the branch with
generic user landing. I have sent only those patches not the full branch
as I remember there was a policy that during merge window one should
send only fixes, rather than refactoring/new code.

>> I think it's this series:
>>
>> https://lore.kernel.org/r/20201013013416.390574-1-d...@arista.com
>>
>> but they look really invasive to me, so I may cook a small hack for arm64
>> in the meantine / for stable.

I don't mind small hacks, but I'm concerned that the suggested fix which
sets `mm->context.vdso_base = 0` on munmap() may have it's issue: that
won't work if a user for whatever-broken-reason will mremap() vdso on 0
address. As the fix supposes to fix an issue that hasn't fired for
anyone yet, it probably shouldn't introduce another. That's why I've
used vm_area_struct to track vdso position in the patches set.
Probably, temporary, you could use something like:
#define BAD_VDSO_ADDRESS(-1)UL
Or non-page-aligned address.
But the signal code that checks if it can land on vdso/sigpage should be
also aligned with the new definition.

> Not sure we are talking about the same thing.
> 
> I can't see any new .close function added to vm_special_mapping in order
> to replace arch_unmap() hook.
Thanks,
  Dmitry

Re: [PATCH 1/2 v2] futex: adjust a futex timeout with a per-timens offset

2020-10-15 Thread Dmitry Safonov

On 10/15/20 5:00 PM, Andrei Vagin wrote:
> For all commands except FUTEX_WAIT, timeout is interpreted as an
> absolute value. This absolute value is inside the task's time namespace
> and has to be converted to the host's time.
> 
> Cc: 
> Fixes: 5a590f35add9 ("posix-clocks: Wire up clock_gettime() with timens 
> offsets")
> Reported-by: Hans van der Laan 
> Signed-off-by: Andrei Vagin 

Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com>

> ---
> 
> v2:
> * check FUTEX_CLOCK_REALTIME properly
> * fix futex_time32 too
> 
>  kernel/futex.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/kernel/futex.c b/kernel/futex.c
> index a5876694a60e..32056d2d4171 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -39,6 +39,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -3797,6 +3798,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, 
> u32, val,
>   t = timespec64_to_ktime(ts);
>   if (cmd == FUTEX_WAIT)
>   t = ktime_add_safe(ktime_get(), t);
> + else if (!(op & FUTEX_CLOCK_REALTIME))
> + t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
>   tp = 
>   }
>   /*
> @@ -3989,6 +3992,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, 
> op, u32, val,
>   t = timespec64_to_ktime(ts);
>   if (cmd == FUTEX_WAIT)
>   t = ktime_add_safe(ktime_get(), t);
> + else if (!(op & FUTEX_CLOCK_REALTIME))
> + t = timens_ktime_to_host(CLOCK_MONOTONIC, t);
>   tp = 
>   }
>   if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
>

Thanks,
 Dmitry

Re: [PATCH 1/2] futex: adjust a futex timeout with a per-timens offset

2020-10-15 Thread Dmitry Safonov

On 10/15/20 8:29 AM, Andrei Vagin wrote:
> For all commands except FUTEX_WAIT, timeout is interpreted as an
> absolute value. This absolute value is inside the task's time namespace
> and has to be converted to the host's time.
> 
> Cc: 
> Fixes: 5a590f35add9 ("posix-clocks: Wire up clock_gettime() with timens 
> offsets")
> Reported-by: Hans van der Laan 
> Signed-off-by: Andrei Vagin [..]
> @@ -3797,6 +3798,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, 
> u32, val,
>   t = timespec64_to_ktime(ts);
>   if (cmd == FUTEX_WAIT)
>   t = ktime_add_safe(ktime_get(), t);
> + else if (!(cmd & FUTEX_CLOCK_REALTIME))
> + t = timens_ktime_to_host(CLOCK_MONOTONIC, t);

Err, it probably should be
: else if (!(op & FUTEX_CLOCK_REALTIME))

And there's also
: SYSCALL_DEFINE6(futex_time32, ...)
which wants to be patched.

Thanks,
  Dmitry

[PATCH 5/6] mremap: Check if it's possible to split original vma

2020-10-12 Thread Dmitry Safonov

If original VMA can't be split at the desired address, do_munmap() will
fail and leave both new-copied VMA and old VMA. De-facto it's
MREMAP_DONTUNMAP behaviour, which is unexpected.

Currently, it may fail such way for hugetlbfs and dax device mappings.

Minimize such unpleasant situations to OOM by checking .may_split()
before attempting to create a VMA copy.

Signed-off-by: Dmitry Safonov 
---
 mm/mremap.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 898e9818ba6d..3c4047c23673 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -343,7 +343,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long excess = 0;
unsigned long hiwater_vm;
int split = 0;
-   int err;
+   int err = 0;
bool need_rmap_locks;
 
/*
@@ -353,6 +353,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
 
+   if (vma->vm_ops && vma->vm_ops->may_split) {
+   if (vma->vm_start != old_addr)
+   err = vma->vm_ops->may_split(vma, old_addr);
+   if (!err && vma->vm_end != old_addr + old_len)
+   err = vma->vm_ops->may_split(vma, old_addr + old_len);
+   if (err)
+   return err;
+   }
+
/*
 * Advise KSM to break any KSM pages in the area to be moved:
 * it would be confusing if they were to turn up at the new
-- 
2.28.0

[PATCH 0/6] mremap: move_vma() fixes

2020-10-12 Thread Dmitry Safonov

1 - seems to be historical issue on a rarely taken path
2,3 - fixes related to the new mremap() flag
5 - dax device/hugetlbfs possible issue

4,6 - refactoring

As those seems to be actual issues, sending this during the merge-window.

(Changes to architecture code are in the 6 patch, but Cc'ing
maintainers on cover for the context, I hope it's fine).

Cc: Alexander Viro 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: Brian Geffon 
Cc: Catalin Marinas 
Cc: Dan Williams 
Cc: Dave Jiang 
Cc: Hugh Dickins 
Cc: Ingo Molnar 
Cc: Kirill A. Shutemov 
Cc: Mike Kravetz 
Cc: Minchan Kim 
Cc: Russell King 
Cc: Thomas Bogendoerfer 
Cc: Thomas Gleixner 
Cc: Vishal Verma 
Cc: Vlastimil Babka 
Cc: Will Deacon 

Cc: linux-...@kvack.org
Cc: linux-fsde...@vger.kernel.org
Cc: linux...@kvack.org

Dmitry Safonov (6):
  mm/mremap: Account memory on do_munmap() failure
  mm/mremap: For MREMAP_DONTUNMAP check security_vm_enough_memory_mm()
  mremap: Don't allow MREMAP_DONTUNMAP on special_mappings and aio
  vm_ops: Rename .split() callback to .may_split()
  mremap: Check if it's possible to split original vma
  mm: Forbid splitting special mappings

 arch/arm/kernel/vdso.c|  9 
 arch/arm64/kernel/vdso.c  | 41 ++-
 arch/mips/vdso/genvdso.c  |  4 --
 arch/s390/kernel/vdso.c   | 11 +
 arch/x86/entry/vdso/vma.c | 17 
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c |  2 +-
 drivers/dax/device.c  |  4 +-
 fs/aio.c  |  5 ++-
 include/linux/mm.h|  5 ++-
 ipc/shm.c |  8 ++--
 mm/hugetlb.c  |  2 +-
 mm/mmap.c | 22 --
 mm/mremap.c   | 50 +++
 13 files changed, 63 insertions(+), 117 deletions(-)


base-commit: bbf5c979011a099af5dc76498918ed7df445635b
-- 
2.28.0

[PATCH 1/6] mm/mremap: Account memory on do_munmap() failure

2020-10-12 Thread Dmitry Safonov

move_vma() copies VMA without adding it to account, then unmaps old part
of VMA. On failure it unmaps the new VMA. With hacks accounting in
munmap is disabled as it's a copy of existing VMA.

Account the memory on munmap() failure which was previously copied into
a new VMA.

Fixes: commit e2ea83742133 ("[PATCH] mremap: move_vma fixes and cleanup")
Signed-off-by: Dmitry Safonov 
---
 mm/mremap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 138abbae4f75..03d31a0d4c67 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -450,7 +450,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
-   vm_unacct_memory(excess >> PAGE_SHIFT);
+   if (vm_flags & VM_ACCOUNT)
+   vm_acct_memory(new_len >> PAGE_SHIFT);
excess = 0;
}
 
-- 
2.28.0

[PATCH 6/6] mm: Forbid splitting special mappings

2020-10-12 Thread Dmitry Safonov

Don't allow splitting of vm_special_mapping's.
It affects vdso/vvar areas. Uprobes have only one page in xol_area so
they aren't affected.

Those restrictions were enforced by checks in .mremap() callbacks.
Restrict resizing with generic .split() callback.

Signed-off-by: Dmitry Safonov 
---
 arch/arm/kernel/vdso.c|  9 -
 arch/arm64/kernel/vdso.c  | 41 +++
 arch/mips/vdso/genvdso.c  |  4 
 arch/s390/kernel/vdso.c   | 11 +--
 arch/x86/entry/vdso/vma.c | 17 
 mm/mmap.c | 12 
 6 files changed, 16 insertions(+), 78 deletions(-)

diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c
index fddd08a6e063..3408269d19c7 100644
--- a/arch/arm/kernel/vdso.c
+++ b/arch/arm/kernel/vdso.c
@@ -50,15 +50,6 @@ static const struct vm_special_mapping vdso_data_mapping = {
 static int vdso_mremap(const struct vm_special_mapping *sm,
struct vm_area_struct *new_vma)
 {
-   unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
-   unsigned long vdso_size;
-
-   /* without VVAR page */
-   vdso_size = (vdso_total_pages - 1) << PAGE_SHIFT;
-
-   if (vdso_size != new_size)
-   return -EINVAL;
-
current->mm->context.vdso = new_vma->vm_start;
 
return 0;
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index d4202a32abc9..a1a4220a405b 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -82,17 +82,9 @@ static union {
 } vdso_data_store __page_aligned_data;
 struct vdso_data *vdso_data = vdso_data_store.data;
 
-static int __vdso_remap(enum vdso_abi abi,
-   const struct vm_special_mapping *sm,
-   struct vm_area_struct *new_vma)
+static int vdso_mremap(const struct vm_special_mapping *sm,
+   struct vm_area_struct *new_vma)
 {
-   unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
-   unsigned long vdso_size = vdso_info[abi].vdso_code_end -
- vdso_info[abi].vdso_code_start;
-
-   if (vdso_size != new_size)
-   return -EINVAL;
-
current->mm->context.vdso = (void *)new_vma->vm_start;
 
return 0;
@@ -223,17 +215,6 @@ static vm_fault_t vvar_fault(const struct 
vm_special_mapping *sm,
return vmf_insert_pfn(vma, vmf->address, pfn);
 }
 
-static int vvar_mremap(const struct vm_special_mapping *sm,
-  struct vm_area_struct *new_vma)
-{
-   unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
-
-   if (new_size != VVAR_NR_PAGES * PAGE_SIZE)
-   return -EINVAL;
-
-   return 0;
-}
-
 static int __setup_additional_pages(enum vdso_abi abi,
struct mm_struct *mm,
struct linux_binprm *bprm,
@@ -284,14 +265,6 @@ static int __setup_additional_pages(enum vdso_abi abi,
 /*
  * Create and map the vectors page for AArch32 tasks.
  */
-#ifdef CONFIG_COMPAT_VDSO
-static int aarch32_vdso_mremap(const struct vm_special_mapping *sm,
-   struct vm_area_struct *new_vma)
-{
-   return __vdso_remap(VDSO_ABI_AA32, sm, new_vma);
-}
-#endif /* CONFIG_COMPAT_VDSO */
-
 enum aarch32_map {
AA32_MAP_VECTORS, /* kuser helpers */
 #ifdef CONFIG_COMPAT_VDSO
@@ -313,11 +286,10 @@ static struct vm_special_mapping aarch32_vdso_maps[] = {
[AA32_MAP_VVAR] = {
.name = "[vvar]",
.fault = vvar_fault,
-   .mremap = vvar_mremap,
},
[AA32_MAP_VDSO] = {
.name = "[vdso]",
-   .mremap = aarch32_vdso_mremap,
+   .mremap = vdso_mremap,
},
 #endif /* CONFIG_COMPAT_VDSO */
[AA32_MAP_SIGPAGE] = {
@@ -465,12 +437,6 @@ int aarch32_setup_additional_pages(struct linux_binprm 
*bprm, int uses_interp)
 }
 #endif /* CONFIG_COMPAT */
 
-static int vdso_mremap(const struct vm_special_mapping *sm,
-   struct vm_area_struct *new_vma)
-{
-   return __vdso_remap(VDSO_ABI_AA64, sm, new_vma);
-}
-
 enum aarch64_map {
AA64_MAP_VVAR,
AA64_MAP_VDSO,
@@ -480,7 +446,6 @@ static struct vm_special_mapping aarch64_vdso_maps[] 
__ro_after_init = {
[AA64_MAP_VVAR] = {
.name   = "[vvar]",
.fault = vvar_fault,
-   .mremap = vvar_mremap,
},
[AA64_MAP_VDSO] = {
.name   = "[vdso]",
diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c
index abb06ae04b40..09e30eb4be86 100644
--- a/arch/mips/vdso/genvdso.c
+++ b/arch/mips/vdso/genvdso.c
@@ -263,10 +263,6 @@ int main(int argc, char **argv)
fprintf(out_file, " const struct vm_special_mapping *sm,\n");
fprintf(out_file, " struct vm_area_struct *new_vma)\n");
fprintf(out_file, "{\n&qu

[PATCH 4/6] vm_ops: Rename .split() callback to .may_split()

2020-10-12 Thread Dmitry Safonov

Rename the callback to reflect that it's not called *on* or *after*
split, but rather some time before the splitting to check if it's
possible.

Signed-off-by: Dmitry Safonov 
---
 drivers/dax/device.c | 4 ++--
 include/linux/mm.h   | 3 ++-
 ipc/shm.c| 8 
 mm/hugetlb.c | 2 +-
 mm/mmap.c| 4 ++--
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 1e89513f3c59..223dd1d13d5c 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -276,7 +276,7 @@ static vm_fault_t dev_dax_fault(struct vm_fault *vmf)
return dev_dax_huge_fault(vmf, PE_SIZE_PTE);
 }
 
-static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr)
+static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr)
 {
struct file *filp = vma->vm_file;
struct dev_dax *dev_dax = filp->private_data;
@@ -299,7 +299,7 @@ static unsigned long dev_dax_pagesize(struct vm_area_struct 
*vma)
 static const struct vm_operations_struct dax_vm_ops = {
.fault = dev_dax_fault,
.huge_fault = dev_dax_huge_fault,
-   .split = dev_dax_split,
+   .may_split = dev_dax_may_split,
.pagesize = dev_dax_pagesize,
 };
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fd51a4a1f722..90887661ef44 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -549,7 +549,8 @@ enum page_entry_size {
 struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
void (*close)(struct vm_area_struct * area);
-   int (*split)(struct vm_area_struct * area, unsigned long addr);
+   /* Called any time before splitting to check if it's allowed */
+   int (*may_split)(struct vm_area_struct *area, unsigned long addr);
int (*mremap)(struct vm_area_struct *area, unsigned long flags);
vm_fault_t (*fault)(struct vm_fault *vmf);
vm_fault_t (*huge_fault)(struct vm_fault *vmf,
diff --git a/ipc/shm.c b/ipc/shm.c
index e25c7c6106bc..febd88daba8c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -434,13 +434,13 @@ static vm_fault_t shm_fault(struct vm_fault *vmf)
return sfd->vm_ops->fault(vmf);
 }
 
-static int shm_split(struct vm_area_struct *vma, unsigned long addr)
+static int shm_may_split(struct vm_area_struct *vma, unsigned long addr)
 {
struct file *file = vma->vm_file;
struct shm_file_data *sfd = shm_file_data(file);
 
-   if (sfd->vm_ops->split)
-   return sfd->vm_ops->split(vma, addr);
+   if (sfd->vm_ops->may_split)
+   return sfd->vm_ops->may_split(vma, addr);
 
return 0;
 }
@@ -582,7 +582,7 @@ static const struct vm_operations_struct shm_vm_ops = {
.open   = shm_open, /* callback for a new vm-area open */
.close  = shm_close,/* callback for when the vm-area is released */
.fault  = shm_fault,
-   .split  = shm_split,
+   .may_split = shm_may_split,
.pagesize = shm_pagesize,
 #if defined(CONFIG_NUMA)
.set_policy = shm_set_policy,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67fc6383995b..c4dc0e453be1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3759,7 +3759,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
-   .split = hugetlb_vm_op_split,
+   .may_split = hugetlb_vm_op_split,
.pagesize = hugetlb_vm_op_pagesize,
 };
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 50f853b0ec39..a62cb3ccafce 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2693,8 +2693,8 @@ int __split_vma(struct mm_struct *mm, struct 
vm_area_struct *vma,
struct vm_area_struct *new;
int err;
 
-   if (vma->vm_ops && vma->vm_ops->split) {
-   err = vma->vm_ops->split(vma, addr);
+   if (vma->vm_ops && vma->vm_ops->may_split) {
+   err = vma->vm_ops->may_split(vma, addr);
if (err)
return err;
}
-- 
2.28.0

[PATCH 2/6] mm/mremap: For MREMAP_DONTUNMAP check security_vm_enough_memory_mm()

2020-10-12 Thread Dmitry Safonov

Currently memory is accounted post-mremap() with MREMAP_DONTUNMAP, which
may break overcommit policy. So, check if there's enough memory before
doing actual VMA copy.

Don't unset VM_ACCOUNT on MREMAP_DONTUNMAP. By semantics, such mremap()
is actually a memory allocation. That also simplifies the error-path a
little.

Also, as it's memory allocation on success don't reset hiwater_vm value.

Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()")
Signed-off-by: Dmitry Safonov 
---
 mm/mremap.c | 36 +---
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 03d31a0d4c67..c248f9a52125 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -365,11 +365,19 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (err)
return err;
 
+   if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) {
+   if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT))
+   return -ENOMEM;
+   }
+
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(, new_addr, new_len, new_pgoff,
   _rmap_locks);
-   if (!new_vma)
+   if (!new_vma) {
+   if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT))
+   vm_unacct_memory(new_len >> PAGE_SHIFT);
return -ENOMEM;
+   }
 
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
 need_rmap_locks);
@@ -398,7 +406,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
 
/* Conceal VM_ACCOUNT so old reservation is not undone */
-   if (vm_flags & VM_ACCOUNT) {
+   if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
vma->vm_flags &= ~VM_ACCOUNT;
excess = vma->vm_end - vma->vm_start - old_len;
if (old_addr > vma->vm_start &&
@@ -423,34 +431,16 @@ static unsigned long move_vma(struct vm_area_struct *vma,
untrack_pfn_moved(vma);
 
if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
-   if (vm_flags & VM_ACCOUNT) {
-   /* Always put back VM_ACCOUNT since we won't unmap */
-   vma->vm_flags |= VM_ACCOUNT;
-
-   vm_acct_memory(new_len >> PAGE_SHIFT);
-   }
-
-   /*
-* VMAs can actually be merged back together in copy_vma
-* calling merge_vma. This can happen with anonymous vmas
-* which have not yet been faulted, so if we were to consider
-* this VMA split we'll end up adding VM_ACCOUNT on the
-* next VMA, which is completely unrelated if this VMA
-* was re-merged.
-*/
-   if (split && new_vma == vma)
-   split = 0;
-
/* We always clear VM_LOCKED[ONFAULT] on the old vma */
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
 
/* Because we won't unmap we don't need to touch locked_vm */
-   goto out;
+   return new_addr;
}
 
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
-   if (vm_flags & VM_ACCOUNT)
+   if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
vm_acct_memory(new_len >> PAGE_SHIFT);
excess = 0;
}
@@ -459,7 +449,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
mm->locked_vm += new_len >> PAGE_SHIFT;
*locked = true;
}
-out:
+
mm->hiwater_vm = hiwater_vm;
 
/* Restore VM_ACCOUNT if one or two pieces of vma left */
-- 
2.28.0

[PATCH 3/6] mremap: Don't allow MREMAP_DONTUNMAP on special_mappings and aio

2020-10-12 Thread Dmitry Safonov

As kernel expect to see only one of such mappings, any further
operations on the VMA-copy may be unexpected by the kernel.
Maybe it's being on the safe side, but there doesn't seem to be any
expected use-case for this, so restrict it now.

Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()")
Signed-off-by: Dmitry Safonov 
---
 arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +-
 fs/aio.c  | 5 -
 include/linux/mm.h| 2 +-
 mm/mmap.c | 6 +-
 mm/mremap.c   | 2 +-
 5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c 
b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 0daf2f1cf7a8..e916646adc69 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, 
struct file *filp)
return 0;
 }
 
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long 
flags)
 {
/* Not supported */
return -EINVAL;
diff --git a/fs/aio.c b/fs/aio.c
index d5ec30385566..3be3c0f77548 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -324,13 +324,16 @@ static void aio_free_ring(struct kioctx *ctx)
}
 }
 
-static int aio_ring_mremap(struct vm_area_struct *vma)
+static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags)
 {
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i, res = -EINVAL;
 
+   if (flags & MREMAP_DONTUNMAP)
+   return -EINVAL;
+
spin_lock(>ioctx_lock);
rcu_read_lock();
table = rcu_dereference(mm->ioctx_table);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 16b799a0522c..fd51a4a1f722 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -550,7 +550,7 @@ struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
void (*close)(struct vm_area_struct * area);
int (*split)(struct vm_area_struct * area, unsigned long addr);
-   int (*mremap)(struct vm_area_struct * area);
+   int (*mremap)(struct vm_area_struct *area, unsigned long flags);
vm_fault_t (*fault)(struct vm_fault *vmf);
vm_fault_t (*huge_fault)(struct vm_fault *vmf,
enum page_entry_size pe_size);
diff --git a/mm/mmap.c b/mm/mmap.c
index bdd19f5b994e..50f853b0ec39 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3372,10 +3372,14 @@ static const char *special_mapping_name(struct 
vm_area_struct *vma)
return ((struct vm_special_mapping *)vma->vm_private_data)->name;
 }
 
-static int special_mapping_mremap(struct vm_area_struct *new_vma)
+static int special_mapping_mremap(struct vm_area_struct *new_vma,
+ unsigned long flags)
 {
struct vm_special_mapping *sm = new_vma->vm_private_data;
 
+   if (flags & MREMAP_DONTUNMAP)
+   return -EINVAL;
+
if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
return -EFAULT;
 
diff --git a/mm/mremap.c b/mm/mremap.c
index c248f9a52125..898e9818ba6d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -384,7 +384,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (moved_len < old_len) {
err = -ENOMEM;
} else if (vma->vm_ops && vma->vm_ops->mremap) {
-   err = vma->vm_ops->mremap(new_vma);
+   err = vma->vm_ops->mremap(new_vma, flags);
}
 
if (unlikely(err)) {
-- 
2.28.0

Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()

2020-09-28 Thread Dmitry Safonov

On 9/27/20 8:43 AM, Christophe Leroy wrote:
> 
> 
> Le 21/09/2020 à 13:26, Will Deacon a écrit :
>> On Fri, Aug 28, 2020 at 12:14:28PM +1000, Michael Ellerman wrote:
>>> Dmitry Safonov <0x7f454...@gmail.com> writes:
[..]
>>>> I'll cook a patch for vm_special_mapping if you don't mind :-)
>>>
>>> That would be great, thanks!
>>
>> I lost track of this one. Is there a patch kicking around to resolve
>> this,
>> or is the segfault expected behaviour?
>>
> 
> IIUC dmitry said he will cook a patch. I have not seen any patch yet.

Yes, sorry about the delay - I was a bit busy with xfrm patches.

I'll send patches for .close() this week, working on them now.

> AFAIKS, among the architectures having VDSO sigreturn trampolines, only
> SH, X86 and POWERPC provide alternative trampoline on stack when VDSO is
> not there.
> 
> All other architectures just having a VDSO don't expect VDSO to not be
> mapped.
> 
> As far as nowadays stacks are mapped non-executable, getting a segfaut
> is expected behaviour. However, I think we should really make it
> cleaner. Today it segfaults because it is still pointing to the VDSO
> trampoline that has been unmapped. But should the user map some other
> code at the same address, we'll run in the weed on signal return instead
> of segfaulting.

+1.

> So VDSO unmapping should really be properly managed, the reference
> should be properly cleared in order to segfault in a controllable manner.
> 
> Only powerpc has a hook to properly clear the VDSO pointer when VDSO is
> unmapped.

Thanks,
 Dmitry

[PATCH v3 6/7] xfrm/compat: Translate 32-bit user_policy from sockptr

2020-09-21 Thread Dmitry Safonov

Provide compat_xfrm_userpolicy_info translation for xfrm setsocketopt().
Reallocate buffer and put the missing padding for 64-bit message.

Signed-off-by: Dmitry Safonov 
---
 include/net/xfrm.h |  3 +++
 net/xfrm/xfrm_compat.c | 26 ++
 net/xfrm/xfrm_state.c  | 17 ++---
 3 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index fa18cb6bb3f7..53618a31634b 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2012,6 +2012,9 @@ struct xfrm_translator {
int maxtype, const struct nla_policy *policy,
struct netlink_ext_ack *extack);
 
+   /* Translate 32-bit user_policy from sockptr */
+   int (*xlate_user_policy_sockptr)(u8 **pdata32, int optlen);
+
struct module *owner;
 };
 
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index b1b5f972538d..e28f0c9ecd6a 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -576,10 +576,36 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const 
struct nlmsghdr *h32,
return h64;
 }
 
+static int xfrm_user_policy_compat(u8 **pdata32, int optlen)
+{
+   struct compat_xfrm_userpolicy_info *p = (void *)*pdata32;
+   u8 *src_templates, *dst_templates;
+   u8 *data64;
+
+   if (optlen < sizeof(*p))
+   return -EINVAL;
+
+   data64 = kmalloc_track_caller(optlen + 4, GFP_USER | __GFP_NOWARN);
+   if (!data64)
+   return -ENOMEM;
+
+   memcpy(data64, *pdata32, sizeof(*p));
+   memset(data64 + sizeof(*p), 0, 4);
+
+   src_templates = *pdata32 + sizeof(*p);
+   dst_templates = data64 + sizeof(*p) + 4;
+   memcpy(dst_templates, src_templates, optlen - sizeof(*p));
+
+   kfree(*pdata32);
+   *pdata32 = data64;
+   return 0;
+}
+
 static struct xfrm_translator xfrm_translator = {
.owner  = THIS_MODULE,
.alloc_compat   = xfrm_alloc_compat,
.rcv_msg_compat = xfrm_user_rcv_msg_compat,
+   .xlate_user_policy_sockptr  = xfrm_user_policy_compat,
 };
 
 static int __init xfrm_compat_init(void)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index cc206ca3df78..f9961884500b 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2331,9 +2331,6 @@ int xfrm_user_policy(struct sock *sk, int optname, 
sockptr_t optval, int optlen)
struct xfrm_mgr *km;
struct xfrm_policy *pol = NULL;
 
-   if (in_compat_syscall())
-   return -EOPNOTSUPP;
-
if (sockptr_is_null(optval) && !optlen) {
xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
@@ -2348,6 +2345,20 @@ int xfrm_user_policy(struct sock *sk, int optname, 
sockptr_t optval, int optlen)
if (IS_ERR(data))
return PTR_ERR(data);
 
+   if (in_compat_syscall()) {
+   struct xfrm_translator *xtr = xfrm_get_translator();
+
+   if (!xtr)
+   return -EOPNOTSUPP;
+
+   err = xtr->xlate_user_policy_sockptr(, optlen);
+   xfrm_put_translator(xtr);
+   if (err) {
+   kfree(data);
+   return err;
+   }
+   }
+
err = -EINVAL;
rcu_read_lock();
list_for_each_entry_rcu(km, _km_list, list) {
-- 
2.28.0

[PATCH v3 1/7] xfrm: Provide API to register translator module

2020-09-21 Thread Dmitry Safonov

Add a skeleton for xfrm_compat module and provide API to register it in
xfrm_state.ko. struct xfrm_translator will have function pointers to
translate messages received from 32-bit userspace or to be sent to it
from 64-bit kernel.
module_get()/module_put() are used instead of rcu_read_lock() as the
module will vmalloc() memory for translation.
The new API is registered with xfrm_state module, not with xfrm_user as
the former needs translator for user_policy set by setsockopt() and
xfrm_user already uses functions from xfrm_state.

Signed-off-by: Dmitry Safonov 
---
 include/net/xfrm.h | 19 +
 net/xfrm/Kconfig   | 10 +++
 net/xfrm/Makefile  |  1 +
 net/xfrm/xfrm_compat.c | 29 
 net/xfrm/xfrm_state.c  | 60 ++
 5 files changed, 119 insertions(+)
 create mode 100644 net/xfrm/xfrm_compat.c

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 2737d24ec244..fe2e3717da14 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2000,6 +2000,25 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, 
struct xfrm_state *x,
return 0;
 }
 
+struct xfrm_translator {
+   struct module *owner;
+};
+
+#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT)
+extern int xfrm_register_translator(struct xfrm_translator *xtr);
+extern int xfrm_unregister_translator(struct xfrm_translator *xtr);
+extern struct xfrm_translator *xfrm_get_translator(void);
+extern void xfrm_put_translator(struct xfrm_translator *xtr);
+#else
+static inline struct xfrm_translator *xfrm_get_translator(void)
+{
+   return NULL;
+}
+static inline void xfrm_put_translator(struct xfrm_translator *xtr)
+{
+}
+#endif
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline bool xfrm6_local_dontfrag(const struct sock *sk)
 {
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 5b9a5ab48111..e79b48dab61b 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -28,6 +28,16 @@ config XFRM_USER
 
  If unsure, say Y.
 
+config XFRM_USER_COMPAT
+   tristate "Compatible ABI support"
+   depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT
+   select WANT_COMPAT_NETLINK_MESSAGES
+   help
+ Transformation(XFRM) user configuration interface like IPsec
+ used by compatible Linux applications.
+
+ If unsure, say N.
+
 config XFRM_INTERFACE
tristate "Transformation virtual interface"
depends on XFRM && IPV6
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 2d4bb4b9f75e..494aa744bfb9 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
 obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
 obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
 obj-$(CONFIG_XFRM_USER) += xfrm_user.o
+obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
 obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
 obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
 obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
new file mode 100644
index ..f01d9af41c55
--- /dev/null
+++ b/net/xfrm/xfrm_compat.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * XFRM compat layer
+ * Author: Dmitry Safonov 
+ * Based on code and translator idea by: Florian Westphal 
+ */
+#include 
+#include 
+#include 
+
+static struct xfrm_translator xfrm_translator = {
+   .owner  = THIS_MODULE,
+};
+
+static int __init xfrm_compat_init(void)
+{
+   return xfrm_register_translator(_translator);
+}
+
+static void __exit xfrm_compat_exit(void)
+{
+   xfrm_unregister_translator(_translator);
+}
+
+module_init(xfrm_compat_init);
+module_exit(xfrm_compat_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dmitry Safonov");
+MODULE_DESCRIPTION("XFRM 32-bit compatibility layer");
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 69520ad3d83b..cc206ca3df78 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2264,6 +2264,66 @@ static bool km_is_alive(const struct km_event *c)
return is_alive;
 }
 
+#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT)
+static DEFINE_SPINLOCK(xfrm_translator_lock);
+static struct xfrm_translator __rcu *xfrm_translator;
+
+struct xfrm_translator *xfrm_get_translator(void)
+{
+   struct xfrm_translator *xtr;
+
+   rcu_read_lock();
+   xtr = rcu_dereference(xfrm_translator);
+   if (unlikely(!xtr))
+   goto out;
+   if (!try_module_get(xtr->owner))
+   xtr = NULL;
+out:
+   rcu_read_unlock();
+   return xtr;
+}
+EXPORT_SYMBOL_GPL(xfrm_get_translator);
+
+void xfrm_put_translator(struct xfrm_translator *xtr)
+{
+   module_put(xtr->owner);
+}
+EXPORT_SYMBOL_GPL(xfrm_put_translator);
+
+int xfrm_register_translator(struct xfrm_translator *xtr)
+{
+   int err = 0;
+
+   spin_lock_bh(_translator_lock);
+   if (unlike

[PATCH v3 3/7] xfrm/compat: Attach xfrm dumps to 64=>32 bit translator

2020-09-21 Thread Dmitry Safonov

Currently nlmsg_unicast() is used by functions that dump structures that
can be different in size for compat tasks, see dump_one_state() and
dump_one_policy().

The following nlmsg_unicast() users exist today in xfrm:

 Function  |Message can be different
   |   in size on compat
---|--
xfrm_get_spdinfo() |   N
xfrm_get_sadinfo() |   N
xfrm_get_sa()  |   Y
xfrm_alloc_userspi()   |   Y
xfrm_get_policy()  |   Y
xfrm_get_ae()  |   N

Besides, dump_one_state() and dump_one_policy() can be used by filtered
netlink dump for XFRM_MSG_GETSA, XFRM_MSG_GETPOLICY.

Just as for xfrm multicast, allocate frag_list for compat skb journey
down to recvmsg() which will give user the desired skb according to
syscall bitness.

Signed-off-by: Dmitry Safonov 
---
 net/xfrm/xfrm_user.c | 38 ++
 1 file changed, 38 insertions(+)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 3769227ed4e1..7fd7b16a8805 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -975,6 +975,7 @@ static int dump_one_state(struct xfrm_state *x, int count, 
void *ptr)
struct xfrm_dump_info *sp = ptr;
struct sk_buff *in_skb = sp->in_skb;
struct sk_buff *skb = sp->out_skb;
+   struct xfrm_translator *xtr;
struct xfrm_usersa_info *p;
struct nlmsghdr *nlh;
int err;
@@ -992,6 +993,18 @@ static int dump_one_state(struct xfrm_state *x, int count, 
void *ptr)
return err;
}
nlmsg_end(skb, nlh);
+
+   xtr = xfrm_get_translator();
+   if (xtr) {
+   err = xtr->alloc_compat(skb, nlh);
+
+   xfrm_put_translator(xtr);
+   if (err) {
+   nlmsg_cancel(skb, nlh);
+   return err;
+   }
+   }
+
return 0;
 }
 
@@ -1320,6 +1333,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct 
nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct xfrm_state *x;
struct xfrm_userspi_info *p;
+   struct xfrm_translator *xtr;
struct sk_buff *resp_skb;
xfrm_address_t *daddr;
int family;
@@ -1370,6 +1384,17 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, 
struct nlmsghdr *nlh,
goto out;
}
 
+   xtr = xfrm_get_translator();
+   if (xtr) {
+   err = xtr->alloc_compat(skb, nlmsg_hdr(skb));
+
+   xfrm_put_translator(xtr);
+   if (err) {
+   kfree_skb(resp_skb);
+   goto out;
+   }
+   }
+
err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid);
 
 out:
@@ -1776,6 +1801,7 @@ static int dump_one_policy(struct xfrm_policy *xp, int 
dir, int count, void *ptr
struct xfrm_userpolicy_info *p;
struct sk_buff *in_skb = sp->in_skb;
struct sk_buff *skb = sp->out_skb;
+   struct xfrm_translator *xtr;
struct nlmsghdr *nlh;
int err;
 
@@ -1800,6 +1826,18 @@ static int dump_one_policy(struct xfrm_policy *xp, int 
dir, int count, void *ptr
return err;
}
nlmsg_end(skb, nlh);
+
+   xtr = xfrm_get_translator();
+   if (xtr) {
+   err = xtr->alloc_compat(skb, nlh);
+
+   xfrm_put_translator(xtr);
+   if (err) {
+   nlmsg_cancel(skb, nlh);
+   return err;
+   }
+   }
+
return 0;
 }
 
-- 
2.28.0

[PATCH v3 0/7] xfrm: Add compat layer

2020-09-21 Thread Dmitry Safonov

Changes since v2:
- added struct xfrm_translator as API to register xfrm_compat.ko with
  xfrm_state.ko. This allows compilation of translator as a loadable
  module
- fixed indention and collected reviewed-by (Johannes Berg)
- moved boilerplate from commit messages into cover-letter (Steffen
  Klassert)
- found on KASAN build and fixed non-initialised stack variable usage
  in the translator

The resulting v2/v3 diff can be found here:
https://gist.github.com/0x7f454c46/8f68311dfa1f240959fdbe7c77ed2259

Patches as a .git branch:
https://github.com/0x7f454c46/linux/tree/xfrm-compat-v3

Changes since v1:
- reworked patches set to use translator
- separated the compat layer into xfrm_compat.c,
  compiled under XFRM_USER_COMPAT config
- 32-bit messages now being sent in frag_list (like wext-core does)
- instead of __packed add compat_u64 members in compat structures
- selftest reworked to kselftest lib API
- added netlink dump testing to the selftest

XFRM is disabled for compatible users because of the UABI difference.
The difference is in structures paddings and in the result the size
of netlink messages differ.

Possibility for compatible application to manage xfrm tunnels was
disabled by: the commmit 19d7df69fdb2 ("xfrm: Refuse to insert 32 bit
userspace socket policies on 64 bit systems") and the commit 74005991b78a
("xfrm: Do not parse 32bits compiled xfrm netlink msg on 64bits host").

This is my second attempt to resolve the xfrm/compat problem by adding
the 64=>32 and 32=>64 bit translators those non-visibly to a user
provide translation between compatible user and kernel.
Previous attempt was to interrupt the message ABI according to a syscall
by xfrm_user, which resulted in over-complicated code [1].

Florian Westphal provided the idea of translator and some draft patches
in the discussion. In these patches, his idea is reused and some of his
initial code is also present.

There were a couple of attempts to solve xfrm compat problem:
https://lkml.org/lkml/2017/1/20/733
https://patchwork.ozlabs.org/patch/44600/
http://netdev.vger.kernel.narkive.com/2Gesykj6/patch-net-next-xfrm-correctly-parse-netlink-msg-from-32bits-ip-command-on-64bits-host

All the discussions end in the conclusion that xfrm should have a full
compatible layer to correctly work with 32-bit applications on 64-bit
kernels:
https://lkml.org/lkml/2017/1/23/413
https://patchwork.ozlabs.org/patch/433279/

In some recent lkml discussion, Linus said that it's worth to fix this
problem and not giving people an excuse to stay on 32-bit kernel:
https://lkml.org/lkml/2018/2/13/752

There is also an selftest for ipsec tunnels.
It doesn't depend on any library and compat version can be easy
build with: make CFLAGS=-m32 net/ipsec

[1]: https://lkml.kernel.org/r/20180726023144.31066-1-d...@arista.com

Cc: "David S. Miller" 
Cc: Florian Westphal 
Cc: Herbert Xu 
Cc: Jakub Kicinski 
Cc: Johannes Berg 
Cc: Steffen Klassert 
Cc: Stephen Suryaputra 
Cc: Dmitry Safonov <0x7f454...@gmail.com>
Cc: net...@vger.kernel.org

Dmitry Safonov (7):
  xfrm: Provide API to register translator module
  xfrm/compat: Add 64=>32-bit messages translator
  xfrm/compat: Attach xfrm dumps to 64=>32 bit translator
  netlink/compat: Append NLMSG_DONE/extack to frag_list
  xfrm/compat: Add 32=>64-bit messages translator
  xfrm/compat: Translate 32-bit user_policy from sockptr
  selftest/net/xfrm: Add test for ipsec tunnel

 MAINTAINERS|1 +
 include/net/xfrm.h |   33 +
 net/netlink/af_netlink.c   |   47 +-
 net/xfrm/Kconfig   |   11 +
 net/xfrm/Makefile  |1 +
 net/xfrm/xfrm_compat.c |  625 +++
 net/xfrm/xfrm_state.c  |   77 +-
 net/xfrm/xfrm_user.c   |  110 +-
 tools/testing/selftests/net/.gitignore |1 +
 tools/testing/selftests/net/Makefile   |1 +
 tools/testing/selftests/net/ipsec.c| 2195 
 11 files changed, 3066 insertions(+), 36 deletions(-)
 create mode 100644 net/xfrm/xfrm_compat.c
 create mode 100644 tools/testing/selftests/net/ipsec.c


base-commit: ba4f184e126b751d1bffad5897f263108befc780
-- 
2.28.0

[PATCH v3 5/7] xfrm/compat: Add 32=>64-bit messages translator

2020-09-21 Thread Dmitry Safonov

Provide the user-to-kernel translator under XFRM_USER_COMPAT, that
creates for 32-bit xfrm-user message a 64-bit translation.
The translation is afterwards reused by xfrm_user code just as if
userspace had sent 64-bit message.

Signed-off-by: Dmitry Safonov 
---
 include/net/xfrm.h |   6 +
 net/xfrm/Kconfig   |   3 +-
 net/xfrm/xfrm_compat.c | 274 +
 net/xfrm/xfrm_user.c   |  57 ++---
 4 files changed, 321 insertions(+), 19 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 5b6cc62c9354..fa18cb6bb3f7 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2001,11 +2001,17 @@ static inline int xfrm_tunnel_check(struct sk_buff 
*skb, struct xfrm_state *x,
 }
 
 extern const int xfrm_msg_min[XFRM_NR_MSGTYPES];
+extern const struct nla_policy xfrma_policy[XFRMA_MAX+1];
 
 struct xfrm_translator {
/* Allocate frag_list and put compat translation there */
int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src);
 
+   /* Allocate nlmsg with 64-bit translaton of received 32-bit message */
+   struct nlmsghdr *(*rcv_msg_compat)(const struct nlmsghdr *nlh,
+   int maxtype, const struct nla_policy *policy,
+   struct netlink_ext_ack *extack);
+
struct module *owner;
 };
 
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index e79b48dab61b..3adf31a83a79 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -30,7 +30,8 @@ config XFRM_USER
 
 config XFRM_USER_COMPAT
tristate "Compatible ABI support"
-   depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT
+   depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT && \
+   HAVE_EFFICIENT_UNALIGNED_ACCESS
select WANT_COMPAT_NETLINK_MESSAGES
help
  Transformation(XFRM) user configuration interface like IPsec
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index aece41b44ff2..b1b5f972538d 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -96,6 +96,39 @@ static const int compat_msg_min[XFRM_NR_MSGTYPES] = {
[XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping)
 };
 
+static const struct nla_policy compat_policy[XFRMA_MAX+1] = {
+   [XFRMA_SA]  = { .len = XMSGSIZE(compat_xfrm_usersa_info)},
+   [XFRMA_POLICY]  = { .len = 
XMSGSIZE(compat_xfrm_userpolicy_info)},
+   [XFRMA_LASTUSED]= { .type = NLA_U64},
+   [XFRMA_ALG_AUTH_TRUNC]  = { .len = sizeof(struct xfrm_algo_auth)},
+   [XFRMA_ALG_AEAD]= { .len = sizeof(struct xfrm_algo_aead) },
+   [XFRMA_ALG_AUTH]= { .len = sizeof(struct xfrm_algo) },
+   [XFRMA_ALG_CRYPT]   = { .len = sizeof(struct xfrm_algo) },
+   [XFRMA_ALG_COMP]= { .len = sizeof(struct xfrm_algo) },
+   [XFRMA_ENCAP]   = { .len = sizeof(struct xfrm_encap_tmpl) },
+   [XFRMA_TMPL]= { .len = sizeof(struct xfrm_user_tmpl) },
+   [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) },
+   [XFRMA_LTIME_VAL]   = { .len = sizeof(struct xfrm_lifetime_cur) },
+   [XFRMA_REPLAY_VAL]  = { .len = sizeof(struct xfrm_replay_state) },
+   [XFRMA_REPLAY_THRESH]   = { .type = NLA_U32 },
+   [XFRMA_ETIMER_THRESH]   = { .type = NLA_U32 },
+   [XFRMA_SRCADDR] = { .len = sizeof(xfrm_address_t) },
+   [XFRMA_COADDR]  = { .len = sizeof(xfrm_address_t) },
+   [XFRMA_POLICY_TYPE] = { .len = sizeof(struct xfrm_userpolicy_type)},
+   [XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) },
+   [XFRMA_KMADDRESS]   = { .len = sizeof(struct xfrm_user_kmaddress) },
+   [XFRMA_MARK]= { .len = sizeof(struct xfrm_mark) },
+   [XFRMA_TFCPAD]  = { .type = NLA_U32 },
+   [XFRMA_REPLAY_ESN_VAL]  = { .len = sizeof(struct xfrm_replay_state_esn) 
},
+   [XFRMA_SA_EXTRA_FLAGS]  = { .type = NLA_U32 },
+   [XFRMA_PROTO]   = { .type = NLA_U8 },
+   [XFRMA_ADDRESS_FILTER]  = { .len = sizeof(struct xfrm_address_filter) },
+   [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) },
+   [XFRMA_SET_MARK]= { .type = NLA_U32 },
+   [XFRMA_SET_MARK_MASK]   = { .type = NLA_U32 },
+   [XFRMA_IF_ID]   = { .type = NLA_U32 },
+};
+
 static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb,
const struct nlmsghdr *nlh_src, u16 type)
 {
@@ -303,9 +336,250 @@ static int xfrm_alloc_compat(struct sk_buff *skb, const 
struct nlmsghdr *nlh_src
return 0;
 }
 
+/* Calculates len of translated 64-bit message. */
+static size_t xfrm_user_rcv_calculate_len64(const struct nlmsghdr *src,
+   struct nlattr *attrs[XFRMA_MAX+1])
+{
+   size_t len = nlmsg_len(src);
+
+   switch (src->nlmsg_type) {
+   case XFRM_MSG_NEWSA:
+   case XFRM_MSG_NEWP

[PATCH v3 4/7] netlink/compat: Append NLMSG_DONE/extack to frag_list

2020-09-21 Thread Dmitry Safonov

Modules those use netlink may supply a 2nd skb, (via frag_list)
that contains an alternative data set meant for applications
using 32bit compatibility mode.

In such a case, netlink_recvmsg will use this 2nd skb instead of the
original one.

Without this patch, such compat applications will retrieve
all netlink dump data, but will then get an unexpected EOF.

Cc: Johannes Berg 
Signed-off-by: Florian Westphal 
Signed-off-by: Dmitry Safonov 
Reviewed-by: Johannes Berg 
---
 net/netlink/af_netlink.c | 47 
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d2d1448274f5..de12dd3136f9 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2186,13 +2186,35 @@ EXPORT_SYMBOL(__nlmsg_put);
  * It would be better to create kernel thread.
  */
 
+static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb,
+struct netlink_callback *cb,
+struct netlink_ext_ack *extack)
+{
+   struct nlmsghdr *nlh;
+
+   nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, 
sizeof(nlk->dump_done_errno),
+  NLM_F_MULTI | cb->answer_flags);
+   if (WARN_ON(!nlh))
+   return -ENOBUFS;
+
+   nl_dump_check_consistent(cb, nlh);
+   memcpy(nlmsg_data(nlh), >dump_done_errno, 
sizeof(nlk->dump_done_errno));
+
+   if (extack->_msg && nlk->flags & NETLINK_F_EXT_ACK) {
+   nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
+   if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg))
+   nlmsg_end(skb, nlh);
+   }
+
+   return 0;
+}
+
 static int netlink_dump(struct sock *sk)
 {
struct netlink_sock *nlk = nlk_sk(sk);
struct netlink_ext_ack extack = {};
struct netlink_callback *cb;
struct sk_buff *skb = NULL;
-   struct nlmsghdr *nlh;
struct module *module;
int err = -ENOBUFS;
int alloc_min_size;
@@ -2258,22 +2280,19 @@ static int netlink_dump(struct sock *sk)
return 0;
}
 
-   nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE,
-  sizeof(nlk->dump_done_errno),
-  NLM_F_MULTI | cb->answer_flags);
-   if (WARN_ON(!nlh))
+   if (netlink_dump_done(nlk, skb, cb, ))
goto errout_skb;
 
-   nl_dump_check_consistent(cb, nlh);
-
-   memcpy(nlmsg_data(nlh), >dump_done_errno,
-  sizeof(nlk->dump_done_errno));
-
-   if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) {
-   nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
-   if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg))
-   nlmsg_end(skb, nlh);
+#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
+   /* frag_list skb's data is used for compat tasks
+* and the regular skb's data for normal (non-compat) tasks.
+* See netlink_recvmsg().
+*/
+   if (unlikely(skb_shinfo(skb)->frag_list)) {
+   if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, 
))
+   goto errout_skb;
}
+#endif
 
if (sk_filter(sk, skb))
kfree_skb(skb);
-- 
2.28.0

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1582 matches

Mail list logo