Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso
On 3/31/21 10:59 AM, Michael Ellerman wrote: > Christophe Leroy writes: [..] >> >>> @@ -133,7 +135,13 @@ static int __arch_setup_additional_pages(struct >>> linux_binprm *bprm, int uses_int >>> * install_special_mapping or the perf counter mmap tracking code >>> * will fail to recognise it as a vDSO. >>> */ >>> - mm->context.vdso = (void __user *)vdso_base + PAGE_SIZE; >>> + mm->context.vdso = (void __user *)vdso_base + vvar_size; >>> + >>> + vma = _install_special_mapping(mm, vdso_base, vvar_size, >>> + VM_READ | VM_MAYREAD | VM_IO | >>> + VM_DONTDUMP | VM_PFNMAP, _spec); >>> + if (IS_ERR(vma)) >>> + return PTR_ERR(vma); >>> >>> /* >>> * our vma flags don't have VM_WRITE so by default, the process isn't >> >> >> IIUC, VM_PFNMAP is for when we have a vvar_fault handler. >> Allthough we will soon have one for handle TIME_NS, at the moment >> powerpc doesn't have that handler. >> Isn't it dangerous to set VM_PFNMAP then ? I believe, it's fine, special_mapping_fault() does: : if (sm->fault) : return sm->fault(sm, vmf->vma, vmf); > Some of the other flags seem odd too. > eg. VM_IO ? VM_DONTDUMP ? Yeah, so: VM_PFNMAP | VM_IO is a protection from remote access on pages. So one can't access such page with ptrace(), /proc/$pid/mem or process_vm_write(). Otherwise, it would create COW mapping and the tracee will stop working with stale vvar. VM_DONTDUMP restricts the area from coredumping and gdb will also avoid accessing those[1][2]. I agree that VM_PFNMAP was probably excessive in this patch alone and rather synchronized code with other architectures, but it makes more sense now in the new patches set by Christophe: https://lore.kernel.org/linux-arch/cover.1617209141.git.christophe.le...@csgroup.eu/ [1] https://lore.kernel.org/lkml/550731af.6080...@redhat.com/T/ [2] https://sourceware.org/legacy-ml/gdb-patches/2015-03/msg00383.html Thanks, Dmitry
Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso
On 3/30/21 11:17 AM, Christophe Leroy wrote: > > > Le 26/03/2021 à 20:17, Dmitry Safonov a écrit : [..] >> --- a/arch/powerpc/kernel/vdso.c >> +++ b/arch/powerpc/kernel/vdso.c >> @@ -55,10 +55,10 @@ static int vdso_mremap(const struct >> vm_special_mapping *sm, struct vm_area_struc >> { >> unsigned long new_size = new_vma->vm_end - new_vma->vm_start; >> - if (new_size != text_size + PAGE_SIZE) >> + if (new_size != text_size) >> return -EINVAL; > > In ARM64 you have removed the above test in commit 871402e05b24cb56 > ("mm: forbid splitting special mappings"). Do we need to keep it here ? > >> - current->mm->context.vdso = (void __user *)new_vma->vm_start + >> PAGE_SIZE; >> + current->mm->context.vdso = (void __user *)new_vma->vm_start; >> return 0; >> } > Yes, right you are, this can be dropped. Thanks, Dmitry
[PATCH] xfrm/compat: Cleanup WARN()s that can be user-triggered
Replace WARN_ONCE() that can be triggered from userspace with pr_warn_once(). Those still give user a hint what's the issue. I've left WARN()s that are not possible to trigger with current code-base and that would mean that the code has issues: - relying on current compat_msg_min[type] <= xfrm_msg_min[type] - expected 4-byte padding size difference between compat_msg_min[type] and xfrm_msg_min[type] - compat_policy[type].len <= xfrma_policy[type].len (for every type) Reported-by: syzbot+834ffd1afc7212eb8...@syzkaller.appspotmail.com Fixes: 5f3eea6b7e8f ("xfrm/compat: Attach xfrm dumps to 64=>32 bit translator") Cc: "David S. Miller" Cc: Eric Dumazet Cc: Herbert Xu Cc: Jakub Kicinski Cc: Steffen Klassert Cc: net...@vger.kernel.org Cc: sta...@vger.kernel.org Signed-off-by: Dmitry Safonov --- net/xfrm/xfrm_compat.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index d8e8a11ca845..a20aec9d7393 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -216,7 +216,7 @@ static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, case XFRM_MSG_GETSADINFO: case XFRM_MSG_GETSPDINFO: default: - WARN_ONCE(1, "unsupported nlmsg_type %d", nlh_src->nlmsg_type); + pr_warn_once("unsupported nlmsg_type %d\n", nlh_src->nlmsg_type); return ERR_PTR(-EOPNOTSUPP); } @@ -277,7 +277,7 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) return xfrm_nla_cpy(dst, src, nla_len(src)); default: BUILD_BUG_ON(XFRMA_MAX != XFRMA_IF_ID); - WARN_ONCE(1, "unsupported nla_type %d", src->nla_type); + pr_warn_once("unsupported nla_type %d\n", src->nla_type); return -EOPNOTSUPP; } } @@ -315,8 +315,10 @@ static int xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src struct sk_buff *new = NULL; int err; - if (WARN_ON_ONCE(type >= ARRAY_SIZE(xfrm_msg_min))) + if (type >= ARRAY_SIZE(xfrm_msg_min)) { + pr_warn_once("unsupported nlmsg_type %d\n", nlh_src->nlmsg_type); return -EOPNOTSUPP; + } if (skb_shinfo(skb)->frag_list == NULL) { new = alloc_skb(skb->len + skb_tailroom(skb), GFP_ATOMIC); @@ -378,6 +380,10 @@ static int xfrm_attr_cpy32(void *dst, size_t *pos, const struct nlattr *src, struct nlmsghdr *nlmsg = dst; struct nlattr *nla; + /* xfrm_user_rcv_msg_compat() relies on fact that 32-bit messages +* have the same len or shorted than 64-bit ones. +* 32-bit translation that is bigger than 64-bit original is unexpected. +*/ if (WARN_ON_ONCE(copy_len > payload)) copy_len = payload; -- 2.31.0
Re: [syzbot] WARNING in xfrm_alloc_compat (2)
On 3/29/21 9:31 PM, Eric Dumazet wrote: > > > On 3/29/21 9:57 PM, Dmitry Safonov wrote: [..] >>> [ cut here ] >>> unsupported nla_type 356 >> >> This doesn't seem to be an issue. >> Userspace sent message with nla_type 356, which is > __XFRM_MSG_MAX, so >> this warning is expected. I've added it so when a new XFRM_MSG_* will be >> added, to make sure that there will be translations to such messages in >> xfrm_compat.ko (if the translation needed). >> This is WARN_ON_ONCE(), so it can't be used as DOS. >> >> Ping me if you'd expect something else than once-a-boot warning for >> this. Not sure how-to close syzkaller bug, docs have only `invalid' tag, >> which isn't `not-a-bug'/`expected' tag: >> https://github.com/google/syzkaller/blob/master/docs/syzbot.md >> > > You should not use WARN_ON_ONCE() for this case (if user space can trigger it) > > https://lwn.net/Articles/769365/ > > > Greg Kroah-Hartman raised the problem of core kernel API code that will use > WARN_ON_ONCE() to complain about bad usage; that will not generate the > desired result if WARN_ON_ONCE() is configured to crash the machine. He was > told that the code should just call pr_warn() instead, and that the called > function should return an error in such situations. It was generally agreed > that any WARN_ON() or WARN_ON_ONCE() calls that can be triggered from user > space need to be fixed. > Yeah, fair enough, I've already thought after sending the reply that WARN_ON*() prints registers and that may be unwanted. I'll cook a patch to convert this and other WARNs in the module. I wish there was something like pr_warn_backtrace_once(), but I guess it's fine without dumpstack(), after all. Thanks, Dmitry
Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso
On 3/29/21 4:14 PM, Laurent Dufour wrote: > Le 26/03/2021 à 20:17, Dmitry Safonov a écrit : >> Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") >> VVAR page is in front of the VDSO area. In result it breaks CRIU >> (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" >> from /proc/../maps points at ELF/vdso image, rather than at VVAR data >> page. >> Laurent made a patch to keep CRIU working (by reading aux vector). >> But I think it still makes sence to separate two mappings into different >> VMAs. It will also make ppc64 less "special" for userspace and as >> a side-bonus will make VVAR page un-writable by debugger (which >> previously >> would COW page and can be unexpected). >> >> I opportunistically Cc stable on it: I understand that usually such >> stuff isn't a stable material, but that will allow us in CRIU have >> one workaround less that is needed just for one release (v5.11) on >> one platform (ppc64), which we otherwise have to maintain. >> I wouldn't go as far as to say that the commit 511157ab641e is ABI >> regression as no other userspace got broken, but I'd really appreciate >> if it gets backported to v5.11 after v5.12 is released, so as not >> to complicate already non-simple CRIU-vdso code. Thanks! >> >> Cc: Andrei Vagin >> Cc: Andy Lutomirski >> Cc: Benjamin Herrenschmidt >> Cc: Christophe Leroy >> Cc: Laurent Dufour >> Cc: Michael Ellerman >> Cc: Paul Mackerras >> Cc: linuxppc-...@lists.ozlabs.org >> Cc: sta...@vger.kernel.org # v5.11 >> [1]: https://github.com/checkpoint-restore/criu/issues/1417 >> Signed-off-by: Dmitry Safonov >> Tested-by: Christophe Leroy > > I run the CRIU's test suite and except the usual suspects, all the tests > passed. > > Tested-by: Laurent Dufour Thank you, Laurent! -- Dmitry
Re: [syzbot] WARNING in xfrm_alloc_compat (2)
Hi, On 3/29/21 8:04 PM, syzbot wrote: > Hello, > > syzbot found the following issue on: > > HEAD commit:6c996e19 net: change netdev_unregister_timeout_secs min va.. > git tree: net-next > console output: https://syzkaller.appspot.com/x/log.txt?x=102e5926d0 > kernel config: https://syzkaller.appspot.com/x/.config?x=c0ac79756537274e > dashboard link: https://syzkaller.appspot.com/bug?extid=834ffd1afc7212eb8147 > syz repro: https://syzkaller.appspot.com/x/repro.syz?x=10a7b1aad0 > C reproducer: https://syzkaller.appspot.com/x/repro.c?x=17ae6b7cd0 > > The issue was bisected to: > > commit 5f3eea6b7e8f58cf5c8a9d4b9679dc19e9e67ba3 > Author: Dmitry Safonov > Date: Mon Sep 21 14:36:53 2020 + > > xfrm/compat: Attach xfrm dumps to 64=>32 bit translator > > bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=10694b3ad0 > final oops: https://syzkaller.appspot.com/x/report.txt?x=12694b3ad0 > console output: https://syzkaller.appspot.com/x/log.txt?x=14694b3ad0 > > IMPORTANT: if you fix the issue, please add the following tag to the commit: > Reported-by: syzbot+834ffd1afc7212eb8...@syzkaller.appspotmail.com > Fixes: 5f3eea6b7e8f ("xfrm/compat: Attach xfrm dumps to 64=>32 bit > translator") > > netlink: 208 bytes leftover after parsing attributes in process > `syz-executor193'. > [ cut here ] > unsupported nla_type 356 This doesn't seem to be an issue. Userspace sent message with nla_type 356, which is > __XFRM_MSG_MAX, so this warning is expected. I've added it so when a new XFRM_MSG_* will be added, to make sure that there will be translations to such messages in xfrm_compat.ko (if the translation needed). This is WARN_ON_ONCE(), so it can't be used as DOS. Ping me if you'd expect something else than once-a-boot warning for this. Not sure how-to close syzkaller bug, docs have only `invalid' tag, which isn't `not-a-bug'/`expected' tag: https://github.com/google/syzkaller/blob/master/docs/syzbot.md > WARNING: CPU: 1 PID: 8423 at net/xfrm/xfrm_compat.c:280 xfrm_xlate64_attr > net/xfrm/xfrm_compat.c:280 [inline] > WARNING: CPU: 1 PID: 8423 at net/xfrm/xfrm_compat.c:280 xfrm_xlate64 > net/xfrm/xfrm_compat.c:301 [inline] > WARNING: CPU: 1 PID: 8423 at net/xfrm/xfrm_compat.c:280 > xfrm_alloc_compat+0xf39/0x10d0 net/xfrm/xfrm_compat.c:328 > Modules linked in: > CPU: 1 PID: 8423 Comm: syz-executor193 Not tainted 5.12.0-rc4-syzkaller #0 > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS > Google 01/01/2011 > RIP: 0010:xfrm_xlate64_attr net/xfrm/xfrm_compat.c:280 [inline] > RIP: 0010:xfrm_xlate64 net/xfrm/xfrm_compat.c:301 [inline] > RIP: 0010:xfrm_alloc_compat+0xf39/0x10d0 net/xfrm/xfrm_compat.c:328 > Code: de e8 5b d7 c3 f9 84 db 0f 85 b0 f8 ff ff e8 9e d0 c3 f9 8b 74 24 08 48 > c7 c7 80 42 76 8a c6 05 39 2e 02 06 01 e8 74 b7 16 01 <0f> 0b e9 8d f8 ff ff > e8 7b d0 c3 f9 8b 14 24 48 c7 c7 40 42 76 8a > RSP: 0018:c9fff4b8 EFLAGS: 00010282 > RAX: RBX: RCX: > RDX: 88801b1554c0 RSI: 815c4d75 RDI: f520001ffe89 > RBP: 00d8 R08: R09: > R10: 815bdb0e R11: R12: ffa1 > R13: 8880248eb8f8 R14: 888013256dc0 R15: 8880132568c0 > FS: 0092f300() GS:8880b9d0() knlGS: > CS: 0010 DS: ES: CR0: 80050033 > CR2: 21e0 CR3: 23271000 CR4: 001506e0 > DR0: DR1: DR2: > DR3: DR6: fffe0ff0 DR7: 0400 > Call Trace: > xfrm_alloc_userspi+0x66a/0xa30 net/xfrm/xfrm_user.c:1448 > xfrm_user_rcv_msg+0x42c/0x8b0 net/xfrm/xfrm_user.c:2812 > netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2502 > xfrm_netlink_rcv+0x6b/0x90 net/xfrm/xfrm_user.c:2824 > netlink_unicast_kernel net/netlink/af_netlink.c:1312 [inline] > netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1338 > netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1927 > sock_sendmsg_nosec net/socket.c:654 [inline] > sock_sendmsg+0xcf/0x120 net/socket.c:674 > sys_sendmsg+0x6e8/0x810 net/socket.c:2350 > ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 > __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 > do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 > entry_SYSCALL_64_after_hwframe+0x44/0xae > RIP: 0033:0x43f009 > Code: 28 c3 e8 2a 14 00 00 66 2e 0f 1f 84 00 00 00 00 00 48 89 f8 48 89 f7 48 > 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 > 01 c3 48 c7 c1 c0 ff ff ff f7 d8 64 89 01 48 > RSP: 002b:7ffe0986bcb8 EFLA
Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso
Hi Christophe, On 3/27/21 5:19 PM, Christophe Leroy wrote: [..] >> I opportunistically Cc stable on it: I understand that usually such >> stuff isn't a stable material, but that will allow us in CRIU have >> one workaround less that is needed just for one release (v5.11) on >> one platform (ppc64), which we otherwise have to maintain. > > Why is that a workaround, and why for one release only ? I think the > solution proposed by Laurentto use the aux vector AT_SYSINFO_EHDR should > work with any past and future release. Yeah, I guess. Previously, (before v5.11/power) all kernels had ELF start at "[vdso]" VMA start, now we'll have to carry the offset in the VMA. Probably, not the worst thing, but as it will be only for v5.11 release it can break, so needs separate testing. Kinda life was a bit easier without this additional code. >> I wouldn't go as far as to say that the commit 511157ab641e is ABI >> regression as no other userspace got broken, but I'd really appreciate >> if it gets backported to v5.11 after v5.12 is released, so as not >> to complicate already non-simple CRIU-vdso code. Thanks! >> >> Cc: Andrei Vagin >> Cc: Andy Lutomirski >> Cc: Benjamin Herrenschmidt >> Cc: Christophe Leroy >> Cc: Laurent Dufour >> Cc: Michael Ellerman >> Cc: Paul Mackerras >> Cc: linuxppc-...@lists.ozlabs.org >> Cc: sta...@vger.kernel.org # v5.11 >> [1]: https://github.com/checkpoint-restore/criu/issues/1417 >> Signed-off-by: Dmitry Safonov >> Tested-by: Christophe Leroy > > I tested it with sifreturn_vdso selftest and it worked, because that > selftest doesn't involve VDSO data. Thanks again on helping with testing it, I appreciate it! > But if I do a mremap() on the VDSO text vma without remapping VVAR to > keep the same distance between the two vmas, gettimeofday() crashes. The > reason is that the code obtains the address of the data by calculating a > fix difference from its own address with the below macro, the delta > being resolved at link time: > > .macro get_datapage ptr > bcl 20, 31, .+4 > 999: > mflr \ptr > #if CONFIG_PPC_PAGE_SHIFT > 14 > addis \ptr, \ptr, (_vdso_datapage - 999b)@ha > #endif > addi \ptr, \ptr, (_vdso_datapage - 999b)@l > .endm > > So the datapage needs to remain at the same distance from the code at > all time. > > Wondering how the other architectures do to have two independent VMAs > and be able to move one independently of the other. It's alright as far as I know. If userspace remaps vdso/vvar it should be aware of this (CRIU keeps this in mind, also old vdso image is dumped to compare on restore with the one that the host has). Thanks, Dmitry
[PATCH] powerpc/vdso: Separate vvar vma from vdso
Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front") VVAR page is in front of the VDSO area. In result it breaks CRIU (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]" from /proc/../maps points at ELF/vdso image, rather than at VVAR data page. Laurent made a patch to keep CRIU working (by reading aux vector). But I think it still makes sence to separate two mappings into different VMAs. It will also make ppc64 less "special" for userspace and as a side-bonus will make VVAR page un-writable by debugger (which previously would COW page and can be unexpected). I opportunistically Cc stable on it: I understand that usually such stuff isn't a stable material, but that will allow us in CRIU have one workaround less that is needed just for one release (v5.11) on one platform (ppc64), which we otherwise have to maintain. I wouldn't go as far as to say that the commit 511157ab641e is ABI regression as no other userspace got broken, but I'd really appreciate if it gets backported to v5.11 after v5.12 is released, so as not to complicate already non-simple CRIU-vdso code. Thanks! Cc: Andrei Vagin Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Christophe Leroy Cc: Laurent Dufour Cc: Michael Ellerman Cc: Paul Mackerras Cc: linuxppc-...@lists.ozlabs.org Cc: sta...@vger.kernel.org # v5.11 [1]: https://github.com/checkpoint-restore/criu/issues/1417 Signed-off-by: Dmitry Safonov Tested-by: Christophe Leroy --- arch/powerpc/include/asm/mmu_context.h | 2 +- arch/powerpc/kernel/vdso.c | 54 +++--- 2 files changed, 40 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 652ce85f9410..4bc45d3ed8b0 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -263,7 +263,7 @@ extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, unsigned long start, unsigned long end) { - unsigned long vdso_base = (unsigned long)mm->context.vdso - PAGE_SIZE; + unsigned long vdso_base = (unsigned long)mm->context.vdso; if (start <= vdso_base && vdso_base < end) mm->context.vdso = NULL; diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index e839a906fdf2..b14907209822 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -55,10 +55,10 @@ static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struc { unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - if (new_size != text_size + PAGE_SIZE) + if (new_size != text_size) return -EINVAL; - current->mm->context.vdso = (void __user *)new_vma->vm_start + PAGE_SIZE; + current->mm->context.vdso = (void __user *)new_vma->vm_start; return 0; } @@ -73,6 +73,10 @@ static int vdso64_mremap(const struct vm_special_mapping *sm, struct vm_area_str return vdso_mremap(sm, new_vma, _end - _start); } +static struct vm_special_mapping vvar_spec __ro_after_init = { + .name = "[vvar]", +}; + static struct vm_special_mapping vdso32_spec __ro_after_init = { .name = "[vdso]", .mremap = vdso32_mremap, @@ -89,11 +93,11 @@ static struct vm_special_mapping vdso64_spec __ro_after_init = { */ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - struct mm_struct *mm = current->mm; + unsigned long vdso_size, vdso_base, mappings_size; struct vm_special_mapping *vdso_spec; + unsigned long vvar_size = PAGE_SIZE; + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long vdso_size; - unsigned long vdso_base; if (is_32bit_task()) { vdso_spec = _spec; @@ -110,8 +114,8 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int vdso_base = 0; } - /* Add a page to the vdso size for the data page */ - vdso_size += PAGE_SIZE; + mappings_size = vdso_size + vvar_size; + mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK; /* * pick a base address for the vDSO in process space. We try to put it @@ -119,9 +123,7 @@ static int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_int * and end up putting it elsewhere. * Add enough to the size so that the result can be aligned. */ - vdso_base = get_unmapped_area(NULL, vdso_base, - vdso_size + ((VDSO_ALIGNMENT - 1) & PAGE_MASK), - 0, 0); + vdso_base = get_unmapped_area(NULL, vdso_base, mappings_size, 0, 0); if (IS_ERR_VALUE(vdso_
Re: [PATCH v2] prctl: PR_SET_MM - unify copying of user's auvx
Hi Cyrill, On 3/23/21 10:06 PM, Cyrill Gorcunov wrote: [..] > --- linux-tip.git.orig/kernel/sys.c > +++ linux-tip.git/kernel/sys.c > @@ -1961,6 +1961,30 @@ out: > return error; > } > > +static int copy_auxv_from_user(unsigned long *auxv, size_t auxv_size, > +const void __user *addr, size_t len) > +{ > + BUG_ON(auxv_size != sizeof(current->mm->saved_auxv)); Nit: size_t auxv_size = sizeof(user_auxv); BUILD_BUG_ON(sizeof(user_auxv) != sizeof(current->mm->saved_auxv)); (to make it local variable instead of a parameter and get rid of a new BUG_ON()) > + > + if (!addr || len > auxv_size) > + return -EINVAL; > + > + memset(auxv, 0, auxv_size); > + if (len && copy_from_user(auxv, addr, len)) > + return -EFAULT; > + > + /* > + * Specification requires the vector to be > + * ended up with AT_NULL entry so user space > + * will notice where to stop enumerating. > + */ > + if (len == auxv_size) { > + auxv[AT_VECTOR_SIZE - 2] = AT_NULL; > + auxv[AT_VECTOR_SIZE - 1] = AT_NULL; I don't follow why it became conditional. Perhaps, you meant that memset(0) above will zerofy it anyway, but in case (len == auxv_size - 1) it won't work. Or I'm missing something obvious :-) Thanks, Dima
Re: [PATCH v5 2/3] Revert "mremap: don't allow MREMAP_DONTUNMAP on special_mappings and aio"
On 3/23/21 6:25 PM, Brian Geffon wrote: > This reverts commit cd544fd1dc9293c6702fab6effa63dac1cc67e99. > > As discussed in [1] this commit was a no-op because the mapping type was > checked in vma_to_resize before move_vma is ever called. This meant that > vm_ops->mremap() would never be called on such mappings. Furthermore, > we've since expanded support of MREMAP_DONTUNMAP to non-anonymous > mappings, and these special mappings are still protected by the existing > check of !VM_DONTEXPAND and !VM_PFNMAP which will result in a -EINVAL. > > 1. https://lkml.org/lkml/2020/12/28/2340 > > Signed-off-by: Brian Geffon > Acked-by: Hugh Dickins Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com> Thanks, Dmitry
Re: [PATCH v5 1/3] mm: Extend MREMAP_DONTUNMAP to non-anonymous mappings
On 3/23/21 6:25 PM, Brian Geffon wrote: > Currently MREMAP_DONTUNMAP only accepts private anonymous mappings. > This restriction was placed initially for simplicity and not because > there exists a technical reason to do so. > > This change will widen the support to include any mappings which are not > VM_DONTEXPAND or VM_PFNMAP. The primary use case is to support > MREMAP_DONTUNMAP on mappings which may have been created from a memfd. > This change will result in mremap(MREMAP_DONTUNMAP) returning -EINVAL > if VM_DONTEXPAND or VM_PFNMAP mappings are specified. > > Lokesh Gidra who works on the Android JVM, provided an explanation of how > such a feature will improve Android JVM garbage collection: > "Android is developing a new garbage collector (GC), based on userfaultfd. > The garbage collector will use userfaultfd (uffd) on the java heap during > compaction. On accessing any uncompacted page, the application threads will > find it missing, at which point the thread will create the compacted page > and then use UFFDIO_COPY ioctl to get it mapped and then resume execution. > Before starting this compaction, in a stop-the-world pause the heap will be > mremap(MREMAP_DONTUNMAP) so that the java heap is ready to receive > UFFD_EVENT_PAGEFAULT events after resuming execution. Pretty interesting idea :-) > > To speedup mremap operations, pagetable movement was optimized by moving > PUD entries instead of PTE entries [1]. It was necessary as mremap of even > modest sized memory ranges also took several milliseconds, and stopping the > application for that long isn't acceptable in response-time sensitive > cases. > > With UFFDIO_CONTINUE feature [2], it will be even more efficient to > implement this GC, particularly the 'non-moveable' portions of the heap. > It will also help in reducing the need to copy (UFFDIO_COPY) the pages. > However, for this to work, the java heap has to be on a 'shared' vma. > Currently MREMAP_DONTUNMAP only supports private anonymous mappings, this > patch will enable using UFFDIO_CONTINUE for the new userfaultfd-based heap > compaction." > > [1] > https://lore.kernel.org/linux-mm/20201215030730.nc3cu98e4%25a...@linux-foundation.org/ > [2] > https://lore.kernel.org/linux-mm/20210302000133.272579-1-axelrasmus...@google.com/ > > Signed-off-by: Brian Geffon > Acked-by: Hugh Dickins > Tested-by: Lokesh Gidra Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com> Thanks, Dmitry
Re: [PATCH] mm: Allow shmem mappings with MREMAP_DONTUNMAP
Hi Brian, Hugh, On 3/16/21 7:18 PM, Brian Geffon wrote: > Hi Hugh, > Thanks for this suggestion, responses in line. > >> A better patch would say: >> >> - if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) || >> - vma->vm_flags & VM_SHARED)) >> + if ((flags & MREMAP_DONTUNMAP) && >> + (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) >> return ERR_PTR(-EINVAL); >> >> VM_DONTEXPAND is what has long been used on special mappings, to prevent >> surprises from mremap changing the size of the mapping: MREMAP_DONTUNMAP >> introduced a different way of expanding the mapping, so VM_DONTEXPAND >> still seems a reasonable name (I've thrown in VM_PFNMAP there because >> it's in the VM_DONTEXPAND test lower down: for safety I guess, and best >> if both behave the same - though one says -EINVAL and the other -EFAULT). > > I like this idea and am happy to mail a new patch. I think it may make > sense to bring the lower block up here so that it becomes more clear > that it's not duplicate code and that the MREMAP_DONTUNMAP case > returns -EINVAL and other cases return -EFAULT. I wonder if the > -EFAULT error code would have made more sense from the start for both > cases, do you have any thoughts on changing the error code at this > point? > >> With that VM_DONTEXPAND check in, Dmitry's commit cd544fd1dc92 >> ("mremap: don't allow MREMAP_DONTUNMAP on special_mappings and aio") >> can still be reverted (as you agreed on 28th December), even though >> vma_is_anonymous() will no longer protect it. > > I agree and if Dmitry does not have time I would be happy to mail a > revert to cd544fd1dc92 as we discussed in [1]. Dmitry, would you like > me to do that? Ack. I was planning to send a patches set that includes the revert, but that's stalled a bit. As the patch just adds excessive checks, but doesn't introduce an issue, I haven't sent it separately. Feel free to revert it :-) Thanks, Dmitry
[PATCH] perf diff: Don't crash on freeing errno-session
__cmd_diff() sets result of perf_session__new() to d->session. In case of failure, it's errno and perf-diff may crash with: failed to open perf.data: Permission denied Failed to open perf.data Segmentation fault (core dumped) >From the coredump: 0 0x5569a62b5955 in auxtrace__free (session=0x) at util/auxtrace.c:2681 1 0x5569a626b37d in perf_session__delete (session=0x) at util/session.c:295 2 perf_session__delete (session=0x) at util/session.c:291 3 0x5569a618008a in __cmd_diff () at builtin-diff.c:1239 4 cmd_diff (argc=, argv=) at builtin-diff.c:2011 [..] Funny enough, it won't always crash. For me it crashes only if failed file is second in cmd-line: the reason is that cmd_diff() check files for branch-stacks [in check_file_brstack()] and if the first file doesn't have brstacks, it doesn't proceed to try open other files from cmd-line. Check d->session before calling perf_session__delete(). Another solution would be assigning to temporary variable, checking it, but I find it easier to follow with IS_ERR() check in the same function. After some time it's still obvious why the check is needed, and with temp variable it's possible to make the same mistake. Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Signed-off-by: Dmitry Safonov --- tools/perf/builtin-diff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index cefc71506409..b0c57e55052d 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -1236,7 +1236,8 @@ static int __cmd_diff(void) out_delete: data__for_each_file(i, d) { - perf_session__delete(d->session); + if (!IS_ERR(d->session)) + perf_session__delete(d->session); data__free(d); } -- 2.30.1
[PATCH] perf: Use (long) for iterator for bfd symbols
GCC (GCC) 8.4.0 20200304 fails to build perf with: : util/symbol.c: In function 'dso__load_bfd_symbols': : util/symbol.c:1626:16: error: comparison of integer expressions of different signednes : for (i = 0; i < symbols_count; ++i) { : ^ : util/symbol.c:1632:16: error: comparison of integer expressions of different signednes :while (i + 1 < symbols_count && : ^ : util/symbol.c:1637:13: error: comparison of integer expressions of different signednes :if (i + 1 < symbols_count && : ^ : cc1: all warnings being treated as errors It's unlikely that the symtable will be that big, but the fix is oneliner and as perf has CORE_CFLAGS += -Wextra, which makes build to fail together with CORE_CFLAGS += -Werror Fixes: eac9a4342e54 ("perf symbols: Try reading the symbol table with libbfd") Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ingo Molnar Cc: Jacek Caban Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Remi Bernon Signed-off-by: Dmitry Safonov --- tools/perf/util/symbol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 64a039cbba1b..1645fb4ec9ed 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1561,12 +1561,11 @@ static int bfd2elf_binding(asymbol *symbol) int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) { int err = -1; - long symbols_size, symbols_count; + long symbols_size, symbols_count, i; asection *section; asymbol **symbols, *sym; struct symbol *symbol; bfd *abfd; - u_int i; u64 start, len; abfd = bfd_openr(dso->long_name, NULL); -- 2.30.0
Re: [PATCH 6/6] mm: Forbid splitting special mappings
On 1/22/21 1:00 PM, Will Deacon wrote: > On Fri, Jan 22, 2021 at 12:58:58PM +, Will Deacon wrote: >> On Tue, Oct 13, 2020 at 02:34:16AM +0100, Dmitry Safonov wrote: >>> Don't allow splitting of vm_special_mapping's. >>> It affects vdso/vvar areas. Uprobes have only one page in xol_area so >>> they aren't affected. >>> >>> Those restrictions were enforced by checks in .mremap() callbacks. >>> Restrict resizing with generic .split() callback. >>> >>> Signed-off-by: Dmitry Safonov >>> --- >>> arch/arm/kernel/vdso.c| 9 - >>> arch/arm64/kernel/vdso.c | 41 +++ >>> arch/mips/vdso/genvdso.c | 4 >>> arch/s390/kernel/vdso.c | 11 +-- >>> arch/x86/entry/vdso/vma.c | 17 >>> mm/mmap.c | 12 >>> 6 files changed, 16 insertions(+), 78 deletions(-) >> >> For arm64: >> >> Acked-by: Will Deacon > > Wait -- this got merged ages ago! Why am I reading such old email? It's alright, thanks for looking at this again! -- Dmitry
Re: [PATCH 3/6] mremap: Don't allow MREMAP_DONTUNMAP on special_mappings and aio
[I moved your reply to avoid top-posting] On 12/28/20 6:03 PM, Brian Geffon wrote: > On Mon, Oct 12, 2020 at 6:34 PM Dmitry Safonov wrote: >> >> As kernel expect to see only one of such mappings, any further >> operations on the VMA-copy may be unexpected by the kernel. >> Maybe it's being on the safe side, but there doesn't seem to be any >> expected use-case for this, so restrict it now. >> >> Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()") >> Signed-off-by: Dmitry Safonov > > I don't think this situation can ever happen MREMAP_DONTUNMAP is > already restricted to anonymous mappings (defined as not having > vm_ops) and vma_to_resize checks that the mapping is anonymous before > move_vma is called. I've looked again now, I think it is possible. One can call MREMAP_DONTUNMAP without MREMAP_FIXED and without resizing. So that the old VMA is copied at some free address. The calltrace would be: mremap()=>move_vma() [under if (flags & MREMAP_MAYMOVE)]. On the other side I agree with you that the fix could have been better if I realized the semantics that MREMAP_DONTUNMAP should only work with anonymous mappings. Probably, a better fix would be to move : if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) || : vma->vm_flags & VM_SHARED)) : return ERR_PTR(-EINVAL); from vma_to_resize() into the mremap() syscall directly. What do you think? -- Dima
Re: [PATCH 3/6] mremap: Don't allow MREMAP_DONTUNMAP on special_mappings and aio
On 12/28/20 7:33 PM, Dmitry Safonov wrote: > [I moved your reply to avoid top-posting] > > On 12/28/20 6:03 PM, Brian Geffon wrote: >> On Mon, Oct 12, 2020 at 6:34 PM Dmitry Safonov wrote: >>> >>> As kernel expect to see only one of such mappings, any further >>> operations on the VMA-copy may be unexpected by the kernel. >>> Maybe it's being on the safe side, but there doesn't seem to be any >>> expected use-case for this, so restrict it now. >>> >>> Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()") >>> Signed-off-by: Dmitry Safonov >> >> I don't think this situation can ever happen MREMAP_DONTUNMAP is >> already restricted to anonymous mappings (defined as not having >> vm_ops) and vma_to_resize checks that the mapping is anonymous before >> move_vma is called. > > I've looked again now, I think it is possible. One can call > MREMAP_DONTUNMAP without MREMAP_FIXED and without resizing. So that the > old VMA is copied at some free address. > > The calltrace would be: mremap()=>move_vma() > [under if (flags & MREMAP_MAYMOVE)]. > > On the other side I agree with you that the fix could have been better > if I realized the semantics that MREMAP_DONTUNMAP should only work with > anonymous mappings. > > Probably, a better fix would be to move > : if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) || > : vma->vm_flags & VM_SHARED)) > : return ERR_PTR(-EINVAL); > > from vma_to_resize() into the mremap() syscall directly. > What do you think? Ok, I've misread the code now, it checks vma_to_resize() before. I'll send a revert to this one. Thanks for noticing, Dima
Re: [PATCH 2/6] mm/mremap: For MREMAP_DONTUNMAP check security_vm_enough_memory_mm()
On 12/28/20 6:21 PM, Brian Geffon wrote: > This looks good to me with a small comment. > >> if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { >> /* OOM: unable to split vma, just get accounts right */ >> - if (vm_flags & VM_ACCOUNT) >> + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) >> vm_acct_memory(new_len >> PAGE_SHIFT); > > Checking MREMAP_DONTUNMAP in the do_munmap path is unnecessary as > MREMAP_DONTUNMAP will have already returned by this point. In this code it is also used as err-path. In case move_page_tables() fails to move all page tables or .mremap() callback fails, the new VMA is unmapped. IOW, MREMAP_DONTUNMAP returns under: : if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { -- Dima
Re: [RESEND PATCH v3 0/4] iommu/iova: Solve longterm IOVA issue
On Tue, 1 Dec 2020 at 21:50, Will Deacon wrote: > > On Tue, 17 Nov 2020 18:25:30 +0800, John Garry wrote: > > This series contains a patch to solve the longterm IOVA issue which > > leizhen originally tried to address at [0]. > > > > A sieved kernel log is at the following, showing periodic dumps of IOVA > > sizes, per CPU and per depot bin, per IOVA size granule: > > https://raw.githubusercontent.com/hisilicon/kernel-dev/topic-iommu-5.10-iova-debug-v3/aging_test > > > > [...] > > Applied the final patch to arm64 (for-next/iommu/iova), thanks! > > [4/4] iommu: avoid taking iova_rbtree_lock twice > https://git.kernel.org/arm64/c/3a651b3a27a1 Glad it made in next, 2 years ago I couldn't convince iommu maintainer it's worth it (but with a different justification): https://lore.kernel.org/linux-iommu/20180621180823.805-3-d...@arista.com/ Thanks, Dmitry
Re: [PATCH v2 00/19] Add generic vdso_base tracking
Hi Christophe, On 11/24/20 6:53 AM, Christophe Leroy wrote: > > > Le 24/11/2020 à 01:29, Dmitry Safonov a écrit : >> v2 Changes: >> - Rename user_landing to vdso_base as it tracks vDSO VMA start address, >> rather than the explicit address to land (Andy) >> - Reword and don't use "new-execed" and "new-born" task (Andy) >> - Fix failures reported by build robot >> >> Started from discussion [1], where was noted that currently a couple of >> architectures support mremap() for vdso/sigpage, but not munmap(). >> If an application maps something on the ex-place of vdso/sigpage, >> later after processing signal it will land there (good luck!) >> >> Patches set is based on linux-next (next-20201123) and it depends on >> changes in x86/cleanups (those reclaim TIF_IA32/TIF_X32) and also >> on my changes in akpm (fixing several mremap() issues). > > I have a series that cleans up VDSO init on powerpc and migrates powerpc > to _install_special_mapping() (patch 10 of the series). > > https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=204396=%2A=both > > > I'm wondering how we should coordinate with your series for merging. > > I guess your series will also imply removal of arch_unmap() ? see > https://elixir.bootlin.com/linux/v5.10-rc4/source/arch/powerpc/include/asm/mmu_context.h#L262 I think our series intersect only in that moment where I re-introduce arch_setup_additional_pages() parameters. So, in theory we could minimize the conflicts by merging both series in parallel and cleanup the result by moving to generic vdso_base on the top, what do you think? Thanks, Dmitry
Re: [PATCH v2 13/19] x86/signal: Check if vdso_image_32 is mapped before trying to land on it
On 11/24/20 11:43 PM, Andy Lutomirski wrote: > On Mon, Nov 23, 2020 at 4:29 PM Dmitry Safonov wrote: >> >> Provide current_has_vdso_image_32() helper and check it apriory landing >> attempt on vdso vma. >> The helper is a macro, not a static inline funciton to avoid >> linux/sched/task_stack.h inclusion in asm/vdso.h. > > Can you make this more general? For example: > > current_has_vdso(_image_whatever) Sounds good, will do. > > also: > >> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c >> index 18d8f17f755c..d9ab58cc765b 100644 >> --- a/arch/x86/entry/common.c >> +++ b/arch/x86/entry/common.c >> @@ -142,11 +142,16 @@ static noinstr bool __do_fast_syscall_32(struct >> pt_regs *regs) >> /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ >> __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) >> { >> + unsigned long landing_pad; >> + >> + if (!current_has_vdso_image_32()) >> + force_sigsegv(SIGSEGV); > > Falling through seems incorrect here. I would instead write 0 to rip, > do the force_sigsegv(), and exit, making sure that exiting goes > through the correct path. > Something like this? : if (!current_has_vdso(_image_32)) { : regs->ax = -EFAULT; : regs->ip = 0; : force_sigsegv(SIGSEGV); : syscall_exit_to_user_mode(regs); : } Thanks, Dmitry
Re: [PATCH v2 08/19] arm/vdso: Remove vdso pointer from mm->context
On 11/24/20 6:22 AM, Christophe Leroy wrote: > > > Le 24/11/2020 à 01:29, Dmitry Safonov a écrit : >> Not used any more. > > But what about mremap() ? Maybe you should explain why you can remove it ? Yep, it was only to keep track of context->vdso position. I'll add it to the description. Thanks, Dmitry
Re: [PATCH v2 05/19] elf: Remove compat_arch_setup_additional_pages()
On 11/24/20 6:13 AM, Christophe Leroy wrote: > > > Le 24/11/2020 à 01:29, Dmitry Safonov a écrit : >> Now that all users rely on detecting bitness of new-born task checking >> personality, remove compat_arch_setup_additional_pages() macro, >> simplifying the code. > > I understand from cover that you wanted to reword "new-born" ? Yes. I have reread the messages, not sure how I've missed it, Thanks, Dmitry
Re: [PATCH v2 06/19] elf/vdso: Reuse arch_setup_additional_pages() parameters
On 11/24/20 6:18 AM, Christophe Leroy wrote: > "Reuse arch_setup_additional_pages() parameters" > > Did you mean "remove" ? Or "Revise" ? > > Maybe could be: > > "Modify arch_setup_additional_pages() parameters" Sure. Thanks, Dmitry
Re: [PATCH v2 02/19] elf: Move arch_setup_additional_pages() to generic elf.h
On 11/24/20 6:12 AM, Christophe Leroy wrote: [..] >> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig >> index 6fd7d38a60c8..4221f171d1a9 100644 >> --- a/arch/powerpc/Kconfig >> +++ b/arch/powerpc/Kconfig >> @@ -134,6 +134,7 @@ config PPC >> select ARCH_HAS_PTE_SPECIAL >> select ARCH_HAS_MEMBARRIER_CALLBACKS >> select ARCH_HAS_MEMBARRIER_SYNC_CORE >> + select ARCH_HAS_SETUP_ADDITIONAL_PAGES > > We try to keep alphabetic order on powerpc, should go after > ARCH_HAS_SCALED_CPUTIME Yep, I think I have to learn the alphabet. Thanks for noticing, Dmitry
[PATCH v2 02/19] elf: Move arch_setup_additional_pages() to generic elf.h
Ifdef the function in the header, not in the code. Following kernel style, move it to Kconfig. All it makes it easier to follow when the option is enabled/disabled. Remove re-definition from compat_binfmt_elf, as it's always defined under compat_arch_setup_additional_pages (to be reworked). Signed-off-by: Dmitry Safonov --- arch/arm/Kconfig| 1 + arch/arm/include/asm/elf.h | 5 - arch/arm64/Kconfig | 1 + arch/arm64/include/asm/elf.h| 6 +- arch/csky/Kconfig | 1 + arch/csky/include/asm/elf.h | 4 arch/hexagon/Kconfig| 1 + arch/hexagon/include/asm/elf.h | 6 -- arch/mips/Kconfig | 1 + arch/mips/include/asm/elf.h | 5 - arch/nds32/Kconfig | 1 + arch/nds32/include/asm/elf.h| 3 --- arch/nios2/Kconfig | 1 + arch/nios2/include/asm/elf.h| 4 arch/powerpc/Kconfig| 1 + arch/powerpc/include/asm/elf.h | 5 - arch/riscv/Kconfig | 1 + arch/riscv/include/asm/elf.h| 4 arch/s390/Kconfig | 1 + arch/s390/include/asm/elf.h | 5 - arch/sh/Kconfig | 1 + arch/sh/include/asm/elf.h | 6 -- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/elf_64.h | 6 -- arch/x86/Kconfig| 1 + arch/x86/include/asm/elf.h | 4 arch/x86/um/asm/elf.h | 5 - fs/Kconfig.binfmt | 3 +++ fs/binfmt_elf.c | 2 -- fs/binfmt_elf_fdpic.c | 3 +-- fs/compat_binfmt_elf.c | 2 -- include/linux/elf.h | 12 32 files changed, 30 insertions(+), 73 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 3b1caaefbff0..a6330e7eb2b8 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -15,6 +15,7 @@ config ARM select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_SPECIAL if ARM_LPAE select ARCH_HAS_PHYS_TO_DMA + select ARCH_HAS_SETUP_ADDITIONAL_PAGES if MMU select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index b8102a6ddf16..47347d7412ec 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -137,7 +137,6 @@ extern int arm_elf_read_implies_exec(int); extern void elf_set_personality(const struct elf32_hdr *); #define SET_PERSONALITY(ex)elf_set_personality(&(ex)) -#ifdef CONFIG_MMU #ifdef CONFIG_VDSO #define ARCH_DLINFO\ do { \ @@ -145,9 +144,5 @@ do { \ (elf_addr_t)current->mm->context.vdso); \ } while (0) #endif -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 -struct linux_binprm; -int arch_setup_additional_pages(struct linux_binprm *, int); -#endif #endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 629a293cc408..35d8e4acd38d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -26,6 +26,7 @@ config ARM64 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_SETUP_ADDITIONAL_PAGES select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 8d1c8dcb87fd..d1073ffa7f24 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -181,11 +181,6 @@ do { \ NEW_AUX_ENT(AT_IGNORE, 0); \ } while (0) -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES -struct linux_binprm; -extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); - /* 1GB of VA */ #ifdef CONFIG_COMPAT #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ @@ -242,6 +237,7 @@ do { \ #else #define COMPAT_ARCH_DLINFO #endif +struct linux_binprm; extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages \ diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 21b2ab099c8b..f068d67456a2 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -4,6 +4,7 @@ config CSKY select ARCH_32BIT_OFF_T select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_SETUP_ADDITIONAL_PAGES select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEV
[PATCH v2 08/19] arm/vdso: Remove vdso pointer from mm->context
Not used any more. Signed-off-by: Dmitry Safonov --- arch/arm/include/asm/mmu.h | 3 --- arch/arm/kernel/vdso.c | 12 2 files changed, 15 deletions(-) diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h index 1592a4264488..2397b0a19f59 100644 --- a/arch/arm/include/asm/mmu.h +++ b/arch/arm/include/asm/mmu.h @@ -12,9 +12,6 @@ typedef struct { #endif unsigned intvmalloc_seq; unsigned long sigpage; -#ifdef CONFIG_VDSO - unsigned long vdso; -#endif #ifdef CONFIG_BINFMT_ELF_FDPIC unsigned long exec_fdpic_loadmap; unsigned long interp_fdpic_loadmap; diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 710e5ca99a53..4b39c7d8f525 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -47,17 +47,8 @@ static const struct vm_special_mapping vdso_data_mapping = { .pages = _data_page, }; -static int vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - current->mm->context.vdso = new_vma->vm_start; - - return 0; -} - static struct vm_special_mapping vdso_text_mapping __ro_after_init = { .name = "[vdso]", - .mremap = vdso_mremap, }; struct elfinfo { @@ -239,8 +230,6 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long addr, struct vm_area_struct *vma; unsigned long len; - mm->context.vdso = 0; - if (vdso_text_pagelist == NULL) return; @@ -258,7 +247,6 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long addr, if (IS_ERR(vma)) return; - mm->context.vdso = addr; *sysinfo_ehdr = addr; } -- 2.29.2
[PATCH v2 06/19] elf/vdso: Reuse arch_setup_additional_pages() parameters
Both parameters of arch_setup_additional_pages() are currently unused. commit fc5243d98ac2 ("[S390] arch_setup_additional_pages arguments") tried to introduce useful arguments, but they still are not used. Remove old parameters and introduce sysinfo_ehdr argument that will be used to return vdso address to put as AT_SYSINFO_EHDR tag in auxiliary vector. The reason to add this parameter is that many architecture have vDSO pointer saved in their mm->context with the only purpose to use it later in ARCH_DLINFO. That's the macro for elf loader to setup sysinfo_ehdr tag. Return sysinfo_ehdr address that will be later used by ARCH_DLINFO as an argument. That will allow to drop vDSO pointer from mm->context and any code responsible to track vDSO position on platforms that don't use vDSO as a landing in userspace (arm/s390/sparc). Cc: Albert Ou Cc: "David S. Miller" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: linux-fsde...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/arm/include/asm/vdso.h| 6 -- arch/arm/kernel/process.c | 4 ++-- arch/arm/kernel/vdso.c | 10 +++--- arch/arm64/kernel/vdso.c | 17 arch/csky/kernel/vdso.c| 3 ++- arch/hexagon/kernel/vdso.c | 3 ++- arch/mips/kernel/vdso.c| 3 ++- arch/nds32/kernel/vdso.c | 3 ++- arch/nios2/mm/init.c | 2 +- arch/powerpc/kernel/vdso.c | 3 ++- arch/riscv/kernel/vdso.c | 11 +- arch/s390/kernel/vdso.c| 3 ++- arch/sh/kernel/vsyscall/vsyscall.c | 3 ++- arch/sparc/vdso/vma.c | 15 +++--- arch/x86/entry/vdso/vma.c | 32 +- arch/x86/um/vdso/vma.c | 2 +- fs/binfmt_elf.c| 3 ++- fs/binfmt_elf_fdpic.c | 3 ++- include/linux/elf.h| 17 +++- 19 files changed, 85 insertions(+), 58 deletions(-) diff --git a/arch/arm/include/asm/vdso.h b/arch/arm/include/asm/vdso.h index 5b85889f82ee..6b2b3b1fe833 100644 --- a/arch/arm/include/asm/vdso.h +++ b/arch/arm/include/asm/vdso.h @@ -10,13 +10,15 @@ struct mm_struct; #ifdef CONFIG_VDSO -void arm_install_vdso(struct mm_struct *mm, unsigned long addr); +void arm_install_vdso(struct mm_struct *mm, unsigned long addr, + unsigned long *sysinfo_ehdr); extern unsigned int vdso_total_pages; #else /* CONFIG_VDSO */ -static inline void arm_install_vdso(struct mm_struct *mm, unsigned long addr) +static inline void arm_install_vdso(struct mm_struct *mm, unsigned long addr, + unsigned long *sysinfo_ehdr) { } diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index d0220da1d1b1..0e90cba8ac7a 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -389,7 +389,7 @@ static const struct vm_special_mapping sigpage_mapping = { .mremap = sigpage_mremap, }; -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -430,7 +430,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * to be fatal to the process, so no error check needed * here. */ - arm_install_vdso(mm, addr + PAGE_SIZE); + arm_install_vdso(mm, addr + PAGE_SIZE, sysinfo_ehdr); up_fail: mmap_write_unlock(mm); diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 3408269d19c7..710e5ca99a53 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -233,7 +233,8 @@ static int install_vvar(struct mm_struct *mm, unsigned long addr) } /* assumes mmap_lock is write-locked */ -void arm_install_vdso(struct mm_struct *mm, unsigned long addr) +void arm_install_vdso(struct mm_struct *mm, unsigned long addr, + unsigned long *sysinfo_ehdr) { struct vm_area_struct *vma; unsigned long len; @@ -254,7 +255,10 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long addr) VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, _text_mapping); - if (!IS_ERR(vma)) - mm->context.vdso = addr; + if (IS_ERR(vma)) + return; + + mm->context.vdso = addr; + *sysinfo_ehdr = addr; } diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 1b710deb84d6..666338724a07 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -213,8 +213,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, - struct linux_binprm *bprm, -
[PATCH v2 13/19] x86/signal: Check if vdso_image_32 is mapped before trying to land on it
Provide current_has_vdso_image_32() helper and check it apriory landing attempt on vdso vma. The helper is a macro, not a static inline funciton to avoid linux/sched/task_stack.h inclusion in asm/vdso.h. Signed-off-by: Dmitry Safonov --- arch/x86/entry/common.c | 7 ++- arch/x86/ia32/ia32_signal.c | 4 ++-- arch/x86/include/asm/vdso.h | 4 arch/x86/kernel/signal.c| 4 ++-- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 18d8f17f755c..d9ab58cc765b 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -142,11 +142,16 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) { + unsigned long landing_pad; + + if (!current_has_vdso_image_32()) + force_sigsegv(SIGSEGV); + /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ - unsigned long landing_pad = (unsigned long)current->mm->context.vdso + + landing_pad = (unsigned long)current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad; /* diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index ea3db15b57bf..f87ed1d53938 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -255,7 +255,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, restorer = ksig->ka.sa.sa_restorer; } else { /* Return stub is in 32bit vsyscall page */ - if (current->mm->context.vdso) + if (current_has_vdso_image_32()) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_sigreturn; else @@ -336,7 +336,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - else if (current->mm->context.vdso) + else if (current_has_vdso_image_32()) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_rt_sigreturn; else diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index b5d23470f56b..e3829c3a6149 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -41,6 +41,10 @@ extern const struct vdso_image vdso_image_x32; #if defined CONFIG_X86_32 || defined CONFIG_COMPAT extern const struct vdso_image vdso_image_32; + +#define current_has_vdso_image_32()\ + likely(current->mm->context.vdso_image == _image_32 && \ + !!current->mm->context.vdso) #endif extern void __init init_vdso_image(const struct vdso_image *image); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 372ec09dc4ac..6fed2e523e0a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -319,7 +319,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, unsafe_put_user(set->sig[1], >extramask[0], Efault); if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - else if (current->mm->context.vdso) + else if (current_has_vdso_image_32()) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_sigreturn; else @@ -381,7 +381,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, /* Set up to return from userspace. */ if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - else if (current->mm->context.vdso) + else if (current_has_vdso_image_32()) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_rt_sigreturn; else -- 2.29.2
[PATCH v2 15/19] x86/vdso: Migrate to generic vdso_base
Generic way to track the landing vma area. As a bonus, after unmapping vdso, kernel won't try to land on its previous position (due to UNMAPPED_VDSO_BASE check instead of context.vdso ?= 0 check). Signed-off-by: Dmitry Safonov --- arch/x86/Kconfig | 1 + arch/x86/entry/common.c | 2 +- arch/x86/entry/vdso/extable.c | 4 ++-- arch/x86/entry/vdso/vma.c | 9 - arch/x86/ia32/ia32_signal.c | 4 ++-- arch/x86/include/asm/mmu.h| 1 - arch/x86/include/asm/vdso.h | 2 +- arch/x86/kernel/signal.c | 4 ++-- 8 files changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d676e0bad1f1..4f06760f849e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -85,6 +85,7 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VDSO_BASE select ARCH_HAS_DEBUG_WX select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d9ab58cc765b..655b7d8fe734 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -151,7 +151,7 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ - landing_pad = (unsigned long)current->mm->context.vdso + + landing_pad = (unsigned long)current->mm->vdso_base + vdso_image_32.sym_int80_landing_pad; /* diff --git a/arch/x86/entry/vdso/extable.c b/arch/x86/entry/vdso/extable.c index afcf5b65beef..16688c2d032c 100644 --- a/arch/x86/entry/vdso/extable.c +++ b/arch/x86/entry/vdso/extable.c @@ -25,10 +25,10 @@ bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP) return false; - if (!current->mm->context.vdso) + if (current->mm->vdso_base == (void *)UNMAPPED_VDSO_BASE) return false; - base = (unsigned long)current->mm->context.vdso + image->extable_base; + base = (unsigned long)current->mm->vdso_base + image->extable_base; nr_entries = image->extable_len / (sizeof(*extable)); extable = image->extable; diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 65780a0164e3..29a795face9d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -77,7 +77,7 @@ static void vdso_fix_landing(const struct vdso_image *image, struct pt_regs *regs = current_pt_regs(); unsigned long vdso_land = image->sym_int80_landing_pad; unsigned long old_land_addr = vdso_land + - (unsigned long)current->mm->context.vdso; + (unsigned long)current->mm->vdso_base; /* Fixing userspace landing - look at do_fast_syscall_32 */ if (regs->ip == old_land_addr) @@ -92,7 +92,6 @@ static void vdso_mremap(const struct vm_special_mapping *sm, const struct vdso_image *image = current->mm->context.vdso_image; vdso_fix_landing(image, new_vma); - current->mm->context.vdso = (void __user *)new_vma->vm_start; } #ifdef CONFIG_TIME_NS @@ -287,7 +286,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr, ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); } else { - current->mm->context.vdso = (void __user *)text_start; + current->mm->vdso_base = (void __user *)text_start; current->mm->context.vdso_image = image; *sysinfo_ehdr = text_start; } @@ -362,8 +361,8 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) * Check if we have already mapped vdso blob - fail to prevent * abusing from userspace install_speciall_mapping, which may * not do accounting and rlimit right. -* We could search vma near context.vdso, but it's a slowpath, -* so let's explicitly check all VMAs to be completely sure. +* It's a slowpath, let's explicitly check all VMAs to be +* completely sure. */ for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma_is_special_mapping(vma, _mapping) || diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index f87ed1d53938..67204b1eeea0 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -256,7 +256,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, } else { /* Return stub is in 32bit vsyscall page */ if (current_has_vdso
[PATCH v2 19/19] mips/vdso: Migrate to generic vdso_base
Generic way to track the landing vma area. As a bonus, after unmapping sigpage, kernel won't try to land on its previous position (due to UNMAPPED_VDSO_BASE check instead of context.vdso ?= 0 check). Cc: Thomas Bogendoerfer Cc: linux-m...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/mips/Kconfig | 1 + arch/mips/kernel/signal.c | 11 +++ arch/mips/kernel/vdso.c | 2 +- arch/mips/vdso/genvdso.c | 8 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 818a9b2c78f1..70424605710f 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -10,6 +10,7 @@ config MIPS select ARCH_HAS_SETUP_ADDITIONAL_PAGES select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VDSO_BASE select ARCH_SUPPORTS_UPROBES select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if 64BIT diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index f1e985109da0..e0beaf2cdc0f 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -806,11 +806,13 @@ struct mips_abi mips_abi = { static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { + void *land = (void *)current->mm->vdso_base; sigset_t *oldset = sigmask_to_save(); - int ret; + int ret = 1; struct mips_abi *abi = current->thread.abi; - void *vdso = current->mm->context.vdso; + if (land == (void *)UNMAPPED_VDSO_BASE) + goto err; /* * If we were emulating a delay slot instruction, exit that frame such * that addresses in the sigframe are as expected for userland and we @@ -843,12 +845,13 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) rseq_signal_deliver(ksig, regs); if (sig_uses_siginfo(>ka, abi)) - ret = abi->setup_rt_frame(vdso + abi->vdso->off_rt_sigreturn, + ret = abi->setup_rt_frame(land + abi->vdso->off_rt_sigreturn, ksig, regs, oldset); else - ret = abi->setup_frame(vdso + abi->vdso->off_sigreturn, + ret = abi->setup_frame(land + abi->vdso->off_sigreturn, ksig, regs, oldset); +err: signal_setup_done(ret, ksig, 0); } diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index e124c68322bb..b2b2e596f03b 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -183,7 +183,7 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) goto out; } - mm->context.vdso = (void *)vdso_addr; + mm->vdso_base = (void __user *)vdso_addr; *sysinfo_ehdr = vdso_addr; ret = 0; diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c index 0303d30cde03..8f581a2c8578 100644 --- a/arch/mips/vdso/genvdso.c +++ b/arch/mips/vdso/genvdso.c @@ -259,13 +259,6 @@ int main(int argc, char **argv) fprintf(out_file, "#include \n"); fprintf(out_file, "#include \n"); fprintf(out_file, "#include \n"); - fprintf(out_file, "static void vdso_mremap(\n"); - fprintf(out_file, " const struct vm_special_mapping *sm,\n"); - fprintf(out_file, " struct vm_area_struct *new_vma)\n"); - fprintf(out_file, "{\n"); - fprintf(out_file, " current->mm->context.vdso =\n"); - fprintf(out_file, " (void *)(new_vma->vm_start);\n"); - fprintf(out_file, "}\n"); /* Write out the stripped VDSO data. */ fprintf(out_file, @@ -290,7 +283,6 @@ int main(int argc, char **argv) fprintf(out_file, "\t.mapping = {\n"); fprintf(out_file, "\t\t.name = \"[vdso]\",\n"); fprintf(out_file, "\t\t.pages = vdso_pages,\n"); - fprintf(out_file, "\t\t.mremap = vdso_mremap,\n"); fprintf(out_file, "\t},\n"); /* Calculate and write symbol offsets to */ -- 2.29.2
[PATCH v2 18/19] arm64/vdso: Migrate native signals to generic vdso_base
Generic way to track the land vma area. As a bonus, after unmapping vdso, kernel won't try to land on its previous position (due to UNMAPPED_VDSO_BASE check instead of context.vdso ?= 0 check). Signed-off-by: Dmitry Safonov --- arch/arm64/kernel/signal.c | 10 +++--- arch/arm64/kernel/vdso.c | 16 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index bec6ef69704f..96e8988f033d 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -723,9 +723,10 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, return 0; } -static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, +static int setup_return(struct pt_regs *regs, struct k_sigaction *ka, struct rt_sigframe_user_layout *user, int usig) { + unsigned long land = (unsigned long)current->mm->vdso_base; __sigrestore_t sigtramp; regs->regs[0] = usig; @@ -754,10 +755,13 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; + else if (land != UNMAPPED_VDSO_BASE) + sigtramp = VDSO_SYMBOL(land, sigtramp); else - sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); + return 1; regs->regs[30] = (unsigned long)sigtramp; + return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, @@ -780,7 +784,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, err |= __save_altstack(>uc.uc_stack, regs->sp); err |= setup_sigframe(, regs, set); if (err == 0) { - setup_return(regs, >ka, , usig); + err = setup_return(regs, >ka, , usig); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(>info, >info); regs->regs[1] = (unsigned long)>info; diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 03a6519d50cd..5b44463b739a 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -78,12 +78,6 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -static void vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - current->mm->context.vdso = (void *)new_vma->vm_start; -} - static int __vdso_init(enum vdso_abi abi) { int i; @@ -239,7 +233,6 @@ static int __setup_additional_pages(enum vdso_abi abi, gp_flags = VM_ARM64_BTI; vdso_base += VVAR_NR_PAGES * PAGE_SIZE; - mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, @@ -247,12 +240,17 @@ static int __setup_additional_pages(enum vdso_abi abi, if (IS_ERR(ret)) goto up_fail; + /* +* 32-bit ABI is to land on sigpage (see aarch32_sigreturn_setup()), +* 64-bit on vDSO. +*/ + if (abi == VDSO_ABI_AA64) + mm->vdso_base = (void __user *)vdso_base; *sysinfo_ehdr = vdso_base; return 0; up_fail: - mm->context.vdso = NULL; return PTR_ERR(ret); } @@ -285,7 +283,6 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { }, [AA32_MAP_VDSO] = { .name = "[vdso]", - .mremap = vdso_mremap, }, }; @@ -431,7 +428,6 @@ static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { }, [AA64_MAP_VDSO] = { .name = "[vdso]", - .mremap = vdso_mremap, }, }; -- 2.29.2
[PATCH v2 16/19] arm/vdso: Migrate to generic vdso_base
Generic way to track the landing vma area. As a bonus, after unmapping sigpage, kernel won't try to land on its previous position (due to UNMAPPED_VDSO_BASE check instead of context.vdso ?= 0 check). Signed-off-by: Dmitry Safonov --- arch/arm/Kconfig | 1 + arch/arm/kernel/process.c | 9 + arch/arm/kernel/signal.c | 6 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a6330e7eb2b8..1df40666b024 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -24,6 +24,7 @@ config ARM select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB select ARCH_HAS_TEARDOWN_DMA_OPS if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_VDSO_BASE select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_KEEP_MEMBLOCK diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 5f4eced738f5..c6d44b34b05e 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -376,16 +376,9 @@ static unsigned long sigpage_addr(const struct mm_struct *mm, static struct page *signal_page; extern struct page *get_signal_page(void); -static void sigpage_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - current->mm->context.sigpage = new_vma->vm_start; -} - static const struct vm_special_mapping sigpage_mapping = { .name = "[sigpage]", .pages = _page, - .mremap = sigpage_mremap, }; int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) @@ -423,7 +416,7 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) goto up_fail; } - mm->context.sigpage = addr; + mm->vdso_base = (void __user *)addr; /* Unlike the sigpage, failure to install the vdso is unlikely * to be fatal to the process, so no error check needed diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 9d2e916121be..ac9c0b4869c9 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -451,13 +451,17 @@ setup_return(struct pt_regs *regs, struct ksignal *ksig, #ifdef CONFIG_MMU if (cpsr & MODE32_BIT) { struct mm_struct *mm = current->mm; + unsigned long land = (unsigned long)mm->vdso_base; + + if (land == UNMAPPED_VDSO_BASE) + return 1; /* * 32-bit code can use the signal return page * except when the MPU has protected the vectors * page from PL0 */ - retcode = mm->context.sigpage + signal_return_offset + + retcode = land + signal_return_offset + (idx << 2) + thumb; } else #endif -- 2.29.2
[PATCH v2 17/19] arm64/vdso: Migrate compat signals to generic vdso_base
Generic way to track the landing vma area. As a bonus, after unmapping sigpage, kernel won't try to land on its previous position (due to UNMAPPED_VDSO_BASE check instead of context.vdso ?= 0 check). Signed-off-by: Dmitry Safonov --- arch/arm64/Kconfig | 1 + arch/arm64/kernel/signal32.c | 17 - arch/arm64/kernel/vdso.c | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 35d8e4acd38d..a93ec99644b8 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -38,6 +38,7 @@ config ARM64 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_VDSO_BASE select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK if !PREEMPTION diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 2f507f565c48..3d2ac8062e99 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -315,7 +315,7 @@ static void __user *compat_get_sigframe(struct ksignal *ksig, return frame; } -static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, +static int compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, compat_ulong_t __user *rc, void __user *frame, int usig) { @@ -342,13 +342,16 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, retcode = ptr_to_compat(ka->sa.sa_restorer); } else { /* Set up sigreturn pointer */ + unsigned long land = (unsigned long)current->mm->vdso_base; unsigned int idx = thumb << 1; if (ka->sa.sa_flags & SA_SIGINFO) idx += 3; - retcode = (unsigned long)current->mm->context.sigpage + - (idx << 2) + thumb; + if (land == UNMAPPED_VDSO_BASE) + return 1; + + retcode = land + (idx << 2) + thumb; } regs->regs[0] = usig; @@ -356,6 +359,8 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->compat_lr = retcode; regs->pc= handler; regs->pstate= spsr; + + return 0; } static int compat_setup_sigframe(struct compat_sigframe __user *sf, @@ -425,7 +430,8 @@ int compat_setup_rt_frame(int usig, struct ksignal *ksig, err |= compat_setup_sigframe(>sig, regs, set); if (err == 0) { - compat_setup_return(regs, >ka, frame->sig.retcode, frame, usig); + err = compat_setup_return(regs, >ka, + frame->sig.retcode, frame, usig); regs->regs[1] = (compat_ulong_t)(unsigned long)>info; regs->regs[2] = (compat_ulong_t)(unsigned long)>sig.uc; } @@ -448,7 +454,8 @@ int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, err |= compat_setup_sigframe(frame, regs, set); if (err == 0) - compat_setup_return(regs, >ka, frame->retcode, frame, usig); + err = compat_setup_return(regs, >ka, + frame->retcode, frame, usig); return err; } diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 91c1b7c716b7..03a6519d50cd 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -394,7 +394,7 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) if (IS_ERR(ret)) goto out; - mm->context.sigpage = (void *)addr; + mm->vdso_base = (void __user *)addr; out: return PTR_ERR_OR_ZERO(ret); -- 2.29.2
[PATCH v2 12/19] x86/signal: Land on >retcode when vdso isn't mapped
Since commit 9fbbd4dd17d0 ("x86: Don't require the vDSO for handling a.out signals") after processing 32-bit signal if there is no vdso mapped frame->retcode is used as a landing. Do the same for rt ia32 signals. It also makes the ia32 compat signals match the native ia32 case. This shouldn't be mistaken for encouragement for running binaries with executable stack, rather something to do in hopefully very rare situation with disabled or unmapped vdso and absent SA_RESTORER. For non-executable stack it'll segfault on attempt to land, rather than land on a random address where vdso was previously mapped. For programs with executable stack it'll just do the same for rt signals as for non-rt. Discouraging users to run with executable stack is done separately in commit 47a2ebb7f505 ("execve: warn if process starts with executable stack"). Signed-off-by: Dmitry Safonov --- arch/x86/ia32/ia32_signal.c | 12 +++- arch/x86/kernel/signal.c| 23 ++- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 81cf22398cd1..ea3db15b57bf 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -270,8 +270,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, unsafe_put_user(set->sig[1], >extramask[0], Efault); unsafe_put_user(ptr_to_compat(restorer), >pretcode, Efault); /* -* These are actually not used anymore, but left because some -* gdb versions depend on them as a marker. +* This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 +* gdb uses it as a signature to notice signal handler stack frames. */ unsafe_put_user(*((u64 *)), (u64 __user *)frame->retcode, Efault); user_access_end(); @@ -336,14 +336,16 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - else + else if (current->mm->context.vdso) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_rt_sigreturn; + else + restorer = >retcode; unsafe_put_user(ptr_to_compat(restorer), >pretcode, Efault); /* -* Not actually used anymore, but left because some gdb -* versions need it. +* This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 +* gdb uses it as a signature to notice signal handler stack frames. */ unsafe_put_user(*((u64 *)), (u64 __user *)frame->retcode, Efault); unsafe_put_sigcontext32(>uc.uc_mcontext, fp, regs, set, Efault); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ea794a083c44..372ec09dc4ac 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -317,23 +317,20 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, unsafe_put_user(sig, >sig, Efault); unsafe_put_sigcontext(>sc, fp, regs, set, Efault); unsafe_put_user(set->sig[1], >extramask[0], Efault); - if (current->mm->context.vdso) + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + else if (current->mm->context.vdso) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_sigreturn; else restorer = >retcode; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; /* Set up to return from userspace. */ unsafe_put_user(restorer, >pretcode, Efault); /* * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 -* -* WE DO NOT USE IT ANY MORE! It's only left here for historical -* reasons and because gdb uses it as a signature to notice -* signal handler stack frames. +* gdb uses it as a signature to notice signal handler stack frames. */ unsafe_put_user(*((u64 *)), (u64 *)frame->retcode, Efault); user_access_end(); @@ -382,18 +379,18 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, unsafe_save_altstack(>uc.uc_stack, regs->sp, Efault); /* Set up to return from userspace. */ - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_rt_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; + else if (current->mm->context.vdso) + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_rt_sigreturn; + else + restorer = >retcode; unsafe_put_user(restorer, >pretcode, Efault); /* * This
[PATCH v2 14/19] mm: Add vdso_base in mm_struct
Instead of having every architecture to define vdso_base/vdso_addr etc, provide a generic mechanism to track vdso_base for landing in userspace. It'll minimize per-architecture difference, the number of callbacks to provide. Originally, it started from thread [1] where the need for .close() callback on vm_special_mapping was pointed, this generic code besides removing duplicated .mremap() callbacks provides a cheaper way to support munmap() on vdso mappings without introducing .close() callbacks for every architecture (with would bring even more code duplication). [1]: https://lore.kernel.org/linux-arch/cajwjo6zanqykshbq+3b+fi_vt80mtrzev5yreqawx-l8j8x...@mail.gmail.com/ Cc: Thomas Bogendoerfer Cc: linux-m...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 3 ++- fs/aio.c | 3 ++- include/linux/mm.h| 3 ++- include/linux/mm_types.h | 10 ++ mm/Kconfig| 3 +++ mm/mmap.c | 19 ++- mm/mremap.c | 2 +- 7 files changed, 38 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index e916646adc69..786c97203bf6 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1458,7 +1458,8 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) return 0; } -static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags) +static int pseudo_lock_dev_mremap(struct vm_area_struct *old_vma, + struct vm_area_struct *new_vma, unsigned long flags) { /* Not supported */ return -EINVAL; diff --git a/fs/aio.c b/fs/aio.c index d213be7b8a7e..9b205ebf17e8 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -323,7 +323,8 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) +static int aio_ring_mremap(struct vm_area_struct *old_vma, + struct vm_area_struct *vma, unsigned long flags) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; diff --git a/include/linux/mm.h b/include/linux/mm.h index a1c25da94663..dbda1b91f971 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -559,7 +559,8 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area, unsigned long flags); + int (*mremap)(struct vm_area_struct *old_vma, + struct vm_area_struct *new_vma, unsigned long flags); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 95a257927dae..e1ca750874af 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -496,6 +496,16 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; +#ifdef CONFIG_ARCH_HAS_VDSO_BASE + /* +* Address of special mapping VMA to land after processing +* a signal. Reads are unprotected: if a thread unmaps or +* mremaps the mapping while another thread is processing +* a signal, it can segfault while landing. +*/ + void __user *vdso_base; +#endif +#define UNMAPPED_VDSO_BASE TASK_SIZE_MAX unsigned long flags; /* Must use atomic bitops to access */ diff --git a/mm/Kconfig b/mm/Kconfig index 29904dc16bfc..941ae0597734 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -893,4 +893,7 @@ config SECRETMEM select GENERIC_ALLOCATOR select CMA +config ARCH_HAS_VDSO_BASE + bool + endmenu diff --git a/mm/mmap.c b/mm/mmap.c index 17fe59a9780b..561afc3f1744 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3409,11 +3409,25 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) static vm_fault_t special_mapping_fault(struct vm_fault *vmf); +static void update_vdso_base(struct vm_area_struct *old_vma, + unsigned long new_addr) +{ +#ifdef CONFIG_ARCH_HAS_VDSO_BASE + struct mm_struct *mm = old_vma->vm_mm; + + if (WARN_ON_ONCE(!mm)) + return; + if (old_vma->vm_start == (unsigned long)mm->vdso_base) + mm->vdso_base = (void __user *)new_addr; +#endif +} + /* * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct vm_area_struct *vma) { + up
[PATCH v2 10/19] sparc/vdso: Remove vdso pointer from mm->context
Not used any more. Cc: "David S. Miller" Cc: sparcli...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/sparc/include/asm/mmu_64.h | 1 - arch/sparc/vdso/vma.c | 5 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h index 7e2704c770e9..8e7892890d14 100644 --- a/arch/sparc/include/asm/mmu_64.h +++ b/arch/sparc/include/asm/mmu_64.h @@ -111,7 +111,6 @@ typedef struct { unsigned long thp_pte_count; struct tsb_config tsb_block[MM_NUM_TSBS]; struct hv_tsb_descr tsb_descr[MM_NUM_TSBS]; - void*vdso; booladi; tag_storage_desc_t *tag_store; spinlock_t tag_lock; diff --git a/arch/sparc/vdso/vma.c b/arch/sparc/vdso/vma.c index bf9195fe9bcc..255e052223ca 100644 --- a/arch/sparc/vdso/vma.c +++ b/arch/sparc/vdso/vma.c @@ -389,7 +389,6 @@ static int map_vdso(const struct vdso_image *image, } text_start = addr - image->sym_vvar_start; - current->mm->context.vdso = (void __user *)text_start; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -418,9 +417,7 @@ static int map_vdso(const struct vdso_image *image, } up_fail: - if (ret) - current->mm->context.vdso = NULL; - else + if (!ret) *sysinfo_ehdr = text_start; mmap_write_unlock(mm); -- 2.29.2
[PATCH v2 11/19] mm/mmap: Make vm_special_mapping::mremap return void
Previously .mremap() callback needed (int) return to provide way to restrict resizing of a special mapping. Now it's restricted by providing .may_split = special_mapping_split. Removing (int) return simplifies further changes to special_mapping_mremap() as it won't need save ret code from the callback. Also, it removes needless `return 0` from callbacks. Signed-off-by: Dmitry Safonov --- arch/arm/kernel/process.c | 3 +-- arch/arm64/kernel/vdso.c | 4 +--- arch/mips/vdso/genvdso.c | 3 +-- arch/x86/entry/vdso/vma.c | 4 +--- include/linux/mm_types.h | 2 +- mm/mmap.c | 2 +- 6 files changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 0e90cba8ac7a..5f4eced738f5 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -376,11 +376,10 @@ static unsigned long sigpage_addr(const struct mm_struct *mm, static struct page *signal_page; extern struct page *get_signal_page(void); -static int sigpage_mremap(const struct vm_special_mapping *sm, +static void sigpage_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { current->mm->context.sigpage = new_vma->vm_start; - return 0; } static const struct vm_special_mapping sigpage_mapping = { diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 666338724a07..91c1b7c716b7 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -78,12 +78,10 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -static int vdso_mremap(const struct vm_special_mapping *sm, +static void vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { current->mm->context.vdso = (void *)new_vma->vm_start; - - return 0; } static int __vdso_init(enum vdso_abi abi) diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c index 09e30eb4be86..0303d30cde03 100644 --- a/arch/mips/vdso/genvdso.c +++ b/arch/mips/vdso/genvdso.c @@ -259,13 +259,12 @@ int main(int argc, char **argv) fprintf(out_file, "#include \n"); fprintf(out_file, "#include \n"); fprintf(out_file, "#include \n"); - fprintf(out_file, "static int vdso_mremap(\n"); + fprintf(out_file, "static void vdso_mremap(\n"); fprintf(out_file, " const struct vm_special_mapping *sm,\n"); fprintf(out_file, " struct vm_area_struct *new_vma)\n"); fprintf(out_file, "{\n"); fprintf(out_file, " current->mm->context.vdso =\n"); fprintf(out_file, " (void *)(new_vma->vm_start);\n"); - fprintf(out_file, " return 0;\n"); fprintf(out_file, "}\n"); /* Write out the stripped VDSO data. */ diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 5b9020742e66..65780a0164e3 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -86,15 +86,13 @@ static void vdso_fix_landing(const struct vdso_image *image, #endif } -static int vdso_mremap(const struct vm_special_mapping *sm, +static void vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { const struct vdso_image *image = current->mm->context.vdso_image; vdso_fix_landing(image, new_vma); current->mm->context.vdso = (void __user *)new_vma->vm_start; - - return 0; } #ifdef CONFIG_TIME_NS diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 65df8abd90bd..95a257927dae 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -780,7 +780,7 @@ struct vm_special_mapping { struct vm_area_struct *vma, struct vm_fault *vmf); - int (*mremap)(const struct vm_special_mapping *sm, + void (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); }; diff --git a/mm/mmap.c b/mm/mmap.c index d0d458632c1b..17fe59a9780b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3433,7 +3433,7 @@ static int special_mapping_mremap(struct vm_area_struct *new_vma, return -EFAULT; if (sm->mremap) - return sm->mremap(sm, new_vma); + sm->mremap(sm, new_vma); return 0; } -- 2.29.2
[PATCH v2 09/19] s390/vdso: Remove vdso_base pointer from mm->context
Not used any more. Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: linux-s...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/s390/include/asm/mmu.h | 1 - arch/s390/kernel/vdso.c | 10 -- 2 files changed, 11 deletions(-) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index e12ff0f29d1a..095d0596f700 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -15,7 +15,6 @@ typedef struct { unsigned long gmap_asce; unsigned long asce; unsigned long asce_limit; - unsigned long vdso_base; /* The mmu context belongs to a secure guest. */ atomic_t is_protected; /* diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 810b72f8985c..3f07711a07c1 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -58,18 +58,9 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, return 0; } -static int vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *vma) -{ - current->mm->context.vdso_base = vma->vm_start; - - return 0; -} - static const struct vm_special_mapping vdso_mapping = { .name = "[vdso]", .fault = vdso_fault, - .mremap = vdso_mremap, }; static int __init vdso_setup(char *str) @@ -204,7 +195,6 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) goto out_up; } - current->mm->context.vdso_base = vdso_base; *sysinfo_ehdr = vdso_base; rc = 0; -- 2.29.2
[PATCH v2 04/19] x86: Remove compat_arch_setup_additional_pages()
The same as for x32 task, detect ia32 task by in_ia32_syscall(). It's valid as execed task is pretending to be in a syscall of relevant bitness/ABI, see the comment near in_32bit_syscall(). Removing compat_arch_setup_additional_pages() provides single point of entry - arch_setup_additional_pages(), makes ifdeffery easier to read, aligns the code with powerpc and sparc (mips also has single vdso setup function, but instead of taking bitness from mm.context, takes vdso image pointer there). Together with arm64 code align to use in_compat_task(), it makes possible to remove compat_arch_setup_additional_pages() macro re-definition from compat elf code (another redefined marco less). Cc: x...@kernel.org Signed-off-by: Dmitry Safonov --- arch/x86/entry/vdso/vma.c | 41 +++--- arch/x86/include/asm/elf.h | 5 - 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 4eea508e9b10..aace862ed9a1 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -376,48 +376,49 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) } #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -static int load_vdso32(void) +static int load_vdso_ia32(void) { if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; return map_vdso(_image_32, 0); } +#else +static int load_vdso_ia32(void) +{ + WARN_ON_ONCE(1); + return -ENODATA; +} #endif #ifdef CONFIG_X86_64 -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int load_vdso_64(void) { if (!vdso64_enabled) return 0; - return map_vdso_randomized(_image_64); -} - -#ifdef CONFIG_COMPAT -int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) -{ #ifdef CONFIG_X86_X32_ABI - if (in_x32_syscall()) { - if (!vdso64_enabled) - return 0; + if (in_x32_syscall()) return map_vdso_randomized(_image_x32); - } #endif -#ifdef CONFIG_IA32_EMULATION - return load_vdso32(); + + return map_vdso_randomized(_image_64); +} #else - return 0; -#endif +static int load_vdso_64(void) +{ + WARN_ON_ONCE(1); + return -ENODATA; } #endif -#else + int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return load_vdso32(); + if (in_ia32_syscall()) + return load_vdso_ia32(); + + return load_vdso_64(); } -#endif #ifdef CONFIG_X86_64 static __init int vdso_setup(char *s) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index d00b723eea2d..51a08f6b18e5 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -377,11 +377,6 @@ else \ ((unsigned long)current->mm->context.vdso + \ vdso_image_32.sym___kernel_vsyscall) -struct linux_binprm; -extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); -#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages - /* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), -- 2.29.2
[PATCH v2 05/19] elf: Remove compat_arch_setup_additional_pages()
Now that all users rely on detecting bitness of new-born task checking personality, remove compat_arch_setup_additional_pages() macro, simplifying the code. Signed-off-by: Dmitry Safonov --- fs/compat_binfmt_elf.c | 5 - 1 file changed, 5 deletions(-) diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 3606dd3a32f5..da8ee4d6e451 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -115,11 +115,6 @@ #define START_THREAD COMPAT_START_THREAD #endif -#ifdef compat_arch_setup_additional_pages -#undef arch_setup_additional_pages -#definearch_setup_additional_pages compat_arch_setup_additional_pages -#endif - #ifdef compat_elf_read_implies_exec #undef elf_read_implies_exec #defineelf_read_implies_exec compat_elf_read_implies_exec -- 2.29.2
[PATCH v2 07/19] elf: Use sysinfo_ehdr in ARCH_DLINFO()
Instead mm->context.vdso use the pointer provided by elf loader. That allows to drop the pointer on arm/s390/sparc. Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: linux-s...@vger.kernel.org Cc: "David S. Miller" Cc: sparcli...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/alpha/include/asm/elf.h| 2 +- arch/arm/include/asm/elf.h | 5 ++--- arch/arm64/include/asm/elf.h| 18 +- arch/ia64/include/asm/elf.h | 2 +- arch/mips/include/asm/elf.h | 5 ++--- arch/nds32/include/asm/elf.h| 5 ++--- arch/powerpc/include/asm/elf.h | 4 ++-- arch/riscv/include/asm/elf.h| 5 ++--- arch/s390/include/asm/elf.h | 5 ++--- arch/sh/include/asm/elf.h | 10 +- arch/sparc/include/asm/elf_64.h | 5 ++--- arch/x86/include/asm/elf.h | 33 ++--- arch/x86/um/asm/elf.h | 4 ++-- fs/binfmt_elf.c | 6 +++--- fs/binfmt_elf_fdpic.c | 11 ++- 15 files changed, 51 insertions(+), 69 deletions(-) diff --git a/arch/alpha/include/asm/elf.h b/arch/alpha/include/asm/elf.h index 8049997fa372..701e820f28f0 100644 --- a/arch/alpha/include/asm/elf.h +++ b/arch/alpha/include/asm/elf.h @@ -155,7 +155,7 @@ extern int alpha_l2_cacheshape; extern int alpha_l3_cacheshape; /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ -#define ARCH_DLINFO\ +#define ARCH_DLINFO(sysinfo_ehdr) \ do { \ NEW_AUX_ENT(AT_L1I_CACHESHAPE, alpha_l1i_cacheshape); \ NEW_AUX_ENT(AT_L1D_CACHESHAPE, alpha_l1d_cacheshape); \ diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index 47347d7412ec..76a0f04190f0 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -138,10 +138,9 @@ extern void elf_set_personality(const struct elf32_hdr *); #define SET_PERSONALITY(ex)elf_set_personality(&(ex)) #ifdef CONFIG_VDSO -#define ARCH_DLINFO\ +#define ARCH_DLINFO(sysinfo_ehdr) \ do { \ - NEW_AUX_ENT(AT_SYSINFO_EHDR,\ - (elf_addr_t)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \ } while (0) #endif diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index a81953bcc1cf..e62818967a69 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -165,10 +165,9 @@ typedef struct user_fpsimd_state elf_fpregset_t; }) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ -#define ARCH_DLINFO\ +#define ARCH_DLINFO(sysinfo_ehdr) \ do { \ - NEW_AUX_ENT(AT_SYSINFO_EHDR,\ - (elf_addr_t)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \ \ /* \ * Should always be nonzero unless there's a kernel bug.\ @@ -223,19 +222,12 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; set_thread_flag(TIF_32BIT); \ }) #ifdef CONFIG_COMPAT_VDSO -#define COMPAT_ARCH_DLINFO \ +#define COMPAT_ARCH_DLINFO(sysinfo_ehdr) \ do { \ - /* \ -* Note that we use Elf64_Off instead of elf_addr_t because \ -* elf_addr_t in compat is defined as Elf32_Addr and casting\ -* current->mm->context.vdso to it triggers a cast warning of \ -* cast from pointer to integer of different size. \ -*/ \ - NEW_AUX_ENT(AT_SYSINFO_EHDR,\ - (Elf64_Off)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \ } while (0) #else -#define COMPAT_ARCH_DLINFO +#define COMPAT_ARCH_DLINFO(sysinfo_ehdr) #endif #endif /* CONFIG_COMPAT */ diff --git a/arch/ia64/include/asm/elf.h b/arch/ia64/include/asm/elf.h index 6629301a2620..a257e5abddce 100644 --- a/arch/ia64/include/asm/elf.h +++ b/arch/ia64/include/asm/elf.h @@ -208,7 +208,7 @@ struct task_struct; #define GATE_EHD
[PATCH v2 01/19] x86/elf: Check in_x32_syscall() in compat_arch_setup_additional_pages()
Partly revert commit 3316ec8ccd34 ("x86/elf: Use e_machine to check for x32/ia32 in setup_additional_pages()") and commit 9a29a671902c ("elf: Expose ELF header on arch_setup_additional_pages()". Both patches did a good thing: removed usage of TIF_X32, but with a price of additional macros ARCH_SETUP_ADDITIONAL_PAGES() and ifdeffs. Instead, use in_x32_syscall() - the first thing load_elf_binary() does after parsing and checking new ELF binary. It's done that early after exec() that way also allows to use it in mmap() code straight away, which needs it to know which mmap_base to use (see arch_pick_mmap_layout()). Add comments that describe how it works. Cc: x...@kernel.org Signed-off-by: Dmitry Safonov --- arch/x86/entry/vdso/vma.c | 4 ++-- arch/x86/include/asm/compat.h | 6 ++ arch/x86/include/asm/elf.h| 6 ++ fs/binfmt_elf.c | 10 +++--- fs/compat_binfmt_elf.c| 11 +++ include/linux/elf.h | 5 - 6 files changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 44de75b21fab..4eea508e9b10 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -396,10 +396,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) #ifdef CONFIG_COMPAT int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp, bool x32) + int uses_interp) { #ifdef CONFIG_X86_X32_ABI - if (x32) { + if (in_x32_syscall()) { if (!vdso64_enabled) return 0; return map_vdso_randomized(_image_x32); diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index f145e3326c6d..4489bd60640b 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -197,6 +197,12 @@ static inline bool in_x32_syscall(void) return false; } +/* + * Valid all time on the context of process that performs a syscall. + * Just born process has __X32_SYSCALL_BIT or TS_COMPAT set very + * early in load_binary() on setting personality and flags. + * See also set_personality_ia32(). + */ static inline bool in_32bit_syscall(void) { return in_ia32_syscall() || in_x32_syscall(); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 44a9b9940535..109697a19eb1 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -383,10 +383,8 @@ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp, bool x32); -#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ - compat_arch_setup_additional_pages(bprm, interpreter, \ - (ex->e_machine == EM_X86_64)) + int uses_interp); +#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages /* Do not change the values. See get_align_mask() */ enum align_flags { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ac0b5fc30ea6..3de72c0e0406 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -999,8 +999,12 @@ static int load_elf_binary(struct linux_binprm *bprm) if (retval) goto out_free_dentry; - /* Do this immediately, since STACK_TOP as used in setup_arg_pages - may depend on the personality. */ + /* +* Do this immediately, since STACK_TOP as used in setup_arg_pages +* may depend on the personality. At this moment we start +* pretending that we are in a context of compat syscall for +* compatible applications on x86, in_compat_syscall() starts working. +*/ SET_PERSONALITY2(*elf_ex, _state); if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@ -1246,7 +1250,7 @@ static int load_elf_binary(struct linux_binprm *bprm) set_binfmt(_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter); + retval = arch_setup_additional_pages(bprm, !!interpreter); if (retval < 0) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 2c557229696a..12b991368f0a 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -115,16 +115,11 @@ #define START_THREAD COMPAT_START_THREAD #endif -#ifdef compat_arch_setup_additional_pages -#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ - compat_arch_setup_additional_pages(bprm, interpreter) -#endif - -#ifdef COMPAT_ARCH_S
[PATCH v2 00/19] Add generic vdso_base tracking
v2 Changes: - Rename user_landing to vdso_base as it tracks vDSO VMA start address, rather than the explicit address to land (Andy) - Reword and don't use "new-execed" and "new-born" task (Andy) - Fix failures reported by build robot Started from discussion [1], where was noted that currently a couple of architectures support mremap() for vdso/sigpage, but not munmap(). If an application maps something on the ex-place of vdso/sigpage, later after processing signal it will land there (good luck!) Patches set is based on linux-next (next-20201123) and it depends on changes in x86/cleanups (those reclaim TIF_IA32/TIF_X32) and also on my changes in akpm (fixing several mremap() issues). Logically, the patches set divides on: - patch 1: a cleanup for patches in x86/cleanups - patches 2-11: cleanups for arch_setup_additional_pages() - patches 12-13: x86 signal changes for unmapped vdso - patches 14-19: provide generic vdso_base in mm_struct In the end, besides cleanups, it's now more predictable what happens for applications with unmapped vdso on architectures those support .mremap() for vdso/sigpage. I'm aware of only one user that unmaps vdso - Valgrind [2]. (there possibly are more, but this one is "special", it unmaps vdso, but not vvar, which confuses CRIU [Checkpoint Restore In Userspace], that's why I'm aware of it) Patches as a .git branch: https://github.com/0x7f454c46/linux/tree/setup_additional_pages-v2 v1 Link: https://lore.kernel.org/lkml/20201108051730.2042693-1-d...@arista.com/ [1]: https://lore.kernel.org/linux-arch/cajwjo6zanqykshbq+3b+fi_vt80mtrzev5yreqawx-l8j8x...@mail.gmail.com/ [2]: https://github.com/checkpoint-restore/criu/issues/488 Cc: Alexander Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Guo Ren Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Russell King Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Will Deacon Cc: x...@kernel.org Dmitry Safonov (19): x86/elf: Check in_x32_syscall() in compat_arch_setup_additional_pages() elf: Move arch_setup_additional_pages() to generic elf.h arm64: Use in_compat_task() in arch_setup_additional_pages() x86: Remove compat_arch_setup_additional_pages() elf: Remove compat_arch_setup_additional_pages() elf/vdso: Reuse arch_setup_additional_pages() parameters elf: Use sysinfo_ehdr in ARCH_DLINFO() arm/vdso: Remove vdso pointer from mm->context s390/vdso: Remove vdso_base pointer from mm->context sparc/vdso: Remove vdso pointer from mm->context mm/mmap: Make vm_special_mapping::mremap return void x86/signal: Land on >retcode when vdso isn't mapped x86/signal: Check if vdso_image_32 is mapped before trying to land on it mm: Add vdso_base in mm_struct x86/vdso: Migrate to generic vdso_base arm/vdso: Migrate to generic vdso_base arm64/vdso: Migrate compat signals to generic vdso_base arm64/vdso: Migrate native signals to generic vdso_base mips/vdso: Migrate to generic vdso_base arch/alpha/include/asm/elf.h | 2 +- arch/arm/Kconfig | 2 + arch/arm/include/asm/elf.h| 10 +--- arch/arm/include/asm/mmu.h| 3 - arch/arm/include/asm/vdso.h | 6 +- arch/arm/kernel/process.c | 14 + arch/arm/kernel/signal.c | 6 +- arch/arm/kernel/vdso.c| 20 ++- arch/arm64/Kconfig| 2 + arch/arm64/include/asm/elf.h | 27 ++--- arch/arm64/kernel/signal.c| 10 +++- arch/arm64/kernel/signal32.c | 17 -- arch/arm64/kernel/vdso.c | 50 +++- arch/csky/Kconfig | 1 + arch/csky/include/asm/elf.h | 4 -- arch/csky/kernel/vdso.c | 3 +- arch/hexagon/Kconfig | 1 + arch/hexagon/include/asm/elf.h| 6 -- arch/hexagon/kernel/vdso.c| 3 +- arch/ia64/include/asm/elf.h | 2 +- arch/mips/Kconfig | 2 + arch/mips/include/asm/elf.h | 10 +--- arch/mips/kernel/signal.c | 11 ++-- arch/mips/kernel/vdso.c | 5 +- arch/mips/vdso/genvdso.c | 9 --- arch/nds32/Kconfig| 1 + arch/nds32/include/asm/elf.h | 8 +-- arch/nds32/kernel/vdso.c | 3 +- arch/nios2/Kconfig| 1 + arch/nios2/include/asm/elf.h | 4 -- arch/nios2/mm/init.c | 2 +- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/elf.h| 9 +-- arch/powerpc/kernel/vdso.c| 3 +- arch/riscv/Kconfig| 1 + arch/riscv/include/asm/elf.h | 9
[PATCH v2 03/19] arm64: Use in_compat_task() in arch_setup_additional_pages()
Instead of providing compat_arch_setup_additional_pages(), check if the task is compatible from personality, which is set earlier in load_elf_binary(). That will align code with powerpc and sparc, also it'll allow to completely remove compat_arch_setyp_addtional_pages() macro after doing the same for x86, simiplifying the binfmt code in the end. Cc: linux-arm-ker...@lists.infradead.org Signed-off-by: Dmitry Safonov --- arch/arm64/include/asm/elf.h | 5 - arch/arm64/kernel/vdso.c | 21 ++--- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index d1073ffa7f24..a81953bcc1cf 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -237,11 +237,6 @@ do { \ #else #define COMPAT_ARCH_DLINFO #endif -struct linux_binprm; -extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); -#define compat_arch_setup_additional_pages \ - aarch32_setup_additional_pages #endif /* CONFIG_COMPAT */ diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index cee5d04ea9ad..1b710deb84d6 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -401,29 +401,24 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) return PTR_ERR_OR_ZERO(ret); } -int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int aarch32_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp) { struct mm_struct *mm = current->mm; int ret; - if (mmap_write_lock_killable(mm)) - return -EINTR; - ret = aarch32_kuser_helpers_setup(mm); if (ret) - goto out; + return ret; if (IS_ENABLED(CONFIG_COMPAT_VDSO)) { ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm, uses_interp); if (ret) - goto out; + return ret; } - ret = aarch32_sigreturn_setup(mm); -out: - mmap_write_unlock(mm); - return ret; + return aarch32_sigreturn_setup(mm); } #endif /* CONFIG_COMPAT */ @@ -460,7 +455,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (mmap_write_lock_killable(mm)) return -EINTR; - ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); + if (is_compat_task()) + ret = aarch32_setup_additional_pages(bprm, uses_interp); + else + ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); + mmap_write_unlock(mm); return ret; -- 2.29.2
Re: [PATCH v2] drivers/virt: vmgenid: add vm generation id driver
Hello, +Cc Eric, Adrian On 11/19/20 6:36 PM, Alexander Graf wrote: > On 19.11.20 18:38, Mike Rapoport wrote: >> On Thu, Nov 19, 2020 at 01:51:18PM +0100, Alexander Graf wrote: >>> On 19.11.20 13:02, Christian Borntraeger wrote: On 16.11.20 16:34, Catangiu, Adrian Costin wrote: > - Background > > The VM Generation ID is a feature defined by Microsoft (paper: > http://go.microsoft.com/fwlink/?LinkId=260709) and supported by > multiple hypervisor vendors. > > The feature is required in virtualized environments by apps that work > with local copies/caches of world-unique data such as random values, > uuids, monotonically increasing counters, etc. > Such apps can be negatively affected by VM snapshotting when the VM > is either cloned or returned to an earlier point in time. > > The VM Generation ID is a simple concept meant to alleviate the issue > by providing a unique ID that changes each time the VM is restored > from a snapshot. The hw provided UUID value can be used to > differentiate between VMs or different generations of the same VM. > > - Problem > > The VM Generation ID is exposed through an ACPI device by multiple > hypervisor vendors but neither the vendors or upstream Linux have no > default driver for it leaving users to fend for themselves. [..] >>> The only piece where I'm unsure is how this will interact with CRIU. >> >> To C/R applications that use /dev/vmgenid CRIU need to be aware of it. >> Checkpointing and restoring withing the same "VM generation" shouldn't be >> a problem, but IMHO, making restore work after genid bump could be >> challenging. >> >> Alex, what scenario involving CRIU did you have in mind? > > You can in theory run into the same situation with containers that this > patch is solving for virtual machines. You could for example do a > snapshot of a prewarmed Java runtime with CRIU to get full JIT speeds > starting from the first request. > > That however means you run into the problem of predictable randomness > again. > >> >>> Can containers emulate ioctls and device nodes? >> >> Containers do not emulate ioctls but they can have /dev/vmgenid inside >> the container, so applications can use it the same way as outside the >> container. > > Hm. I suppose we could add a CAP_ADMIN ioctl interface to /dev/vmgenid > (when container people get to the point of needing it) that sets the > generation to "at least X". That way on restore, you could just call > that with "generation at snapshot"+1. > > That also means we need to have this interface available without virtual > machines then though, right? Sounds like a good idea. I guess, genvmid can be global on host, rather than per-userns or per-process for simplicity. Later if somebody will have a bottleneck on restore when every process on the machine wakes up from read() it could be virtualized, but doing it now sounds too early. ioctl() probably should go under checkpoint_restore_ns_capable(current_user_ns()), rather than CAP_SYS_ADMIN (I believe it should be safe from DOS as only CRIU should run with this capability, but worth to document this). Thanks, Dmitry
[PATCH] brcmsmac: ampdu: Check BA window size before checking block ack
bindex can be out of BA window (64): tid 0 seq 2983, start_seq 2915, bindex 68, index 39 tid 0 seq 2984, start_seq 2915, bindex 69, index 40 tid 0 seq 2985, start_seq 2915, bindex 70, index 41 tid 0 seq 2986, start_seq 2915, bindex 71, index 42 tid 0 seq 2879, start_seq 2915, bindex 4060, index 63 tid 0 seq 2854, start_seq 2915, bindex 4035, index 38 tid 0 seq 2795, start_seq 2915, bindex 3976, index 43 tid 0 seq 2989, start_seq 2924, bindex 65, index 45 tid 0 seq 2992, start_seq 2924, bindex 68, index 48 tid 0 seq 2993, start_seq 2924, bindex 69, index 49 tid 0 seq 2994, start_seq 2924, bindex 70, index 50 tid 0 seq 2997, start_seq 2924, bindex 73, index 53 tid 0 seq 2795, start_seq 2941, bindex 3950, index 43 tid 0 seq 2921, start_seq 2941, bindex 4076, index 41 tid 0 seq 2929, start_seq 2941, bindex 4084, index 49 tid 0 seq 3011, start_seq 2946, bindex 65, index 3 tid 0 seq 3012, start_seq 2946, bindex 66, index 4 tid 0 seq 3013, start_seq 2946, bindex 67, index 5 In result isset() will try to dereference something on the stack, causing panics: BUG: unable to handle page fault for address: a742800ed01f #PF: supervisor read access in kernel mode #PF: error_code(0x) - not-present page PGD 6a4e9067 P4D 6a4e9067 PUD 6a4ec067 PMD 6a4ed067 PTE 0 Oops: [#1] PREEMPT SMP PTI CPU: 1 PID: 0 Comm: swapper/1 Kdump: loaded Not tainted 5.8.5-arch1-1-kdump #1 Hardware name: Apple Inc. MacBookAir3,1/Mac-942452F5819B1C1B, BIOS MBA31.88Z.0061.B07.1201241641 01/24/12 RIP: 0010:brcms_c_ampdu_dotxstatus+0x343/0x9f0 [brcmsmac] Code: 54 24 20 66 81 e2 ff 0f 41 83 e4 07 89 d1 0f b7 d2 66 c1 e9 03 0f b7 c9 4c 8d 5c 0c 48 49 8b 4d 10 48 8b 79 68 41 57 44 89 e1 <41> 0f b6 33 41 d3 e0 48 c7 c1 38 e0 ea c0 48 83 c7 10 44 21 c6 4c RSP: 0018:a742800ecdd0 EFLAGS: 00010207 RAX: 0019 RBX: 000b RCX: 0006 RDX: 0ffe RSI: 0004 RDI: 8fc6ad776800 RBP: 8fc6855acb00 R08: 0001 R09: 05d9 R10: fffe R11: a742800ed01f R12: 0006 R13: 8fc68d75a000 R14: 05db R15: 0019 FS: () GS:8fc6aad0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: a742800ed01f CR3: 2480a000 CR4: 000406e0 Call Trace: brcms_c_dpc+0xb46/0x1020 [brcmsmac] ? wlc_intstatus+0xc8/0x180 [brcmsmac] ? __raise_softirq_irqoff+0x1a/0x80 brcms_dpc+0x37/0xd0 [brcmsmac] tasklet_action_common.constprop.0+0x51/0xb0 __do_softirq+0xff/0x340 ? handle_level_irq+0x1a0/0x1a0 asm_call_on_stack+0x12/0x20 do_softirq_own_stack+0x5f/0x80 irq_exit_rcu+0xcb/0x120 common_interrupt+0xd1/0x200 asm_common_interrupt+0x1e/0x40 RIP: 0010:cpuidle_enter_state+0xb3/0x420 Check if the block is within BA window and only then check block's status. Otherwise as Behan wrote: "When I came back to Dublin I was courtmartialed in my absence and sentenced to death in my absence, so I said they could shoot me in my absence." Also reported: https://bbs.archlinux.org/viewtopic.php?id=258428 https://lore.kernel.org/linux-wireless/87tuwgi92n@yujinakao.com/ Reported-by: Yuji Nakao Signed-off-by: Dmitry Safonov --- .../net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c index c9fb4b0cffaf..2631eb7569eb 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/ampdu.c @@ -942,14 +942,19 @@ brcms_c_ampdu_dotxstatus_complete(struct ampdu_info *ampdu, struct scb *scb, index = TX_SEQ_TO_INDEX(seq); ack_recd = false; if (ba_recd) { + int block_acked; + bindex = MODSUB_POW2(seq, start_seq, SEQNUM_MAX); + if (bindex < AMPDU_TX_BA_MAX_WSIZE) + block_acked = isset(bitmap, bindex); + else + block_acked = 0; brcms_dbg_ht(wlc->hw->d11core, "tid %d seq %d, start_seq %d, bindex %d set %d, index %d\n", tid, seq, start_seq, bindex, -isset(bitmap, bindex), index); +block_acked, index); /* if acked then clear bit and free packet */ - if ((bindex < AMPDU_TX_BA_MAX_WSIZE) - && isset(bitmap, bindex)) { + if (block_acked) { ini->txretry[index] = 0; /* -- 2.29.2
Re: [PATCH 00/19] Add generic user_landing tracking
On 11/8/20 7:07 PM, Andy Lutomirski wrote: > On Sat, Nov 7, 2020 at 9:17 PM Dmitry Safonov wrote: >> >> Started from discussion [1], where was noted that currently a couple of >> architectures support mremap() for vdso/sigpage, but not munmap(). >> If an application maps something on the ex-place of vdso/sigpage, >> later after processing signal it will land there (good luck!) >> >> Patches set is based on linux-next (next-20201106) and it depends on >> changes in x86/cleanups (those reclaim TIF_IA32/TIF_X32) and also >> on my changes in akpm (fixing several mremap() issues). >> >> Logically, the patches set divides on: >> - patch 1: cleanup for patches in x86/cleanups >> - patches 2-11: cleanups for arch_setup_additional_pages() > > I like these cleanups, although I think you should stop using terms > like "new-born". A task being exec'd is not newborn at all -- it's in > the middle of a transformation. Thank you for looking at them, Andy :-) Yeah, somehow I thought about new-execed process as a new-born binary. I'll try to improve changelogs in v2. Thanks, Dmitry
Re: [PATCH 14/19] mm: Add user_landing in mm_struct
On 11/8/20 7:04 PM, Andy Lutomirski wrote: > On Sat, Nov 7, 2020 at 9:18 PM Dmitry Safonov wrote: >> >> Instead of having every architecture to define vdso_base/vdso_addr etc, >> provide a generic mechanism to track landing in userspace. >> It'll minimize per-architecture difference, the number of callbacks to >> provide. >> >> Originally, it started from thread [1] where the need for .close() >> callback on vm_special_mapping was pointed, this generic code besides >> removing duplicated .mremap() callbacks provides a cheaper way to >> support munmap() on vdso mappings without introducing .close() callbacks >> for every architecture (with would bring even more code duplication). > > I find the naming odd. It's called "user_landing", which is > presumably a hard-to-understand shorthand for "user mode landing pad > for return from a signal handler if SA_RESTORER is not set". But, > looking at the actual code, it's not this at all -- it's just the vDSO > base address. Agree. Originally, I tried to track the actual landing address on the vdso, but .mremap() seemed simpler when tracking the vma base. > So how about just calling it vdso_base? I'm very much in favor of > consolidating and cleaning up, and improving the vdso remap/unmap > code, but I'm not convinced that we should call it anything other than > the vdso base. Sure. Thanks, Dmitry
Re: [PATCH 12/19] x86/signal: Land on >retcode when vdso isn't mapped
On 11/8/20 7:06 PM, Andy Lutomirski wrote: > On Sat, Nov 7, 2020 at 9:17 PM Dmitry Safonov wrote: >> >> Since commit 9fbbd4dd17d0 ("x86: Don't require the vDSO for handling >> a.out signals") after processing 32-bit signal if there is no vdso >> mapped frame->retcode is used as a landing. >> Do the same for rt ia32 signals. > > Am I reading correctly that this makes the ia32 compat signals match > the native ia32 case? Yes, probably I should have added it to the changelog. Thanks, Dmitry
[PATCH 06/19] elf/vdso: Reuse arch_setup_additional_pages() parameters
Both parameters of arch_setup_additional_pages() are currently unused. commit fc5243d98ac2 ("[S390] arch_setup_additional_pages arguments") tried to introduce useful arguments, but they still are not used. Remove old parameters and introduce sysinfo_ehdr argument that will be used to return vdso address to put as AT_SYSINFO_EHDR tag in auxiliary vector. The reason to do it is that many architecture have vDSO pointer saved in their mm->context with the only purpose to use it later in ARCH_DLINFO. That's the macro for elf loader to setup sysinfo_ehdr tag. Return sysinfo_ehdr address that will be later used by ARCH_DLINFO as an argument. That will allow to drop vDSO pointer from mm.context and any code responsible to track vDSO position on platforms that don't use vDSO as a landing in userspace (arm/s390/sparc). Cc: Albert Ou Cc: "David S. Miller" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: linux-fsde...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/arm/include/asm/vdso.h| 6 -- arch/arm/kernel/process.c | 4 ++-- arch/arm/kernel/vdso.c | 10 +++--- arch/arm64/kernel/vdso.c | 17 arch/csky/kernel/vdso.c| 3 ++- arch/hexagon/kernel/vdso.c | 3 ++- arch/mips/kernel/vdso.c| 3 ++- arch/nds32/kernel/vdso.c | 3 ++- arch/nios2/mm/init.c | 2 +- arch/powerpc/kernel/vdso.c | 3 ++- arch/riscv/kernel/vdso.c | 9 + arch/s390/kernel/vdso.c| 3 ++- arch/sh/kernel/vsyscall/vsyscall.c | 3 ++- arch/sparc/vdso/vma.c | 15 +++--- arch/x86/entry/vdso/vma.c | 32 +- arch/x86/um/vdso/vma.c | 2 +- fs/binfmt_elf.c| 3 ++- fs/binfmt_elf_fdpic.c | 3 ++- include/linux/elf.h| 17 +++- 19 files changed, 84 insertions(+), 57 deletions(-) diff --git a/arch/arm/include/asm/vdso.h b/arch/arm/include/asm/vdso.h index 5b85889f82ee..6b2b3b1fe833 100644 --- a/arch/arm/include/asm/vdso.h +++ b/arch/arm/include/asm/vdso.h @@ -10,13 +10,15 @@ struct mm_struct; #ifdef CONFIG_VDSO -void arm_install_vdso(struct mm_struct *mm, unsigned long addr); +void arm_install_vdso(struct mm_struct *mm, unsigned long addr, + unsigned long *sysinfo_ehdr); extern unsigned int vdso_total_pages; #else /* CONFIG_VDSO */ -static inline void arm_install_vdso(struct mm_struct *mm, unsigned long addr) +static inline void arm_install_vdso(struct mm_struct *mm, unsigned long addr, + unsigned long *sysinfo_ehdr) { } diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index d0220da1d1b1..0e90cba8ac7a 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -389,7 +389,7 @@ static const struct vm_special_mapping sigpage_mapping = { .mremap = sigpage_mremap, }; -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -430,7 +430,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * to be fatal to the process, so no error check needed * here. */ - arm_install_vdso(mm, addr + PAGE_SIZE); + arm_install_vdso(mm, addr + PAGE_SIZE, sysinfo_ehdr); up_fail: mmap_write_unlock(mm); diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 3408269d19c7..710e5ca99a53 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -233,7 +233,8 @@ static int install_vvar(struct mm_struct *mm, unsigned long addr) } /* assumes mmap_lock is write-locked */ -void arm_install_vdso(struct mm_struct *mm, unsigned long addr) +void arm_install_vdso(struct mm_struct *mm, unsigned long addr, + unsigned long *sysinfo_ehdr) { struct vm_area_struct *vma; unsigned long len; @@ -254,7 +255,10 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long addr) VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, _text_mapping); - if (!IS_ERR(vma)) - mm->context.vdso = addr; + if (IS_ERR(vma)) + return; + + mm->context.vdso = addr; + *sysinfo_ehdr = addr; } diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 1b710deb84d6..666338724a07 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -213,8 +213,7 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, - struct linux_binprm *bprm, - int uses_interp) +
[PATCH 19/19] mips/vdso: Migrate to user_landing
Generic way to track the land vma area. As a bonus, after unmapping sigpage, kernel won't try to land on its previous position. Cc: Thomas Bogendoerfer Cc: linux-m...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/mips/Kconfig | 1 + arch/mips/kernel/signal.c | 11 +++ arch/mips/kernel/vdso.c | 2 +- arch/mips/vdso/genvdso.c | 8 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 5e696ab80df4..eedb1683ec8e 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -10,6 +10,7 @@ config MIPS select ARCH_HAS_SETUP_ADDITIONAL_PAGES select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_USER_LANDING select ARCH_SUPPORTS_UPROBES select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if 64BIT diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index f1e985109da0..eb79272d3cc2 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -806,11 +806,13 @@ struct mips_abi mips_abi = { static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { + unsigned long land = (unsigned long)current->mm->user_landing; sigset_t *oldset = sigmask_to_save(); - int ret; + int ret = 1; struct mips_abi *abi = current->thread.abi; - void *vdso = current->mm->context.vdso; + if (land == UNMAPPED_USER_LANDING) + goto err; /* * If we were emulating a delay slot instruction, exit that frame such * that addresses in the sigframe are as expected for userland and we @@ -843,12 +845,13 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) rseq_signal_deliver(ksig, regs); if (sig_uses_siginfo(>ka, abi)) - ret = abi->setup_rt_frame(vdso + abi->vdso->off_rt_sigreturn, + ret = abi->setup_rt_frame(land + abi->vdso->off_rt_sigreturn, ksig, regs, oldset); else - ret = abi->setup_frame(vdso + abi->vdso->off_sigreturn, + ret = abi->setup_frame(land + abi->vdso->off_sigreturn, ksig, regs, oldset); +err: signal_setup_done(ret, ksig, 0); } diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index a4a321252df6..5523ba25ab3d 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -183,7 +183,7 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) goto out; } - mm->context.vdso = (void *)vdso_addr; + mm->user_landing = (void __user *)vdso_addr; *sysinfo_ehdr = vdso_addr; ret = 0; diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c index 0303d30cde03..8f581a2c8578 100644 --- a/arch/mips/vdso/genvdso.c +++ b/arch/mips/vdso/genvdso.c @@ -259,13 +259,6 @@ int main(int argc, char **argv) fprintf(out_file, "#include \n"); fprintf(out_file, "#include \n"); fprintf(out_file, "#include \n"); - fprintf(out_file, "static void vdso_mremap(\n"); - fprintf(out_file, " const struct vm_special_mapping *sm,\n"); - fprintf(out_file, " struct vm_area_struct *new_vma)\n"); - fprintf(out_file, "{\n"); - fprintf(out_file, " current->mm->context.vdso =\n"); - fprintf(out_file, " (void *)(new_vma->vm_start);\n"); - fprintf(out_file, "}\n"); /* Write out the stripped VDSO data. */ fprintf(out_file, @@ -290,7 +283,6 @@ int main(int argc, char **argv) fprintf(out_file, "\t.mapping = {\n"); fprintf(out_file, "\t\t.name = \"[vdso]\",\n"); fprintf(out_file, "\t\t.pages = vdso_pages,\n"); - fprintf(out_file, "\t\t.mremap = vdso_mremap,\n"); fprintf(out_file, "\t},\n"); /* Calculate and write symbol offsets to */ -- 2.28.0
[PATCH 17/19] arm64/vdso: Migrate compat signals to user_landing
Generic way to track the land vma area. As a bonus, after unmapping sigpage, kernel won't try to land on its previous position. Signed-off-by: Dmitry Safonov --- arch/arm64/Kconfig | 1 + arch/arm64/kernel/signal32.c | 17 - arch/arm64/kernel/vdso.c | 2 +- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7111cf335ede..56505e396253 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -38,6 +38,7 @@ config ARM64 select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_USER_LANDING select ARCH_HAVE_ELF_PROT select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_INLINE_READ_LOCK if !PREEMPTION diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 2f507f565c48..61d583c73be3 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -315,7 +315,7 @@ static void __user *compat_get_sigframe(struct ksignal *ksig, return frame; } -static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, +static int compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, compat_ulong_t __user *rc, void __user *frame, int usig) { @@ -342,13 +342,16 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, retcode = ptr_to_compat(ka->sa.sa_restorer); } else { /* Set up sigreturn pointer */ + unsigned long land = (unsigned long)current->mm->user_landing; unsigned int idx = thumb << 1; if (ka->sa.sa_flags & SA_SIGINFO) idx += 3; - retcode = (unsigned long)current->mm->context.sigpage + - (idx << 2) + thumb; + if (land == UNMAPPED_USER_LANDING) + return 1; + + retcode = land + (idx << 2) + thumb; } regs->regs[0] = usig; @@ -356,6 +359,8 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, regs->compat_lr = retcode; regs->pc= handler; regs->pstate= spsr; + + return 0; } static int compat_setup_sigframe(struct compat_sigframe __user *sf, @@ -425,7 +430,8 @@ int compat_setup_rt_frame(int usig, struct ksignal *ksig, err |= compat_setup_sigframe(>sig, regs, set); if (err == 0) { - compat_setup_return(regs, >ka, frame->sig.retcode, frame, usig); + err = compat_setup_return(regs, >ka, + frame->sig.retcode, frame, usig); regs->regs[1] = (compat_ulong_t)(unsigned long)>info; regs->regs[2] = (compat_ulong_t)(unsigned long)>sig.uc; } @@ -448,7 +454,8 @@ int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, err |= compat_setup_sigframe(frame, regs, set); if (err == 0) - compat_setup_return(regs, >ka, frame->retcode, frame, usig); + err = compat_setup_return(regs, >ka, + frame->retcode, frame, usig); return err; } diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 91c1b7c716b7..08e8f1d56d92 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -394,7 +394,7 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) if (IS_ERR(ret)) goto out; - mm->context.sigpage = (void *)addr; + mm->user_landing = (void __user *)addr; out: return PTR_ERR_OR_ZERO(ret); -- 2.28.0
[PATCH 14/19] mm: Add user_landing in mm_struct
Instead of having every architecture to define vdso_base/vdso_addr etc, provide a generic mechanism to track landing in userspace. It'll minimize per-architecture difference, the number of callbacks to provide. Originally, it started from thread [1] where the need for .close() callback on vm_special_mapping was pointed, this generic code besides removing duplicated .mremap() callbacks provides a cheaper way to support munmap() on vdso mappings without introducing .close() callbacks for every architecture (with would bring even more code duplication). [1]: https://lore.kernel.org/linux-arch/cajwjo6zanqykshbq+3b+fi_vt80mtrzev5yreqawx-l8j8x...@mail.gmail.com/ Cc: Thomas Bogendoerfer Cc: linux-m...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 3 ++- fs/aio.c | 3 ++- include/linux/mm.h| 3 ++- include/linux/mm_types.h | 10 ++ mm/Kconfig| 3 +++ mm/mmap.c | 19 ++- mm/mremap.c | 2 +- 7 files changed, 38 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index e916646adc69..786c97203bf6 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1458,7 +1458,8 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) return 0; } -static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags) +static int pseudo_lock_dev_mremap(struct vm_area_struct *old_vma, + struct vm_area_struct *new_vma, unsigned long flags) { /* Not supported */ return -EINVAL; diff --git a/fs/aio.c b/fs/aio.c index d1dad4cd860f..2695dc9ed46f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -324,7 +324,8 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) +static int aio_ring_mremap(struct vm_area_struct *old_vma, + struct vm_area_struct *vma, unsigned long flags) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; diff --git a/include/linux/mm.h b/include/linux/mm.h index 427911d2c83e..4b0f97a289b3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -559,7 +559,8 @@ struct vm_operations_struct { void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); - int (*mremap)(struct vm_area_struct *area, unsigned long flags); + int (*mremap)(struct vm_area_struct *old_vma, + struct vm_area_struct *new_vma, unsigned long flags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b035caff6abe..f888257e973a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -488,6 +488,16 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; +#ifdef CONFIG_ARCH_HAS_USER_LANDING + /* +* Address of special mapping VMA to land after processing +* a signal. Reads are unprotected: if a thread unmaps or +* mremaps the mapping while another thread is processing +* a signal, it can segfault while landing. +*/ + void __user *user_landing; +#endif +#define UNMAPPED_USER_LANDING TASK_SIZE_MAX unsigned long flags; /* Must use atomic bitops to access */ diff --git a/mm/Kconfig b/mm/Kconfig index 01b0ae0cd9d3..d43b61a21be8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -883,4 +883,7 @@ config ARCH_HAS_HUGEPD config MAPPING_DIRTY_HELPERS bool +config ARCH_HAS_USER_LANDING + bool + endmenu diff --git a/mm/mmap.c b/mm/mmap.c index 2376f3972f13..8a17ffdedacb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3410,11 +3410,25 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) static vm_fault_t special_mapping_fault(struct vm_fault *vmf); +static void update_user_landing(struct vm_area_struct *old_vma, + unsigned long new_addr) +{ +#ifdef CONFIG_ARCH_HAS_USER_LANDING + struct mm_struct *mm = old_vma->vm_mm; + + if (WARN_ON_ONCE(!mm)) + return; + if (old_vma->vm_start == (unsigned long)mm->user_landing) + mm->user_landing = (void __user *)new_addr; +#endif +} + /* * Having a close hook prevents vma merging regardless of flags. */ static void special_mapping_close(struct v
[PATCH 11/19] mm/mmap: Make vm_special_mapping::mremap return void
Previously .mremap() callback needed (int) return to provide way to restrict resizing of a special mapping. Now it's restricted by providing .may_split = special_mapping_split. Removing (int) return simplifies further changes to special_mapping_mremap() as it won't need save ret code from the callback. Also, it removes needless `return 0` from callbacks. Signed-off-by: Dmitry Safonov --- arch/arm/kernel/process.c | 3 +-- arch/arm64/kernel/vdso.c | 4 +--- arch/mips/vdso/genvdso.c | 3 +-- arch/x86/entry/vdso/vma.c | 4 +--- include/linux/mm_types.h | 2 +- mm/mmap.c | 2 +- 6 files changed, 6 insertions(+), 12 deletions(-) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 0e90cba8ac7a..5f4eced738f5 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -376,11 +376,10 @@ static unsigned long sigpage_addr(const struct mm_struct *mm, static struct page *signal_page; extern struct page *get_signal_page(void); -static int sigpage_mremap(const struct vm_special_mapping *sm, +static void sigpage_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { current->mm->context.sigpage = new_vma->vm_start; - return 0; } static const struct vm_special_mapping sigpage_mapping = { diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 666338724a07..91c1b7c716b7 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -78,12 +78,10 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -static int vdso_mremap(const struct vm_special_mapping *sm, +static void vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { current->mm->context.vdso = (void *)new_vma->vm_start; - - return 0; } static int __vdso_init(enum vdso_abi abi) diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c index 09e30eb4be86..0303d30cde03 100644 --- a/arch/mips/vdso/genvdso.c +++ b/arch/mips/vdso/genvdso.c @@ -259,13 +259,12 @@ int main(int argc, char **argv) fprintf(out_file, "#include \n"); fprintf(out_file, "#include \n"); fprintf(out_file, "#include \n"); - fprintf(out_file, "static int vdso_mremap(\n"); + fprintf(out_file, "static void vdso_mremap(\n"); fprintf(out_file, " const struct vm_special_mapping *sm,\n"); fprintf(out_file, " struct vm_area_struct *new_vma)\n"); fprintf(out_file, "{\n"); fprintf(out_file, " current->mm->context.vdso =\n"); fprintf(out_file, " (void *)(new_vma->vm_start);\n"); - fprintf(out_file, " return 0;\n"); fprintf(out_file, "}\n"); /* Write out the stripped VDSO data. */ diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 5b9020742e66..65780a0164e3 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -86,15 +86,13 @@ static void vdso_fix_landing(const struct vdso_image *image, #endif } -static int vdso_mremap(const struct vm_special_mapping *sm, +static void vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { const struct vdso_image *image = current->mm->context.vdso_image; vdso_fix_landing(image, new_vma); current->mm->context.vdso = (void __user *)new_vma->vm_start; - - return 0; } #ifdef CONFIG_TIME_NS diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a6b078b9d6a..b035caff6abe 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -772,7 +772,7 @@ struct vm_special_mapping { struct vm_area_struct *vma, struct vm_fault *vmf); - int (*mremap)(const struct vm_special_mapping *sm, + void (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); }; diff --git a/mm/mmap.c b/mm/mmap.c index 61f72b09d990..2376f3972f13 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3434,7 +3434,7 @@ static int special_mapping_mremap(struct vm_area_struct *new_vma, return -EFAULT; if (sm->mremap) - return sm->mremap(sm, new_vma); + sm->mremap(sm, new_vma); return 0; } -- 2.28.0
[PATCH 16/19] arm/vdso: Migrate to user_landing
Generic way to track the land vma area. As a bonus, after unmapping sigpage, kernel won't try to land on its previous position. Signed-off-by: Dmitry Safonov --- arch/arm/Kconfig | 1 + arch/arm/kernel/process.c | 9 + arch/arm/kernel/signal.c | 6 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index bece41f3b3b9..c161d7313911 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -24,6 +24,7 @@ config ARM select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB select ARCH_HAS_TEARDOWN_DMA_OPS if MMU select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST + select ARCH_HAS_USER_LANDING select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_KEEP_MEMBLOCK diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 5f4eced738f5..ac08241e5cf8 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -376,16 +376,9 @@ static unsigned long sigpage_addr(const struct mm_struct *mm, static struct page *signal_page; extern struct page *get_signal_page(void); -static void sigpage_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - current->mm->context.sigpage = new_vma->vm_start; -} - static const struct vm_special_mapping sigpage_mapping = { .name = "[sigpage]", .pages = _page, - .mremap = sigpage_mremap, }; int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) @@ -423,7 +416,7 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) goto up_fail; } - mm->context.sigpage = addr; + mm->user_landing = (void __user *)addr; /* Unlike the sigpage, failure to install the vdso is unlikely * to be fatal to the process, so no error check needed diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 9d2e916121be..270b17a9dc0f 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -451,13 +451,17 @@ setup_return(struct pt_regs *regs, struct ksignal *ksig, #ifdef CONFIG_MMU if (cpsr & MODE32_BIT) { struct mm_struct *mm = current->mm; + unsigned long land = (unsigned long)mm->user_landing; + + if (land == UNMAPPED_USER_LANDING) + return 1; /* * 32-bit code can use the signal return page * except when the MPU has protected the vectors * page from PL0 */ - retcode = mm->context.sigpage + signal_return_offset + + retcode = land + signal_return_offset + (idx << 2) + thumb; } else #endif -- 2.28.0
[PATCH 07/19] elf: Use sysinfo_ehdr in ARCH_DLINFO()
Instead mm->context.vdso use the pointer provided by elf loader. That allows to drop the pointer on arm/s390/sparc. Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: linux-s...@vger.kernel.org Cc: "David S. Miller" Cc: sparcli...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/alpha/include/asm/elf.h| 2 +- arch/arm/include/asm/elf.h | 5 ++--- arch/arm64/include/asm/elf.h| 18 +- arch/ia64/include/asm/elf.h | 2 +- arch/mips/include/asm/elf.h | 5 ++--- arch/nds32/include/asm/elf.h| 5 ++--- arch/powerpc/include/asm/elf.h | 4 ++-- arch/riscv/include/asm/elf.h| 5 ++--- arch/s390/include/asm/elf.h | 5 ++--- arch/sh/include/asm/elf.h | 10 +- arch/sparc/include/asm/elf_64.h | 5 ++--- arch/x86/include/asm/elf.h | 33 ++--- arch/x86/um/asm/elf.h | 4 ++-- fs/binfmt_elf.c | 6 +++--- fs/binfmt_elf_fdpic.c | 11 ++- 15 files changed, 51 insertions(+), 69 deletions(-) diff --git a/arch/alpha/include/asm/elf.h b/arch/alpha/include/asm/elf.h index 8049997fa372..701e820f28f0 100644 --- a/arch/alpha/include/asm/elf.h +++ b/arch/alpha/include/asm/elf.h @@ -155,7 +155,7 @@ extern int alpha_l2_cacheshape; extern int alpha_l3_cacheshape; /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ -#define ARCH_DLINFO\ +#define ARCH_DLINFO(sysinfo_ehdr) \ do { \ NEW_AUX_ENT(AT_L1I_CACHESHAPE, alpha_l1i_cacheshape); \ NEW_AUX_ENT(AT_L1D_CACHESHAPE, alpha_l1d_cacheshape); \ diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index 1f4b91a17a91..7bb07056242f 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -133,10 +133,9 @@ extern void elf_set_personality(const struct elf32_hdr *); #define SET_PERSONALITY(ex)elf_set_personality(&(ex)) #ifdef CONFIG_VDSO -#define ARCH_DLINFO\ +#define ARCH_DLINFO(sysinfo_ehdr) \ do { \ - NEW_AUX_ENT(AT_SYSINFO_EHDR,\ - (elf_addr_t)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \ } while (0) #endif diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index a81953bcc1cf..e62818967a69 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -165,10 +165,9 @@ typedef struct user_fpsimd_state elf_fpregset_t; }) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ -#define ARCH_DLINFO\ +#define ARCH_DLINFO(sysinfo_ehdr) \ do { \ - NEW_AUX_ENT(AT_SYSINFO_EHDR,\ - (elf_addr_t)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \ \ /* \ * Should always be nonzero unless there's a kernel bug.\ @@ -223,19 +222,12 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; set_thread_flag(TIF_32BIT); \ }) #ifdef CONFIG_COMPAT_VDSO -#define COMPAT_ARCH_DLINFO \ +#define COMPAT_ARCH_DLINFO(sysinfo_ehdr) \ do { \ - /* \ -* Note that we use Elf64_Off instead of elf_addr_t because \ -* elf_addr_t in compat is defined as Elf32_Addr and casting\ -* current->mm->context.vdso to it triggers a cast warning of \ -* cast from pointer to integer of different size. \ -*/ \ - NEW_AUX_ENT(AT_SYSINFO_EHDR,\ - (Elf64_Off)current->mm->context.vdso); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, sysinfo_ehdr); \ } while (0) #else -#define COMPAT_ARCH_DLINFO +#define COMPAT_ARCH_DLINFO(sysinfo_ehdr) #endif #endif /* CONFIG_COMPAT */ diff --git a/arch/ia64/include/asm/elf.h b/arch/ia64/include/asm/elf.h index 6629301a2620..a257e5abddce 100644 --- a/arch/ia64/include/asm/elf.h +++ b/arch/ia64/include/asm/elf.h @@ -208,7 +208,7 @@ struct task_struct; #define GATE_EHD
[PATCH 09/19] s390/vdso: Remove vdso_base pointer from mm->context
Not used any more. Cc: Christian Borntraeger Cc: Heiko Carstens Cc: Vasily Gorbik Cc: linux-s...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/s390/include/asm/mmu.h | 1 - arch/s390/kernel/vdso.c | 10 -- 2 files changed, 11 deletions(-) diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index e12ff0f29d1a..095d0596f700 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -15,7 +15,6 @@ typedef struct { unsigned long gmap_asce; unsigned long asce; unsigned long asce_limit; - unsigned long vdso_base; /* The mmu context belongs to a secure guest. */ atomic_t is_protected; /* diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 810b72f8985c..3f07711a07c1 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -58,18 +58,9 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, return 0; } -static int vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *vma) -{ - current->mm->context.vdso_base = vma->vm_start; - - return 0; -} - static const struct vm_special_mapping vdso_mapping = { .name = "[vdso]", .fault = vdso_fault, - .mremap = vdso_mremap, }; static int __init vdso_setup(char *str) @@ -204,7 +195,6 @@ int arch_setup_additional_pages(unsigned long *sysinfo_ehdr) goto out_up; } - current->mm->context.vdso_base = vdso_base; *sysinfo_ehdr = vdso_base; rc = 0; -- 2.28.0
[PATCH 12/19] x86/signal: Land on >retcode when vdso isn't mapped
Since commit 9fbbd4dd17d0 ("x86: Don't require the vDSO for handling a.out signals") after processing 32-bit signal if there is no vdso mapped frame->retcode is used as a landing. Do the same for rt ia32 signals. This shouldn't be mistaken for encouragement for running binaries with executable stack, rather something to do in hopefully very rare situation with disabled or unmapped vdso and absent SA_RESTORER. For non-executable stack it'll segfault on attempt to land, rather than land on a random address where vdso was previously mapped. For programs with executable stack it'll just do the same for rt signals as for non-rt. Discouraging users to run with executable stack is done separately in commit 47a2ebb7f505 ("execve: warn if process starts with executable stack"). Signed-off-by: Dmitry Safonov --- arch/x86/ia32/ia32_signal.c | 12 +++- arch/x86/kernel/signal.c| 23 ++- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 81cf22398cd1..ea3db15b57bf 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -270,8 +270,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, unsafe_put_user(set->sig[1], >extramask[0], Efault); unsafe_put_user(ptr_to_compat(restorer), >pretcode, Efault); /* -* These are actually not used anymore, but left because some -* gdb versions depend on them as a marker. +* This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 +* gdb uses it as a signature to notice signal handler stack frames. */ unsafe_put_user(*((u64 *)), (u64 __user *)frame->retcode, Efault); user_access_end(); @@ -336,14 +336,16 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - else + else if (current->mm->context.vdso) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_rt_sigreturn; + else + restorer = >retcode; unsafe_put_user(ptr_to_compat(restorer), >pretcode, Efault); /* -* Not actually used anymore, but left because some gdb -* versions need it. +* This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 +* gdb uses it as a signature to notice signal handler stack frames. */ unsafe_put_user(*((u64 *)), (u64 __user *)frame->retcode, Efault); unsafe_put_sigcontext32(>uc.uc_mcontext, fp, regs, set, Efault); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index ea794a083c44..372ec09dc4ac 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -317,23 +317,20 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, unsafe_put_user(sig, >sig, Efault); unsafe_put_sigcontext(>sc, fp, regs, set, Efault); unsafe_put_user(set->sig[1], >extramask[0], Efault); - if (current->mm->context.vdso) + if (ksig->ka.sa.sa_flags & SA_RESTORER) + restorer = ksig->ka.sa.sa_restorer; + else if (current->mm->context.vdso) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_sigreturn; else restorer = >retcode; - if (ksig->ka.sa.sa_flags & SA_RESTORER) - restorer = ksig->ka.sa.sa_restorer; /* Set up to return from userspace. */ unsafe_put_user(restorer, >pretcode, Efault); /* * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80 -* -* WE DO NOT USE IT ANY MORE! It's only left here for historical -* reasons and because gdb uses it as a signature to notice -* signal handler stack frames. +* gdb uses it as a signature to notice signal handler stack frames. */ unsafe_put_user(*((u64 *)), (u64 *)frame->retcode, Efault); user_access_end(); @@ -382,18 +379,18 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, unsafe_save_altstack(>uc.uc_stack, regs->sp, Efault); /* Set up to return from userspace. */ - restorer = current->mm->context.vdso + - vdso_image_32.sym___kernel_rt_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; + else if (current->mm->context.vdso) + restorer = current->mm->context.vdso + + vdso_image_32.sym___kernel_rt_sigreturn; + else + restorer = >retcode; unsafe_put_user(restorer, >pretcode, Efault); /* * This is movl $__NR_rt_sigreturn, %ax ; int $0x80 -* -*
[PATCH 15/19] x86/vdso: Migrate to user_landing
Generic way to track the land vma area. As a bonus, after unmapping vdso, kernel won't try to land on its previous position. Signed-off-by: Dmitry Safonov --- arch/x86/Kconfig| 1 + arch/x86/entry/common.c | 3 ++- arch/x86/entry/vdso/vma.c | 9 - arch/x86/ia32/ia32_signal.c | 4 ++-- arch/x86/include/asm/mmu.h | 1 - arch/x86/include/asm/vdso.h | 2 +- arch/x86/kernel/signal.c| 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b068f949d2e4..2449844ef0e9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -84,6 +84,7 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_USER_LANDING select ARCH_HAS_DEBUG_WX select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 1be1bdbe55d4..1a3f795af331 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -151,9 +151,10 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ - landing_pad = (unsigned long)current->mm->context.vdso + + landing_pad = (unsigned long)current->mm->user_landing + vdso_image_32.sym_int80_landing_pad; + /* * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 65780a0164e3..2c426997a7a4 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -77,7 +77,7 @@ static void vdso_fix_landing(const struct vdso_image *image, struct pt_regs *regs = current_pt_regs(); unsigned long vdso_land = image->sym_int80_landing_pad; unsigned long old_land_addr = vdso_land + - (unsigned long)current->mm->context.vdso; + (unsigned long)current->mm->user_landing; /* Fixing userspace landing - look at do_fast_syscall_32 */ if (regs->ip == old_land_addr) @@ -92,7 +92,6 @@ static void vdso_mremap(const struct vm_special_mapping *sm, const struct vdso_image *image = current->mm->context.vdso_image; vdso_fix_landing(image, new_vma); - current->mm->context.vdso = (void __user *)new_vma->vm_start; } #ifdef CONFIG_TIME_NS @@ -287,7 +286,7 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr, ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size, NULL); } else { - current->mm->context.vdso = (void __user *)text_start; + current->mm->user_landing = (void __user *)text_start; current->mm->context.vdso_image = image; *sysinfo_ehdr = text_start; } @@ -362,8 +361,8 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) * Check if we have already mapped vdso blob - fail to prevent * abusing from userspace install_speciall_mapping, which may * not do accounting and rlimit right. -* We could search vma near context.vdso, but it's a slowpath, -* so let's explicitly check all VMAs to be completely sure. +* It's a slowpath, let's explicitly check all VMAs to be +* completely sure. */ for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma_is_special_mapping(vma, _mapping) || diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index f87ed1d53938..fb84c0fcdc4e 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -256,7 +256,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, } else { /* Return stub is in 32bit vsyscall page */ if (current_has_vdso_image_32()) - restorer = current->mm->context.vdso + + restorer = current->mm->user_landing + vdso_image_32.sym___kernel_sigreturn; else restorer = >retcode; @@ -337,7 +337,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; else if (current_has_vdso_image_32()) - restorer = current->mm->context.vdso + + restorer = current->mm->user_landing + vdso_image_32.sym___kernel_rt_sigreturn; else
[PATCH 10/19] sparc/vdso: Remove vdso pointer from mm->context
Not used any more. Cc: "David S. Miller" Cc: sparcli...@vger.kernel.org Signed-off-by: Dmitry Safonov --- arch/sparc/include/asm/mmu_64.h | 1 - arch/sparc/vdso/vma.c | 5 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h index 7e2704c770e9..8e7892890d14 100644 --- a/arch/sparc/include/asm/mmu_64.h +++ b/arch/sparc/include/asm/mmu_64.h @@ -111,7 +111,6 @@ typedef struct { unsigned long thp_pte_count; struct tsb_config tsb_block[MM_NUM_TSBS]; struct hv_tsb_descr tsb_descr[MM_NUM_TSBS]; - void*vdso; booladi; tag_storage_desc_t *tag_store; spinlock_t tag_lock; diff --git a/arch/sparc/vdso/vma.c b/arch/sparc/vdso/vma.c index bf9195fe9bcc..255e052223ca 100644 --- a/arch/sparc/vdso/vma.c +++ b/arch/sparc/vdso/vma.c @@ -389,7 +389,6 @@ static int map_vdso(const struct vdso_image *image, } text_start = addr - image->sym_vvar_start; - current->mm->context.vdso = (void __user *)text_start; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -418,9 +417,7 @@ static int map_vdso(const struct vdso_image *image, } up_fail: - if (ret) - current->mm->context.vdso = NULL; - else + if (!ret) *sysinfo_ehdr = text_start; mmap_write_unlock(mm); -- 2.28.0
[PATCH 08/19] arm/vdso: Remove vdso pointer from mm->context
Not used any more. Signed-off-by: Dmitry Safonov --- arch/arm/include/asm/mmu.h | 3 --- arch/arm/kernel/vdso.c | 12 2 files changed, 15 deletions(-) diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h index 1592a4264488..2397b0a19f59 100644 --- a/arch/arm/include/asm/mmu.h +++ b/arch/arm/include/asm/mmu.h @@ -12,9 +12,6 @@ typedef struct { #endif unsigned intvmalloc_seq; unsigned long sigpage; -#ifdef CONFIG_VDSO - unsigned long vdso; -#endif #ifdef CONFIG_BINFMT_ELF_FDPIC unsigned long exec_fdpic_loadmap; unsigned long interp_fdpic_loadmap; diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 710e5ca99a53..4b39c7d8f525 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -47,17 +47,8 @@ static const struct vm_special_mapping vdso_data_mapping = { .pages = _data_page, }; -static int vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - current->mm->context.vdso = new_vma->vm_start; - - return 0; -} - static struct vm_special_mapping vdso_text_mapping __ro_after_init = { .name = "[vdso]", - .mremap = vdso_mremap, }; struct elfinfo { @@ -239,8 +230,6 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long addr, struct vm_area_struct *vma; unsigned long len; - mm->context.vdso = 0; - if (vdso_text_pagelist == NULL) return; @@ -258,7 +247,6 @@ void arm_install_vdso(struct mm_struct *mm, unsigned long addr, if (IS_ERR(vma)) return; - mm->context.vdso = addr; *sysinfo_ehdr = addr; } -- 2.28.0
[PATCH 18/19] arm64/vdso: Migrate native signals to user_landing
Generic way to track the land vma area. As a bonus, after unmapping vdso, kernel won't try to land on its previous position. Signed-off-by: Dmitry Safonov --- arch/arm64/kernel/signal.c | 10 +++--- arch/arm64/kernel/vdso.c | 13 +++-- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index bec6ef69704f..4c1dfbc1aed3 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -723,9 +723,10 @@ static int get_sigframe(struct rt_sigframe_user_layout *user, return 0; } -static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, +static int setup_return(struct pt_regs *regs, struct k_sigaction *ka, struct rt_sigframe_user_layout *user, int usig) { + unsigned long land = (unsigned long)current->mm->user_landing; __sigrestore_t sigtramp; regs->regs[0] = usig; @@ -754,10 +755,13 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; + else if (land != UNMAPPED_USER_LANDING) + sigtramp = VDSO_SYMBOL(land, sigtramp); else - sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp); + return 1; regs->regs[30] = (unsigned long)sigtramp; + return 0; } static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, @@ -780,7 +784,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, err |= __save_altstack(>uc.uc_stack, regs->sp); err |= setup_sigframe(, regs, set); if (err == 0) { - setup_return(regs, >ka, , usig); + err = setup_return(regs, >ka, , usig); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(>info, >info); regs->regs[1] = (unsigned long)>info; diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 08e8f1d56d92..d710fcd7141c 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -78,12 +78,6 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -static void vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - current->mm->context.vdso = (void *)new_vma->vm_start; -} - static int __vdso_init(enum vdso_abi abi) { int i; @@ -239,7 +233,6 @@ static int __setup_additional_pages(enum vdso_abi abi, gp_flags = VM_ARM64_BTI; vdso_base += VVAR_NR_PAGES * PAGE_SIZE; - mm->context.vdso = (void *)vdso_base; ret = _install_special_mapping(mm, vdso_base, vdso_text_len, VM_READ|VM_EXEC|gp_flags| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, @@ -247,12 +240,14 @@ static int __setup_additional_pages(enum vdso_abi abi, if (IS_ERR(ret)) goto up_fail; + /* 32-bit ABI is to land on sigpage, 64-bit on vdso */ + if (abi == VDSO_ABI_AA64) + mm->user_landing = (void __user *)vdso_base; *sysinfo_ehdr = vdso_base; return 0; up_fail: - mm->context.vdso = NULL; return PTR_ERR(ret); } @@ -285,7 +280,6 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { }, [AA32_MAP_VDSO] = { .name = "[vdso]", - .mremap = vdso_mremap, }, }; @@ -431,7 +425,6 @@ static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { }, [AA64_MAP_VDSO] = { .name = "[vdso]", - .mremap = vdso_mremap, }, }; -- 2.28.0
[PATCH 13/19] x86/signal: Check if vdso_image_32 is mapped before trying to land on it
Provide current_has_vdso_image_32() helper and check it apriory landing attempt on vdso vma. The helper is a macro, not a static inline funciton to avoid linux/sched/task_stack.h inclusion in asm/vdso.h. Signed-off-by: Dmitry Safonov --- arch/x86/entry/common.c | 7 ++- arch/x86/ia32/ia32_signal.c | 4 ++-- arch/x86/include/asm/vdso.h | 4 arch/x86/kernel/signal.c| 4 ++-- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 870efeec8bda..1be1bdbe55d4 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -142,11 +142,16 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs) /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs) { + unsigned long landing_pad; + + if (!current_has_vdso_image_32()) + force_sigsegv(SIGSEGV); + /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling * convention. Adjust regs so it looks like we entered using int80. */ - unsigned long landing_pad = (unsigned long)current->mm->context.vdso + + landing_pad = (unsigned long)current->mm->context.vdso + vdso_image_32.sym_int80_landing_pad; /* diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index ea3db15b57bf..f87ed1d53938 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -255,7 +255,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, restorer = ksig->ka.sa.sa_restorer; } else { /* Return stub is in 32bit vsyscall page */ - if (current->mm->context.vdso) + if (current_has_vdso_image_32()) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_sigreturn; else @@ -336,7 +336,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - else if (current->mm->context.vdso) + else if (current_has_vdso_image_32()) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_rt_sigreturn; else diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bbcdc7b8f963..a19e0a7bae2d 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -39,6 +39,10 @@ extern const struct vdso_image vdso_image_x32; #if defined CONFIG_X86_32 || defined CONFIG_COMPAT extern const struct vdso_image vdso_image_32; + +#define current_has_vdso_image_32()\ + likely(current->mm->context.vdso_image == _image_32 && \ + !!current->mm->context.vdso) #endif extern void __init init_vdso_image(const struct vdso_image *image); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 372ec09dc4ac..6fed2e523e0a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -319,7 +319,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, unsafe_put_user(set->sig[1], >extramask[0], Efault); if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - else if (current->mm->context.vdso) + else if (current_has_vdso_image_32()) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_sigreturn; else @@ -381,7 +381,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, /* Set up to return from userspace. */ if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; - else if (current->mm->context.vdso) + else if (current_has_vdso_image_32()) restorer = current->mm->context.vdso + vdso_image_32.sym___kernel_rt_sigreturn; else -- 2.28.0
[PATCH 02/19] elf: Move arch_setup_additional_pages() to generic elf.h
Ifdef the function in the header, not in the code. Following kernel style, move it to Kconfig. All it makes it easier to follow when the option is enabled/disabled. Remove re-definition from compat_binfmt_elf, as it's always defined under compat_arch_setup_additional_pages (to be reworked). Signed-off-by: Dmitry Safonov --- arch/arm/Kconfig| 1 + arch/arm/include/asm/elf.h | 5 - arch/arm64/Kconfig | 1 + arch/arm64/include/asm/elf.h| 6 +- arch/csky/Kconfig | 1 + arch/csky/include/asm/elf.h | 4 arch/hexagon/Kconfig| 1 + arch/hexagon/include/asm/elf.h | 6 -- arch/mips/Kconfig | 1 + arch/mips/include/asm/elf.h | 5 - arch/nds32/Kconfig | 1 + arch/nds32/include/asm/elf.h| 3 --- arch/nios2/Kconfig | 1 + arch/nios2/include/asm/elf.h| 4 arch/powerpc/Kconfig| 1 + arch/powerpc/include/asm/elf.h | 5 - arch/riscv/Kconfig | 1 + arch/riscv/include/asm/elf.h| 5 - arch/s390/Kconfig | 1 + arch/s390/include/asm/elf.h | 5 - arch/sh/Kconfig | 1 + arch/sh/include/asm/elf.h | 6 -- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/elf_64.h | 6 -- arch/x86/Kconfig| 1 + arch/x86/include/asm/elf.h | 4 arch/x86/um/asm/elf.h | 5 - fs/Kconfig.binfmt | 3 +++ fs/binfmt_elf.c | 2 -- fs/binfmt_elf_fdpic.c | 3 +-- fs/compat_binfmt_elf.c | 2 -- include/linux/elf.h | 12 32 files changed, 30 insertions(+), 74 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fb700e471332..bece41f3b3b9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -15,6 +15,7 @@ config ARM select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_SPECIAL if ARM_LPAE select ARCH_HAS_PHYS_TO_DMA + select ARCH_HAS_SETUP_ADDITIONAL_PAGES if MMU select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_MEMORY select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index 61941f369861..1f4b91a17a91 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -132,7 +132,6 @@ extern int arm_elf_read_implies_exec(int); extern void elf_set_personality(const struct elf32_hdr *); #define SET_PERSONALITY(ex)elf_set_personality(&(ex)) -#ifdef CONFIG_MMU #ifdef CONFIG_VDSO #define ARCH_DLINFO\ do { \ @@ -140,9 +139,5 @@ do { \ (elf_addr_t)current->mm->context.vdso); \ } while (0) #endif -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 -struct linux_binprm; -int arch_setup_additional_pages(struct linux_binprm *, int); -#endif #endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d1b81221c6a9..7111cf335ede 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -26,6 +26,7 @@ config ARM64 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_SETUP_ADDITIONAL_PAGES select ARCH_HAS_SETUP_DMA_OPS select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_SET_MEMORY diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 8d1c8dcb87fd..d1073ffa7f24 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -181,11 +181,6 @@ do { \ NEW_AUX_ENT(AT_IGNORE, 0); \ } while (0) -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES -struct linux_binprm; -extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); - /* 1GB of VA */ #ifdef CONFIG_COMPAT #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ @@ -242,6 +237,7 @@ do { \ #else #define COMPAT_ARCH_DLINFO #endif +struct linux_binprm; extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages \ diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 8fec85ab5da6..00e7b063f6ca 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -4,6 +4,7 @@ config CSKY select ARCH_32BIT_OFF_T select ARCH_HAS_DMA_PREP_COHERENT select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_HAS_SETUP_ADDITIONAL_PAGES select ARCH_HAS_SYNC_DMA_FOR_CPU select ARCH_HAS_SYNC_DMA_FOR_DEV
[PATCH 03/19] arm64: Use in_compat_task() in arch_setup_additional_pages()
Instead of providing compat_arch_setup_additional_pages(), check if the task is compatible from personality, which is set earlier in load_elf_binary(). That will align code with powerpc and sparc, also it'll allow to completely remove compat_arch_setyp_addtional_pages() macro after doing the same for x86, simiplifying the binfmt code in the end. Cc: linux-arm-ker...@lists.infradead.org Signed-off-by: Dmitry Safonov --- arch/arm64/include/asm/elf.h | 5 - arch/arm64/kernel/vdso.c | 21 ++--- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index d1073ffa7f24..a81953bcc1cf 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -237,11 +237,6 @@ do { \ #else #define COMPAT_ARCH_DLINFO #endif -struct linux_binprm; -extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); -#define compat_arch_setup_additional_pages \ - aarch32_setup_additional_pages #endif /* CONFIG_COMPAT */ diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index cee5d04ea9ad..1b710deb84d6 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -401,29 +401,24 @@ static int aarch32_sigreturn_setup(struct mm_struct *mm) return PTR_ERR_OR_ZERO(ret); } -int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int aarch32_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp) { struct mm_struct *mm = current->mm; int ret; - if (mmap_write_lock_killable(mm)) - return -EINTR; - ret = aarch32_kuser_helpers_setup(mm); if (ret) - goto out; + return ret; if (IS_ENABLED(CONFIG_COMPAT_VDSO)) { ret = __setup_additional_pages(VDSO_ABI_AA32, mm, bprm, uses_interp); if (ret) - goto out; + return ret; } - ret = aarch32_sigreturn_setup(mm); -out: - mmap_write_unlock(mm); - return ret; + return aarch32_sigreturn_setup(mm); } #endif /* CONFIG_COMPAT */ @@ -460,7 +455,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (mmap_write_lock_killable(mm)) return -EINTR; - ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); + if (is_compat_task()) + ret = aarch32_setup_additional_pages(bprm, uses_interp); + else + ret = __setup_additional_pages(VDSO_ABI_AA64, mm, bprm, uses_interp); + mmap_write_unlock(mm); return ret; -- 2.28.0
[PATCH 05/19] elf: Remove compat_arch_setup_additional_pages()
Now that all users rely on detecting bitness of new-born task checking personality, remove compat_arch_setup_additional_pages() macro, simplifying the code. Signed-off-by: Dmitry Safonov --- fs/compat_binfmt_elf.c | 5 - 1 file changed, 5 deletions(-) diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 3606dd3a32f5..da8ee4d6e451 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -115,11 +115,6 @@ #define START_THREAD COMPAT_START_THREAD #endif -#ifdef compat_arch_setup_additional_pages -#undef arch_setup_additional_pages -#definearch_setup_additional_pages compat_arch_setup_additional_pages -#endif - #ifdef compat_elf_read_implies_exec #undef elf_read_implies_exec #defineelf_read_implies_exec compat_elf_read_implies_exec -- 2.28.0
[PATCH 00/19] Add generic user_landing tracking
Started from discussion [1], where was noted that currently a couple of architectures support mremap() for vdso/sigpage, but not munmap(). If an application maps something on the ex-place of vdso/sigpage, later after processing signal it will land there (good luck!) Patches set is based on linux-next (next-20201106) and it depends on changes in x86/cleanups (those reclaim TIF_IA32/TIF_X32) and also on my changes in akpm (fixing several mremap() issues). Logically, the patches set divides on: - patch 1: cleanup for patches in x86/cleanups - patches 2-11: cleanups for arch_setup_additional_pages() - patches 12-13: x86 signal changes for unmapped vdso - patches 14-19: provide generic user_landing in mm_struct In the end, besides cleanups, it's now more predictable what happens for applications with unmapped vdso on architectures those support .mremap() for vdso/sigpage. I'm aware of only one user that unmaps vdso - Valgrind [2]. (there possibly are more, but this one is "special", it unmaps vdso, but not vvar, which confuses CRIU [Checkpoint Restore In Userspace], that's why I'm aware of it) Patches as a .git branch: https://github.com/0x7f454c46/linux/tree/setup_additional_pages [1]: https://lore.kernel.org/linux-arch/cajwjo6zanqykshbq+3b+fi_vt80mtrzev5yreqawx-l8j8x...@mail.gmail.com/ [2]: https://github.com/checkpoint-restore/criu/issues/488 Cc: Alexander Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christophe Leroy Cc: Guo Ren Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Russell King Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Will Deacon Cc: x...@kernel.org Dmitry Safonov (19): x86/elf: Check in_x32_syscall() in compat_arch_setup_additional_pages() elf: Move arch_setup_additional_pages() to generic elf.h arm64: Use in_compat_task() in arch_setup_additional_pages() x86: Remove compat_arch_setup_additional_pages() elf: Remove compat_arch_setup_additional_pages() elf/vdso: Reuse arch_setup_additional_pages() parameters elf: Use sysinfo_ehdr in ARCH_DLINFO() arm/vdso: Remove vdso pointer from mm->context s390/vdso: Remove vdso_base pointer from mm->context sparc/vdso: Remove vdso pointer from mm->context mm/mmap: Make vm_special_mapping::mremap return void x86/signal: Land on >retcode when vdso isn't mapped x86/signal: Check if vdso_image_32 is mapped before trying to land on it mm: Add user_landing in mm_struct x86/vdso: Migrate to user_landing arm/vdso: Migrate to user_landing arm64/vdso: Migrate compat signals to user_landing arm64/vdso: Migrate native signals to user_landing mips/vdso: Migrate to user_landing arch/alpha/include/asm/elf.h | 2 +- arch/arm/Kconfig | 2 + arch/arm/include/asm/elf.h| 10 +--- arch/arm/include/asm/mmu.h| 3 - arch/arm/include/asm/vdso.h | 6 +- arch/arm/kernel/process.c | 14 + arch/arm/kernel/signal.c | 6 +- arch/arm/kernel/vdso.c| 20 ++- arch/arm64/Kconfig| 2 + arch/arm64/include/asm/elf.h | 27 ++--- arch/arm64/kernel/signal.c| 10 +++- arch/arm64/kernel/signal32.c | 17 -- arch/arm64/kernel/vdso.c | 47 ++- arch/csky/Kconfig | 1 + arch/csky/include/asm/elf.h | 4 -- arch/csky/kernel/vdso.c | 3 +- arch/hexagon/Kconfig | 1 + arch/hexagon/include/asm/elf.h| 6 -- arch/hexagon/kernel/vdso.c| 3 +- arch/ia64/include/asm/elf.h | 2 +- arch/mips/Kconfig | 2 + arch/mips/include/asm/elf.h | 10 +--- arch/mips/kernel/signal.c | 11 ++-- arch/mips/kernel/vdso.c | 5 +- arch/mips/vdso/genvdso.c | 9 --- arch/nds32/Kconfig| 1 + arch/nds32/include/asm/elf.h | 8 +-- arch/nds32/kernel/vdso.c | 3 +- arch/nios2/Kconfig| 1 + arch/nios2/include/asm/elf.h | 4 -- arch/nios2/mm/init.c | 2 +- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/elf.h| 9 +-- arch/powerpc/kernel/vdso.c| 3 +- arch/riscv/Kconfig| 1 + arch/riscv/include/asm/elf.h | 10 +--- arch/riscv/kernel/vdso.c | 9 +-- arch/s390/Kconfig | 1 + arch/s390/include/asm/elf.h | 10 +--- arch/s390/include/asm/mmu.h | 1 - arch/s390/kernel/vdso.c | 13 +--- arch/sh/Kconfig | 1 + arch/sh/include/asm/elf.h | 16 ++--- ar
[PATCH 04/19] x86: Remove compat_arch_setup_additional_pages()
The same as for x32 task, detect ia32 task by in_ia32_syscall(). It's valid as new-execed task is pretending to be in a syscall of relevant bitness/ABI, see the comment near in_32bit_syscall(). Removing compat_arch_setup_additional_pages() provides single point of entry - arch_setup_additional_pages(), makes ifdeffery easier to read, aligns the code with powerpc and sparc (mips also has single vdso setup function, but instead of taking bitness from mm.context, takes vdso image pointer there). Together with arm64 code align to use in_compat_task(), it makes possible to remove compat_arch_setup_additional_pages() macro re-definition from compat elf code (another redefined marco less). Cc: x...@kernel.org Signed-off-by: Dmitry Safonov --- arch/x86/entry/vdso/vma.c | 41 +++--- arch/x86/include/asm/elf.h | 5 - 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 4eea508e9b10..aace862ed9a1 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -376,48 +376,49 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) } #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) -static int load_vdso32(void) +static int load_vdso_ia32(void) { if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; return map_vdso(_image_32, 0); } +#else +static int load_vdso_ia32(void) +{ + WARN_ON_ONCE(1); + return -ENODATA; +} #endif #ifdef CONFIG_X86_64 -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +static int load_vdso_64(void) { if (!vdso64_enabled) return 0; - return map_vdso_randomized(_image_64); -} - -#ifdef CONFIG_COMPAT -int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) -{ #ifdef CONFIG_X86_X32_ABI - if (in_x32_syscall()) { - if (!vdso64_enabled) - return 0; + if (in_x32_syscall()) return map_vdso_randomized(_image_x32); - } #endif -#ifdef CONFIG_IA32_EMULATION - return load_vdso32(); + + return map_vdso_randomized(_image_64); +} #else - return 0; -#endif +static int load_vdso_64(void) +{ + WARN_ON_ONCE(1); + return -ENODATA; } #endif -#else + int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return load_vdso32(); + if (in_ia32_syscall()) + return load_vdso_ia32(); + + return load_vdso_64(); } -#endif #ifdef CONFIG_X86_64 static __init int vdso_setup(char *s) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index d00b723eea2d..51a08f6b18e5 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -377,11 +377,6 @@ else \ ((unsigned long)current->mm->context.vdso + \ vdso_image_32.sym___kernel_vsyscall) -struct linux_binprm; -extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); -#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages - /* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), -- 2.28.0
[PATCH 01/19] x86/elf: Check in_x32_syscall() in compat_arch_setup_additional_pages()
Partly revert commit 3316ec8ccd34 ("x86/elf: Use e_machine to check for x32/ia32 in setup_additional_pages()") and commit 9a29a671902c ("elf: Expose ELF header on arch_setup_additional_pages()". Both patches did a good thing: removed usage of TIF_X32, but with a price of additional macros ARCH_SETUP_ADDITIONAL_PAGES() and ifdeffs. Instead, use in_x32_syscall() - the first thing load_elf_binary() does after parsing and checking new ELF binary. It's done that early which also allows to use it in mmap() code straight away, which needs it to know which mmap_base to use (see arch_pick_mmap_layout()). Add comments that describe how it works. Cc: x...@kernel.org Signed-off-by: Dmitry Safonov --- arch/x86/entry/vdso/vma.c | 4 ++-- arch/x86/include/asm/compat.h | 6 ++ arch/x86/include/asm/elf.h| 6 ++ fs/binfmt_elf.c | 10 +++--- fs/compat_binfmt_elf.c| 11 +++ include/linux/elf.h | 5 - 6 files changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 44de75b21fab..4eea508e9b10 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -396,10 +396,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) #ifdef CONFIG_COMPAT int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp, bool x32) + int uses_interp) { #ifdef CONFIG_X86_X32_ABI - if (x32) { + if (in_x32_syscall()) { if (!vdso64_enabled) return 0; return map_vdso_randomized(_image_x32); diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index f145e3326c6d..4489bd60640b 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -197,6 +197,12 @@ static inline bool in_x32_syscall(void) return false; } +/* + * Valid all time on the context of process that performs a syscall. + * Just born process has __X32_SYSCALL_BIT or TS_COMPAT set very + * early in load_binary() on setting personality and flags. + * See also set_personality_ia32(). + */ static inline bool in_32bit_syscall(void) { return in_ia32_syscall() || in_x32_syscall(); diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 44a9b9940535..109697a19eb1 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -383,10 +383,8 @@ struct linux_binprm; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp, bool x32); -#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ - compat_arch_setup_additional_pages(bprm, interpreter, \ - (ex->e_machine == EM_X86_64)) + int uses_interp); +#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages /* Do not change the values. See get_align_mask() */ enum align_flags { diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ac0b5fc30ea6..3de72c0e0406 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -999,8 +999,12 @@ static int load_elf_binary(struct linux_binprm *bprm) if (retval) goto out_free_dentry; - /* Do this immediately, since STACK_TOP as used in setup_arg_pages - may depend on the personality. */ + /* +* Do this immediately, since STACK_TOP as used in setup_arg_pages +* may depend on the personality. At this moment we start +* pretending that we are in a context of compat syscall for +* compatible applications on x86, in_compat_syscall() starts working. +*/ SET_PERSONALITY2(*elf_ex, _state); if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@ -1246,7 +1250,7 @@ static int load_elf_binary(struct linux_binprm *bprm) set_binfmt(_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES - retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter); + retval = arch_setup_additional_pages(bprm, !!interpreter); if (retval < 0) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 2c557229696a..12b991368f0a 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -115,16 +115,11 @@ #define START_THREAD COMPAT_START_THREAD #endif -#ifdef compat_arch_setup_additional_pages -#define COMPAT_ARCH_SETUP_ADDITIONAL_PAGES(bprm, ex, interpreter) \ - compat_arch_setup_additional_pages(bprm, interpreter) -#endif - -#ifdef COMPAT_ARCH_S
Re: [PATCH v3 10/10] x86: Reclaim TIF_IA32 and TIF_X32
On Sun, 4 Oct 2020 at 04:31, Gabriel Krisman Bertazi wrote: > > Now that these flags are no longer used, reclaim those TI bits. > > Signed-off-by: Gabriel Krisman Bertazi Oh wow! I've just started rebasing patches that do essentially the same on linux-next and found that it's already done and merged. Thanks for doing this! :-) -- Dmitry
Re: [PATCH] x86/mpx: fix recursive munmap() corruption
Hi Laurent, Christophe, Michael, all, On 11/3/20 5:11 PM, Laurent Dufour wrote: > Le 23/10/2020 à 14:28, Christophe Leroy a écrit : [..] That seems like it would work for CRIU and make sense in general? >>> >>> Sorry for the late answer, yes this would make more sense. >>> >>> Here is a patch doing that. >>> >> >> In your patch, the test seems overkill: >> >> + if ((start <= vdso_base && vdso_end <= end) || /* 1 */ >> + (vdso_base <= start && start < vdso_end) || /* 3,4 */ >> + (vdso_base < end && end <= vdso_end)) /* 2,3 */ >> + mm->context.vdso_base = mm->context.vdso_end = 0; >> >> What about >> >> if (start < vdso_end && vdso_start < end) >> mm->context.vdso_base = mm->context.vdso_end = 0; >> >> This should cover all cases, or am I missing something ? >> >> >> And do we really need to store vdso_end in the context ? >> I think it should be possible to re-calculate it: the size of the VDSO >> should be (_end - _start) + PAGE_SIZE for 32 bits VDSO, >> and (_end - _start) + PAGE_SIZE for the 64 bits VDSO. > > Thanks Christophe for the advise. > > That is covering all the cases, and indeed is similar to the Michael's > proposal I missed last year. > > I'll send a patch fixing this issue following your proposal. It's probably not necessary anymore. I've sent patches [1], currently in akpm, the last one forbids splitting of vm_special_mapping. So, a user is able munmap() or mremap() vdso as a whole, but not partly. [1]: https://lore.kernel.org/linux-mm/20201013013416.390574-1-d...@arista.com/ Thanks, Dmitry
[PATCH v2 0/3] xfrm/compat: syzbot-found fixes
v2: Added "Fixes" tags to the patches. WARN_ON() for XFRMA_UNSPEC translation which likely no-one except syzkaller uses; properly zerofy tail-padding for 64-bit attribute; don't use __GFP_ZERO as the memory is initialized during translation. Cc: Steffen Klassert Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Herbert Xu Cc: Hillf Danton Cc: net...@vger.kernel.org Thanks, Dmitry Dmitry Safonov (3): xfrm/compat: Translate by copying XFRMA_UNSPEC attribute xfrm/compat: memset(0) 64-bit padding at right place xfrm/compat: Don't allocate memory with __GFP_ZERO net/xfrm/xfrm_compat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) base-commit: 3cea11cd5e3b00d91caf0b4730194039b45c5891 -- 2.28.0
[PATCH v2 3/3] xfrm/compat: Don't allocate memory with __GFP_ZERO
32-bit to 64-bit messages translator zerofies needed paddings in the translation, the rest is the actual payload. Don't allocate zero pages as they are not needed. Fixes: 5106f4a8acff ("xfrm/compat: Add 32=>64-bit messages translator") Signed-off-by: Dmitry Safonov --- net/xfrm/xfrm_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 556e9f33b815..d8e8a11ca845 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -564,7 +564,7 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32, return NULL; len += NLMSG_HDRLEN; - h64 = kvmalloc(len, GFP_KERNEL | __GFP_ZERO); + h64 = kvmalloc(len, GFP_KERNEL); if (!h64) return ERR_PTR(-ENOMEM); -- 2.28.0
[PATCH v2 2/3] xfrm/compat: memset(0) 64-bit padding at right place
32-bit messages translated by xfrm_compat can have attributes attached. For all, but XFRMA_SA, XFRMA_POLICY the size of payload is the same in 32-bit UABI and 64-bit UABI. For XFRMA_SA (struct xfrm_usersa_info) and XFRMA_POLICY (struct xfrm_userpolicy_info) it's only tail-padding that is present in 64-bit payload, but not in 32-bit. The proper size for destination nlattr is already calculated by xfrm_user_rcv_calculate_len64() and allocated with kvmalloc(). xfrm_attr_cpy32() copies 32-bit copy_len into 64-bit attribute translated payload, zero-filling possible padding for SA/POLICY. Due to a typo, *pos already has 64-bit payload size, in a result next memset(0) is called on the memory after the translated attribute, not on the tail-padding of it. Fixes: 5106f4a8acff ("xfrm/compat: Add 32=>64-bit messages translator") Reported-by: syzbot+c43831072e7df506a...@syzkaller.appspotmail.com Signed-off-by: Dmitry Safonov --- net/xfrm/xfrm_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 17edbf935e35..556e9f33b815 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -388,7 +388,7 @@ static int xfrm_attr_cpy32(void *dst, size_t *pos, const struct nlattr *src, memcpy(nla, src, nla_attr_size(copy_len)); nla->nla_len = nla_attr_size(payload); - *pos += nla_attr_size(payload); + *pos += nla_attr_size(copy_len); nlmsg->nlmsg_len += nla->nla_len; memset(dst + *pos, 0, payload - copy_len); -- 2.28.0
[PATCH v2 1/3] xfrm/compat: Translate by copying XFRMA_UNSPEC attribute
xfrm_xlate32() translates 64-bit message provided by kernel to be sent for 32-bit listener (acknowledge or monitor). Translator code doesn't expect XFRMA_UNSPEC attribute as it doesn't know its payload. Kernel never attaches such attribute, but a user can. I've searched if any opensource does it and the answer is no. Nothing on github and google finds only tfcproject that has such code commented-out. What will happen if a user sends a netlink message with XFRMA_UNSPEC attribute? Ipsec code ignores this attribute. But if there is a monitor-process or 32-bit user requested ack - kernel will try to translate such message and will hit WARN_ONCE() in xfrm_xlate64_attr(). Deal with XFRMA_UNSPEC by copying the attribute payload with xfrm_nla_cpy(). In result, the default switch-case in xfrm_xlate64_attr() becomes an unused code. Leave those 3 lines in case a new xfrm attribute will be added. Fixes: 5461fc0c8d9f ("xfrm/compat: Add 64=>32-bit messages translator") Reported-by: syzbot+a7e701c8385bd8543...@syzkaller.appspotmail.com Signed-off-by: Dmitry Safonov --- net/xfrm/xfrm_compat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index e28f0c9ecd6a..17edbf935e35 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -234,6 +234,7 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_PAD: /* Ignore */ return 0; + case XFRMA_UNSPEC: case XFRMA_ALG_AUTH: case XFRMA_ALG_CRYPT: case XFRMA_ALG_COMP: -- 2.28.0
Re: [PATCH] xfrm/compat: Remove use of kmalloc_track_caller
On 11/1/20 10:08 PM, Alistair Delva wrote: > The __kmalloc_track_caller symbol is not exported if SLUB/SLOB are > enabled instead of SLAB, which breaks the build on such configs when > CONFIG_XFRM_USER_COMPAT=m. > > ERROR: "__kmalloc_track_caller" [net/xfrm/xfrm_compat.ko] undefined! > > Other users of this symbol are 'bool' options, but changing this to > bool would require XFRM_USER to be built in as well, which doesn't > seem worth it. Go back to kmalloc(). > > Fixes: 96392ee5a13b9 ("xfrm/compat: Translate 32-bit user_policy from > sockptr") > Cc: Dmitry Safonov <0x7f454...@gmail.com> > Cc: Maciej Żenczykowski > Cc: Steffen Klassert > Signed-off-by: Alistair Delva Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com> Thank you! -- Dmitry
[PATCH 1/3] xfrm/compat: Translate by copying XFRMA_UNSPEC attribute
xfrm_xlate32() translates 64-bit message provided by kernel to be sent for 32-bit listener (acknowledge or monitor). Translator code doesn't expect XFRMA_UNSPEC attribute as it doesn't know its payload. Kernel never attaches such attribute, but a user can. I've searched if any opensource does it and the answer is no. Nothing on github and google finds only tfcproject that has such code commented-out. What will happen if a user sends a netlink message with XFRMA_UNSPEC attribute? Ipsec code ignores this attribute. But if there is a monitor-process or 32-bit user requested ack - kernel will try to translate such message and will hit WARN_ONCE() in xfrm_xlate64_attr(). Deal with XFRMA_UNSPEC by copying the attribute payload with xfrm_nla_cpy(). In result, the default switch-case in xfrm_xlate64_attr() becomes an unused code. Leave those 3 lines in case a new xfrm attribute will be added. Reported-by: syzbot+a7e701c8385bd8543...@syzkaller.appspotmail.com Signed-off-by: Dmitry Safonov --- net/xfrm/xfrm_compat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index e28f0c9ecd6a..17edbf935e35 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -234,6 +234,7 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_PAD: /* Ignore */ return 0; + case XFRMA_UNSPEC: case XFRMA_ALG_AUTH: case XFRMA_ALG_CRYPT: case XFRMA_ALG_COMP: -- 2.28.0
[PATCH 3/3] xfrm/compat: Don't allocate memory with __GFP_ZERO
32-bit to 64-bit messages translator zerofies needed paddings in the translation, the rest is the actual payload. Don't allocate zero pages as they are not needed. Signed-off-by: Dmitry Safonov --- net/xfrm/xfrm_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 556e9f33b815..d8e8a11ca845 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -564,7 +564,7 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32, return NULL; len += NLMSG_HDRLEN; - h64 = kvmalloc(len, GFP_KERNEL | __GFP_ZERO); + h64 = kvmalloc(len, GFP_KERNEL); if (!h64) return ERR_PTR(-ENOMEM); -- 2.28.0
[PATCH 2/3] xfrm/compat: memset(0) 64-bit padding at right place
32-bit messages translated by xfrm_compat can have attributes attached. For all, but XFRMA_SA, XFRMA_POLICY the size of payload is the same in 32-bit UABI and 64-bit UABI. For XFRMA_SA (struct xfrm_usersa_info) and XFRMA_POLICY (struct xfrm_userpolicy_info) it's only tail-padding that is present in 64-bit payload, but not in 32-bit. The proper size for destination nlattr is already calculated by xfrm_user_rcv_calculate_len64() and allocated with kvmalloc(). xfrm_attr_cpy32() copies 32-bit copy_len into 64-bit attribute translated payload, zero-filling possible padding for SA/POLICY. Due to a typo, *pos already has 64-bit payload size, in a result next memset(0) is called on the memory after the translated attribute, not on the tail-padding of it. Reported-by: syzbot+c43831072e7df506a...@syzkaller.appspotmail.com Signed-off-by: Dmitry Safonov --- net/xfrm/xfrm_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 17edbf935e35..556e9f33b815 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -388,7 +388,7 @@ static int xfrm_attr_cpy32(void *dst, size_t *pos, const struct nlattr *src, memcpy(nla, src, nla_attr_size(copy_len)); nla->nla_len = nla_attr_size(payload); - *pos += nla_attr_size(payload); + *pos += nla_attr_size(copy_len); nlmsg->nlmsg_len += nla->nla_len; memset(dst + *pos, 0, payload - copy_len); -- 2.28.0
[PATCH 0/3] xfrm/compat: syzbot-found fixes
WARN_ON() for XFRMA_UNSPEC translation which likely no-one except syzkaller uses; properly zerofy tail-padding for 64-bit attribute; don't use __GFP_ZERO as the memory is initialized during translation. Cc: Steffen Klassert Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Herbert Xu Cc: Hillf Danton Cc: net...@vger.kernel.org Thanks, Dmitry Dmitry Safonov (3): xfrm/compat: Translate by copying XFRMA_UNSPEC attribute xfrm/compat: memset(0) 64-bit padding at right place xfrm/compat: Don't allocate memory with __GFP_ZERO net/xfrm/xfrm_compat.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) base-commit: 07e0887302450a62f51dba72df6afb5fabb23d1c -- 2.28.0
Re: [PATCH 2/4] arm64: hide more compat_vdso code
On 10/29/20 1:35 PM, Arnd Bergmann wrote: > On Mon, Oct 26, 2020 at 5:55 PM Mark Rutland wrote: >> On Mon, Oct 26, 2020 at 05:03:29PM +0100, Arnd Bergmann wrote: >>> From: Arnd Bergmann >>> >>> When CONFIG_COMPAT_VDSO is disabled, we get a warning >>> about a potential out-of-bounds access: >>> >>> arch/arm64/kernel/vdso.c: In function 'aarch32_vdso_mremap': >>> arch/arm64/kernel/vdso.c:86:37: warning: array subscript 1 is above array >>> bounds of 'struct vdso_abi_info[1]' [-Warray-bounds] >>>86 | unsigned long vdso_size = vdso_info[abi].vdso_code_end - >>> |~^ >>> >>> This is all in dead code however that the compiler is unable to >>> eliminate by itself. >>> >>> Change the array to individual local variables that can be >>> dropped in dead code elimination to let the compiler understand >>> this better. >>> >>> Fixes: 0cbc2659123e ("arm64: vdso32: Remove a bunch of #ifdef >>> CONFIG_COMPAT_VDSO guards") >>> Signed-off-by: Arnd Bergmann >> >> This looks like a nice cleanup to me! I agree we don't need the array >> here. >> >> Reviewed-by: Mark Rutland > > Thanks! > > I see the patch now conflicts with "mm: forbid splitting special mappings" > in -mm, by Dmitry Safonov. I have rebased my patch on top, should > I send it to Andrew for inclusion in -mm then? Makes sense to me. I plan to add some more patches on top that will make tracking of user landing (on vdso/sigpage/etc) common between architectures in generic code. So, I think it's probably good idea to keep it in one place, -mm tree seems like a proper place for it. Thanks, Dmitry
Re: WARNING in xfrm_alloc_compat
On 10/28/20 10:45 AM, Steffen Klassert wrote: > Same here, Dmitry please look into it. Looking on both, thanks! > I guess we can just remove the WARN_ON() that > triggeres here. Thanks, Dmitry
Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
Hi Christophe, Will, On 10/23/20 12:57 PM, Christophe Leroy wrote: > > > Le 23/10/2020 à 13:25, Will Deacon a écrit : >> On Fri, Oct 23, 2020 at 01:22:04PM +0200, Christophe Leroy wrote: >>> Hi Dmitry, [..] >>> I haven't seen the patches, did you sent them out finally ? I was working on .close() hook, but while cooking it, I thought it may be better to make tracking of user landing generic. Note that the vdso base address is mostly needed by kernel as an address to land in userspace after processing a signal. I have some raw patches that add +#ifdef CONFIG_ARCH_HAS_USER_LANDING + struct vm_area_struct *user_landing; +#endif inside mm_struct and I plan to finish them after rc1 gets released. While working on that, I noticed that arm32 and some other architectures track vdso position in mm.context with the only reason to add AT_SYSINFO_EHDR in the elf header that's being loaded. That's quite overkill to have a pointer in mm.context that rather can be a local variable in elf binfmt loader. Also, I found some issues with mremap code. The patches series mentioned are at the base of the branch with generic user landing. I have sent only those patches not the full branch as I remember there was a policy that during merge window one should send only fixes, rather than refactoring/new code. >> I think it's this series: >> >> https://lore.kernel.org/r/20201013013416.390574-1-d...@arista.com >> >> but they look really invasive to me, so I may cook a small hack for arm64 >> in the meantine / for stable. I don't mind small hacks, but I'm concerned that the suggested fix which sets `mm->context.vdso_base = 0` on munmap() may have it's issue: that won't work if a user for whatever-broken-reason will mremap() vdso on 0 address. As the fix supposes to fix an issue that hasn't fired for anyone yet, it probably shouldn't introduce another. That's why I've used vm_area_struct to track vdso position in the patches set. Probably, temporary, you could use something like: #define BAD_VDSO_ADDRESS(-1)UL Or non-page-aligned address. But the signal code that checks if it can land on vdso/sigpage should be also aligned with the new definition. > Not sure we are talking about the same thing. > > I can't see any new .close function added to vm_special_mapping in order > to replace arch_unmap() hook. Thanks, Dmitry
Re: [PATCH 1/2 v2] futex: adjust a futex timeout with a per-timens offset
On 10/15/20 5:00 PM, Andrei Vagin wrote: > For all commands except FUTEX_WAIT, timeout is interpreted as an > absolute value. This absolute value is inside the task's time namespace > and has to be converted to the host's time. > > Cc: > Fixes: 5a590f35add9 ("posix-clocks: Wire up clock_gettime() with timens > offsets") > Reported-by: Hans van der Laan > Signed-off-by: Andrei Vagin Reviewed-by: Dmitry Safonov <0x7f454...@gmail.com> > --- > > v2: > * check FUTEX_CLOCK_REALTIME properly > * fix futex_time32 too > > kernel/futex.c | 5 + > 1 file changed, 5 insertions(+) > > diff --git a/kernel/futex.c b/kernel/futex.c > index a5876694a60e..32056d2d4171 100644 > --- a/kernel/futex.c > +++ b/kernel/futex.c > @@ -39,6 +39,7 @@ > #include > #include > #include > +#include > > #include > > @@ -3797,6 +3798,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, > u32, val, > t = timespec64_to_ktime(ts); > if (cmd == FUTEX_WAIT) > t = ktime_add_safe(ktime_get(), t); > + else if (!(op & FUTEX_CLOCK_REALTIME)) > + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); > tp = > } > /* > @@ -3989,6 +3992,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, > op, u32, val, > t = timespec64_to_ktime(ts); > if (cmd == FUTEX_WAIT) > t = ktime_add_safe(ktime_get(), t); > + else if (!(op & FUTEX_CLOCK_REALTIME)) > + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); > tp = > } > if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || > Thanks, Dmitry
Re: [PATCH 1/2] futex: adjust a futex timeout with a per-timens offset
On 10/15/20 8:29 AM, Andrei Vagin wrote: > For all commands except FUTEX_WAIT, timeout is interpreted as an > absolute value. This absolute value is inside the task's time namespace > and has to be converted to the host's time. > > Cc: > Fixes: 5a590f35add9 ("posix-clocks: Wire up clock_gettime() with timens > offsets") > Reported-by: Hans van der Laan > Signed-off-by: Andrei Vagin [..] > @@ -3797,6 +3798,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, > u32, val, > t = timespec64_to_ktime(ts); > if (cmd == FUTEX_WAIT) > t = ktime_add_safe(ktime_get(), t); > + else if (!(cmd & FUTEX_CLOCK_REALTIME)) > + t = timens_ktime_to_host(CLOCK_MONOTONIC, t); Err, it probably should be : else if (!(op & FUTEX_CLOCK_REALTIME)) And there's also : SYSCALL_DEFINE6(futex_time32, ...) which wants to be patched. Thanks, Dmitry
[PATCH 5/6] mremap: Check if it's possible to split original vma
If original VMA can't be split at the desired address, do_munmap() will fail and leave both new-copied VMA and old VMA. De-facto it's MREMAP_DONTUNMAP behaviour, which is unexpected. Currently, it may fail such way for hugetlbfs and dax device mappings. Minimize such unpleasant situations to OOM by checking .may_split() before attempting to create a VMA copy. Signed-off-by: Dmitry Safonov --- mm/mremap.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index 898e9818ba6d..3c4047c23673 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -343,7 +343,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long excess = 0; unsigned long hiwater_vm; int split = 0; - int err; + int err = 0; bool need_rmap_locks; /* @@ -353,6 +353,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + if (vma->vm_ops && vma->vm_ops->may_split) { + if (vma->vm_start != old_addr) + err = vma->vm_ops->may_split(vma, old_addr); + if (!err && vma->vm_end != old_addr + old_len) + err = vma->vm_ops->may_split(vma, old_addr + old_len); + if (err) + return err; + } + /* * Advise KSM to break any KSM pages in the area to be moved: * it would be confusing if they were to turn up at the new -- 2.28.0
[PATCH 0/6] mremap: move_vma() fixes
1 - seems to be historical issue on a rarely taken path 2,3 - fixes related to the new mremap() flag 5 - dax device/hugetlbfs possible issue 4,6 - refactoring As those seems to be actual issues, sending this during the merge-window. (Changes to architecture code are in the 6 patch, but Cc'ing maintainers on cover for the context, I hope it's fine). Cc: Alexander Viro Cc: Andrew Morton Cc: Andy Lutomirski Cc: Brian Geffon Cc: Catalin Marinas Cc: Dan Williams Cc: Dave Jiang Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Mike Kravetz Cc: Minchan Kim Cc: Russell King Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vishal Verma Cc: Vlastimil Babka Cc: Will Deacon Cc: linux-...@kvack.org Cc: linux-fsde...@vger.kernel.org Cc: linux...@kvack.org Dmitry Safonov (6): mm/mremap: Account memory on do_munmap() failure mm/mremap: For MREMAP_DONTUNMAP check security_vm_enough_memory_mm() mremap: Don't allow MREMAP_DONTUNMAP on special_mappings and aio vm_ops: Rename .split() callback to .may_split() mremap: Check if it's possible to split original vma mm: Forbid splitting special mappings arch/arm/kernel/vdso.c| 9 arch/arm64/kernel/vdso.c | 41 ++- arch/mips/vdso/genvdso.c | 4 -- arch/s390/kernel/vdso.c | 11 + arch/x86/entry/vdso/vma.c | 17 arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- drivers/dax/device.c | 4 +- fs/aio.c | 5 ++- include/linux/mm.h| 5 ++- ipc/shm.c | 8 ++-- mm/hugetlb.c | 2 +- mm/mmap.c | 22 -- mm/mremap.c | 50 +++ 13 files changed, 63 insertions(+), 117 deletions(-) base-commit: bbf5c979011a099af5dc76498918ed7df445635b -- 2.28.0
[PATCH 1/6] mm/mremap: Account memory on do_munmap() failure
move_vma() copies VMA without adding it to account, then unmaps old part of VMA. On failure it unmaps the new VMA. With hacks accounting in munmap is disabled as it's a copy of existing VMA. Account the memory on munmap() failure which was previously copied into a new VMA. Fixes: commit e2ea83742133 ("[PATCH] mremap: move_vma fixes and cleanup") Signed-off-by: Dmitry Safonov --- mm/mremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index 138abbae4f75..03d31a0d4c67 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -450,7 +450,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ - vm_unacct_memory(excess >> PAGE_SHIFT); + if (vm_flags & VM_ACCOUNT) + vm_acct_memory(new_len >> PAGE_SHIFT); excess = 0; } -- 2.28.0
[PATCH 6/6] mm: Forbid splitting special mappings
Don't allow splitting of vm_special_mapping's. It affects vdso/vvar areas. Uprobes have only one page in xol_area so they aren't affected. Those restrictions were enforced by checks in .mremap() callbacks. Restrict resizing with generic .split() callback. Signed-off-by: Dmitry Safonov --- arch/arm/kernel/vdso.c| 9 - arch/arm64/kernel/vdso.c | 41 +++ arch/mips/vdso/genvdso.c | 4 arch/s390/kernel/vdso.c | 11 +-- arch/x86/entry/vdso/vma.c | 17 mm/mmap.c | 12 6 files changed, 16 insertions(+), 78 deletions(-) diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index fddd08a6e063..3408269d19c7 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -50,15 +50,6 @@ static const struct vm_special_mapping vdso_data_mapping = { static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) { - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - unsigned long vdso_size; - - /* without VVAR page */ - vdso_size = (vdso_total_pages - 1) << PAGE_SHIFT; - - if (vdso_size != new_size) - return -EINVAL; - current->mm->context.vdso = new_vma->vm_start; return 0; diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index d4202a32abc9..a1a4220a405b 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -82,17 +82,9 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -static int __vdso_remap(enum vdso_abi abi, - const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) { - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - unsigned long vdso_size = vdso_info[abi].vdso_code_end - - vdso_info[abi].vdso_code_start; - - if (vdso_size != new_size) - return -EINVAL; - current->mm->context.vdso = (void *)new_vma->vm_start; return 0; @@ -223,17 +215,6 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, return vmf_insert_pfn(vma, vmf->address, pfn); } -static int vvar_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - unsigned long new_size = new_vma->vm_end - new_vma->vm_start; - - if (new_size != VVAR_NR_PAGES * PAGE_SIZE) - return -EINVAL; - - return 0; -} - static int __setup_additional_pages(enum vdso_abi abi, struct mm_struct *mm, struct linux_binprm *bprm, @@ -284,14 +265,6 @@ static int __setup_additional_pages(enum vdso_abi abi, /* * Create and map the vectors page for AArch32 tasks. */ -#ifdef CONFIG_COMPAT_VDSO -static int aarch32_vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - return __vdso_remap(VDSO_ABI_AA32, sm, new_vma); -} -#endif /* CONFIG_COMPAT_VDSO */ - enum aarch32_map { AA32_MAP_VECTORS, /* kuser helpers */ #ifdef CONFIG_COMPAT_VDSO @@ -313,11 +286,10 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { [AA32_MAP_VVAR] = { .name = "[vvar]", .fault = vvar_fault, - .mremap = vvar_mremap, }, [AA32_MAP_VDSO] = { .name = "[vdso]", - .mremap = aarch32_vdso_mremap, + .mremap = vdso_mremap, }, #endif /* CONFIG_COMPAT_VDSO */ [AA32_MAP_SIGPAGE] = { @@ -465,12 +437,6 @@ int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } #endif /* CONFIG_COMPAT */ -static int vdso_mremap(const struct vm_special_mapping *sm, - struct vm_area_struct *new_vma) -{ - return __vdso_remap(VDSO_ABI_AA64, sm, new_vma); -} - enum aarch64_map { AA64_MAP_VVAR, AA64_MAP_VDSO, @@ -480,7 +446,6 @@ static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = { [AA64_MAP_VVAR] = { .name = "[vvar]", .fault = vvar_fault, - .mremap = vvar_mremap, }, [AA64_MAP_VDSO] = { .name = "[vdso]", diff --git a/arch/mips/vdso/genvdso.c b/arch/mips/vdso/genvdso.c index abb06ae04b40..09e30eb4be86 100644 --- a/arch/mips/vdso/genvdso.c +++ b/arch/mips/vdso/genvdso.c @@ -263,10 +263,6 @@ int main(int argc, char **argv) fprintf(out_file, " const struct vm_special_mapping *sm,\n"); fprintf(out_file, " struct vm_area_struct *new_vma)\n"); fprintf(out_file, "{\n&qu
[PATCH 4/6] vm_ops: Rename .split() callback to .may_split()
Rename the callback to reflect that it's not called *on* or *after* split, but rather some time before the splitting to check if it's possible. Signed-off-by: Dmitry Safonov --- drivers/dax/device.c | 4 ++-- include/linux/mm.h | 3 ++- ipc/shm.c| 8 mm/hugetlb.c | 2 +- mm/mmap.c| 4 ++-- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 1e89513f3c59..223dd1d13d5c 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -276,7 +276,7 @@ static vm_fault_t dev_dax_fault(struct vm_fault *vmf) return dev_dax_huge_fault(vmf, PE_SIZE_PTE); } -static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr) +static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr) { struct file *filp = vma->vm_file; struct dev_dax *dev_dax = filp->private_data; @@ -299,7 +299,7 @@ static unsigned long dev_dax_pagesize(struct vm_area_struct *vma) static const struct vm_operations_struct dax_vm_ops = { .fault = dev_dax_fault, .huge_fault = dev_dax_huge_fault, - .split = dev_dax_split, + .may_split = dev_dax_may_split, .pagesize = dev_dax_pagesize, }; diff --git a/include/linux/mm.h b/include/linux/mm.h index fd51a4a1f722..90887661ef44 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -549,7 +549,8 @@ enum page_entry_size { struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); - int (*split)(struct vm_area_struct * area, unsigned long addr); + /* Called any time before splitting to check if it's allowed */ + int (*may_split)(struct vm_area_struct *area, unsigned long addr); int (*mremap)(struct vm_area_struct *area, unsigned long flags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, diff --git a/ipc/shm.c b/ipc/shm.c index e25c7c6106bc..febd88daba8c 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -434,13 +434,13 @@ static vm_fault_t shm_fault(struct vm_fault *vmf) return sfd->vm_ops->fault(vmf); } -static int shm_split(struct vm_area_struct *vma, unsigned long addr) +static int shm_may_split(struct vm_area_struct *vma, unsigned long addr) { struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(file); - if (sfd->vm_ops->split) - return sfd->vm_ops->split(vma, addr); + if (sfd->vm_ops->may_split) + return sfd->vm_ops->may_split(vma, addr); return 0; } @@ -582,7 +582,7 @@ static const struct vm_operations_struct shm_vm_ops = { .open = shm_open, /* callback for a new vm-area open */ .close = shm_close,/* callback for when the vm-area is released */ .fault = shm_fault, - .split = shm_split, + .may_split = shm_may_split, .pagesize = shm_pagesize, #if defined(CONFIG_NUMA) .set_policy = shm_set_policy, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67fc6383995b..c4dc0e453be1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3759,7 +3759,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, - .split = hugetlb_vm_op_split, + .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, }; diff --git a/mm/mmap.c b/mm/mmap.c index 50f853b0ec39..a62cb3ccafce 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2693,8 +2693,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - if (vma->vm_ops && vma->vm_ops->split) { - err = vma->vm_ops->split(vma, addr); + if (vma->vm_ops && vma->vm_ops->may_split) { + err = vma->vm_ops->may_split(vma, addr); if (err) return err; } -- 2.28.0
[PATCH 2/6] mm/mremap: For MREMAP_DONTUNMAP check security_vm_enough_memory_mm()
Currently memory is accounted post-mremap() with MREMAP_DONTUNMAP, which may break overcommit policy. So, check if there's enough memory before doing actual VMA copy. Don't unset VM_ACCOUNT on MREMAP_DONTUNMAP. By semantics, such mremap() is actually a memory allocation. That also simplifies the error-path a little. Also, as it's memory allocation on success don't reset hiwater_vm value. Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()") Signed-off-by: Dmitry Safonov --- mm/mremap.c | 36 +--- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 03d31a0d4c67..c248f9a52125 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -365,11 +365,19 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (err) return err; + if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) { + if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) + return -ENOMEM; + } + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(, new_addr, new_len, new_pgoff, _rmap_locks); - if (!new_vma) + if (!new_vma) { + if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) + vm_unacct_memory(new_len >> PAGE_SHIFT); return -ENOMEM; + } moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, need_rmap_locks); @@ -398,7 +406,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } /* Conceal VM_ACCOUNT so old reservation is not undone */ - if (vm_flags & VM_ACCOUNT) { + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { vma->vm_flags &= ~VM_ACCOUNT; excess = vma->vm_end - vma->vm_start - old_len; if (old_addr > vma->vm_start && @@ -423,34 +431,16 @@ static unsigned long move_vma(struct vm_area_struct *vma, untrack_pfn_moved(vma); if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { - if (vm_flags & VM_ACCOUNT) { - /* Always put back VM_ACCOUNT since we won't unmap */ - vma->vm_flags |= VM_ACCOUNT; - - vm_acct_memory(new_len >> PAGE_SHIFT); - } - - /* -* VMAs can actually be merged back together in copy_vma -* calling merge_vma. This can happen with anonymous vmas -* which have not yet been faulted, so if we were to consider -* this VMA split we'll end up adding VM_ACCOUNT on the -* next VMA, which is completely unrelated if this VMA -* was re-merged. -*/ - if (split && new_vma == vma) - split = 0; - /* We always clear VM_LOCKED[ONFAULT] on the old vma */ vma->vm_flags &= VM_LOCKED_CLEAR_MASK; /* Because we won't unmap we don't need to touch locked_vm */ - goto out; + return new_addr; } if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ - if (vm_flags & VM_ACCOUNT) + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) vm_acct_memory(new_len >> PAGE_SHIFT); excess = 0; } @@ -459,7 +449,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, mm->locked_vm += new_len >> PAGE_SHIFT; *locked = true; } -out: + mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ -- 2.28.0
[PATCH 3/6] mremap: Don't allow MREMAP_DONTUNMAP on special_mappings and aio
As kernel expect to see only one of such mappings, any further operations on the VMA-copy may be unexpected by the kernel. Maybe it's being on the safe side, but there doesn't seem to be any expected use-case for this, so restrict it now. Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()") Signed-off-by: Dmitry Safonov --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- fs/aio.c | 5 - include/linux/mm.h| 2 +- mm/mmap.c | 6 +- mm/mremap.c | 2 +- 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 0daf2f1cf7a8..e916646adc69 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) return 0; } -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags) { /* Not supported */ return -EINVAL; diff --git a/fs/aio.c b/fs/aio.c index d5ec30385566..3be3c0f77548 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -324,13 +324,16 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma) +static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; + if (flags & MREMAP_DONTUNMAP) + return -EINVAL; + spin_lock(>ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); diff --git a/include/linux/mm.h b/include/linux/mm.h index 16b799a0522c..fd51a4a1f722 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -550,7 +550,7 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*split)(struct vm_area_struct * area, unsigned long addr); - int (*mremap)(struct vm_area_struct * area); + int (*mremap)(struct vm_area_struct *area, unsigned long flags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); diff --git a/mm/mmap.c b/mm/mmap.c index bdd19f5b994e..50f853b0ec39 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3372,10 +3372,14 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } -static int special_mapping_mremap(struct vm_area_struct *new_vma) +static int special_mapping_mremap(struct vm_area_struct *new_vma, + unsigned long flags) { struct vm_special_mapping *sm = new_vma->vm_private_data; + if (flags & MREMAP_DONTUNMAP) + return -EINVAL; + if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; diff --git a/mm/mremap.c b/mm/mremap.c index c248f9a52125..898e9818ba6d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -384,7 +384,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { - err = vma->vm_ops->mremap(new_vma); + err = vma->vm_ops->mremap(new_vma, flags); } if (unlikely(err)) { -- 2.28.0
Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
On 9/27/20 8:43 AM, Christophe Leroy wrote: > > > Le 21/09/2020 à 13:26, Will Deacon a écrit : >> On Fri, Aug 28, 2020 at 12:14:28PM +1000, Michael Ellerman wrote: >>> Dmitry Safonov <0x7f454...@gmail.com> writes: [..] >>>> I'll cook a patch for vm_special_mapping if you don't mind :-) >>> >>> That would be great, thanks! >> >> I lost track of this one. Is there a patch kicking around to resolve >> this, >> or is the segfault expected behaviour? >> > > IIUC dmitry said he will cook a patch. I have not seen any patch yet. Yes, sorry about the delay - I was a bit busy with xfrm patches. I'll send patches for .close() this week, working on them now. > AFAIKS, among the architectures having VDSO sigreturn trampolines, only > SH, X86 and POWERPC provide alternative trampoline on stack when VDSO is > not there. > > All other architectures just having a VDSO don't expect VDSO to not be > mapped. > > As far as nowadays stacks are mapped non-executable, getting a segfaut > is expected behaviour. However, I think we should really make it > cleaner. Today it segfaults because it is still pointing to the VDSO > trampoline that has been unmapped. But should the user map some other > code at the same address, we'll run in the weed on signal return instead > of segfaulting. +1. > So VDSO unmapping should really be properly managed, the reference > should be properly cleared in order to segfault in a controllable manner. > > Only powerpc has a hook to properly clear the VDSO pointer when VDSO is > unmapped. Thanks, Dmitry
[PATCH v3 6/7] xfrm/compat: Translate 32-bit user_policy from sockptr
Provide compat_xfrm_userpolicy_info translation for xfrm setsocketopt(). Reallocate buffer and put the missing padding for 64-bit message. Signed-off-by: Dmitry Safonov --- include/net/xfrm.h | 3 +++ net/xfrm/xfrm_compat.c | 26 ++ net/xfrm/xfrm_state.c | 17 ++--- 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index fa18cb6bb3f7..53618a31634b 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2012,6 +2012,9 @@ struct xfrm_translator { int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack); + /* Translate 32-bit user_policy from sockptr */ + int (*xlate_user_policy_sockptr)(u8 **pdata32, int optlen); + struct module *owner; }; diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index b1b5f972538d..e28f0c9ecd6a 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -576,10 +576,36 @@ static struct nlmsghdr *xfrm_user_rcv_msg_compat(const struct nlmsghdr *h32, return h64; } +static int xfrm_user_policy_compat(u8 **pdata32, int optlen) +{ + struct compat_xfrm_userpolicy_info *p = (void *)*pdata32; + u8 *src_templates, *dst_templates; + u8 *data64; + + if (optlen < sizeof(*p)) + return -EINVAL; + + data64 = kmalloc_track_caller(optlen + 4, GFP_USER | __GFP_NOWARN); + if (!data64) + return -ENOMEM; + + memcpy(data64, *pdata32, sizeof(*p)); + memset(data64 + sizeof(*p), 0, 4); + + src_templates = *pdata32 + sizeof(*p); + dst_templates = data64 + sizeof(*p) + 4; + memcpy(dst_templates, src_templates, optlen - sizeof(*p)); + + kfree(*pdata32); + *pdata32 = data64; + return 0; +} + static struct xfrm_translator xfrm_translator = { .owner = THIS_MODULE, .alloc_compat = xfrm_alloc_compat, .rcv_msg_compat = xfrm_user_rcv_msg_compat, + .xlate_user_policy_sockptr = xfrm_user_policy_compat, }; static int __init xfrm_compat_init(void) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index cc206ca3df78..f9961884500b 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2331,9 +2331,6 @@ int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; - if (in_compat_syscall()) - return -EOPNOTSUPP; - if (sockptr_is_null(optval) && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); @@ -2348,6 +2345,20 @@ int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) if (IS_ERR(data)) return PTR_ERR(data); + if (in_compat_syscall()) { + struct xfrm_translator *xtr = xfrm_get_translator(); + + if (!xtr) + return -EOPNOTSUPP; + + err = xtr->xlate_user_policy_sockptr(, optlen); + xfrm_put_translator(xtr); + if (err) { + kfree(data); + return err; + } + } + err = -EINVAL; rcu_read_lock(); list_for_each_entry_rcu(km, _km_list, list) { -- 2.28.0
[PATCH v3 1/7] xfrm: Provide API to register translator module
Add a skeleton for xfrm_compat module and provide API to register it in xfrm_state.ko. struct xfrm_translator will have function pointers to translate messages received from 32-bit userspace or to be sent to it from 64-bit kernel. module_get()/module_put() are used instead of rcu_read_lock() as the module will vmalloc() memory for translation. The new API is registered with xfrm_state module, not with xfrm_user as the former needs translator for user_policy set by setsockopt() and xfrm_user already uses functions from xfrm_state. Signed-off-by: Dmitry Safonov --- include/net/xfrm.h | 19 + net/xfrm/Kconfig | 10 +++ net/xfrm/Makefile | 1 + net/xfrm/xfrm_compat.c | 29 net/xfrm/xfrm_state.c | 60 ++ 5 files changed, 119 insertions(+) create mode 100644 net/xfrm/xfrm_compat.c diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2737d24ec244..fe2e3717da14 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2000,6 +2000,25 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, return 0; } +struct xfrm_translator { + struct module *owner; +}; + +#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT) +extern int xfrm_register_translator(struct xfrm_translator *xtr); +extern int xfrm_unregister_translator(struct xfrm_translator *xtr); +extern struct xfrm_translator *xfrm_get_translator(void); +extern void xfrm_put_translator(struct xfrm_translator *xtr); +#else +static inline struct xfrm_translator *xfrm_get_translator(void) +{ + return NULL; +} +static inline void xfrm_put_translator(struct xfrm_translator *xtr) +{ +} +#endif + #if IS_ENABLED(CONFIG_IPV6) static inline bool xfrm6_local_dontfrag(const struct sock *sk) { diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 5b9a5ab48111..e79b48dab61b 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -28,6 +28,16 @@ config XFRM_USER If unsure, say Y. +config XFRM_USER_COMPAT + tristate "Compatible ABI support" + depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT + select WANT_COMPAT_NETLINK_MESSAGES + help + Transformation(XFRM) user configuration interface like IPsec + used by compatible Linux applications. + + If unsure, say N. + config XFRM_INTERFACE tristate "Transformation virtual interface" depends on XFRM && IPV6 diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 2d4bb4b9f75e..494aa744bfb9 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o +obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c new file mode 100644 index ..f01d9af41c55 --- /dev/null +++ b/net/xfrm/xfrm_compat.c @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XFRM compat layer + * Author: Dmitry Safonov + * Based on code and translator idea by: Florian Westphal + */ +#include +#include +#include + +static struct xfrm_translator xfrm_translator = { + .owner = THIS_MODULE, +}; + +static int __init xfrm_compat_init(void) +{ + return xfrm_register_translator(_translator); +} + +static void __exit xfrm_compat_exit(void) +{ + xfrm_unregister_translator(_translator); +} + +module_init(xfrm_compat_init); +module_exit(xfrm_compat_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dmitry Safonov"); +MODULE_DESCRIPTION("XFRM 32-bit compatibility layer"); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 69520ad3d83b..cc206ca3df78 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2264,6 +2264,66 @@ static bool km_is_alive(const struct km_event *c) return is_alive; } +#if IS_ENABLED(CONFIG_XFRM_USER_COMPAT) +static DEFINE_SPINLOCK(xfrm_translator_lock); +static struct xfrm_translator __rcu *xfrm_translator; + +struct xfrm_translator *xfrm_get_translator(void) +{ + struct xfrm_translator *xtr; + + rcu_read_lock(); + xtr = rcu_dereference(xfrm_translator); + if (unlikely(!xtr)) + goto out; + if (!try_module_get(xtr->owner)) + xtr = NULL; +out: + rcu_read_unlock(); + return xtr; +} +EXPORT_SYMBOL_GPL(xfrm_get_translator); + +void xfrm_put_translator(struct xfrm_translator *xtr) +{ + module_put(xtr->owner); +} +EXPORT_SYMBOL_GPL(xfrm_put_translator); + +int xfrm_register_translator(struct xfrm_translator *xtr) +{ + int err = 0; + + spin_lock_bh(_translator_lock); + if (unlike
[PATCH v3 3/7] xfrm/compat: Attach xfrm dumps to 64=>32 bit translator
Currently nlmsg_unicast() is used by functions that dump structures that can be different in size for compat tasks, see dump_one_state() and dump_one_policy(). The following nlmsg_unicast() users exist today in xfrm: Function |Message can be different | in size on compat ---|-- xfrm_get_spdinfo() | N xfrm_get_sadinfo() | N xfrm_get_sa() | Y xfrm_alloc_userspi() | Y xfrm_get_policy() | Y xfrm_get_ae() | N Besides, dump_one_state() and dump_one_policy() can be used by filtered netlink dump for XFRM_MSG_GETSA, XFRM_MSG_GETPOLICY. Just as for xfrm multicast, allocate frag_list for compat skb journey down to recvmsg() which will give user the desired skb according to syscall bitness. Signed-off-by: Dmitry Safonov --- net/xfrm/xfrm_user.c | 38 ++ 1 file changed, 38 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 3769227ed4e1..7fd7b16a8805 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -975,6 +975,7 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) struct xfrm_dump_info *sp = ptr; struct sk_buff *in_skb = sp->in_skb; struct sk_buff *skb = sp->out_skb; + struct xfrm_translator *xtr; struct xfrm_usersa_info *p; struct nlmsghdr *nlh; int err; @@ -992,6 +993,18 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) return err; } nlmsg_end(skb, nlh); + + xtr = xfrm_get_translator(); + if (xtr) { + err = xtr->alloc_compat(skb, nlh); + + xfrm_put_translator(xtr); + if (err) { + nlmsg_cancel(skb, nlh); + return err; + } + } + return 0; } @@ -1320,6 +1333,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct xfrm_state *x; struct xfrm_userspi_info *p; + struct xfrm_translator *xtr; struct sk_buff *resp_skb; xfrm_address_t *daddr; int family; @@ -1370,6 +1384,17 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } + xtr = xfrm_get_translator(); + if (xtr) { + err = xtr->alloc_compat(skb, nlmsg_hdr(skb)); + + xfrm_put_translator(xtr); + if (err) { + kfree_skb(resp_skb); + goto out; + } + } + err = nlmsg_unicast(net->xfrm.nlsk, resp_skb, NETLINK_CB(skb).portid); out: @@ -1776,6 +1801,7 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr struct xfrm_userpolicy_info *p; struct sk_buff *in_skb = sp->in_skb; struct sk_buff *skb = sp->out_skb; + struct xfrm_translator *xtr; struct nlmsghdr *nlh; int err; @@ -1800,6 +1826,18 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr return err; } nlmsg_end(skb, nlh); + + xtr = xfrm_get_translator(); + if (xtr) { + err = xtr->alloc_compat(skb, nlh); + + xfrm_put_translator(xtr); + if (err) { + nlmsg_cancel(skb, nlh); + return err; + } + } + return 0; } -- 2.28.0
[PATCH v3 0/7] xfrm: Add compat layer
Changes since v2: - added struct xfrm_translator as API to register xfrm_compat.ko with xfrm_state.ko. This allows compilation of translator as a loadable module - fixed indention and collected reviewed-by (Johannes Berg) - moved boilerplate from commit messages into cover-letter (Steffen Klassert) - found on KASAN build and fixed non-initialised stack variable usage in the translator The resulting v2/v3 diff can be found here: https://gist.github.com/0x7f454c46/8f68311dfa1f240959fdbe7c77ed2259 Patches as a .git branch: https://github.com/0x7f454c46/linux/tree/xfrm-compat-v3 Changes since v1: - reworked patches set to use translator - separated the compat layer into xfrm_compat.c, compiled under XFRM_USER_COMPAT config - 32-bit messages now being sent in frag_list (like wext-core does) - instead of __packed add compat_u64 members in compat structures - selftest reworked to kselftest lib API - added netlink dump testing to the selftest XFRM is disabled for compatible users because of the UABI difference. The difference is in structures paddings and in the result the size of netlink messages differ. Possibility for compatible application to manage xfrm tunnels was disabled by: the commmit 19d7df69fdb2 ("xfrm: Refuse to insert 32 bit userspace socket policies on 64 bit systems") and the commit 74005991b78a ("xfrm: Do not parse 32bits compiled xfrm netlink msg on 64bits host"). This is my second attempt to resolve the xfrm/compat problem by adding the 64=>32 and 32=>64 bit translators those non-visibly to a user provide translation between compatible user and kernel. Previous attempt was to interrupt the message ABI according to a syscall by xfrm_user, which resulted in over-complicated code [1]. Florian Westphal provided the idea of translator and some draft patches in the discussion. In these patches, his idea is reused and some of his initial code is also present. There were a couple of attempts to solve xfrm compat problem: https://lkml.org/lkml/2017/1/20/733 https://patchwork.ozlabs.org/patch/44600/ http://netdev.vger.kernel.narkive.com/2Gesykj6/patch-net-next-xfrm-correctly-parse-netlink-msg-from-32bits-ip-command-on-64bits-host All the discussions end in the conclusion that xfrm should have a full compatible layer to correctly work with 32-bit applications on 64-bit kernels: https://lkml.org/lkml/2017/1/23/413 https://patchwork.ozlabs.org/patch/433279/ In some recent lkml discussion, Linus said that it's worth to fix this problem and not giving people an excuse to stay on 32-bit kernel: https://lkml.org/lkml/2018/2/13/752 There is also an selftest for ipsec tunnels. It doesn't depend on any library and compat version can be easy build with: make CFLAGS=-m32 net/ipsec [1]: https://lkml.kernel.org/r/20180726023144.31066-1-d...@arista.com Cc: "David S. Miller" Cc: Florian Westphal Cc: Herbert Xu Cc: Jakub Kicinski Cc: Johannes Berg Cc: Steffen Klassert Cc: Stephen Suryaputra Cc: Dmitry Safonov <0x7f454...@gmail.com> Cc: net...@vger.kernel.org Dmitry Safonov (7): xfrm: Provide API to register translator module xfrm/compat: Add 64=>32-bit messages translator xfrm/compat: Attach xfrm dumps to 64=>32 bit translator netlink/compat: Append NLMSG_DONE/extack to frag_list xfrm/compat: Add 32=>64-bit messages translator xfrm/compat: Translate 32-bit user_policy from sockptr selftest/net/xfrm: Add test for ipsec tunnel MAINTAINERS|1 + include/net/xfrm.h | 33 + net/netlink/af_netlink.c | 47 +- net/xfrm/Kconfig | 11 + net/xfrm/Makefile |1 + net/xfrm/xfrm_compat.c | 625 +++ net/xfrm/xfrm_state.c | 77 +- net/xfrm/xfrm_user.c | 110 +- tools/testing/selftests/net/.gitignore |1 + tools/testing/selftests/net/Makefile |1 + tools/testing/selftests/net/ipsec.c| 2195 11 files changed, 3066 insertions(+), 36 deletions(-) create mode 100644 net/xfrm/xfrm_compat.c create mode 100644 tools/testing/selftests/net/ipsec.c base-commit: ba4f184e126b751d1bffad5897f263108befc780 -- 2.28.0
[PATCH v3 5/7] xfrm/compat: Add 32=>64-bit messages translator
Provide the user-to-kernel translator under XFRM_USER_COMPAT, that creates for 32-bit xfrm-user message a 64-bit translation. The translation is afterwards reused by xfrm_user code just as if userspace had sent 64-bit message. Signed-off-by: Dmitry Safonov --- include/net/xfrm.h | 6 + net/xfrm/Kconfig | 3 +- net/xfrm/xfrm_compat.c | 274 + net/xfrm/xfrm_user.c | 57 ++--- 4 files changed, 321 insertions(+), 19 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 5b6cc62c9354..fa18cb6bb3f7 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2001,11 +2001,17 @@ static inline int xfrm_tunnel_check(struct sk_buff *skb, struct xfrm_state *x, } extern const int xfrm_msg_min[XFRM_NR_MSGTYPES]; +extern const struct nla_policy xfrma_policy[XFRMA_MAX+1]; struct xfrm_translator { /* Allocate frag_list and put compat translation there */ int (*alloc_compat)(struct sk_buff *skb, const struct nlmsghdr *src); + /* Allocate nlmsg with 64-bit translaton of received 32-bit message */ + struct nlmsghdr *(*rcv_msg_compat)(const struct nlmsghdr *nlh, + int maxtype, const struct nla_policy *policy, + struct netlink_ext_ack *extack); + struct module *owner; }; diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index e79b48dab61b..3adf31a83a79 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -30,7 +30,8 @@ config XFRM_USER config XFRM_USER_COMPAT tristate "Compatible ABI support" - depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT + depends on XFRM_USER && COMPAT_FOR_U64_ALIGNMENT && \ + HAVE_EFFICIENT_UNALIGNED_ACCESS select WANT_COMPAT_NETLINK_MESSAGES help Transformation(XFRM) user configuration interface like IPsec diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index aece41b44ff2..b1b5f972538d 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -96,6 +96,39 @@ static const int compat_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_MAPPING - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_mapping) }; +static const struct nla_policy compat_policy[XFRMA_MAX+1] = { + [XFRMA_SA] = { .len = XMSGSIZE(compat_xfrm_usersa_info)}, + [XFRMA_POLICY] = { .len = XMSGSIZE(compat_xfrm_userpolicy_info)}, + [XFRMA_LASTUSED]= { .type = NLA_U64}, + [XFRMA_ALG_AUTH_TRUNC] = { .len = sizeof(struct xfrm_algo_auth)}, + [XFRMA_ALG_AEAD]= { .len = sizeof(struct xfrm_algo_aead) }, + [XFRMA_ALG_AUTH]= { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_CRYPT] = { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ALG_COMP]= { .len = sizeof(struct xfrm_algo) }, + [XFRMA_ENCAP] = { .len = sizeof(struct xfrm_encap_tmpl) }, + [XFRMA_TMPL]= { .len = sizeof(struct xfrm_user_tmpl) }, + [XFRMA_SEC_CTX] = { .len = sizeof(struct xfrm_sec_ctx) }, + [XFRMA_LTIME_VAL] = { .len = sizeof(struct xfrm_lifetime_cur) }, + [XFRMA_REPLAY_VAL] = { .len = sizeof(struct xfrm_replay_state) }, + [XFRMA_REPLAY_THRESH] = { .type = NLA_U32 }, + [XFRMA_ETIMER_THRESH] = { .type = NLA_U32 }, + [XFRMA_SRCADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_COADDR] = { .len = sizeof(xfrm_address_t) }, + [XFRMA_POLICY_TYPE] = { .len = sizeof(struct xfrm_userpolicy_type)}, + [XFRMA_MIGRATE] = { .len = sizeof(struct xfrm_user_migrate) }, + [XFRMA_KMADDRESS] = { .len = sizeof(struct xfrm_user_kmaddress) }, + [XFRMA_MARK]= { .len = sizeof(struct xfrm_mark) }, + [XFRMA_TFCPAD] = { .type = NLA_U32 }, + [XFRMA_REPLAY_ESN_VAL] = { .len = sizeof(struct xfrm_replay_state_esn) }, + [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, + [XFRMA_PROTO] = { .type = NLA_U8 }, + [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, + [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, + [XFRMA_SET_MARK]= { .type = NLA_U32 }, + [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 }, + [XFRMA_IF_ID] = { .type = NLA_U32 }, +}; + static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src, u16 type) { @@ -303,9 +336,250 @@ static int xfrm_alloc_compat(struct sk_buff *skb, const struct nlmsghdr *nlh_src return 0; } +/* Calculates len of translated 64-bit message. */ +static size_t xfrm_user_rcv_calculate_len64(const struct nlmsghdr *src, + struct nlattr *attrs[XFRMA_MAX+1]) +{ + size_t len = nlmsg_len(src); + + switch (src->nlmsg_type) { + case XFRM_MSG_NEWSA: + case XFRM_MSG_NEWP
[PATCH v3 4/7] netlink/compat: Append NLMSG_DONE/extack to frag_list
Modules those use netlink may supply a 2nd skb, (via frag_list) that contains an alternative data set meant for applications using 32bit compatibility mode. In such a case, netlink_recvmsg will use this 2nd skb instead of the original one. Without this patch, such compat applications will retrieve all netlink dump data, but will then get an unexpected EOF. Cc: Johannes Berg Signed-off-by: Florian Westphal Signed-off-by: Dmitry Safonov Reviewed-by: Johannes Berg --- net/netlink/af_netlink.c | 47 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index d2d1448274f5..de12dd3136f9 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2186,13 +2186,35 @@ EXPORT_SYMBOL(__nlmsg_put); * It would be better to create kernel thread. */ +static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb, +struct netlink_callback *cb, +struct netlink_ext_ack *extack) +{ + struct nlmsghdr *nlh; + + nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), + NLM_F_MULTI | cb->answer_flags); + if (WARN_ON(!nlh)) + return -ENOBUFS; + + nl_dump_check_consistent(cb, nlh); + memcpy(nlmsg_data(nlh), >dump_done_errno, sizeof(nlk->dump_done_errno)); + + if (extack->_msg && nlk->flags & NETLINK_F_EXT_ACK) { + nlh->nlmsg_flags |= NLM_F_ACK_TLVS; + if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)) + nlmsg_end(skb, nlh); + } + + return 0; +} + static int netlink_dump(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; - struct nlmsghdr *nlh; struct module *module; int err = -ENOBUFS; int alloc_min_size; @@ -2258,22 +2280,19 @@ static int netlink_dump(struct sock *sk) return 0; } - nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, - sizeof(nlk->dump_done_errno), - NLM_F_MULTI | cb->answer_flags); - if (WARN_ON(!nlh)) + if (netlink_dump_done(nlk, skb, cb, )) goto errout_skb; - nl_dump_check_consistent(cb, nlh); - - memcpy(nlmsg_data(nlh), >dump_done_errno, - sizeof(nlk->dump_done_errno)); - - if (extack._msg && nlk->flags & NETLINK_F_EXT_ACK) { - nlh->nlmsg_flags |= NLM_F_ACK_TLVS; - if (!nla_put_string(skb, NLMSGERR_ATTR_MSG, extack._msg)) - nlmsg_end(skb, nlh); +#ifdef CONFIG_COMPAT_NETLINK_MESSAGES + /* frag_list skb's data is used for compat tasks +* and the regular skb's data for normal (non-compat) tasks. +* See netlink_recvmsg(). +*/ + if (unlikely(skb_shinfo(skb)->frag_list)) { + if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, )) + goto errout_skb; } +#endif if (sk_filter(sk, skb)) kfree_skb(skb); -- 2.28.0