On Thu, Dec 6, 2018 at 7:02 AM David Abdurachmanov <[email protected]> wrote: > > The patch adds support for SECCOMP and SECCOMP_FILTER (BPF). > > Signed-off-by: David Abdurachmanov <[email protected]> > --- > arch/riscv/Kconfig | 14 ++++++++++++++ > arch/riscv/include/asm/thread_info.h | 5 ++++- > arch/riscv/kernel/entry.S | 27 +++++++++++++++++++++++++-- > arch/riscv/kernel/ptrace.c | 8 ++++++++ > 4 files changed, 51 insertions(+), 3 deletions(-) > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index a4f48f757204..49cd8e251547 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -29,6 +29,7 @@ config RISCV > select GENERIC_SMP_IDLE_THREAD > select GENERIC_ATOMIC64 if !64BIT || !RISCV_ISA_A > select HAVE_ARCH_AUDITSYSCALL > + select HAVE_ARCH_SECCOMP_FILTER > select HAVE_MEMBLOCK_NODE_MAP > select HAVE_DMA_CONTIGUOUS > select HAVE_FUTEX_CMPXCHG if FUTEX > @@ -228,6 +229,19 @@ menu "Kernel features" > > source "kernel/Kconfig.hz" > > +config SECCOMP > + bool "Enable seccomp to safely compute untrusted bytecode" > + help > + This kernel feature is useful for number crunching applications > + that may need to compute untrusted bytecode during their > + execution. By using pipes or other transports made available to > + the process as file descriptors supporting the read/write > + syscalls, it's possible to isolate those applications in > + their own address space using seccomp. Once seccomp is > + enabled via prctl(PR_SET_SECCOMP), it cannot be disabled > + and the task is only allowed to execute a few safe syscalls > + defined by each seccomp mode. > + > endmenu > > menu "Boot options" > diff --git a/arch/riscv/include/asm/thread_info.h > b/arch/riscv/include/asm/thread_info.h > index 1c9cc8389928..1fd6e4130cab 100644 > --- a/arch/riscv/include/asm/thread_info.h > +++ b/arch/riscv/include/asm/thread_info.h > @@ -81,6 +81,7 @@ struct thread_info { > #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ > #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint > instrumentation */ > #define TIF_SYSCALL_AUDIT 7 /* syscall auditing */ > +#define TIF_SECCOMP 8 /* syscall secure computing */ > > #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) > #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) > @@ -88,11 +89,13 @@ struct thread_info { > #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) > #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) > #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) > +#define _TIF_SECCOMP (1 << TIF_SECCOMP) > > #define _TIF_WORK_MASK \ > (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED) > > #define _TIF_SYSCALL_WORK \ > - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) > + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT \ > + _TIF_SECCOMP ) > > #endif /* _ASM_RISCV_THREAD_INFO_H */ > diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S > index 355166f57205..e88ccbfa61ee 100644 > --- a/arch/riscv/kernel/entry.S > +++ b/arch/riscv/kernel/entry.S > @@ -207,8 +207,25 @@ check_syscall_nr: > /* Check to make sure we don't jump to a bogus syscall number. */ > li t0, __NR_syscalls > la s0, sys_ni_syscall > - /* Syscall number held in a7 */ > - bgeu a7, t0, 1f > + /* > + * The tracer can change syscall number to valid/invalid value. > + * We use syscall_set_nr helper in syscall_trace_enter thus we > + * cannot trust the current value in a7 and have to reload from > + * the current task pt_regs. > + */ > + REG_L a7, PT_A7(sp) > + /* > + * Syscall number held in a7. > + * If syscall number is above allowed value, redirect to ni_syscall. > + */ > + bge a7, t0, 1f > + /* > + * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1. > + * If yes, we pretend it was executed. > + */ > + li t1, -1 > + beq a7, t1, ret_from_syscall_rejected > + /* Call syscall */ > la s0, sys_call_table > slli t0, a7, RISCV_LGPTR > add s0, s0, t0 > @@ -219,6 +236,12 @@ check_syscall_nr: > ret_from_syscall: > /* Set user a0 to kernel a0 */ > REG_S a0, PT_A0(sp) > + /* > + * We didn't execute the actual syscall. > + * Seccomp already set return value for the current task pt_regs. > + * (If it was configured with SECCOMP_RET_ERRNO/TRACE) > + */ > +ret_from_syscall_rejected: > /* Trace syscalls, but only if requested by the user. */ > REG_L t0, TASK_TI_FLAGS(tp) > andi t0, t0, _TIF_SYSCALL_WORK > diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c > index c1b51539c3e2..598e48b8ca2b 100644 > --- a/arch/riscv/kernel/ptrace.c > +++ b/arch/riscv/kernel/ptrace.c > @@ -160,6 +160,14 @@ void do_syscall_trace_enter(struct pt_regs *regs) > if (tracehook_report_syscall_entry(regs)) > syscall_set_nr(current, regs, -1); > > + /* > + * Do the secure computing after ptrace; failures should be fast. > + * If this fails we might have return value in a0 from seccomp > + * (via SECCOMP_RET_ERRNO/TRACE). > + */ > + if (secure_computing(NULL) == -1) > + syscall_set_nr(current, regs, -1);
On a -1 return, this should return immediately -- it should not continue to process trace_sys_enter(), etc. -Kees > + > #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS > if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) > trace_sys_enter(regs, syscall_get_nr(current, regs)); > -- > 2.19.2 > -- Kees Cook

