Re: [PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-16 Thread Michael Kerrisk (man-pages)
Hi Kees,

On Wed, Jun 11, 2014 at 5:25 AM, Kees Cook  wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

I assume there'll be another iteration of these patches. With that
next iteration, could you write a man page (or at least free text
structured like a man page) that comprehensively documents the
user-space API?

Thanks,

Michael


> Signed-off-by: Kees Cook 
> Cc: linux-...@vger.kernel.org
> ---
>  arch/x86/syscalls/syscall_32.tbl  |1 +
>  arch/x86/syscalls/syscall_64.tbl  |1 +
>  include/linux/syscalls.h  |2 ++
>  include/uapi/asm-generic/unistd.h |4 ++-
>  include/uapi/linux/seccomp.h  |4 +++
>  kernel/seccomp.c  |   63 
> -
>  kernel/sys_ni.c   |3 ++
>  7 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/syscalls/syscall_32.tbl 
> b/arch/x86/syscalls/syscall_32.tbl
> index d6b867921612..7527eac24122 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -360,3 +360,4 @@
>  351i386sched_setattr   sys_sched_setattr
>  352i386sched_getattr   sys_sched_getattr
>  353i386renameat2   sys_renameat2
> +354i386seccomp sys_seccomp
> diff --git a/arch/x86/syscalls/syscall_64.tbl 
> b/arch/x86/syscalls/syscall_64.tbl
> index ec255a1646d2..16272a6c12b7 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -323,6 +323,7 @@
>  314common  sched_setattr   sys_sched_setattr
>  315common  sched_getattr   sys_sched_getattr
>  316common  renameat2   sys_renameat2
> +317common  seccomp sys_seccomp
>
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index b0881a0ed322..1713977ee26f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>  unsigned long idx1, unsigned long idx2);
>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
> flags);
> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
> +   const char __user *uargs);
>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h 
> b/include/uapi/asm-generic/unistd.h
> index 333640608087..65acbf0e2867 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
>  __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
>  #define __NR_renameat2 276
>  __SYSCALL(__NR_renameat2, sys_renameat2)
> +#define __NR_seccomp 277
> +__SYSCALL(__NR_seccomp, sys_seccomp)
>
>  #undef __NR_syscalls
> -#define __NR_syscalls 277
> +#define __NR_syscalls 278
>
>  /*
>   * All syscalls below here should go away really,
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index ac2dc9f72973..b258878ba754 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -10,6 +10,10 @@
>  #define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
>  #define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
>
> +/* Valid operations for seccomp syscall. */
> +#define SECCOMP_SET_MODE_STRICT0
> +#define SECCOMP_SET_MODE_FILTER1
> +
>  /*
>   * All BPF programs must return a 32-bit value.
>   * The bottom 16-bits are for optional return data.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 39d32c2904fc..c0cafa9e84af 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -19,6 +19,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  /* #define SECCOMP_DEBUG 1 */
>
> @@ -301,8 +302,8 @@ free_prog:
>   *
>   * Returns filter on success and ERR_PTR otherwise.
>   */
> -static
> -struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
> +static struct seccomp_filter *
> +seccomp_prepare_user_filter(const char __user *user_filter)
>  {
> struct sock_fprog fprog;
> struct seccomp_filter *filter = ERR_PTR(-EFAULT);
> @@ -325,19 +326,25 @@ out:
>
>  /**
>   * seccomp_attach_filter: validate and attach filter
> + * @flags:  flags to change filter behavior
>   * @filter: seccomp filter to add to the current process
>   *
>   * Caller must be holding current->sighand->siglock lock.
>   *
>   * Returns 0 on success, -ve on error.
>   */
> -static long seccomp_attach_filter(struct seccomp_filter *filter)
> +static long seccomp_attach_filter(unsigned int flags,
> + 

Re: [PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-16 Thread Michael Kerrisk (man-pages)
Hi Kees,

On Wed, Jun 11, 2014 at 5:25 AM, Kees Cook keesc...@chromium.org wrote:
 This adds the new seccomp syscall with both an operation and flags
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

I assume there'll be another iteration of these patches. With that
next iteration, could you write a man page (or at least free text
structured like a man page) that comprehensively documents the
user-space API?

Thanks,

Michael


 Signed-off-by: Kees Cook keesc...@chromium.org
 Cc: linux-...@vger.kernel.org
 ---
  arch/x86/syscalls/syscall_32.tbl  |1 +
  arch/x86/syscalls/syscall_64.tbl  |1 +
  include/linux/syscalls.h  |2 ++
  include/uapi/asm-generic/unistd.h |4 ++-
  include/uapi/linux/seccomp.h  |4 +++
  kernel/seccomp.c  |   63 
 -
  kernel/sys_ni.c   |3 ++
  7 files changed, 69 insertions(+), 9 deletions(-)

 diff --git a/arch/x86/syscalls/syscall_32.tbl 
 b/arch/x86/syscalls/syscall_32.tbl
 index d6b867921612..7527eac24122 100644
 --- a/arch/x86/syscalls/syscall_32.tbl
 +++ b/arch/x86/syscalls/syscall_32.tbl
 @@ -360,3 +360,4 @@
  351i386sched_setattr   sys_sched_setattr
  352i386sched_getattr   sys_sched_getattr
  353i386renameat2   sys_renameat2
 +354i386seccomp sys_seccomp
 diff --git a/arch/x86/syscalls/syscall_64.tbl 
 b/arch/x86/syscalls/syscall_64.tbl
 index ec255a1646d2..16272a6c12b7 100644
 --- a/arch/x86/syscalls/syscall_64.tbl
 +++ b/arch/x86/syscalls/syscall_64.tbl
 @@ -323,6 +323,7 @@
  314common  sched_setattr   sys_sched_setattr
  315common  sched_getattr   sys_sched_getattr
  316common  renameat2   sys_renameat2
 +317common  seccomp sys_seccomp

  #
  # x32-specific system call numbers start at 512 to avoid cache impact
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
 index b0881a0ed322..1713977ee26f 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
  unsigned long idx1, unsigned long idx2);
  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
 flags);
 +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 +   const char __user *uargs);
  #endif
 diff --git a/include/uapi/asm-generic/unistd.h 
 b/include/uapi/asm-generic/unistd.h
 index 333640608087..65acbf0e2867 100644
 --- a/include/uapi/asm-generic/unistd.h
 +++ b/include/uapi/asm-generic/unistd.h
 @@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
  __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
  #define __NR_renameat2 276
  __SYSCALL(__NR_renameat2, sys_renameat2)
 +#define __NR_seccomp 277
 +__SYSCALL(__NR_seccomp, sys_seccomp)

  #undef __NR_syscalls
 -#define __NR_syscalls 277
 +#define __NR_syscalls 278

  /*
   * All syscalls below here should go away really,
 diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
 index ac2dc9f72973..b258878ba754 100644
 --- a/include/uapi/linux/seccomp.h
 +++ b/include/uapi/linux/seccomp.h
 @@ -10,6 +10,10 @@
  #define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
  #define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */

 +/* Valid operations for seccomp syscall. */
 +#define SECCOMP_SET_MODE_STRICT0
 +#define SECCOMP_SET_MODE_FILTER1
 +
  /*
   * All BPF programs must return a 32-bit value.
   * The bottom 16-bits are for optional return data.
 diff --git a/kernel/seccomp.c b/kernel/seccomp.c
 index 39d32c2904fc..c0cafa9e84af 100644
 --- a/kernel/seccomp.c
 +++ b/kernel/seccomp.c
 @@ -19,6 +19,7 @@
  #include linux/sched.h
  #include linux/seccomp.h
  #include linux/slab.h
 +#include linux/syscalls.h

  /* #define SECCOMP_DEBUG 1 */

 @@ -301,8 +302,8 @@ free_prog:
   *
   * Returns filter on success and ERR_PTR otherwise.
   */
 -static
 -struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
 +static struct seccomp_filter *
 +seccomp_prepare_user_filter(const char __user *user_filter)
  {
 struct sock_fprog fprog;
 struct seccomp_filter *filter = ERR_PTR(-EFAULT);
 @@ -325,19 +326,25 @@ out:

  /**
   * seccomp_attach_filter: validate and attach filter
 + * @flags:  flags to change filter behavior
   * @filter: seccomp filter to add to the current process
   *
   * Caller must be holding current-sighand-siglock lock.
   *
   * Returns 0 on success, -ve on error.
   */
 -static long seccomp_attach_filter(struct seccomp_filter *filter)
 +static long seccomp_attach_filter(unsigned int flags,
 + struct 

Re: [PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-13 Thread Alexei Starovoitov
On Fri, Jun 13, 2014 at 2:42 PM, Andy Lutomirski  wrote:
> On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov  wrote:
>> On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski  wrote:
>>> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov  
>>> wrote:
 On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook  wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>
> Signed-off-by: Kees Cook 
> Cc: linux-...@vger.kernel.org
> ---
>  arch/x86/syscalls/syscall_32.tbl  |1 +
>  arch/x86/syscalls/syscall_64.tbl  |1 +
>  include/linux/syscalls.h  |2 ++
>  include/uapi/asm-generic/unistd.h |4 ++-
>  include/uapi/linux/seccomp.h  |4 +++
>  kernel/seccomp.c  |   63 
> -
>  kernel/sys_ni.c   |3 ++
>  7 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/syscalls/syscall_32.tbl 
> b/arch/x86/syscalls/syscall_32.tbl
> index d6b867921612..7527eac24122 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -360,3 +360,4 @@
>  351i386sched_setattr   sys_sched_setattr
>  352i386sched_getattr   sys_sched_getattr
>  353i386renameat2   sys_renameat2
> +354i386seccomp sys_seccomp
> diff --git a/arch/x86/syscalls/syscall_64.tbl 
> b/arch/x86/syscalls/syscall_64.tbl
> index ec255a1646d2..16272a6c12b7 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -323,6 +323,7 @@
>  314common  sched_setattr   sys_sched_setattr
>  315common  sched_getattr   sys_sched_getattr
>  316common  renameat2   sys_renameat2
> +317common  seccomp sys_seccomp
>
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index b0881a0ed322..1713977ee26f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>  unsigned long idx1, unsigned long idx2);
>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
> flags);
> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
> +   const char __user *uargs);

 It looks odd to add 'flags' argument to syscall that is not even used.
 It don't think it will be extensible this way.
 'uargs' is used only in 2nd command as well and it's not 'char __user *'
 but rather 'struct sock_fprog __user *'
 I think it makes more sense to define only first argument as 'int op' and 
 the
 rest as variable length array.
 Something like:
 long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
 then different commands can interpret 'attrs' differently.
 if op == mode_strict, then attrs == NULL, len == 0
 if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
 and nla_data(attrs) is 'struct sock_fprog'
>>>
>>> Eww.  If the operation doesn't imply the type, then I think we've
>>> totally screwed up.
>>>
 If we decide to add new types of filters or new commands, the syscall 
 prototype
 won't need to change. New commands can be added preserving backward
 compatibility.
 The basic TLV concept has been around forever in netlink world. imo makes
 sense to use it with new syscalls. Passing 'struct xxx' into syscalls
 is the thing
 of the past. TLV style is more extensible. Fields of structures can become
 optional in the future, new fields added, etc.
 'struct nlattr' brings the same benefits to kernel api as protobuf did
 to user land.
>>>
>>> I see no reason to bring nl_attr into this.
>>>
>>> Admittedly, I've never dealt with nl_attr, but everything
>>> netlink-related I've even been involved in has involved some sort of
>>> API atrocity.
>>
>> netlink has a lot of legacy and there is genetlink which is not pretty
>> either because of extra socket creation, binding, dealing with packet
>> loss issues, but the key concept of variable length encoding is sound.
>> Right now seccomp has two commands and they already don't fit
>> into single syscall neatly. Are you saying there should be two syscalls
>> here? What about another seccomp related command? Another syscall?
>> imo all seccomp related commands needs to be mux/demux-ed under
>> one 

Re: [PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-13 Thread Kees Cook
On Fri, Jun 13, 2014 at 2:42 PM, Andy Lutomirski  wrote:
> On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov  wrote:
>> On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski  wrote:
>>> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov  
>>> wrote:
 On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook  wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>
> Signed-off-by: Kees Cook 
> Cc: linux-...@vger.kernel.org
> ---
>  arch/x86/syscalls/syscall_32.tbl  |1 +
>  arch/x86/syscalls/syscall_64.tbl  |1 +
>  include/linux/syscalls.h  |2 ++
>  include/uapi/asm-generic/unistd.h |4 ++-
>  include/uapi/linux/seccomp.h  |4 +++
>  kernel/seccomp.c  |   63 
> -
>  kernel/sys_ni.c   |3 ++
>  7 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/syscalls/syscall_32.tbl 
> b/arch/x86/syscalls/syscall_32.tbl
> index d6b867921612..7527eac24122 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -360,3 +360,4 @@
>  351i386sched_setattr   sys_sched_setattr
>  352i386sched_getattr   sys_sched_getattr
>  353i386renameat2   sys_renameat2
> +354i386seccomp sys_seccomp
> diff --git a/arch/x86/syscalls/syscall_64.tbl 
> b/arch/x86/syscalls/syscall_64.tbl
> index ec255a1646d2..16272a6c12b7 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -323,6 +323,7 @@
>  314common  sched_setattr   sys_sched_setattr
>  315common  sched_getattr   sys_sched_getattr
>  316common  renameat2   sys_renameat2
> +317common  seccomp sys_seccomp
>
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index b0881a0ed322..1713977ee26f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>  unsigned long idx1, unsigned long idx2);
>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
> flags);
> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
> +   const char __user *uargs);

 It looks odd to add 'flags' argument to syscall that is not even used.
 It don't think it will be extensible this way.
 'uargs' is used only in 2nd command as well and it's not 'char __user *'
 but rather 'struct sock_fprog __user *'
 I think it makes more sense to define only first argument as 'int op' and 
 the
 rest as variable length array.
 Something like:
 long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
 then different commands can interpret 'attrs' differently.
 if op == mode_strict, then attrs == NULL, len == 0
 if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
 and nla_data(attrs) is 'struct sock_fprog'
>>>
>>> Eww.  If the operation doesn't imply the type, then I think we've
>>> totally screwed up.
>>>
 If we decide to add new types of filters or new commands, the syscall 
 prototype
 won't need to change. New commands can be added preserving backward
 compatibility.
 The basic TLV concept has been around forever in netlink world. imo makes
 sense to use it with new syscalls. Passing 'struct xxx' into syscalls
 is the thing
 of the past. TLV style is more extensible. Fields of structures can become
 optional in the future, new fields added, etc.
 'struct nlattr' brings the same benefits to kernel api as protobuf did
 to user land.
>>>
>>> I see no reason to bring nl_attr into this.
>>>
>>> Admittedly, I've never dealt with nl_attr, but everything
>>> netlink-related I've even been involved in has involved some sort of
>>> API atrocity.
>>
>> netlink has a lot of legacy and there is genetlink which is not pretty
>> either because of extra socket creation, binding, dealing with packet
>> loss issues, but the key concept of variable length encoding is sound.
>> Right now seccomp has two commands and they already don't fit
>> into single syscall neatly. Are you saying there should be two syscalls
>> here? What about another seccomp related command? Another syscall?
>> imo all seccomp related commands needs to be mux/demux-ed under
>> one 

Re: [PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-13 Thread Kees Cook
On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov  wrote:
> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook  wrote:
>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>> parameter for future expansion. The third argument is a pointer value,
>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>
>> Signed-off-by: Kees Cook 
>> Cc: linux-...@vger.kernel.org
>> ---
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index b0881a0ed322..1713977ee26f 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>  unsigned long idx1, unsigned long idx2);
>>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
>> flags);
>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>> +   const char __user *uargs);
>
> It looks odd to add 'flags' argument to syscall that is not even used.

FWIW, "flags" is given use in the next patch to support the tsync option.

-Kees

-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-13 Thread Andy Lutomirski
On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov  wrote:
> On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski  wrote:
>> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov  
>> wrote:
>>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook  wrote:
 This adds the new "seccomp" syscall with both an "operation" and "flags"
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

 Signed-off-by: Kees Cook 
 Cc: linux-...@vger.kernel.org
 ---
  arch/x86/syscalls/syscall_32.tbl  |1 +
  arch/x86/syscalls/syscall_64.tbl  |1 +
  include/linux/syscalls.h  |2 ++
  include/uapi/asm-generic/unistd.h |4 ++-
  include/uapi/linux/seccomp.h  |4 +++
  kernel/seccomp.c  |   63 
 -
  kernel/sys_ni.c   |3 ++
  7 files changed, 69 insertions(+), 9 deletions(-)

 diff --git a/arch/x86/syscalls/syscall_32.tbl 
 b/arch/x86/syscalls/syscall_32.tbl
 index d6b867921612..7527eac24122 100644
 --- a/arch/x86/syscalls/syscall_32.tbl
 +++ b/arch/x86/syscalls/syscall_32.tbl
 @@ -360,3 +360,4 @@
  351i386sched_setattr   sys_sched_setattr
  352i386sched_getattr   sys_sched_getattr
  353i386renameat2   sys_renameat2
 +354i386seccomp sys_seccomp
 diff --git a/arch/x86/syscalls/syscall_64.tbl 
 b/arch/x86/syscalls/syscall_64.tbl
 index ec255a1646d2..16272a6c12b7 100644
 --- a/arch/x86/syscalls/syscall_64.tbl
 +++ b/arch/x86/syscalls/syscall_64.tbl
 @@ -323,6 +323,7 @@
  314common  sched_setattr   sys_sched_setattr
  315common  sched_getattr   sys_sched_getattr
  316common  renameat2   sys_renameat2
 +317common  seccomp sys_seccomp

  #
  # x32-specific system call numbers start at 512 to avoid cache impact
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
 index b0881a0ed322..1713977ee26f 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
  unsigned long idx1, unsigned long idx2);
  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
 flags);
 +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 +   const char __user *uargs);
>>>
>>> It looks odd to add 'flags' argument to syscall that is not even used.
>>> It don't think it will be extensible this way.
>>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>>> but rather 'struct sock_fprog __user *'
>>> I think it makes more sense to define only first argument as 'int op' and 
>>> the
>>> rest as variable length array.
>>> Something like:
>>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>>> then different commands can interpret 'attrs' differently.
>>> if op == mode_strict, then attrs == NULL, len == 0
>>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>>> and nla_data(attrs) is 'struct sock_fprog'
>>
>> Eww.  If the operation doesn't imply the type, then I think we've
>> totally screwed up.
>>
>>> If we decide to add new types of filters or new commands, the syscall 
>>> prototype
>>> won't need to change. New commands can be added preserving backward
>>> compatibility.
>>> The basic TLV concept has been around forever in netlink world. imo makes
>>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>>> is the thing
>>> of the past. TLV style is more extensible. Fields of structures can become
>>> optional in the future, new fields added, etc.
>>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>>> to user land.
>>
>> I see no reason to bring nl_attr into this.
>>
>> Admittedly, I've never dealt with nl_attr, but everything
>> netlink-related I've even been involved in has involved some sort of
>> API atrocity.
>
> netlink has a lot of legacy and there is genetlink which is not pretty
> either because of extra socket creation, binding, dealing with packet
> loss issues, but the key concept of variable length encoding is sound.
> Right now seccomp has two commands and they already don't fit
> into single syscall neatly. Are you saying there should be two syscalls
> here? What about another seccomp related command? Another syscall?
> imo all seccomp related commands needs to be mux/demux-ed under
> one syscall. What is the way to mux/demux potentially very different
> commands under one syscall? I cannot think of anything better than
> TLV style. 'struct 

Re: [PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-13 Thread Alexei Starovoitov
On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski  wrote:
> On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov  wrote:
>> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook  wrote:
>>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>>> parameter for future expansion. The third argument is a pointer value,
>>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>>
>>> Signed-off-by: Kees Cook 
>>> Cc: linux-...@vger.kernel.org
>>> ---
>>>  arch/x86/syscalls/syscall_32.tbl  |1 +
>>>  arch/x86/syscalls/syscall_64.tbl  |1 +
>>>  include/linux/syscalls.h  |2 ++
>>>  include/uapi/asm-generic/unistd.h |4 ++-
>>>  include/uapi/linux/seccomp.h  |4 +++
>>>  kernel/seccomp.c  |   63 
>>> -
>>>  kernel/sys_ni.c   |3 ++
>>>  7 files changed, 69 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/arch/x86/syscalls/syscall_32.tbl 
>>> b/arch/x86/syscalls/syscall_32.tbl
>>> index d6b867921612..7527eac24122 100644
>>> --- a/arch/x86/syscalls/syscall_32.tbl
>>> +++ b/arch/x86/syscalls/syscall_32.tbl
>>> @@ -360,3 +360,4 @@
>>>  351i386sched_setattr   sys_sched_setattr
>>>  352i386sched_getattr   sys_sched_getattr
>>>  353i386renameat2   sys_renameat2
>>> +354i386seccomp sys_seccomp
>>> diff --git a/arch/x86/syscalls/syscall_64.tbl 
>>> b/arch/x86/syscalls/syscall_64.tbl
>>> index ec255a1646d2..16272a6c12b7 100644
>>> --- a/arch/x86/syscalls/syscall_64.tbl
>>> +++ b/arch/x86/syscalls/syscall_64.tbl
>>> @@ -323,6 +323,7 @@
>>>  314common  sched_setattr   sys_sched_setattr
>>>  315common  sched_getattr   sys_sched_getattr
>>>  316common  renameat2   sys_renameat2
>>> +317common  seccomp sys_seccomp
>>>
>>>  #
>>>  # x32-specific system call numbers start at 512 to avoid cache impact
>>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>>> index b0881a0ed322..1713977ee26f 100644
>>> --- a/include/linux/syscalls.h
>>> +++ b/include/linux/syscalls.h
>>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>>  unsigned long idx1, unsigned long idx2);
>>>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
>>> flags);
>>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>>> +   const char __user *uargs);
>>
>> It looks odd to add 'flags' argument to syscall that is not even used.
>> It don't think it will be extensible this way.
>> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
>> but rather 'struct sock_fprog __user *'
>> I think it makes more sense to define only first argument as 'int op' and the
>> rest as variable length array.
>> Something like:
>> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
>> then different commands can interpret 'attrs' differently.
>> if op == mode_strict, then attrs == NULL, len == 0
>> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
>> and nla_data(attrs) is 'struct sock_fprog'
>
> Eww.  If the operation doesn't imply the type, then I think we've
> totally screwed up.
>
>> If we decide to add new types of filters or new commands, the syscall 
>> prototype
>> won't need to change. New commands can be added preserving backward
>> compatibility.
>> The basic TLV concept has been around forever in netlink world. imo makes
>> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
>> is the thing
>> of the past. TLV style is more extensible. Fields of structures can become
>> optional in the future, new fields added, etc.
>> 'struct nlattr' brings the same benefits to kernel api as protobuf did
>> to user land.
>
> I see no reason to bring nl_attr into this.
>
> Admittedly, I've never dealt with nl_attr, but everything
> netlink-related I've even been involved in has involved some sort of
> API atrocity.

netlink has a lot of legacy and there is genetlink which is not pretty
either because of extra socket creation, binding, dealing with packet
loss issues, but the key concept of variable length encoding is sound.
Right now seccomp has two commands and they already don't fit
into single syscall neatly. Are you saying there should be two syscalls
here? What about another seccomp related command? Another syscall?
imo all seccomp related commands needs to be mux/demux-ed under
one syscall. What is the way to mux/demux potentially very different
commands under one syscall? I cannot think of anything better than
TLV style. 'struct nlattr' is what we have today and I think it works fine.
I'm not suggesting to bring the whole netlink into the picture, but rather
TLV style of encoding different arguments for 

Re: [PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-13 Thread Andy Lutomirski
On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov  wrote:
> On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook  wrote:
>> This adds the new "seccomp" syscall with both an "operation" and "flags"
>> parameter for future expansion. The third argument is a pointer value,
>> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
>> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>>
>> Signed-off-by: Kees Cook 
>> Cc: linux-...@vger.kernel.org
>> ---
>>  arch/x86/syscalls/syscall_32.tbl  |1 +
>>  arch/x86/syscalls/syscall_64.tbl  |1 +
>>  include/linux/syscalls.h  |2 ++
>>  include/uapi/asm-generic/unistd.h |4 ++-
>>  include/uapi/linux/seccomp.h  |4 +++
>>  kernel/seccomp.c  |   63 
>> -
>>  kernel/sys_ni.c   |3 ++
>>  7 files changed, 69 insertions(+), 9 deletions(-)
>>
>> diff --git a/arch/x86/syscalls/syscall_32.tbl 
>> b/arch/x86/syscalls/syscall_32.tbl
>> index d6b867921612..7527eac24122 100644
>> --- a/arch/x86/syscalls/syscall_32.tbl
>> +++ b/arch/x86/syscalls/syscall_32.tbl
>> @@ -360,3 +360,4 @@
>>  351i386sched_setattr   sys_sched_setattr
>>  352i386sched_getattr   sys_sched_getattr
>>  353i386renameat2   sys_renameat2
>> +354i386seccomp sys_seccomp
>> diff --git a/arch/x86/syscalls/syscall_64.tbl 
>> b/arch/x86/syscalls/syscall_64.tbl
>> index ec255a1646d2..16272a6c12b7 100644
>> --- a/arch/x86/syscalls/syscall_64.tbl
>> +++ b/arch/x86/syscalls/syscall_64.tbl
>> @@ -323,6 +323,7 @@
>>  314common  sched_setattr   sys_sched_setattr
>>  315common  sched_getattr   sys_sched_getattr
>>  316common  renameat2   sys_renameat2
>> +317common  seccomp sys_seccomp
>>
>>  #
>>  # x32-specific system call numbers start at 512 to avoid cache impact
>> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
>> index b0881a0ed322..1713977ee26f 100644
>> --- a/include/linux/syscalls.h
>> +++ b/include/linux/syscalls.h
>> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>>  unsigned long idx1, unsigned long idx2);
>>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
>> flags);
>> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
>> +   const char __user *uargs);
>
> It looks odd to add 'flags' argument to syscall that is not even used.
> It don't think it will be extensible this way.
> 'uargs' is used only in 2nd command as well and it's not 'char __user *'
> but rather 'struct sock_fprog __user *'
> I think it makes more sense to define only first argument as 'int op' and the
> rest as variable length array.
> Something like:
> long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
> then different commands can interpret 'attrs' differently.
> if op == mode_strict, then attrs == NULL, len == 0
> if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
> and nla_data(attrs) is 'struct sock_fprog'

Eww.  If the operation doesn't imply the type, then I think we've
totally screwed up.

> If we decide to add new types of filters or new commands, the syscall 
> prototype
> won't need to change. New commands can be added preserving backward
> compatibility.
> The basic TLV concept has been around forever in netlink world. imo makes
> sense to use it with new syscalls. Passing 'struct xxx' into syscalls
> is the thing
> of the past. TLV style is more extensible. Fields of structures can become
> optional in the future, new fields added, etc.
> 'struct nlattr' brings the same benefits to kernel api as protobuf did
> to user land.

I see no reason to bring nl_attr into this.

Admittedly, I've never dealt with nl_attr, but everything
netlink-related I've even been involved in has involved some sort of
API atrocity.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-13 Thread Alexei Starovoitov
On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook  wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).
>
> Signed-off-by: Kees Cook 
> Cc: linux-...@vger.kernel.org
> ---
>  arch/x86/syscalls/syscall_32.tbl  |1 +
>  arch/x86/syscalls/syscall_64.tbl  |1 +
>  include/linux/syscalls.h  |2 ++
>  include/uapi/asm-generic/unistd.h |4 ++-
>  include/uapi/linux/seccomp.h  |4 +++
>  kernel/seccomp.c  |   63 
> -
>  kernel/sys_ni.c   |3 ++
>  7 files changed, 69 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/syscalls/syscall_32.tbl 
> b/arch/x86/syscalls/syscall_32.tbl
> index d6b867921612..7527eac24122 100644
> --- a/arch/x86/syscalls/syscall_32.tbl
> +++ b/arch/x86/syscalls/syscall_32.tbl
> @@ -360,3 +360,4 @@
>  351i386sched_setattr   sys_sched_setattr
>  352i386sched_getattr   sys_sched_getattr
>  353i386renameat2   sys_renameat2
> +354i386seccomp sys_seccomp
> diff --git a/arch/x86/syscalls/syscall_64.tbl 
> b/arch/x86/syscalls/syscall_64.tbl
> index ec255a1646d2..16272a6c12b7 100644
> --- a/arch/x86/syscalls/syscall_64.tbl
> +++ b/arch/x86/syscalls/syscall_64.tbl
> @@ -323,6 +323,7 @@
>  314common  sched_setattr   sys_sched_setattr
>  315common  sched_getattr   sys_sched_getattr
>  316common  renameat2   sys_renameat2
> +317common  seccomp sys_seccomp
>
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index b0881a0ed322..1713977ee26f 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
>  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
>  unsigned long idx1, unsigned long idx2);
>  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
> flags);
> +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
> +   const char __user *uargs);

It looks odd to add 'flags' argument to syscall that is not even used.
It don't think it will be extensible this way.
'uargs' is used only in 2nd command as well and it's not 'char __user *'
but rather 'struct sock_fprog __user *'
I think it makes more sense to define only first argument as 'int op' and the
rest as variable length array.
Something like:
long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
then different commands can interpret 'attrs' differently.
if op == mode_strict, then attrs == NULL, len == 0
if op == mode_filter, then attrs->nla_type == seccomp_bpf_filter
and nla_data(attrs) is 'struct sock_fprog'
If we decide to add new types of filters or new commands, the syscall prototype
won't need to change. New commands can be added preserving backward
compatibility.
The basic TLV concept has been around forever in netlink world. imo makes
sense to use it with new syscalls. Passing 'struct xxx' into syscalls
is the thing
of the past. TLV style is more extensible. Fields of structures can become
optional in the future, new fields added, etc.
'struct nlattr' brings the same benefits to kernel api as protobuf did
to user land.

>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h 
> b/include/uapi/asm-generic/unistd.h
> index 333640608087..65acbf0e2867 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
>  __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
>  #define __NR_renameat2 276
>  __SYSCALL(__NR_renameat2, sys_renameat2)
> +#define __NR_seccomp 277
> +__SYSCALL(__NR_seccomp, sys_seccomp)
>
>  #undef __NR_syscalls
> -#define __NR_syscalls 277
> +#define __NR_syscalls 278
>
>  /*
>   * All syscalls below here should go away really,
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index ac2dc9f72973..b258878ba754 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -10,6 +10,10 @@
>  #define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
>  #define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
>
> +/* Valid operations for seccomp syscall. */
> +#define SECCOMP_SET_MODE_STRICT0
> +#define SECCOMP_SET_MODE_FILTER1
> +
>  /*
>   * All BPF programs must return a 32-bit value.
>   * The bottom 16-bits are for optional return data.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 39d32c2904fc..c0cafa9e84af 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -19,6 +19,7 @@
>  

Re: [PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-13 Thread Andy Lutomirski
On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook  wrote:
> This adds the new "seccomp" syscall with both an "operation" and "flags"
> parameter for future expansion. The third argument is a pointer value,
> used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
> be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

Question for the linux-abi people:

What's the preferred way to do this these days?  This syscall is a
general purpose "adjust the seccomp state" thing.  The alternative
would be a specific new syscall to add a filter with a flags argument.

--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-13 Thread Andy Lutomirski
On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook keesc...@chromium.org wrote:
 This adds the new seccomp syscall with both an operation and flags
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

Question for the linux-abi people:

What's the preferred way to do this these days?  This syscall is a
general purpose adjust the seccomp state thing.  The alternative
would be a specific new syscall to add a filter with a flags argument.

--Andy
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-13 Thread Alexei Starovoitov
On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook keesc...@chromium.org wrote:
 This adds the new seccomp syscall with both an operation and flags
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

 Signed-off-by: Kees Cook keesc...@chromium.org
 Cc: linux-...@vger.kernel.org
 ---
  arch/x86/syscalls/syscall_32.tbl  |1 +
  arch/x86/syscalls/syscall_64.tbl  |1 +
  include/linux/syscalls.h  |2 ++
  include/uapi/asm-generic/unistd.h |4 ++-
  include/uapi/linux/seccomp.h  |4 +++
  kernel/seccomp.c  |   63 
 -
  kernel/sys_ni.c   |3 ++
  7 files changed, 69 insertions(+), 9 deletions(-)

 diff --git a/arch/x86/syscalls/syscall_32.tbl 
 b/arch/x86/syscalls/syscall_32.tbl
 index d6b867921612..7527eac24122 100644
 --- a/arch/x86/syscalls/syscall_32.tbl
 +++ b/arch/x86/syscalls/syscall_32.tbl
 @@ -360,3 +360,4 @@
  351i386sched_setattr   sys_sched_setattr
  352i386sched_getattr   sys_sched_getattr
  353i386renameat2   sys_renameat2
 +354i386seccomp sys_seccomp
 diff --git a/arch/x86/syscalls/syscall_64.tbl 
 b/arch/x86/syscalls/syscall_64.tbl
 index ec255a1646d2..16272a6c12b7 100644
 --- a/arch/x86/syscalls/syscall_64.tbl
 +++ b/arch/x86/syscalls/syscall_64.tbl
 @@ -323,6 +323,7 @@
  314common  sched_setattr   sys_sched_setattr
  315common  sched_getattr   sys_sched_getattr
  316common  renameat2   sys_renameat2
 +317common  seccomp sys_seccomp

  #
  # x32-specific system call numbers start at 512 to avoid cache impact
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
 index b0881a0ed322..1713977ee26f 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
  unsigned long idx1, unsigned long idx2);
  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
 flags);
 +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 +   const char __user *uargs);

It looks odd to add 'flags' argument to syscall that is not even used.
It don't think it will be extensible this way.
'uargs' is used only in 2nd command as well and it's not 'char __user *'
but rather 'struct sock_fprog __user *'
I think it makes more sense to define only first argument as 'int op' and the
rest as variable length array.
Something like:
long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
then different commands can interpret 'attrs' differently.
if op == mode_strict, then attrs == NULL, len == 0
if op == mode_filter, then attrs-nla_type == seccomp_bpf_filter
and nla_data(attrs) is 'struct sock_fprog'
If we decide to add new types of filters or new commands, the syscall prototype
won't need to change. New commands can be added preserving backward
compatibility.
The basic TLV concept has been around forever in netlink world. imo makes
sense to use it with new syscalls. Passing 'struct xxx' into syscalls
is the thing
of the past. TLV style is more extensible. Fields of structures can become
optional in the future, new fields added, etc.
'struct nlattr' brings the same benefits to kernel api as protobuf did
to user land.

  #endif
 diff --git a/include/uapi/asm-generic/unistd.h 
 b/include/uapi/asm-generic/unistd.h
 index 333640608087..65acbf0e2867 100644
 --- a/include/uapi/asm-generic/unistd.h
 +++ b/include/uapi/asm-generic/unistd.h
 @@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
  __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
  #define __NR_renameat2 276
  __SYSCALL(__NR_renameat2, sys_renameat2)
 +#define __NR_seccomp 277
 +__SYSCALL(__NR_seccomp, sys_seccomp)

  #undef __NR_syscalls
 -#define __NR_syscalls 277
 +#define __NR_syscalls 278

  /*
   * All syscalls below here should go away really,
 diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
 index ac2dc9f72973..b258878ba754 100644
 --- a/include/uapi/linux/seccomp.h
 +++ b/include/uapi/linux/seccomp.h
 @@ -10,6 +10,10 @@
  #define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
  #define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */

 +/* Valid operations for seccomp syscall. */
 +#define SECCOMP_SET_MODE_STRICT0
 +#define SECCOMP_SET_MODE_FILTER1
 +
  /*
   * All BPF programs must return a 32-bit value.
   * The bottom 16-bits are for optional return data.
 diff --git a/kernel/seccomp.c b/kernel/seccomp.c
 index 39d32c2904fc..c0cafa9e84af 100644
 --- a/kernel/seccomp.c
 +++ b/kernel/seccomp.c
 @@ -19,6 +19,7 @@
  #include linux/sched.h
  #include linux/seccomp.h
  #include 

Re: [PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-13 Thread Andy Lutomirski
On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov a...@plumgrid.com wrote:
 On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook keesc...@chromium.org wrote:
 This adds the new seccomp syscall with both an operation and flags
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

 Signed-off-by: Kees Cook keesc...@chromium.org
 Cc: linux-...@vger.kernel.org
 ---
  arch/x86/syscalls/syscall_32.tbl  |1 +
  arch/x86/syscalls/syscall_64.tbl  |1 +
  include/linux/syscalls.h  |2 ++
  include/uapi/asm-generic/unistd.h |4 ++-
  include/uapi/linux/seccomp.h  |4 +++
  kernel/seccomp.c  |   63 
 -
  kernel/sys_ni.c   |3 ++
  7 files changed, 69 insertions(+), 9 deletions(-)

 diff --git a/arch/x86/syscalls/syscall_32.tbl 
 b/arch/x86/syscalls/syscall_32.tbl
 index d6b867921612..7527eac24122 100644
 --- a/arch/x86/syscalls/syscall_32.tbl
 +++ b/arch/x86/syscalls/syscall_32.tbl
 @@ -360,3 +360,4 @@
  351i386sched_setattr   sys_sched_setattr
  352i386sched_getattr   sys_sched_getattr
  353i386renameat2   sys_renameat2
 +354i386seccomp sys_seccomp
 diff --git a/arch/x86/syscalls/syscall_64.tbl 
 b/arch/x86/syscalls/syscall_64.tbl
 index ec255a1646d2..16272a6c12b7 100644
 --- a/arch/x86/syscalls/syscall_64.tbl
 +++ b/arch/x86/syscalls/syscall_64.tbl
 @@ -323,6 +323,7 @@
  314common  sched_setattr   sys_sched_setattr
  315common  sched_getattr   sys_sched_getattr
  316common  renameat2   sys_renameat2
 +317common  seccomp sys_seccomp

  #
  # x32-specific system call numbers start at 512 to avoid cache impact
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
 index b0881a0ed322..1713977ee26f 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
  unsigned long idx1, unsigned long idx2);
  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
 flags);
 +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 +   const char __user *uargs);

 It looks odd to add 'flags' argument to syscall that is not even used.
 It don't think it will be extensible this way.
 'uargs' is used only in 2nd command as well and it's not 'char __user *'
 but rather 'struct sock_fprog __user *'
 I think it makes more sense to define only first argument as 'int op' and the
 rest as variable length array.
 Something like:
 long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
 then different commands can interpret 'attrs' differently.
 if op == mode_strict, then attrs == NULL, len == 0
 if op == mode_filter, then attrs-nla_type == seccomp_bpf_filter
 and nla_data(attrs) is 'struct sock_fprog'

Eww.  If the operation doesn't imply the type, then I think we've
totally screwed up.

 If we decide to add new types of filters or new commands, the syscall 
 prototype
 won't need to change. New commands can be added preserving backward
 compatibility.
 The basic TLV concept has been around forever in netlink world. imo makes
 sense to use it with new syscalls. Passing 'struct xxx' into syscalls
 is the thing
 of the past. TLV style is more extensible. Fields of structures can become
 optional in the future, new fields added, etc.
 'struct nlattr' brings the same benefits to kernel api as protobuf did
 to user land.

I see no reason to bring nl_attr into this.

Admittedly, I've never dealt with nl_attr, but everything
netlink-related I've even been involved in has involved some sort of
API atrocity.

--Andy
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-13 Thread Alexei Starovoitov
On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov a...@plumgrid.com wrote:
 On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook keesc...@chromium.org wrote:
 This adds the new seccomp syscall with both an operation and flags
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

 Signed-off-by: Kees Cook keesc...@chromium.org
 Cc: linux-...@vger.kernel.org
 ---
  arch/x86/syscalls/syscall_32.tbl  |1 +
  arch/x86/syscalls/syscall_64.tbl  |1 +
  include/linux/syscalls.h  |2 ++
  include/uapi/asm-generic/unistd.h |4 ++-
  include/uapi/linux/seccomp.h  |4 +++
  kernel/seccomp.c  |   63 
 -
  kernel/sys_ni.c   |3 ++
  7 files changed, 69 insertions(+), 9 deletions(-)

 diff --git a/arch/x86/syscalls/syscall_32.tbl 
 b/arch/x86/syscalls/syscall_32.tbl
 index d6b867921612..7527eac24122 100644
 --- a/arch/x86/syscalls/syscall_32.tbl
 +++ b/arch/x86/syscalls/syscall_32.tbl
 @@ -360,3 +360,4 @@
  351i386sched_setattr   sys_sched_setattr
  352i386sched_getattr   sys_sched_getattr
  353i386renameat2   sys_renameat2
 +354i386seccomp sys_seccomp
 diff --git a/arch/x86/syscalls/syscall_64.tbl 
 b/arch/x86/syscalls/syscall_64.tbl
 index ec255a1646d2..16272a6c12b7 100644
 --- a/arch/x86/syscalls/syscall_64.tbl
 +++ b/arch/x86/syscalls/syscall_64.tbl
 @@ -323,6 +323,7 @@
  314common  sched_setattr   sys_sched_setattr
  315common  sched_getattr   sys_sched_getattr
  316common  renameat2   sys_renameat2
 +317common  seccomp sys_seccomp

  #
  # x32-specific system call numbers start at 512 to avoid cache impact
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
 index b0881a0ed322..1713977ee26f 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
  unsigned long idx1, unsigned long idx2);
  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
 flags);
 +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 +   const char __user *uargs);

 It looks odd to add 'flags' argument to syscall that is not even used.
 It don't think it will be extensible this way.
 'uargs' is used only in 2nd command as well and it's not 'char __user *'
 but rather 'struct sock_fprog __user *'
 I think it makes more sense to define only first argument as 'int op' and the
 rest as variable length array.
 Something like:
 long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
 then different commands can interpret 'attrs' differently.
 if op == mode_strict, then attrs == NULL, len == 0
 if op == mode_filter, then attrs-nla_type == seccomp_bpf_filter
 and nla_data(attrs) is 'struct sock_fprog'

 Eww.  If the operation doesn't imply the type, then I think we've
 totally screwed up.

 If we decide to add new types of filters or new commands, the syscall 
 prototype
 won't need to change. New commands can be added preserving backward
 compatibility.
 The basic TLV concept has been around forever in netlink world. imo makes
 sense to use it with new syscalls. Passing 'struct xxx' into syscalls
 is the thing
 of the past. TLV style is more extensible. Fields of structures can become
 optional in the future, new fields added, etc.
 'struct nlattr' brings the same benefits to kernel api as protobuf did
 to user land.

 I see no reason to bring nl_attr into this.

 Admittedly, I've never dealt with nl_attr, but everything
 netlink-related I've even been involved in has involved some sort of
 API atrocity.

netlink has a lot of legacy and there is genetlink which is not pretty
either because of extra socket creation, binding, dealing with packet
loss issues, but the key concept of variable length encoding is sound.
Right now seccomp has two commands and they already don't fit
into single syscall neatly. Are you saying there should be two syscalls
here? What about another seccomp related command? Another syscall?
imo all seccomp related commands needs to be mux/demux-ed under
one syscall. What is the way to mux/demux potentially very different
commands under one syscall? I cannot think of anything better than
TLV style. 'struct nlattr' is what we have today and I think it works fine.
I'm not suggesting to bring the whole netlink into the picture, but rather
TLV style of encoding different arguments for different commands.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More 

Re: [PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-13 Thread Andy Lutomirski
On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov a...@plumgrid.com wrote:
 On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov a...@plumgrid.com 
 wrote:
 On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook keesc...@chromium.org wrote:
 This adds the new seccomp syscall with both an operation and flags
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

 Signed-off-by: Kees Cook keesc...@chromium.org
 Cc: linux-...@vger.kernel.org
 ---
  arch/x86/syscalls/syscall_32.tbl  |1 +
  arch/x86/syscalls/syscall_64.tbl  |1 +
  include/linux/syscalls.h  |2 ++
  include/uapi/asm-generic/unistd.h |4 ++-
  include/uapi/linux/seccomp.h  |4 +++
  kernel/seccomp.c  |   63 
 -
  kernel/sys_ni.c   |3 ++
  7 files changed, 69 insertions(+), 9 deletions(-)

 diff --git a/arch/x86/syscalls/syscall_32.tbl 
 b/arch/x86/syscalls/syscall_32.tbl
 index d6b867921612..7527eac24122 100644
 --- a/arch/x86/syscalls/syscall_32.tbl
 +++ b/arch/x86/syscalls/syscall_32.tbl
 @@ -360,3 +360,4 @@
  351i386sched_setattr   sys_sched_setattr
  352i386sched_getattr   sys_sched_getattr
  353i386renameat2   sys_renameat2
 +354i386seccomp sys_seccomp
 diff --git a/arch/x86/syscalls/syscall_64.tbl 
 b/arch/x86/syscalls/syscall_64.tbl
 index ec255a1646d2..16272a6c12b7 100644
 --- a/arch/x86/syscalls/syscall_64.tbl
 +++ b/arch/x86/syscalls/syscall_64.tbl
 @@ -323,6 +323,7 @@
  314common  sched_setattr   sys_sched_setattr
  315common  sched_getattr   sys_sched_getattr
  316common  renameat2   sys_renameat2
 +317common  seccomp sys_seccomp

  #
  # x32-specific system call numbers start at 512 to avoid cache impact
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
 index b0881a0ed322..1713977ee26f 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
  unsigned long idx1, unsigned long idx2);
  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
 flags);
 +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 +   const char __user *uargs);

 It looks odd to add 'flags' argument to syscall that is not even used.
 It don't think it will be extensible this way.
 'uargs' is used only in 2nd command as well and it's not 'char __user *'
 but rather 'struct sock_fprog __user *'
 I think it makes more sense to define only first argument as 'int op' and 
 the
 rest as variable length array.
 Something like:
 long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
 then different commands can interpret 'attrs' differently.
 if op == mode_strict, then attrs == NULL, len == 0
 if op == mode_filter, then attrs-nla_type == seccomp_bpf_filter
 and nla_data(attrs) is 'struct sock_fprog'

 Eww.  If the operation doesn't imply the type, then I think we've
 totally screwed up.

 If we decide to add new types of filters or new commands, the syscall 
 prototype
 won't need to change. New commands can be added preserving backward
 compatibility.
 The basic TLV concept has been around forever in netlink world. imo makes
 sense to use it with new syscalls. Passing 'struct xxx' into syscalls
 is the thing
 of the past. TLV style is more extensible. Fields of structures can become
 optional in the future, new fields added, etc.
 'struct nlattr' brings the same benefits to kernel api as protobuf did
 to user land.

 I see no reason to bring nl_attr into this.

 Admittedly, I've never dealt with nl_attr, but everything
 netlink-related I've even been involved in has involved some sort of
 API atrocity.

 netlink has a lot of legacy and there is genetlink which is not pretty
 either because of extra socket creation, binding, dealing with packet
 loss issues, but the key concept of variable length encoding is sound.
 Right now seccomp has two commands and they already don't fit
 into single syscall neatly. Are you saying there should be two syscalls
 here? What about another seccomp related command? Another syscall?
 imo all seccomp related commands needs to be mux/demux-ed under
 one syscall. What is the way to mux/demux potentially very different
 commands under one syscall? I cannot think of anything better than
 TLV style. 'struct nlattr' is what we have today and I think it works fine.
 I'm not suggesting to bring the whole netlink into the picture, but rather
 TLV style of encoding different arguments for different commands.

I'm unconvinced.  These are simple 

Re: [PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-13 Thread Kees Cook
On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov a...@plumgrid.com wrote:
 On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook keesc...@chromium.org wrote:
 This adds the new seccomp syscall with both an operation and flags
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

 Signed-off-by: Kees Cook keesc...@chromium.org
 Cc: linux-...@vger.kernel.org
 ---
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
 index b0881a0ed322..1713977ee26f 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
  unsigned long idx1, unsigned long idx2);
  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
 flags);
 +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 +   const char __user *uargs);

 It looks odd to add 'flags' argument to syscall that is not even used.

FWIW, flags is given use in the next patch to support the tsync option.

-Kees

-- 
Kees Cook
Chrome OS Security
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-13 Thread Kees Cook
On Fri, Jun 13, 2014 at 2:42 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov a...@plumgrid.com wrote:
 On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov a...@plumgrid.com 
 wrote:
 On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook keesc...@chromium.org wrote:
 This adds the new seccomp syscall with both an operation and flags
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

 Signed-off-by: Kees Cook keesc...@chromium.org
 Cc: linux-...@vger.kernel.org
 ---
  arch/x86/syscalls/syscall_32.tbl  |1 +
  arch/x86/syscalls/syscall_64.tbl  |1 +
  include/linux/syscalls.h  |2 ++
  include/uapi/asm-generic/unistd.h |4 ++-
  include/uapi/linux/seccomp.h  |4 +++
  kernel/seccomp.c  |   63 
 -
  kernel/sys_ni.c   |3 ++
  7 files changed, 69 insertions(+), 9 deletions(-)

 diff --git a/arch/x86/syscalls/syscall_32.tbl 
 b/arch/x86/syscalls/syscall_32.tbl
 index d6b867921612..7527eac24122 100644
 --- a/arch/x86/syscalls/syscall_32.tbl
 +++ b/arch/x86/syscalls/syscall_32.tbl
 @@ -360,3 +360,4 @@
  351i386sched_setattr   sys_sched_setattr
  352i386sched_getattr   sys_sched_getattr
  353i386renameat2   sys_renameat2
 +354i386seccomp sys_seccomp
 diff --git a/arch/x86/syscalls/syscall_64.tbl 
 b/arch/x86/syscalls/syscall_64.tbl
 index ec255a1646d2..16272a6c12b7 100644
 --- a/arch/x86/syscalls/syscall_64.tbl
 +++ b/arch/x86/syscalls/syscall_64.tbl
 @@ -323,6 +323,7 @@
  314common  sched_setattr   sys_sched_setattr
  315common  sched_getattr   sys_sched_getattr
  316common  renameat2   sys_renameat2
 +317common  seccomp sys_seccomp

  #
  # x32-specific system call numbers start at 512 to avoid cache impact
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
 index b0881a0ed322..1713977ee26f 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
  unsigned long idx1, unsigned long idx2);
  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
 flags);
 +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 +   const char __user *uargs);

 It looks odd to add 'flags' argument to syscall that is not even used.
 It don't think it will be extensible this way.
 'uargs' is used only in 2nd command as well and it's not 'char __user *'
 but rather 'struct sock_fprog __user *'
 I think it makes more sense to define only first argument as 'int op' and 
 the
 rest as variable length array.
 Something like:
 long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
 then different commands can interpret 'attrs' differently.
 if op == mode_strict, then attrs == NULL, len == 0
 if op == mode_filter, then attrs-nla_type == seccomp_bpf_filter
 and nla_data(attrs) is 'struct sock_fprog'

 Eww.  If the operation doesn't imply the type, then I think we've
 totally screwed up.

 If we decide to add new types of filters or new commands, the syscall 
 prototype
 won't need to change. New commands can be added preserving backward
 compatibility.
 The basic TLV concept has been around forever in netlink world. imo makes
 sense to use it with new syscalls. Passing 'struct xxx' into syscalls
 is the thing
 of the past. TLV style is more extensible. Fields of structures can become
 optional in the future, new fields added, etc.
 'struct nlattr' brings the same benefits to kernel api as protobuf did
 to user land.

 I see no reason to bring nl_attr into this.

 Admittedly, I've never dealt with nl_attr, but everything
 netlink-related I've even been involved in has involved some sort of
 API atrocity.

 netlink has a lot of legacy and there is genetlink which is not pretty
 either because of extra socket creation, binding, dealing with packet
 loss issues, but the key concept of variable length encoding is sound.
 Right now seccomp has two commands and they already don't fit
 into single syscall neatly. Are you saying there should be two syscalls
 here? What about another seccomp related command? Another syscall?
 imo all seccomp related commands needs to be mux/demux-ed under
 one syscall. What is the way to mux/demux potentially very different
 commands under one syscall? I cannot think of anything better than
 TLV style. 'struct nlattr' is what we have today and I think it works fine.
 I'm not suggesting to bring the whole netlink into the picture, but rather
 TLV style of encoding 

Re: [PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-13 Thread Alexei Starovoitov
On Fri, Jun 13, 2014 at 2:42 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Fri, Jun 13, 2014 at 2:37 PM, Alexei Starovoitov a...@plumgrid.com wrote:
 On Fri, Jun 13, 2014 at 2:25 PM, Andy Lutomirski l...@amacapital.net wrote:
 On Fri, Jun 13, 2014 at 2:22 PM, Alexei Starovoitov a...@plumgrid.com 
 wrote:
 On Tue, Jun 10, 2014 at 8:25 PM, Kees Cook keesc...@chromium.org wrote:
 This adds the new seccomp syscall with both an operation and flags
 parameter for future expansion. The third argument is a pointer value,
 used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
 be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

 Signed-off-by: Kees Cook keesc...@chromium.org
 Cc: linux-...@vger.kernel.org
 ---
  arch/x86/syscalls/syscall_32.tbl  |1 +
  arch/x86/syscalls/syscall_64.tbl  |1 +
  include/linux/syscalls.h  |2 ++
  include/uapi/asm-generic/unistd.h |4 ++-
  include/uapi/linux/seccomp.h  |4 +++
  kernel/seccomp.c  |   63 
 -
  kernel/sys_ni.c   |3 ++
  7 files changed, 69 insertions(+), 9 deletions(-)

 diff --git a/arch/x86/syscalls/syscall_32.tbl 
 b/arch/x86/syscalls/syscall_32.tbl
 index d6b867921612..7527eac24122 100644
 --- a/arch/x86/syscalls/syscall_32.tbl
 +++ b/arch/x86/syscalls/syscall_32.tbl
 @@ -360,3 +360,4 @@
  351i386sched_setattr   sys_sched_setattr
  352i386sched_getattr   sys_sched_getattr
  353i386renameat2   sys_renameat2
 +354i386seccomp sys_seccomp
 diff --git a/arch/x86/syscalls/syscall_64.tbl 
 b/arch/x86/syscalls/syscall_64.tbl
 index ec255a1646d2..16272a6c12b7 100644
 --- a/arch/x86/syscalls/syscall_64.tbl
 +++ b/arch/x86/syscalls/syscall_64.tbl
 @@ -323,6 +323,7 @@
  314common  sched_setattr   sys_sched_setattr
  315common  sched_getattr   sys_sched_getattr
  316common  renameat2   sys_renameat2
 +317common  seccomp sys_seccomp

  #
  # x32-specific system call numbers start at 512 to avoid cache impact
 diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
 index b0881a0ed322..1713977ee26f 100644
 --- a/include/linux/syscalls.h
 +++ b/include/linux/syscalls.h
 @@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
  asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
  unsigned long idx1, unsigned long idx2);
  asmlinkage long sys_finit_module(int fd, const char __user *uargs, int 
 flags);
 +asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
 +   const char __user *uargs);

 It looks odd to add 'flags' argument to syscall that is not even used.
 It don't think it will be extensible this way.
 'uargs' is used only in 2nd command as well and it's not 'char __user *'
 but rather 'struct sock_fprog __user *'
 I think it makes more sense to define only first argument as 'int op' and 
 the
 rest as variable length array.
 Something like:
 long sys_seccomp(unsigned int op, struct nlattr *attrs, int len);
 then different commands can interpret 'attrs' differently.
 if op == mode_strict, then attrs == NULL, len == 0
 if op == mode_filter, then attrs-nla_type == seccomp_bpf_filter
 and nla_data(attrs) is 'struct sock_fprog'

 Eww.  If the operation doesn't imply the type, then I think we've
 totally screwed up.

 If we decide to add new types of filters or new commands, the syscall 
 prototype
 won't need to change. New commands can be added preserving backward
 compatibility.
 The basic TLV concept has been around forever in netlink world. imo makes
 sense to use it with new syscalls. Passing 'struct xxx' into syscalls
 is the thing
 of the past. TLV style is more extensible. Fields of structures can become
 optional in the future, new fields added, etc.
 'struct nlattr' brings the same benefits to kernel api as protobuf did
 to user land.

 I see no reason to bring nl_attr into this.

 Admittedly, I've never dealt with nl_attr, but everything
 netlink-related I've even been involved in has involved some sort of
 API atrocity.

 netlink has a lot of legacy and there is genetlink which is not pretty
 either because of extra socket creation, binding, dealing with packet
 loss issues, but the key concept of variable length encoding is sound.
 Right now seccomp has two commands and they already don't fit
 into single syscall neatly. Are you saying there should be two syscalls
 here? What about another seccomp related command? Another syscall?
 imo all seccomp related commands needs to be mux/demux-ed under
 one syscall. What is the way to mux/demux potentially very different
 commands under one syscall? I cannot think of anything better than
 TLV style. 'struct nlattr' is what we have today and I think it works fine.
 I'm not suggesting to bring the whole netlink into the picture, but rather
 TLV style of encoding 

[PATCH v6 6/9] seccomp: add "seccomp" syscall

2014-06-10 Thread Kees Cook
This adds the new "seccomp" syscall with both an "operation" and "flags"
parameter for future expansion. The third argument is a pointer value,
used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

Signed-off-by: Kees Cook 
Cc: linux-...@vger.kernel.org
---
 arch/x86/syscalls/syscall_32.tbl  |1 +
 arch/x86/syscalls/syscall_64.tbl  |1 +
 include/linux/syscalls.h  |2 ++
 include/uapi/asm-generic/unistd.h |4 ++-
 include/uapi/linux/seccomp.h  |4 +++
 kernel/seccomp.c  |   63 -
 kernel/sys_ni.c   |3 ++
 7 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b867921612..7527eac24122 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,4 @@
 351i386sched_setattr   sys_sched_setattr
 352i386sched_getattr   sys_sched_getattr
 353i386renameat2   sys_renameat2
+354i386seccomp sys_seccomp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1646d2..16272a6c12b7 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,7 @@
 314common  sched_setattr   sys_sched_setattr
 315common  sched_getattr   sys_sched_getattr
 316common  renameat2   sys_renameat2
+317common  seccomp sys_seccomp
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b0881a0ed322..1713977ee26f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
 unsigned long idx1, unsigned long idx2);
 asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
+asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
+   const char __user *uargs);
 #endif
diff --git a/include/uapi/asm-generic/unistd.h 
b/include/uapi/asm-generic/unistd.h
index 333640608087..65acbf0e2867 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
 __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
 #define __NR_renameat2 276
 __SYSCALL(__NR_renameat2, sys_renameat2)
+#define __NR_seccomp 277
+__SYSCALL(__NR_seccomp, sys_seccomp)
 
 #undef __NR_syscalls
-#define __NR_syscalls 277
+#define __NR_syscalls 278
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index ac2dc9f72973..b258878ba754 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -10,6 +10,10 @@
 #define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
 #define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
 
+/* Valid operations for seccomp syscall. */
+#define SECCOMP_SET_MODE_STRICT0
+#define SECCOMP_SET_MODE_FILTER1
+
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 39d32c2904fc..c0cafa9e84af 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* #define SECCOMP_DEBUG 1 */
 
@@ -301,8 +302,8 @@ free_prog:
  *
  * Returns filter on success and ERR_PTR otherwise.
  */
-static
-struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
 {
struct sock_fprog fprog;
struct seccomp_filter *filter = ERR_PTR(-EFAULT);
@@ -325,19 +326,25 @@ out:
 
 /**
  * seccomp_attach_filter: validate and attach filter
+ * @flags:  flags to change filter behavior
  * @filter: seccomp filter to add to the current process
  *
  * Caller must be holding current->sighand->siglock lock.
  *
  * Returns 0 on success, -ve on error.
  */
-static long seccomp_attach_filter(struct seccomp_filter *filter)
+static long seccomp_attach_filter(unsigned int flags,
+ struct seccomp_filter *filter)
 {
unsigned long total_insns;
struct seccomp_filter *walker;
 
BUG_ON(!spin_is_locked(>sighand->siglock));
 
+   /* Validate flags. */
+   if (flags != 0)
+   return -EINVAL;
+
/* Validate resulting filter length. */
total_insns = filter->len;
for (walker = current->seccomp.filter; walker; walker = filter->prev)
@@ -541,6 +548,7 @@ out:
 #ifdef CONFIG_SECCOMP_FILTER
 /**
  * seccomp_set_mode_filter: 

[PATCH v6 6/9] seccomp: add seccomp syscall

2014-06-10 Thread Kees Cook
This adds the new seccomp syscall with both an operation and flags
parameter for future expansion. The third argument is a pointer value,
used with the SECCOMP_SET_MODE_FILTER operation. Currently, flags must
be 0. This is functionally equivalent to prctl(PR_SET_SECCOMP, ...).

Signed-off-by: Kees Cook keesc...@chromium.org
Cc: linux-...@vger.kernel.org
---
 arch/x86/syscalls/syscall_32.tbl  |1 +
 arch/x86/syscalls/syscall_64.tbl  |1 +
 include/linux/syscalls.h  |2 ++
 include/uapi/asm-generic/unistd.h |4 ++-
 include/uapi/linux/seccomp.h  |4 +++
 kernel/seccomp.c  |   63 -
 kernel/sys_ni.c   |3 ++
 7 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d6b867921612..7527eac24122 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -360,3 +360,4 @@
 351i386sched_setattr   sys_sched_setattr
 352i386sched_getattr   sys_sched_getattr
 353i386renameat2   sys_renameat2
+354i386seccomp sys_seccomp
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1646d2..16272a6c12b7 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -323,6 +323,7 @@
 314common  sched_setattr   sys_sched_setattr
 315common  sched_getattr   sys_sched_getattr
 316common  renameat2   sys_renameat2
+317common  seccomp sys_seccomp
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index b0881a0ed322..1713977ee26f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -866,4 +866,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid,
 asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
 unsigned long idx1, unsigned long idx2);
 asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
+asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
+   const char __user *uargs);
 #endif
diff --git a/include/uapi/asm-generic/unistd.h 
b/include/uapi/asm-generic/unistd.h
index 333640608087..65acbf0e2867 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -699,9 +699,11 @@ __SYSCALL(__NR_sched_setattr, sys_sched_setattr)
 __SYSCALL(__NR_sched_getattr, sys_sched_getattr)
 #define __NR_renameat2 276
 __SYSCALL(__NR_renameat2, sys_renameat2)
+#define __NR_seccomp 277
+__SYSCALL(__NR_seccomp, sys_seccomp)
 
 #undef __NR_syscalls
-#define __NR_syscalls 277
+#define __NR_syscalls 278
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index ac2dc9f72973..b258878ba754 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -10,6 +10,10 @@
 #define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
 #define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
 
+/* Valid operations for seccomp syscall. */
+#define SECCOMP_SET_MODE_STRICT0
+#define SECCOMP_SET_MODE_FILTER1
+
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 39d32c2904fc..c0cafa9e84af 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -19,6 +19,7 @@
 #include linux/sched.h
 #include linux/seccomp.h
 #include linux/slab.h
+#include linux/syscalls.h
 
 /* #define SECCOMP_DEBUG 1 */
 
@@ -301,8 +302,8 @@ free_prog:
  *
  * Returns filter on success and ERR_PTR otherwise.
  */
-static
-struct seccomp_filter *seccomp_prepare_user_filter(char __user *user_filter)
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
 {
struct sock_fprog fprog;
struct seccomp_filter *filter = ERR_PTR(-EFAULT);
@@ -325,19 +326,25 @@ out:
 
 /**
  * seccomp_attach_filter: validate and attach filter
+ * @flags:  flags to change filter behavior
  * @filter: seccomp filter to add to the current process
  *
  * Caller must be holding current-sighand-siglock lock.
  *
  * Returns 0 on success, -ve on error.
  */
-static long seccomp_attach_filter(struct seccomp_filter *filter)
+static long seccomp_attach_filter(unsigned int flags,
+ struct seccomp_filter *filter)
 {
unsigned long total_insns;
struct seccomp_filter *walker;
 
BUG_ON(!spin_is_locked(current-sighand-siglock));
 
+   /* Validate flags. */
+   if (flags != 0)
+   return -EINVAL;
+
/* Validate resulting filter length. */
total_insns = filter-len;
for (walker = current-seccomp.filter; walker; walker = filter-prev)
@@ -541,6 +548,7