from:"Sargun Dhillon"

[PATCH] net/sunrpc: Make rpc_auth_create_args a const

2018-07-05 Thread Sargun Dhillon

This turns rpc_auth_create_args into a const as it gets passed through the
auth stack.

Signed-off-by: Sargun Dhillon 
---
 include/linux/sunrpc/auth.h| 5 +++--
 net/sunrpc/auth.c  | 2 +-
 net/sunrpc/auth_gss/auth_gss.c | 9 +
 net/sunrpc/auth_null.c | 2 +-
 net/sunrpc/auth_unix.c | 2 +-
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index d9af474a857d..58a6765c1c5e 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -125,7 +125,8 @@ struct rpc_authops {
struct module   *owner;
rpc_authflavor_tau_flavor;  /* flavor (RPC_AUTH_*) */
char *  au_name;
-   struct rpc_auth *   (*create)(struct rpc_auth_create_args *, struct 
rpc_clnt *);
+   struct rpc_auth *   (*create)(const struct rpc_auth_create_args *,
+ struct rpc_clnt *);
void(*destroy)(struct rpc_auth *);
 
int (*hash_cred)(struct auth_cred *, unsigned int);
@@ -174,7 +175,7 @@ struct rpc_cred *   rpc_lookup_generic_cred(struct 
auth_cred *, int, gfp_t);
 struct rpc_cred *  rpc_lookup_machine_cred(const char *service_name);
 intrpcauth_register(const struct rpc_authops *);
 intrpcauth_unregister(const struct rpc_authops *);
-struct rpc_auth *  rpcauth_create(struct rpc_auth_create_args *,
+struct rpc_auth *  rpcauth_create(const struct rpc_auth_create_args *,
struct rpc_clnt *);
 void   rpcauth_release(struct rpc_auth *);
 rpc_authflavor_t   rpcauth_get_pseudoflavor(rpc_authflavor_t,
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index d2623b9f23d6..661e2277f468 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -253,7 +253,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size)
 EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
 
 struct rpc_auth *
-rpcauth_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
struct rpc_auth *auth;
const struct rpc_authops *ops;
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index be8f103d22fd..21a19a9f0e33 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -985,7 +985,7 @@ static void gss_pipe_free(struct gss_pipe *p)
  * parameters based on the input flavor (which must be a pseudoflavor)
  */
 static struct gss_auth *
-gss_create_new(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
rpc_authflavor_t flavor = args->pseudoflavor;
struct gss_auth *gss_auth;
@@ -1132,7 +1132,7 @@ gss_destroy(struct rpc_auth *auth)
  * (which is guaranteed to last as long as any of its descendants).
  */
 static struct gss_auth *
-gss_auth_find_or_add_hashed(struct rpc_auth_create_args *args,
+gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args,
struct rpc_clnt *clnt,
struct gss_auth *new)
 {
@@ -1169,7 +1169,8 @@ gss_auth_find_or_add_hashed(struct rpc_auth_create_args 
*args,
 }
 
 static struct gss_auth *
-gss_create_hashed(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+gss_create_hashed(const struct rpc_auth_create_args *args,
+ struct rpc_clnt *clnt)
 {
struct gss_auth *gss_auth;
struct gss_auth *new;
@@ -1188,7 +1189,7 @@ gss_create_hashed(struct rpc_auth_create_args *args, 
struct rpc_clnt *clnt)
 }
 
 static struct rpc_auth *
-gss_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+gss_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
struct gss_auth *gss_auth;
struct rpc_xprt_switch *xps = 
rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch);
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index 75d72e109a04..4b48228ee8c7 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -19,7 +19,7 @@ static struct rpc_auth null_auth;
 static struct rpc_cred null_cred;
 
 static struct rpc_auth *
-nul_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
atomic_inc(_auth.au_count);
return _auth;
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index dafd6b870ba3..185e56d4f9ae 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -30,7 +30,7 @@ static struct rpc_authunix_auth;
 static const struct rpc_credopsunix_credops;
 
 static struct rpc_auth *
-unx_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
+unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
 {
dprint

How should NFS, SUNRPC interact with Network and User Namespaces?

2018-06-29 Thread Sargun Dhillon

Today, sunrpc lives in net/sunrpc. As far as I can tell, the primary
production consumer of it is NFS. The RPC clients have the concept of
being tied back to a network namespace. On the other hand, NFS has its
own superblock with its own user namespace.

When sunrpc convert kuids to UIDs to send over the wire, should it use
the user namespace of the network namespace that the RPC client is
associated with? This is required for auth_unix (UID based).
Alternatively, should the sunrpc RPC client use the user namespace
associated with the NFS superblock?

Re: Userspace helper design

2018-03-06 Thread Sargun Dhillon

On Tue, Mar 6, 2018 at 12:31 PM, Daniel Borkmann <dan...@iogearbox.net> wrote:
> On 03/06/2018 05:02 PM, Sargun Dhillon wrote:
>> On Tue, Mar 6, 2018 at 3:26 AM, Daniel Borkmann <dan...@iogearbox.net> wrote:
>>> On 03/06/2018 02:53 AM, Alexei Starovoitov wrote:
>>>> On Mon, Mar 05, 2018 at 05:46:51PM -0800, Sargun Dhillon wrote:
>>>>> On Mon, Mar 5, 2018 at 5:45 PM, Alexei Starovoitov
>>>>> <alexei.starovoi...@gmail.com> wrote:
>>>>>> On Tue, Mar 06, 2018 at 01:34:51AM +, Sargun Dhillon wrote:
>>>>>>>
>>>>>>> I want to get y'all's thoughts.
>>>>>>>
>>>>>>> In seccomp, we need a mechanism to delegate some functionality to
>>>>>>> userspace programs. Ideally, we build this mechanism in a generic
>>>>>>> way to notify userspace "monitors" from a BPF program.
>>>>>>
>>>>>> why perf_event_output helper is not enough?
>>>>>> It seems it can already do everything as you described.
>>>
>>> perf RB would just be unidirectional, though. :-/
>>>
>>>>> Because Tycho wants it to be synchronous. The BPF program can't
>>>>> wait on the response from userspace with perf_event_output.
>>>>
>>>> bpf in kernel execution will never wait on user space process.
>>>
>>> Potentially, such daemon could be hooked into kernel via the module
>>> loader approach, I think that might be interesting. Given this would
>>> sleep, it would probably need to be a verdict that then does the
>>> push into user space waiting for the result. Other approach could be
>>> via helper and you'd have a per-cpu refcount that makes sure the
>>> BPF prog (or better the seccomp struct holding it) doesn't evict
>>> during that time when you drop / reacquire RCU read lock from within
>>> the helper.
>> How do you deal with map values and such? Those can also be released
>> if we give up rcu read lock.
>
> Agree, that would be a problem. Any such access would need to be
> invalidated by the verifier.
>
>> I think a terminal "helper" would be better. The problem I see with a
>> terminal helper is how do you deal with errors?
>
> E.g. in XDP, we have a return verdict XDP_ABORTED. Potentially you
> could also have an 'exception' return verdict for seccomp that would
> eventually be the same as behavior as SECCOMP_RET_KILL, but would
> still allow for some sort of introspection such that it can be detected
> that the daemon triggered such error. Whether tracepoint might be the
> appropriate choice as well in that case ... probably not though.

So, would this verifier just ensure that no maps are accessed prior to
the helper being called, XOR ensure that after the helper is called,
no map values are accessed?

Re: [net-next v3 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-03-05 Thread Sargun Dhillon

On Mon, Mar 5, 2018 at 8:10 AM, Tycho Andersen  wrote:
> Hi Andy,
>
> On Thu, Mar 01, 2018 at 10:05:47PM +, Andy Lutomirski wrote:
>> But Tycho: would hooking user notifiers in right here work for you?
>> As I see it, this would be the best justification for seccomp eBPF.
>
> Sorry for the delay; Sargun had declared on irc that he was going to
> implement it, so I didn't look into it. I think the basics will work,
> but I haven't invested the time to look into it given the above.
>
> Sargun, are you still planning to look at this? What's your timeline?
>
> Cheers,
>
> Tycho
Still working on this. I don't really have a timeline. I think I'll
get to share a prototype by the end of the week. I'm trying to come up
with a common mechanism to do this for multiple types of filters.

Re: [net-next v3 0/2] eBPF seccomp filters

2018-03-01 Thread Sargun Dhillon

On Thu, Mar 1, 2018 at 1:59 PM, Andy Lutomirski <l...@kernel.org> wrote:
> On Thu, Mar 1, 2018 at 9:51 PM, Sargun Dhillon <sar...@sargun.me> wrote:
>> On Thu, Mar 1, 2018 at 9:44 AM, Andy Lutomirski <l...@amacapital.net> wrote:
>>> On Wed, Feb 28, 2018 at 7:56 PM, Daniel Borkmann <dan...@iogearbox.net> 
>>> wrote:
>>>> On 02/28/2018 12:55 AM, chris hyser wrote:
>>>>>> On 02/27/2018 04:58 PM, Daniel Borkmann wrote: >> On 02/27/2018 05:59 
>>>>>> PM, chris hyser wrote:
>>>>>>>> On 02/27/2018 11:00 AM, Kees Cook wrote:
>>>>>>>>> On Tue, Feb 27, 2018 at 6:53 AM, chris hyser <chris.hy...@oracle.com> 
>>>>>>>>> wrote:
>>>>>>>>>> On 02/26/2018 11:38 PM, Kees Cook wrote:
>>>>>>>>>>> On Mon, Feb 26, 2018 at 8:19 PM, Andy Lutomirski 
>>>>>>>>>>> <l...@amacapital.net>
>>>>>>>>>>> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> 3. Straight-up bugs.  Those are exactly as problematic as verifier
>>>>>>>>>>>> bugs in any other unprivileged eBPF program type, right?  I don't 
>>>>>>>>>>>> see
>>>>>>>>>>>> why seccomp is special here.
>>>>>>>>>>>
>>>>>>>>>>> My concern is more about unintended design mistakes or other feature
>>>>>>>>>>> creep with side-effects, especially when it comes to privileges and
>>>>>>>>>>> synchronization. Getting no-new-privs done correctly, for example,
>>>>>>>>>>> took some careful thought and discussion, and I'm shy from how 
>>>>>>>>>>> painful
>>>>>>>>>>> TSYNC was on the process locking side, and eBPF has had some rather
>>>>>>>>>>> ugly flaws in the past (and recently: it was nice to be able to say
>>>>>>>>>>> for Spectre that seccomp filters couldn't be constructed to make
>>>>>>>>>>> attacks but eBPF could). Adding the complexity needs to be worth the
>>>>>>>
>>>>>>> Well, not really. One part of all the Spectre mitigations that went 
>>>>>>> upstream
>>>>>>> from BPF side was to have an option to remove interpreter entirely and 
>>>>>>> that
>>>>>>> also relates to seccomp eventually. But other than that an attacker 
>>>>>>> might
>>>>>>> potentially find as well useful gadgets inside seccomp or any other code
>>>>>>> that is inside the kernel, so it's not a strict necessity either.
>>>>>>>
>>>>>>>>>>> gain. I'm on board for doing it, I just want to be careful. :)
>>>>>>>>>>
>>>>>>>>>> Another option might be to remove c/eBPF from the equation all 
>>>>>>>>>> together.
>>>>>>>>>> c/eBPF allows flexibility and that almost always comes at the cost of
>>>>>>>>>> additional security risk. Seccomp is for enhanced security yes? How 
>>>>>>>>>> about a
>>>>>>>>>> new seccomp mode that passes in something like a bit vector or 
>>>>>>>>>> hashmap for
>>>>>>>>>> "simple" white/black list checks validated by kernel code, versus 
>>>>>>>>>> user
>>>>>>>>>> provided interpreted code? Of course this removes a fair number of 
>>>>>>>>>> things
>>>>>>>>>> you can currently do or would be able to do with eBPF. Of course, 
>>>>>>>>>> restated
>>>>>>>>>> from a security point of view, this removes a fair number of things 
>>>>>>>>>> an
>>>>>>>>>> _attacker_ can do. Presumably the performance improvement would also 
>>>>>>>>>> be
>>>>>>>>>> significant.
>>>>>>>
>>>>>>> Good luck with not breaking existing applications relying on seccomp out
>>>>>>> there.
>>>>>>
>>>>>> This wasn't in the conte

Re: [net-next v3 0/2] eBPF seccomp filters

2018-03-01 Thread Sargun Dhillon

On Thu, Mar 1, 2018 at 9:44 AM, Andy Lutomirski  wrote:
> On Wed, Feb 28, 2018 at 7:56 PM, Daniel Borkmann  wrote:
>> On 02/28/2018 12:55 AM, chris hyser wrote:
 On 02/27/2018 04:58 PM, Daniel Borkmann wrote: >> On 02/27/2018 05:59 PM, 
 chris hyser wrote:
>> On 02/27/2018 11:00 AM, Kees Cook wrote:
>>> On Tue, Feb 27, 2018 at 6:53 AM, chris hyser  
>>> wrote:
 On 02/26/2018 11:38 PM, Kees Cook wrote:
> On Mon, Feb 26, 2018 at 8:19 PM, Andy Lutomirski 
> wrote:
>>
>> 3. Straight-up bugs.  Those are exactly as problematic as verifier
>> bugs in any other unprivileged eBPF program type, right?  I don't see
>> why seccomp is special here.
>
> My concern is more about unintended design mistakes or other feature
> creep with side-effects, especially when it comes to privileges and
> synchronization. Getting no-new-privs done correctly, for example,
> took some careful thought and discussion, and I'm shy from how painful
> TSYNC was on the process locking side, and eBPF has had some rather
> ugly flaws in the past (and recently: it was nice to be able to say
> for Spectre that seccomp filters couldn't be constructed to make
> attacks but eBPF could). Adding the complexity needs to be worth the
>
> Well, not really. One part of all the Spectre mitigations that went 
> upstream
> from BPF side was to have an option to remove interpreter entirely and 
> that
> also relates to seccomp eventually. But other than that an attacker might
> potentially find as well useful gadgets inside seccomp or any other code
> that is inside the kernel, so it's not a strict necessity either.
>
> gain. I'm on board for doing it, I just want to be careful. :)

 Another option might be to remove c/eBPF from the equation all 
 together.
 c/eBPF allows flexibility and that almost always comes at the cost of
 additional security risk. Seccomp is for enhanced security yes? How 
 about a
 new seccomp mode that passes in something like a bit vector or hashmap 
 for
 "simple" white/black list checks validated by kernel code, versus user
 provided interpreted code? Of course this removes a fair number of 
 things
 you can currently do or would be able to do with eBPF. Of course, 
 restated
 from a security point of view, this removes a fair number of things an
 _attacker_ can do. Presumably the performance improvement would also be
 significant.
>
> Good luck with not breaking existing applications relying on seccomp out
> there.

 This wasn't in the context of an implementation proposal, but the 
 assumption would be to add this in addition to the old way. Now, does that 
 make sense to do? That is the discussion.
>>
>> I see; didn't read that out from the above when you also mentioned removing
>> cBPF, but fair enough.
>>
 Is this an idea worth prototyping?
>>>
>>> That was the original prototype for seccomp-filter. :) The discussion
>>> around that from years ago basically boiled down to it being
>>> inflexible. Given all the things people want to do at syscall time,
>>> that continues to be true. So true, in fact, that here we are now,
>>> trying to move to eBPF from cBPF. ;)
>
> Right, agree. cBPF is also pretty much frozen these days and aside from
> that, seccomp/BPF also just uses a proper subset of it. I wouldn't mind
> doing something similar for eBPF side as long as this is reasonably
> maintainable and not making BPF core more complex, but most of it can
> already be set in the verifier anyway based on prog type. Note, that
> performance of seccomp/BPF is definitely a demand as well which is why
> people still extend the old remaining cBPF JITs today such that it can
> be JITed also from there.
>
>> I will try to find that discussion. As someone pointed out here though, 
>> eBPF is being used by more and more people in areas where security is 
>> not the primary concern. Differing objectives will make this a long term 
>> continuing issue. We ourselves were looking at eBPF simply as a means to 
>> use a hashmap for a white/blacklist, i.e. performance not flexibility.
>
> Not really, security of verifier and BPF infra in general is on the top
> of the list, it's fundamental to the underlying concept and just because
> it is heavily used also in tracing and networking, it only shows that the
> concept is highly flexible that it can be applied in multiple areas.
>>>
>>> If you're implying that because seccomp would have it's own verifier and 
>>> could therefore restrict itself to a subset of

Re: [net-next v3 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-02-26 Thread Sargun Dhillon

On Mon, Feb 26, 2018 at 7:57 PM, Tycho Andersen <ty...@tycho.ws> wrote:
> On Mon, Feb 26, 2018 at 07:49:48PM -0800, Sargun Dhillon wrote:
>> On Mon, Feb 26, 2018 at 4:54 PM, Tycho Andersen <ty...@tycho.ws> wrote:
>> > On Mon, Feb 26, 2018 at 07:27:05AM +, Sargun Dhillon wrote:
>> >> +config SECCOMP_FILTER_EXTENDED
>> >> + bool "Extended BPF seccomp filters"
>> >> + depends on SECCOMP_FILTER && BPF_SYSCALL
>> >> + depends on !CHECKPOINT_RESTORE
>> >
>> > Why not just give -EINVAL or something in case one of these is
>> > requested, instead of making them incompatible at compile time?
>> >
>> > Tycho
>> There's already code to return -EMEDIUMTYPE if it's a non-classic, or
>> non-saved filter. Under the normal case, with CHECKPOINT_RESTORE
>> enabled, you should never be able to get that. I think it makes sense
>> to preserve this behaviour.
>
> Oh, right. So can't we just drop this, and the existing code will
> DTRT, i.e. give you -EMEDIUMTYPE because the new filters aren't
> supported, until they are?
>
> Tycho
My suggestion is we merge this as is, so we don't break checkpoint /
restore, and I will try to get the filter dumping patching in the same
development cycle as it comes at minimal risk. Otherwise, we risk
introducing a feature which could break checkpoint/restore, even in
unprivileged containers since anyone can load a BPF Seccomp filter.

Re: [net-next v3 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-02-26 Thread Sargun Dhillon

On Mon, Feb 26, 2018 at 4:54 PM, Tycho Andersen <ty...@tycho.ws> wrote:
> On Mon, Feb 26, 2018 at 07:27:05AM +0000, Sargun Dhillon wrote:
>> +config SECCOMP_FILTER_EXTENDED
>> + bool "Extended BPF seccomp filters"
>> + depends on SECCOMP_FILTER && BPF_SYSCALL
>> + depends on !CHECKPOINT_RESTORE
>
> Why not just give -EINVAL or something in case one of these is
> requested, instead of making them incompatible at compile time?
>
> Tycho
There's already code to return -EMEDIUMTYPE if it's a non-classic, or
non-saved filter. Under the normal case, with CHECKPOINT_RESTORE
enabled, you should never be able to get that. I think it makes sense
to preserve this behaviour.

My rough plan is to introduce a mechanism to dump filters like you can
cBPF filters. If you look at my v1, there was a patch that did this.
Once this gets in, I can prepare that patch, and we can lift this
restriction.

Re: [net-next v3 0/2] eBPF seccomp filters

2018-02-26 Thread Sargun Dhillon

On Mon, Feb 26, 2018 at 5:01 PM, Tycho Andersen <ty...@tycho.ws> wrote:
> On Mon, Feb 26, 2018 at 03:20:15PM -0800, Kees Cook wrote:
>> On Mon, Feb 26, 2018 at 3:04 PM, Alexei Starovoitov
>> <alexei.starovoi...@gmail.com> wrote:
>> > On Mon, Feb 26, 2018 at 07:26:54AM +, Sargun Dhillon wrote:
>> >> This patchset enables seccomp filters to be written in eBPF. Although, 
>> >> this
>> >> [...]
>> > The main statement I want to hear from seccomp maintainers before
>> > proceeding any further on this that enabling eBPF in seccomp won't lead
>> > to seccomp folks arguing against changes in bpf core (like verifier)
>> > just because it's used by seccomp.
>> > It must be spelled out in the commit log with explicit Ack.
>>
>> The primary thing I'm concerned about with eBPF and seccomp is
>> side-effects from eBPF programs running at syscall time. This is an
>> extremely sensitive area, and I want to be sure there won't be
>> feature-creep here that leads to seccomp getting into a bad state.
>>
>> As long as seccomp can continue have its own verifier,
>
> I guess these patches should introduce some additional restrictions in
> kernel/seccomp.c then? Based on my reading now, it's whatever the eBPF
> verifier allows.
>
Like what? The helpers allowed are listed in seccomp.c. You have the
same restrictions as the traditional eBPF verifier (no unsafe memory
access, jumps backwards, etc..). I'm not sure which built-in eBPF
functionality presents risk.

>> I *think* this will be fine, though, again I remain concerned about
>> maps, etc. I'm still reviewing these patches and how they might
>> provide overlap with Tycho's needs too, etc.
>
> Yes, it's on my TODO list to take a look at how to do it as suggested
> by Alexi on top of this set before posting a v2. Haven't had time
> recently, though.
>
> Cheers,
>
> Tycho

There's a lot of interest (in general) of having a mechanism to do
notifications to userspace processes from eBPF for a variety of use
cases. I think that this would be valuable for more than just seccomp,
if it's implemented in a general purpose manner.

Re: [net-next v3 0/2] eBPF seccomp filters

2018-02-26 Thread Sargun Dhillon

On Mon, Feb 26, 2018 at 3:04 PM, Alexei Starovoitov
<alexei.starovoi...@gmail.com> wrote:
> On Mon, Feb 26, 2018 at 07:26:54AM +0000, Sargun Dhillon wrote:
>> This patchset enables seccomp filters to be written in eBPF. Although, this
>> patchset doesn't introduce much of the functionality enabled by eBPF, it lays
>> the ground work for it. Currently, you have to disable CHECKPOINT_RESTORE
>> support in order to utilize eBPF seccomp filters, as eBPF filters cannot be
>> retrieved via the ptrace GET_FILTER API.
>
> this was discussed multiple times in the past.
> In eBPF land it's practically impossible to do checkpoint/restore
> of the whole bpf program/map graph.
>
>> Any user can load a bpf seccomp filter program, and it can be pinned and
>> reused without requiring access to the bpf syscalls. A user only requires
>> the traditional permissions of either being cap_sys_admin, or have
>> no_new_privs set in order to install their rule.
>>
>> The primary reason for not adding maps support in this patchset is
>> to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
>> If we have a map that the BPF program can read, it can potentially
>> "change" privileges after running. It seems like doing writes only
>> is safe, because it can be pure, and side effect free, and therefore
>> not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
>> to an agreement, this can be in a follow-up patchset.
>
> readonly maps already exist. See BPF_F_RDONLY.
> Is that not enough?
>
With BPF_F_RDONLY, is there a mechanism to populate a prog_array, and
then mark it rd_only?

>> A benchmark of this patchset is as follows for a very standard eBPF filter:
>>
>> Given this test program:
>> for (i = 10; i < ; i++) syscall(__NR_getpid);
>>
>> If I implement an eBPF filter with PROG_ARRAYs with a program per syscall,
>> and tail call, the numbers are such:
>> ebpf JIT 12.3% slower than native
>> ebpf no JIT 13.6% slower than native
>> seccomp JIT 17.6% slower than native
>> seccomp no JIT 37% slower than native
>
> the perf gains are misleading, since patches don't enable bpf_tail_call.
>
> The main statement I want to hear from seccomp maintainers before
> proceeding any further on this that enabling eBPF in seccomp won't lead
> to seccomp folks arguing against changes in bpf core (like verifier)
> just because it's used by seccomp.
> It must be spelled out in the commit log with explicit Ack.
>

[net-next v3 2/2] bpf: Add eBPF seccomp sample programs

2018-02-25 Thread Sargun Dhillon

This adds a sample program that uses seccomp-eBPF, called
seccomp1. It shows the simple ability to code seccomp filters
in C.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 samples/bpf/Makefile|  5 +
 samples/bpf/bpf_load.c  |  9 ++--
 samples/bpf/test_seccomp_kern.c | 41 
 samples/bpf/test_seccomp_user.c | 46 +
 4 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/test_seccomp_kern.c
 create mode 100644 samples/bpf/test_seccomp_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ec3fc8d88e87..05f21988775f 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
+hostprogs-y += test_seccomp
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -93,6 +94,8 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) 
xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
+test_seccomp-objs := bpf_load.o $(LIBBPF) test_seccomp_user.o
+
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -144,6 +147,7 @@ always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
+always += test_seccomp_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -188,6 +192,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
+HOSTLOADLIBES_test_seccomp += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 69806d74fa53..856bc8b93916 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -67,6 +67,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
bool is_sockops = strncmp(event, "sockops", 7) == 0;
bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
+   bool is_seccomp = strncmp(event, "seccomp", 7) == 0;
size_t insns_cnt = size / sizeof(struct bpf_insn);
enum bpf_prog_type prog_type;
char buf[256];
@@ -96,6 +97,8 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
prog_type = BPF_PROG_TYPE_SOCK_OPS;
} else if (is_sk_skb) {
prog_type = BPF_PROG_TYPE_SK_SKB;
+   } else if (is_seccomp) {
+   prog_type = BPF_PROG_TYPE_SECCOMP;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -110,7 +113,8 @@ static int load_and_attach(const char *event, struct 
bpf_insn *prog, int size)
 
prog_fd[prog_cnt++] = fd;
 
-   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
+   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk ||
+   is_seccomp)
return 0;
 
if (is_socket || is_sockops || is_sk_skb) {
@@ -589,7 +593,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb 
fixup_map)
memcmp(shname, "socket", 6) == 0 ||
memcmp(shname, "cgroup/", 7) == 0 ||
memcmp(shname, "sockops", 7) == 0 ||
-   memcmp(shname, "sk_skb", 6) == 0) {
+   memcmp(shname, "sk_skb", 6) == 0 ||
+   memcmp(shname, "seccomp", 7) == 0) {
ret = load_and_attach(shname, data->d_buf,
  data->d_size);
if (ret != 0)
diff --git a/samples/bpf/test_seccomp_kern.c b/samples/bpf/test_seccomp_kern.c
new file mode 100644
index ..a0dd39b4ba16
--- /dev/null
+++ b/samples/bpf/test_seccomp_kern.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+#include 
+
+#if defined(__x86_64__)
+#define ARCH   AUDIT_ARCH_X86_64
+#elif defined(__i386__)
+#define ARCH   AUDIT_ARCH_I386
+#else
+#endif
+
+#ifdef ARCH
+/* Returns EPERM when trying to close fd 999 */
+SEC("seccomp")
+int bpf_prog1(struct seccomp_data *ctx)
+{
+   /*
+* Make sure this BPF program is being run on the same architecture it
+* was compiled on.
+*/
+   if (ctx->arch != ARCH)
+   return SECCOMP_RET_ERRNO | EPERM;
+

[net-next v3 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-02-25 Thread Sargun Dhillon

This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
to be used for seccomp filters as an alternative to cBPF filters. The
program type has relatively limited capabilities in terms of helpers,
but that can be extended later on.

The eBPF code loading is separated from attachment of the filter, so
a privileged user can load the filter, and pass it back to an
unprivileged user who can attach it and use it at a later time.

In order to attach the filter itself, you need to supply a flag to the
seccomp syscall indicating that a eBPF filter is being attached, as
opposed to a cBPF one. Verification occurs at program load time,
so the user should only receive errors related to attachment.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 arch/Kconfig |   8 +++
 include/linux/bpf_types.h|   3 +
 include/linux/seccomp.h  |   3 +-
 include/uapi/linux/bpf.h |   2 +
 include/uapi/linux/seccomp.h |   7 +-
 kernel/bpf/syscall.c |   1 +
 kernel/seccomp.c | 159 ---
 7 files changed, 156 insertions(+), 27 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 76c0b54443b1..8490d35e59d6 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -401,6 +401,14 @@ config SECCOMP_FILTER
 
  See Documentation/prctl/seccomp_filter.txt for details.
 
+config SECCOMP_FILTER_EXTENDED
+   bool "Extended BPF seccomp filters"
+   depends on SECCOMP_FILTER && BPF_SYSCALL
+   depends on !CHECKPOINT_RESTORE
+   help
+ Enables seccomp filters to be written in eBPF, as opposed
+ to just cBPF filters.
+
 config HAVE_GCC_PLUGINS
bool
help
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 19b8349a3809..945c65c4e461 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
+BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index c723a5c4e3ff..a7df3ba6cf25 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -5,7 +5,8 @@
 #include 
 
 #define SECCOMP_FILTER_FLAG_MASK   (SECCOMP_FILTER_FLAG_TSYNC | \
-SECCOMP_FILTER_FLAG_LOG)
+SECCOMP_FILTER_FLAG_LOG | \
+SECCOMP_FILTER_FLAG_EXTENDED)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index db6bdc375126..5f96cb7ed954 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1,3 +1,4 @@
+
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  *
@@ -133,6 +134,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SOCK_OPS,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_CGROUP_DEVICE,
+   BPF_PROG_TYPE_SECCOMP,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 2a0bd9dd104d..730af6c7ec2e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,10 +16,11 @@
 #define SECCOMP_SET_MODE_FILTER1
 #define SECCOMP_GET_ACTION_AVAIL   2
 
-/* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC  1
-#define SECCOMP_FILTER_FLAG_LOG2
 
+/* Valid flags for SECCOMP_SET_MODE_FILTER */
+#define SECCOMP_FILTER_FLAG_TSYNC  (1 << 0)
+#define SECCOMP_FILTER_FLAG_LOG(1 << 1)
+#define SECCOMP_FILTER_FLAG_EXTENDED   (1 << 2)
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..86d6ec8b916d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
+   type != BPF_PROG_TYPE_SECCOMP &&
!capable(CAP_SYS_ADMIN))
return -EPERM;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index dc77548167ef..d95c24181a6c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * struct seccomp_filter - container for seccomp BPF programs
@@ -367,17 +368,6 @@ static struct seccomp_filter 
*seccomp_prepare_filter(struct sock_fprog *fprog)
 
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
 
-   /*
-* Installing a seccomp filter requires t

[net-next v3 0/2] eBPF seccomp filters

2018-02-25 Thread Sargun Dhillon

This patchset enables seccomp filters to be written in eBPF. Although, this
patchset doesn't introduce much of the functionality enabled by eBPF, it lays
the ground work for it. Currently, you have to disable CHECKPOINT_RESTORE
support in order to utilize eBPF seccomp filters, as eBPF filters cannot be
retrieved via the ptrace GET_FILTER API.

Any user can load a bpf seccomp filter program, and it can be pinned and
reused without requiring access to the bpf syscalls. A user only requires
the traditional permissions of either being cap_sys_admin, or have
no_new_privs set in order to install their rule.

The primary reason for not adding maps support in this patchset is
to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
If we have a map that the BPF program can read, it can potentially
"change" privileges after running. It seems like doing writes only
is safe, because it can be pure, and side effect free, and therefore
not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
to an agreement, this can be in a follow-up patchset.

A benchmark of this patchset is as follows for a very standard eBPF filter:

Given this test program:
for (i = 10; i < ; i++) syscall(__NR_getpid);

If I implement an eBPF filter with PROG_ARRAYs with a program per syscall,
and tail call, the numbers are such:
ebpf JIT 12.3% slower than native
ebpf no JIT 13.6% slower than native
seccomp JIT 17.6% slower than native
seccomp no JIT 37% slower than native

The speed of the traditional seccomp filter increases O(n) with the number
of syscalls with discrete rulesets, whereas ebpf is O(1), given any number
of syscall filters.

Changes since v2:
  * Rename sample
  * Code cleanup
Changes since v1:
  * Use a flag to indicate loading an eBPF filter, not a separate command
  * Remove printk helper
  * Remove ptrace patch / restore filter / sample
  * Add some safe helpers

Sargun Dhillon (2):
  bpf, seccomp: Add eBPF filter capabilities
  bpf: Add eBPF seccomp sample programs

 arch/Kconfig|   8 ++
 include/linux/bpf_types.h   |   3 +
 include/linux/seccomp.h |   3 +-
 include/uapi/linux/bpf.h|   2 +
 include/uapi/linux/seccomp.h|   7 +-
 kernel/bpf/syscall.c|   1 +
 kernel/seccomp.c| 159 ++--
 samples/bpf/Makefile|   5 ++
 samples/bpf/bpf_load.c  |   9 ++-
 samples/bpf/test_seccomp_kern.c |  41 +++
 samples/bpf/test_seccomp_user.c |  46 
 11 files changed, 255 insertions(+), 29 deletions(-)
 create mode 100644 samples/bpf/test_seccomp_kern.c
 create mode 100644 samples/bpf/test_seccomp_user.c

-- 
2.14.1

Re: [net-next v2 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-02-20 Thread Sargun Dhillon

On Mon, Feb 19, 2018 at 4:00 PM, Daniel Borkmann <dan...@iogearbox.net> wrote:
> On 02/19/2018 05:22 PM, Sargun Dhillon wrote:
>> This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
>> to be used for seccomp filters as an alternative to cBPF filters. The
>> program type has relatively limited capabilities in terms of helpers,
>> but that can be extended later on.
>>
>> The eBPF code loading is separated from attachment of the filter, so
>> a privileged user can load the filter, and pass it back to an
>> unprivileged user who can attach it and use it at a later time.
>>
>> In order to attach the filter itself, you need to supply a flag to the
>> seccomp syscall indicating that a eBPF filter is being attached, as
>> opposed to a cBPF one. Verification occurs at program load time,
>> so the user should only receive errors related to attachment.
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> [...]
>> @@ -867,7 +924,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>>
>>   spin_lock_irq(>sighand->siglock);
>>
>> - if (!seccomp_may_assign_mode(seccomp_mode))
>> + if (!seccomp_may_assign_mode(filter_mode))
>>   goto out;
>>
>>   ret = seccomp_attach_filter(flags, prepared);
>> @@ -876,7 +933,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>>   /* Do not free the successfully attached filter. */
>>   prepared = NULL;
>>
>> - seccomp_assign_mode(current, seccomp_mode);
>> + seccomp_assign_mode(current, filter_mode);
>>  out:
>>   spin_unlock_irq(>sighand->siglock);
>>   if (flags & SECCOMP_FILTER_FLAG_TSYNC)
>> @@ -1040,8 +1097,7 @@ long seccomp_get_filter(struct task_struct *task, 
>> unsigned long filter_off,
>>   if (IS_ERR(filter))
>>   return PTR_ERR(filter);
>>
>> - fprog = filter->prog->orig_prog;
>> - if (!fprog) {
>> + if (!bpf_prog_was_classic(filter->prog)) {
>
> This is actually a bug, see f8e529ed941b ("seccomp, ptrace: add support for
> dumping seccomp filters") and would cause a NULL ptr deref in case the filter
> was created with bpf_prog_create_from_user() with save_orig as false, so the
> if (!fprog) test for cBPF cannot be removed from here.
>
Isn't this function within:
#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
#endif

And, above, where bpf_prog_create_from_user is, save_prog is derived from:
const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);

Are there any other places this can be loaded, or this function can be
exposes if CONFIG_CHECKPOINT_RESTORE = n?


>>   /* This must be a new non-cBPF filter, since we save
>>* every cBPF filter's orig_prog above when
>>* CONFIG_CHECKPOINT_RESTORE is enabled.
>> @@ -1050,6 +1106,7 @@ long seccomp_get_filter(struct task_struct *task, 
>> unsigned long filter_off,
>>   goto out;
>>   }
>>
>> + fprog = filter->prog->orig_prog;
>>   ret = fprog->len;
>
> (See above.)
>
>>   if (!data)
>>   goto out;
>> @@ -1239,6 +1296,58 @@ static int seccomp_actions_logged_handler(struct 
>> ctl_table *ro_table, int write,
>>   return 0;
>>  }
>>
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +static bool seccomp_is_valid_access(int off, int size,
>> + enum bpf_access_type type,
>> + struct bpf_insn_access_aux *info)
>> +{
>> + if (type != BPF_READ)
>> + return false;
>> +
>> + if (off < 0 || off + size > sizeof(struct seccomp_data))
>> + return false;
>
> if (off % size != 0)
> return false;
>
>> + switch (off) {
>> + case bpf_ctx_range_till(struct seccomp_data, args[0], args[5]):
>> + return (size == sizeof(__u64));
>> + case bpf_ctx_range(struct seccomp_data, nr):
>> + return (size == FIELD_SIZEOF(struct seccomp_data, nr));
>> + case bpf_ctx_range(struct seccomp_data, arch):
>> + return (size == FIELD_SIZEOF(struct seccomp_data, arch));
>> + case bpf_ctx_range(struct seccomp_data, instruction_pointer):
>> + return (size == FIELD_SIZEOF(struct seccomp_data,
>> +  instruction_pointer));
>
> default:
> return false;
>
> [...]
>> +static const struct bpf_func_prot

Re: [net-next v2 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-02-20 Thread Sargun Dhillon

On Mon, Feb 19, 2018 at 4:00 PM, Daniel Borkmann <dan...@iogearbox.net> wrote:
> On 02/19/2018 05:22 PM, Sargun Dhillon wrote:
>> This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
>> to be used for seccomp filters as an alternative to cBPF filters. The
>> program type has relatively limited capabilities in terms of helpers,
>> but that can be extended later on.
>>
>> The eBPF code loading is separated from attachment of the filter, so
>> a privileged user can load the filter, and pass it back to an
>> unprivileged user who can attach it and use it at a later time.
>>
>> In order to attach the filter itself, you need to supply a flag to the
>> seccomp syscall indicating that a eBPF filter is being attached, as
>> opposed to a cBPF one. Verification occurs at program load time,
>> so the user should only receive errors related to attachment.
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> [...]
>> @@ -867,7 +924,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>>
>>   spin_lock_irq(>sighand->siglock);
>>
>> - if (!seccomp_may_assign_mode(seccomp_mode))
>> + if (!seccomp_may_assign_mode(filter_mode))
>>   goto out;
>>
>>   ret = seccomp_attach_filter(flags, prepared);
>> @@ -876,7 +933,7 @@ static long seccomp_set_mode_filter(unsigned int flags,
>>   /* Do not free the successfully attached filter. */
>>   prepared = NULL;
>>
>> - seccomp_assign_mode(current, seccomp_mode);
>> + seccomp_assign_mode(current, filter_mode);
>>  out:
>>   spin_unlock_irq(>sighand->siglock);
>>   if (flags & SECCOMP_FILTER_FLAG_TSYNC)
>> @@ -1040,8 +1097,7 @@ long seccomp_get_filter(struct task_struct *task, 
>> unsigned long filter_off,
>>   if (IS_ERR(filter))
>>   return PTR_ERR(filter);
>>
>> - fprog = filter->prog->orig_prog;
>> - if (!fprog) {
>> + if (!bpf_prog_was_classic(filter->prog)) {
>
> This is actually a bug, see f8e529ed941b ("seccomp, ptrace: add support for
> dumping seccomp filters") and would cause a NULL ptr deref in case the filter
> was created with bpf_prog_create_from_user() with save_orig as false, so the
> if (!fprog) test for cBPF cannot be removed from here.
>
>>   /* This must be a new non-cBPF filter, since we save
>>* every cBPF filter's orig_prog above when
>>* CONFIG_CHECKPOINT_RESTORE is enabled.
>> @@ -1050,6 +1106,7 @@ long seccomp_get_filter(struct task_struct *task, 
>> unsigned long filter_off,
>>   goto out;
>>   }
>>
>> + fprog = filter->prog->orig_prog;
>>   ret = fprog->len;
>
> (See above.)
>
>>   if (!data)
>>   goto out;
>> @@ -1239,6 +1296,58 @@ static int seccomp_actions_logged_handler(struct 
>> ctl_table *ro_table, int write,
>>   return 0;
>>  }
>>
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +static bool seccomp_is_valid_access(int off, int size,
>> + enum bpf_access_type type,
>> + struct bpf_insn_access_aux *info)
>> +{
>> + if (type != BPF_READ)
>> + return false;
>> +
>> + if (off < 0 || off + size > sizeof(struct seccomp_data))
>> + return false;
>
> if (off % size != 0)
> return false;
>
Won't this break access to the instruction pointer, and args if
sizeof(int) != 4? Don't know any if any architectures fall under that.

>> + switch (off) {
>> + case bpf_ctx_range_till(struct seccomp_data, args[0], args[5]):
>> + return (size == sizeof(__u64));
>> + case bpf_ctx_range(struct seccomp_data, nr):
>> + return (size == FIELD_SIZEOF(struct seccomp_data, nr));
>> + case bpf_ctx_range(struct seccomp_data, arch):
>> + return (size == FIELD_SIZEOF(struct seccomp_data, arch));
>> + case bpf_ctx_range(struct seccomp_data, instruction_pointer):
>> + return (size == FIELD_SIZEOF(struct seccomp_data,
>> +  instruction_pointer));
>
> default:
> return false;
>
> [...]
>> +static const struct bpf_func_proto *
>> +seccomp_func_proto(enum bpf_func_id func_id)
>> +{
>> + switch (func_id) {
>> + case BPF_FUNC_get_current_uid_gid:
>> + return _get_current_uid_gid_proto;
>> + case BPF_FUNC_k

[net-next v2 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-02-19 Thread Sargun Dhillon

This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
to be used for seccomp filters as an alternative to cBPF filters. The
program type has relatively limited capabilities in terms of helpers,
but that can be extended later on.

The eBPF code loading is separated from attachment of the filter, so
a privileged user can load the filter, and pass it back to an
unprivileged user who can attach it and use it at a later time.

In order to attach the filter itself, you need to supply a flag to the
seccomp syscall indicating that a eBPF filter is being attached, as
opposed to a cBPF one. Verification occurs at program load time,
so the user should only receive errors related to attachment.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 arch/Kconfig |   8 +++
 include/linux/bpf_types.h|   3 +
 include/linux/seccomp.h  |   3 +-
 include/uapi/linux/bpf.h |   2 +
 include/uapi/linux/seccomp.h |   7 ++-
 kernel/bpf/syscall.c |   1 +
 kernel/seccomp.c | 145 +--
 7 files changed, 147 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 76c0b54443b1..8490d35e59d6 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -401,6 +401,14 @@ config SECCOMP_FILTER
 
  See Documentation/prctl/seccomp_filter.txt for details.
 
+config SECCOMP_FILTER_EXTENDED
+   bool "Extended BPF seccomp filters"
+   depends on SECCOMP_FILTER && BPF_SYSCALL
+   depends on !CHECKPOINT_RESTORE
+   help
+ Enables seccomp filters to be written in eBPF, as opposed
+ to just cBPF filters.
+
 config HAVE_GCC_PLUGINS
bool
help
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 19b8349a3809..945c65c4e461 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
+BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index c723a5c4e3ff..a7df3ba6cf25 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -5,7 +5,8 @@
 #include 
 
 #define SECCOMP_FILTER_FLAG_MASK   (SECCOMP_FILTER_FLAG_TSYNC | \
-SECCOMP_FILTER_FLAG_LOG)
+SECCOMP_FILTER_FLAG_LOG | \
+SECCOMP_FILTER_FLAG_EXTENDED)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index db6bdc375126..5f96cb7ed954 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1,3 +1,4 @@
+
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  *
@@ -133,6 +134,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SOCK_OPS,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_CGROUP_DEVICE,
+   BPF_PROG_TYPE_SECCOMP,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 2a0bd9dd104d..730af6c7ec2e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,10 +16,11 @@
 #define SECCOMP_SET_MODE_FILTER1
 #define SECCOMP_GET_ACTION_AVAIL   2
 
-/* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC  1
-#define SECCOMP_FILTER_FLAG_LOG2
 
+/* Valid flags for SECCOMP_SET_MODE_FILTER */
+#define SECCOMP_FILTER_FLAG_TSYNC  (1 << 0)
+#define SECCOMP_FILTER_FLAG_LOG(1 << 1)
+#define SECCOMP_FILTER_FLAG_EXTENDED   (1 << 2)
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..86d6ec8b916d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
+   type != BPF_PROG_TYPE_SECCOMP &&
!capable(CAP_SYS_ADMIN))
return -EPERM;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 940fa408a288..f8ddc4af1135 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * struct seccomp_filter - container for seccomp BPF programs
@@ -367,17 +368,6 @@ static struct seccomp_filter 
*seccomp_prepare_filter(struct sock_fprog *fprog)
 
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
 
-   /*
-* Installing a seccomp filter requires t

Re: [net-next v2 2/2] bpf: Add eBPF seccomp sample programs

2018-02-18 Thread Sargun Dhillon

On Sat, Feb 17, 2018 at 9:58 AM, Randy Dunlap <rdun...@infradead.org> wrote:
> On 02/16/2018 11:36 PM, Sargun Dhillon wrote:
>> + close(111);
>> + assert(errno == EBADF);
>> + close(999);
>> + assert(errno = EPERM);
>
> should that be   == ?
>
Woops. Embarassing. Will fix that in the next re-spin.
>> +
>> + return 0;
>> +}
>
>
> --
> ~Randy

[net-next v2 1/2] bpf, seccomp: Add eBPF filter capabilities

2018-02-16 Thread Sargun Dhillon

This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
to be used for seccomp filters as an alternative to cBPF filters. The
program type has relatively limited capabilities in terms of helpers,
but that can be extended later on.

The eBPF code loading is separated from attachment of the filter, so
a privileged user can load the filter, and pass it back to an
unprivileged user who can attach it and use it at a later time.

In order to attach the filter itself, you need to supply a flag to the
seccomp syscall indicating that a eBPF filter is being attached, as
opposed to a cBPF one. Verification occurs at program load time,
so the user should only receive errors related to attachment.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 arch/Kconfig |   8 +++
 include/linux/bpf_types.h|   3 +
 include/linux/seccomp.h  |   3 +-
 include/uapi/linux/bpf.h |   2 +
 include/uapi/linux/seccomp.h |   7 ++-
 kernel/bpf/syscall.c |   1 +
 kernel/seccomp.c | 145 +--
 7 files changed, 147 insertions(+), 22 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 76c0b54443b1..8490d35e59d6 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -401,6 +401,14 @@ config SECCOMP_FILTER
 
  See Documentation/prctl/seccomp_filter.txt for details.
 
+config SECCOMP_FILTER_EXTENDED
+   bool "Extended BPF seccomp filters"
+   depends on SECCOMP_FILTER && BPF_SYSCALL
+   depends on !CHECKPOINT_RESTORE
+   help
+ Enables seccomp filters to be written in eBPF, as opposed
+ to just cBPF filters.
+
 config HAVE_GCC_PLUGINS
bool
help
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 19b8349a3809..945c65c4e461 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
+BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index c723a5c4e3ff..a7df3ba6cf25 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -5,7 +5,8 @@
 #include 
 
 #define SECCOMP_FILTER_FLAG_MASK   (SECCOMP_FILTER_FLAG_TSYNC | \
-SECCOMP_FILTER_FLAG_LOG)
+SECCOMP_FILTER_FLAG_LOG | \
+SECCOMP_FILTER_FLAG_EXTENDED)
 
 #ifdef CONFIG_SECCOMP
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index db6bdc375126..5f96cb7ed954 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1,3 +1,4 @@
+
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  *
@@ -133,6 +134,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SOCK_OPS,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_CGROUP_DEVICE,
+   BPF_PROG_TYPE_SECCOMP,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 2a0bd9dd104d..730af6c7ec2e 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -16,10 +16,11 @@
 #define SECCOMP_SET_MODE_FILTER1
 #define SECCOMP_GET_ACTION_AVAIL   2
 
-/* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC  1
-#define SECCOMP_FILTER_FLAG_LOG2
 
+/* Valid flags for SECCOMP_SET_MODE_FILTER */
+#define SECCOMP_FILTER_FLAG_TSYNC  (1 << 0)
+#define SECCOMP_FILTER_FLAG_LOG(1 << 1)
+#define SECCOMP_FILTER_FLAG_EXTENDED   (1 << 2)
 /*
  * All BPF programs must return a 32-bit value.
  * The bottom 16-bits are for optional return data.
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..86d6ec8b916d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
+   type != BPF_PROG_TYPE_SECCOMP &&
!capable(CAP_SYS_ADMIN))
return -EPERM;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 940fa408a288..f8ddc4af1135 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * struct seccomp_filter - container for seccomp BPF programs
@@ -367,17 +368,6 @@ static struct seccomp_filter 
*seccomp_prepare_filter(struct sock_fprog *fprog)
 
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
 
-   /*
-* Installing a seccomp filter requires t

[net-next v2 2/2] bpf: Add eBPF seccomp sample programs

2018-02-16 Thread Sargun Dhillon

This adds a sample program that uses seccomp-eBPF, called
seccomp1. It shows the simple ability to code seccomp filters
in C.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 samples/bpf/Makefile|  5 +
 samples/bpf/bpf_load.c  |  9 +++--
 samples/bpf/seccomp1_kern.c | 43 +++
 samples/bpf/seccomp1_user.c | 45 +
 4 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/seccomp1_kern.c
 create mode 100644 samples/bpf/seccomp1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ec3fc8d88e87..264838846f71 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -43,6 +43,7 @@ hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
+hostprogs-y += seccomp1
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -93,6 +94,8 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) 
xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
+seccomp1-objs := bpf_load.o $(LIBBPF) seccomp1_user.o
+
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -144,6 +147,7 @@ always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
+always += seccomp1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -188,6 +192,7 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
+HOSTLOADLIBES_seccomp1 += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 69806d74fa53..856bc8b93916 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -67,6 +67,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
bool is_sockops = strncmp(event, "sockops", 7) == 0;
bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
+   bool is_seccomp = strncmp(event, "seccomp", 7) == 0;
size_t insns_cnt = size / sizeof(struct bpf_insn);
enum bpf_prog_type prog_type;
char buf[256];
@@ -96,6 +97,8 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
prog_type = BPF_PROG_TYPE_SOCK_OPS;
} else if (is_sk_skb) {
prog_type = BPF_PROG_TYPE_SK_SKB;
+   } else if (is_seccomp) {
+   prog_type = BPF_PROG_TYPE_SECCOMP;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -110,7 +113,8 @@ static int load_and_attach(const char *event, struct 
bpf_insn *prog, int size)
 
prog_fd[prog_cnt++] = fd;
 
-   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
+   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk ||
+   is_seccomp)
return 0;
 
if (is_socket || is_sockops || is_sk_skb) {
@@ -589,7 +593,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb 
fixup_map)
memcmp(shname, "socket", 6) == 0 ||
memcmp(shname, "cgroup/", 7) == 0 ||
memcmp(shname, "sockops", 7) == 0 ||
-   memcmp(shname, "sk_skb", 6) == 0) {
+   memcmp(shname, "sk_skb", 6) == 0 ||
+   memcmp(shname, "seccomp", 7) == 0) {
ret = load_and_attach(shname, data->d_buf,
  data->d_size);
if (ret != 0)
diff --git a/samples/bpf/seccomp1_kern.c b/samples/bpf/seccomp1_kern.c
new file mode 100644
index ..420e37eebd92
--- /dev/null
+++ b/samples/bpf/seccomp1_kern.c
@@ -0,0 +1,43 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+#include 
+
+#if defined(__x86_64__)
+#define ARCH   AUDIT_ARCH_X86_64
+#elif defined(__i386__)
+#define ARCH   AUDIT_ARCH_I386
+#else
+#endif
+
+#ifdef ARCH
+/* Returns EPERM when trying to close fd 999 */
+SEC("seccomp")
+int bpf_prog1(struct seccomp_data *ctx)
+{
+   /*
+* Make sure this BPF program is being run on the same architecture it
+* was compiled on.
+*/
+   if (ctx->arch != ARCH)
+   return SECCOMP_RET_ERRNO | EPERM;
+   if (ctx->nr == __NR_close && ctx->args[0] == 999)
+   ret

[net-next v2 0/2] eBPF Seccomp filters

2018-02-16 Thread Sargun Dhillon

This patchset enables seccomp filters to be written in eBPF. Although, this
patchset doesn't introduce much of the functionality enabled by eBPF, it lays
the ground work for it. Currently, you have to disable CHECKPOINT_RESTORE
support in order to utilize eBPF seccomp filters, as eBPF filters cannot be
retrieved via the ptrace GET_FILTER API.

Any user can load a bpf seccomp filter program, and it can be pinned and
reused without requiring access to the bpf syscalls. A user only requires
the traditional permissions of either being cap_sys_admin, or have
no_new_privs set in order to install their rule.

The primary reason for not adding maps support in this patchset is
to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
If we have a map that the BPF program can read, it can potentially
"change" privileges after running. It seems like doing writes only
is safe, because it can be pure, and side effect free, and therefore
not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
to an agreement, this can be in a follow-up patchset.

A benchmark of this patchset is as follows for a very standard eBPF filter:

Given this test program:
for (i = 10; i < ; i++) syscall(__NR_getpid);

If I implement an eBPF filter with PROG_ARRAYs with a program per syscall,
and tail call, the numbers are such:
ebpf JIT 12.3% slower than native
ebpf no JIT 13.6% slower than native
seccomp JIT 17.6% slower than native
seccomp no JIT 37% slower than native

The speed of the traditional seccomp filter increases O(n) with the number
of syscalls with discrete rulesets, whereas ebpf is O(1), given any number
of syscall filters.

Changes since v1:
  * Use a flag to indicate loading an eBPF filter, not a separate command
  * Remove printk helper
  * Remove ptrace patch / restore filter / sample
  * Add some safe helpers

Sargun Dhillon (2):
  bpf, seccomp: Add eBPF filter capabilities
  bpf: Add eBPF seccomp sample programs

 arch/Kconfig |   8 +++
 include/linux/bpf_types.h|   3 +
 include/linux/seccomp.h  |   3 +-
 include/uapi/linux/bpf.h |   2 +
 include/uapi/linux/seccomp.h |   7 ++-
 kernel/bpf/syscall.c |   1 +
 kernel/seccomp.c | 145 +--
 samples/bpf/Makefile |   5 ++
 samples/bpf/bpf_load.c   |   9 ++-
 samples/bpf/seccomp1_kern.c  |  43 +
 samples/bpf/seccomp1_user.c  |  45 ++
 11 files changed, 247 insertions(+), 24 deletions(-)
 create mode 100644 samples/bpf/seccomp1_kern.c
 create mode 100644 samples/bpf/seccomp1_user.c

-- 
2.14.1

Re: [PATCH net-next 1/3] bpf, seccomp: Add eBPF filter capabilities

2018-02-16 Thread Sargun Dhillon

On Tue, Feb 13, 2018 at 12:34 PM, Kees Cook <keesc...@chromium.org> wrote:
> On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sar...@sargun.me> wrote:
>> From: Sargun Dhillon <sar...@netflix.com>
>>
>> This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
>> to be used for seccomp filters as an alternative to cBPF filters. The
>> program type has relatively limited capabilities in terms of helpers,
>> but that can be extended later on.
>>
>> It also introduces a new mechanism to attach these filters via the
>> prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and
>> SECCOMP_SET_MODE_FILTER_EXTENDED respectively.
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
>> ---
>>  arch/Kconfig |   7 ++
>>  include/linux/bpf_types.h|   3 +
>>  include/uapi/linux/bpf.h |   2 +
>>  include/uapi/linux/seccomp.h |  15 +++--
>>  kernel/bpf/syscall.c |   1 +
>>  kernel/seccomp.c | 148 
>> +--
>>  6 files changed, 150 insertions(+), 26 deletions(-)
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 76c0b54443b1..db675888577c 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -401,6 +401,13 @@ config SECCOMP_FILTER
>>
>>   See Documentation/prctl/seccomp_filter.txt for details.
>>
>> +config SECCOMP_FILTER_EXTENDED
>> +   bool "Extended BPF seccomp filters"
>> +   depends on SECCOMP_FILTER && BPF_SYSCALL
>> +   help
>> + Enables seccomp filters to be written in eBPF, as opposed
>> + to just cBPF filters.
>> +
>>  config HAVE_GCC_PLUGINS
>> bool
>> help
>> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
>> index 19b8349a3809..945c65c4e461 100644
>> --- a/include/linux/bpf_types.h
>> +++ b/include/linux/bpf_types.h
>> @@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
>>  #ifdef CONFIG_CGROUP_BPF
>>  BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
>>  #endif
>> +#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
>> +BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
>> +#endif
>>
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
>>  BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index db6bdc375126..5f96cb7ed954 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1,3 +1,4 @@
>> +
>>  /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
>>  /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
>>   *
>> @@ -133,6 +134,7 @@ enum bpf_prog_type {
>> BPF_PROG_TYPE_SOCK_OPS,
>> BPF_PROG_TYPE_SK_SKB,
>> BPF_PROG_TYPE_CGROUP_DEVICE,
>> +   BPF_PROG_TYPE_SECCOMP,
>>  };
>>
>>  enum bpf_attach_type {
>> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
>> index 2a0bd9dd104d..7da8b39f2a6a 100644
>> --- a/include/uapi/linux/seccomp.h
>> +++ b/include/uapi/linux/seccomp.h
>> @@ -7,14 +7,17 @@
>>
>>
>>  /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, ) */
>> -#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
>> -#define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
>> -#define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
>> +#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
>> +#define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
>> +#define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
>> +#define SECCOMP_MODE_FILTER_EXTENDED   3 /* uses eBPF filter from fd */
>
> This MODE flag isn't needed: it's still using a filter, and the
> interface changes should be sufficient with
> SECCOMP_SET_MODE_FILTER_EXTENDED below.
>
>>  /* Valid operations for seccomp syscall. */
>> -#define SECCOMP_SET_MODE_STRICT0
>> -#define SECCOMP_SET_MODE_FILTER1
>> -#define SECCOMP_GET_ACTION_AVAIL   2
>> +#define SECCOMP_SET_MODE_STRICT0
>> +#define SECCOMP_SET_MODE_FILTER1
>> +#define SECCOMP_GET_ACTION_AVAIL   2
>> +#define SECCOMP_SET_MODE_FILTER_EXTENDED   3
>
> It seems like this should be a FILTER flag, not a syscall op change?
>
>> +
>>
>>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
>>  #define SECCOMP_FILTER_FLAG_TSYNC  1
>

Re: [PATCH net-next 0/3] eBPF Seccomp filters

2018-02-16 Thread Sargun Dhillon

On Wed, Feb 14, 2018 at 8:30 PM, Alexei Starovoitov
 wrote:
> On Wed, Feb 14, 2018 at 10:32:22AM -0700, Tycho Andersen wrote:
>> > >
>> > > What's the reason for adding eBPF support? seccomp shouldn't need it,
>> > > and it only makes the code more complex. I'd rather stick with cBPF
>> > > until we have an overwhelmingly good reason to use eBPF as a "native"
>> > > seccomp filter language.
>> > >
>> >
>> > I can think of two fairly strong use cases for eBPF's ability to call
>> > functions: logging and Tycho's user notifier thing.
>>
>> Worth noting that there is one additional thing that I didn't
>> implement, but which would be nice and is probably not possible with
>> eBPF (at least, not without a bunch of additional infrastructure):
>> passing fds back to the tracee from the manager if you intercept
>> socket(), or accept() or something.
>>
>> This could again be accomplished via other means, though it would be a
>> lot nicer to have a primitive for it.
>
> there is bpf_perf_event_output() interface that allows to stream
> arbitrary data from kernel into user space via perf ring buffer.
> User space can epoll on it. We use this in both tracing and networking
> for notifications and streaming data transfers.
> I suspect this can be used for 'logging' too, since it's cheap and fast.
>
> Specifically for android we added bpf_lsm hooks, cookie/uid helpers,
> and read-only maps.
> Lorenzo,
> there was a claim in this thread that bpf is disabled on android.
> Can you please clarify ?
> If it's actually disabled and there is no intent to enable it,
> I'd rather not add any more android specific features to bpf.
>
> What I think is important to understand is that BPF goes through
> very active development. The verifier is constantly getting smarter.
> There is work to add bounded loops, lock/unlock, get/put tracking,
> global/percpu variables, dynamic linking and so on.
> Most of the features are available to root only and unpriv
> has very limited set. Like getting bpf_perf_event_output() to work
> for unpriv will likely require additional verifier work.
>
> So all cool bits will not be usable by seccomp+eBPF and unpriv
> on day one. It's not a lot of work either, but once it's done
> I'd hate to see arguments against adding more verifier features
> just because eBPF is used by seccomp/landlock/other_security_thing.
>
> Also I think the argument that seccomp+eBPF will be faster than
> seccomp+cBPF is a weak one. I bet kpti on/off makes no difference
> under seccomp, since _all_ syscalls are already slow for sandboxed app.
> Instead of making seccomp 5% faster with eBPF, I think it's
> worth looking into extending LSM hooks to cover all syscalls and
> have programmable (bpf or whatever) filtering applied per syscall.
> Like we can have a white list syscall table covered by lsm hooks
> and any other syscall will get into old seccomp-style
> filtering category automatically.
> lsm+bpf would need to follow process hierarchy. It shouldn't be
> a runtime check at syscall entry either, but compile time
> extra branch in SYSCALL_DEFINE for non-whitelisted syscalls.
> There are bunch of other things to figure out, but I think
> the perf win will be bigger than replacing cBPF with eBPF in
> existing seccomp.
>
Given this test program:
for (i = 10; i < ; i++) syscall(__NR_getpid);

If I implement an eBPF filter with PROG_ARRAYs, and tail call, the
numbers are such:
ebpf JIT 12.3% slower than native
ebpf no JIT 13.6% slower than native
seccomp JIT 17.6% slower than native
seccomp no JIT 37% slower than native

This is using libseccomp for the standard seccomp BPF program. There's
no reasonable way for our workload to know which syscalls come
"earlier", so we can't take that optimization. Potentially, libseccomp
can be smarter about ordering cases (using ranges), and use an
O(log(n)) search algorithm, but both of these are microptimizations
that scale with the number of syscalls and per-syscall rules. The
nicety of using a PROG_ARRAY means that adding additional filters
(syscalls) comes at no cost, whereas there's a tradeoff any time you
add another rule in traditional seccomp filters.

This was tested on an Amazon M4.16XL running with pcid, and KPTI.

Re: [PATCH net-next 0/3] eBPF Seccomp filters

2018-02-13 Thread Sargun Dhillon

On Tue, Feb 13, 2018 at 9:02 AM, Jessie Frazelle <m...@jessfraz.com> wrote:
> On Tue, Feb 13, 2018 at 11:29 AM, Sargun Dhillon <sar...@sargun.me> wrote:
>> On Tue, Feb 13, 2018 at 7:47 AM, Kees Cook <keesc...@chromium.org> wrote:
>>> On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sar...@sargun.me> wrote:
>>>> This patchset enables seccomp filters to be written in eBPF. Although,
>>>> this patchset doesn't introduce much of the functionality enabled by
>>>> eBPF, it lays the ground work for it.
>>>>
>>>> It also introduces the capability to dump eBPF filters via the PTRACE
>>>> API in order to make it so that CHECKPOINT_RESTORE will be satisifed.
>>>> In the attached samples, there's an example of this. One can then use
>>>> BPF_OBJ_GET_INFO_BY_FD in order to get the actual code of the program,
>>>> and use that at reload time.
>>>>
>>>> The primary reason for not adding maps support in this patchset is
>>>> to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
>>>> If we have a map that the BPF program can read, it can potentially
>>>> "change" privileges after running. It seems like doing writes only
>>>> is safe, because it can be pure, and side effect free, and therefore
>>>> not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
>>>> to an agreement, this can be in a follow-up patchset.
>>>
>>> What's the reason for adding eBPF support? seccomp shouldn't need it,
>>> and it only makes the code more complex. I'd rather stick with  -- cBPF
>>> until we have an overwhelmingly good reason to use eBPF as a "native"
>>> seccomp filter language.
>>>
>>> -Kees
>>>
>> Three reasons:
>> 1) The userspace tooling for eBPF is much better than the user space
>> tooling for cBPF. Our use case is specifically to optimize Docker
>> policies. This is roughly what their seccomp policy looks like:
>> https://github.com/moby/moby/blob/master/profiles/seccomp/default.json.
>> It would be much nicer to be able to leverage eBPF to write this in C,
>> or any other the other languages targetting eBPF. In addition, if we
>> have write-only maps, we can exfiltrate information from seccomp, like
>> arguments, and errors in a relatively cheap way compared to cBPF, and
>> then extract this via the bcc stack. Writing cBPF via C macros is a
>> pain, and the off the shelf cBPF libraries are getting no love. The
>> eBPF community is *exploding* with contributions.
>
> Is stage two of this getting runc to support eBPF and docker to change
> the default to be written as eBPF, because I foresee that being a
> problem mainly with the kernel versions people use. The point of that
> patch was to help the most people and as your point in (2) is made
> about performance, that is a trade-off I would be willing to make in
> order to have this functionality on more kernel versions.
>
> The other alternative would be to have docker translate to use eBPF if
).> the kernel supported it, but that amount of complexity seems a bit
> unnecessary for a feature that was trying to also be "simple".
>
> Or do you plan on wrapping filters onto processes tangentially from
> the runtime, in which case, that should be totally fine :)
>
> Anyways this is kinda a tangent from the main point of getting it in
> the kernel, just I would hate to see someone having to maintain this
> without there being a path to getting it upstream elsewhere.
>
We (me) intend to do the work to get it into Docker / Moby /
Containerd / Runc / Whatever the kids call it these days. It already
has the idea of multiple security modules, like seccomp, apparmor,
etc.. I can imagine that the first approach would be just to let
people pass eBPF filters as code, in the same way. Afterwards, there
could be more sophisticated approaches in order to transparently
upgrade people's filters, and give them performance upgrades.

A really naive approach is to take the JSON seccomp policy document
and converting it to plain old C with switch / case statements. Then
we can just push that through LLVM and we're in business. Although,
for some reason, I don't think the folks will want to take a hard dep
on llvm at runtime, so maybe there's some mechanism where it first
tries llvm, then tries to create a eBPF application naively, and then
falls back to cBPF. My primary fear with the first two approaches is
that given how the policies are written today, it's not conducive to
the eBPF instruction limit.

Our initial approach for this internally, since we use Docker 1.13.1,
and backporting this can be a bit of a

Re: [PATCH net-next 0/3] eBPF Seccomp filters

2018-02-13 Thread Sargun Dhillon

On Tue, Feb 13, 2018 at 7:47 AM, Kees Cook <keesc...@chromium.org> wrote:
> On Tue, Feb 13, 2018 at 7:42 AM, Sargun Dhillon <sar...@sargun.me> wrote:
>> This patchset enables seccomp filters to be written in eBPF. Although,
>> this patchset doesn't introduce much of the functionality enabled by
>> eBPF, it lays the ground work for it.
>>
>> It also introduces the capability to dump eBPF filters via the PTRACE
>> API in order to make it so that CHECKPOINT_RESTORE will be satisifed.
>> In the attached samples, there's an example of this. One can then use
>> BPF_OBJ_GET_INFO_BY_FD in order to get the actual code of the program,
>> and use that at reload time.
>>
>> The primary reason for not adding maps support in this patchset is
>> to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
>> If we have a map that the BPF program can read, it can potentially
>> "change" privileges after running. It seems like doing writes only
>> is safe, because it can be pure, and side effect free, and therefore
>> not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
>> to an agreement, this can be in a follow-up patchset.
>
> What's the reason for adding eBPF support? seccomp shouldn't need it,
> and it only makes the code more complex. I'd rather stick with  -- cBPF
> until we have an overwhelmingly good reason to use eBPF as a "native"
> seccomp filter language.
>
> -Kees
>
Three reasons:
1) The userspace tooling for eBPF is much better than the user space
tooling for cBPF. Our use case is specifically to optimize Docker
policies. This is roughly what their seccomp policy looks like:
https://github.com/moby/moby/blob/master/profiles/seccomp/default.json.
It would be much nicer to be able to leverage eBPF to write this in C,
or any other the other languages targetting eBPF. In addition, if we
have write-only maps, we can exfiltrate information from seccomp, like
arguments, and errors in a relatively cheap way compared to cBPF, and
then extract this via the bcc stack. Writing cBPF via C macros is a
pain, and the off the shelf cBPF libraries are getting no love. The
eBPF community is *exploding* with contributions.

2) In my testing, which thus so far has been very rudimentary, with
rewriting the policy that libseccomp generates from the Docker policy
to use eBPF, and eBPF maps performs much better than cBPF. The
specific case tested was to use a bpf array to lookup rules for a
particular syscall. In a super trivial test, this was about 5% low
latency than using traditional branches. If you need more evidence of
this, I can work a little bit more on the maps related patches, and
see if I can get some more benchmarking. From my understanding, we
would need to add "sealing" support for maps, in which they can be
marked as read-only, and only at that point should an eBPF seccomp
program be able to read from them.

3) Eventually, I'd like to use some more advanced capabilities of
eBPF, like being able to rewrite arguments safely (not things referred
to by pointers, but just plain old arguments).

>>
>>
>> Sargun Dhillon (3):
>>   bpf, seccomp: Add eBPF filter capabilities
>>   seccomp, ptrace: Add a mechanism to retrieve attached eBPF seccomp
>> filters
>>   bpf: Add eBPF seccomp sample programs
>>
>>  arch/Kconfig |   7 ++
>>  include/linux/bpf_types.h|   3 +
>>  include/linux/seccomp.h  |  12 +++
>>  include/uapi/linux/bpf.h |   2 +
>>  include/uapi/linux/ptrace.h  |   5 +-
>>  include/uapi/linux/seccomp.h |  15 ++--
>>  kernel/bpf/syscall.c |   1 +
>>  kernel/ptrace.c  |   3 +
>>  kernel/seccomp.c | 185 
>> ++-
>>  samples/bpf/Makefile |   9 +++
>>  samples/bpf/bpf_load.c   |   9 ++-
>>  samples/bpf/seccomp1_kern.c  |  17 
>>  samples/bpf/seccomp1_user.c  |  34 
>>  samples/bpf/seccomp2_kern.c  |  24 ++
>>  samples/bpf/seccomp2_user.c  |  66 +++
>>  15 files changed, 362 insertions(+), 30 deletions(-)
>>  create mode 100644 samples/bpf/seccomp1_kern.c
>>  create mode 100644 samples/bpf/seccomp1_user.c
>>  create mode 100644 samples/bpf/seccomp2_kern.c
>>  create mode 100644 samples/bpf/seccomp2_user.c
>>
>> --
>> 2.14.1
>>
>
>
>
> --
> Kees Cook
> Pixel Security

[PATCH net-next 3/3] bpf: Add eBPF seccomp sample programs

2018-02-13 Thread Sargun Dhillon

From: Sargun Dhillon <sar...@netflix.com>

This adds two sample programs:
seccomp1: A simple eBPF seccomp filter
seccomp2: A program which installs an eBPF filter
  and then retrieves it via ptrace to show
  checkpoint / restore capability.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 samples/bpf/Makefile|  9 +++
 samples/bpf/bpf_load.c  |  9 +--
 samples/bpf/seccomp1_kern.c | 17 
 samples/bpf/seccomp1_user.c | 34 +++
 samples/bpf/seccomp2_kern.c | 24 +
 samples/bpf/seccomp2_user.c | 66 +
 6 files changed, 157 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/seccomp1_kern.c
 create mode 100644 samples/bpf/seccomp1_user.c
 create mode 100644 samples/bpf/seccomp2_kern.c
 create mode 100644 samples/bpf/seccomp2_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index ec3fc8d88e87..f1ba5fa18db7 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -43,6 +43,8 @@ hostprogs-y += xdp_redirect_cpu
 hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
+hostprogs-y += seccomp1
+hostprogs-y += seccomp2
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -93,6 +95,9 @@ xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) 
xdp_redirect_cpu_user.o
 xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
+seccomp1-objs := bpf_load.o $(LIBBPF) seccomp1_user.o
+seccomp2-objs := bpf_load.o $(LIBBPF) seccomp2_user.o
+
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -144,6 +149,8 @@ always += xdp_monitor_kern.o
 always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
+always += seccomp1_kern.o
+always += seccomp2_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -188,6 +195,8 @@ HOSTLOADLIBES_xdp_redirect_cpu += -lelf
 HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
+HOSTLOADLIBES_seccomp1 += -lelf
+HOSTLOADLIBES_seccomp2 += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 69806d74fa53..856bc8b93916 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -67,6 +67,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
bool is_cgroup_sk = strncmp(event, "cgroup/sock", 11) == 0;
bool is_sockops = strncmp(event, "sockops", 7) == 0;
bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
+   bool is_seccomp = strncmp(event, "seccomp", 7) == 0;
size_t insns_cnt = size / sizeof(struct bpf_insn);
enum bpf_prog_type prog_type;
char buf[256];
@@ -96,6 +97,8 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
prog_type = BPF_PROG_TYPE_SOCK_OPS;
} else if (is_sk_skb) {
prog_type = BPF_PROG_TYPE_SK_SKB;
+   } else if (is_seccomp) {
+   prog_type = BPF_PROG_TYPE_SECCOMP;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -110,7 +113,8 @@ static int load_and_attach(const char *event, struct 
bpf_insn *prog, int size)
 
prog_fd[prog_cnt++] = fd;
 
-   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
+   if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk ||
+   is_seccomp)
return 0;
 
if (is_socket || is_sockops || is_sk_skb) {
@@ -589,7 +593,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb 
fixup_map)
memcmp(shname, "socket", 6) == 0 ||
memcmp(shname, "cgroup/", 7) == 0 ||
memcmp(shname, "sockops", 7) == 0 ||
-   memcmp(shname, "sk_skb", 6) == 0) {
+   memcmp(shname, "sk_skb", 6) == 0 ||
+   memcmp(shname, "seccomp", 7) == 0) {
ret = load_and_attach(shname, data->d_buf,
  data->d_size);
if (ret != 0)
diff --git a/samples/bpf/seccomp1_kern.c b/samples/bpf/seccomp1_kern.c
new file mode 100644
index ..7fcbd48fa69a
--- /dev/null
+++ b/samples/bpf/seccomp1_kern.c
@@ -0,0 +1,17 @@
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+
+/* Returns EPERM when trying to close fd 999 */
+SEC("seccomp")
+int bpf_prog1(struct seccomp_data *ctx)
+{
+   if (ctx

[PATCH net-next 0/3] eBPF Seccomp filters

2018-02-13 Thread Sargun Dhillon

This patchset enables seccomp filters to be written in eBPF. Although,
this patchset doesn't introduce much of the functionality enabled by
eBPF, it lays the ground work for it.

It also introduces the capability to dump eBPF filters via the PTRACE
API in order to make it so that CHECKPOINT_RESTORE will be satisifed.
In the attached samples, there's an example of this. One can then use
BPF_OBJ_GET_INFO_BY_FD in order to get the actual code of the program,
and use that at reload time.

The primary reason for not adding maps support in this patchset is
to avoid introducing new complexities around PR_SET_NO_NEW_PRIVS.
If we have a map that the BPF program can read, it can potentially
"change" privileges after running. It seems like doing writes only
is safe, because it can be pure, and side effect free, and therefore
not negatively effect PR_SET_NO_NEW_PRIVS. Nonetheless, if we come
to an agreement, this can be in a follow-up patchset.


Sargun Dhillon (3):
  bpf, seccomp: Add eBPF filter capabilities
  seccomp, ptrace: Add a mechanism to retrieve attached eBPF seccomp
filters
  bpf: Add eBPF seccomp sample programs

 arch/Kconfig |   7 ++
 include/linux/bpf_types.h|   3 +
 include/linux/seccomp.h  |  12 +++
 include/uapi/linux/bpf.h |   2 +
 include/uapi/linux/ptrace.h  |   5 +-
 include/uapi/linux/seccomp.h |  15 ++--
 kernel/bpf/syscall.c |   1 +
 kernel/ptrace.c  |   3 +
 kernel/seccomp.c | 185 ++-
 samples/bpf/Makefile |   9 +++
 samples/bpf/bpf_load.c   |   9 ++-
 samples/bpf/seccomp1_kern.c  |  17 
 samples/bpf/seccomp1_user.c  |  34 
 samples/bpf/seccomp2_kern.c  |  24 ++
 samples/bpf/seccomp2_user.c  |  66 +++
 15 files changed, 362 insertions(+), 30 deletions(-)
 create mode 100644 samples/bpf/seccomp1_kern.c
 create mode 100644 samples/bpf/seccomp1_user.c
 create mode 100644 samples/bpf/seccomp2_kern.c
 create mode 100644 samples/bpf/seccomp2_user.c

-- 
2.14.1

[PATCH net-next 1/3] bpf, seccomp: Add eBPF filter capabilities

2018-02-13 Thread Sargun Dhillon

From: Sargun Dhillon <sar...@netflix.com>

This introduces the BPF_PROG_TYPE_SECCOMP bpf program type. It is meant
to be used for seccomp filters as an alternative to cBPF filters. The
program type has relatively limited capabilities in terms of helpers,
but that can be extended later on.

It also introduces a new mechanism to attach these filters via the
prctl and seccomp syscalls -- SECCOMP_MODE_FILTER_EXTENDED, and
SECCOMP_SET_MODE_FILTER_EXTENDED respectively.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 arch/Kconfig |   7 ++
 include/linux/bpf_types.h|   3 +
 include/uapi/linux/bpf.h |   2 +
 include/uapi/linux/seccomp.h |  15 +++--
 kernel/bpf/syscall.c |   1 +
 kernel/seccomp.c | 148 +--
 6 files changed, 150 insertions(+), 26 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 76c0b54443b1..db675888577c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -401,6 +401,13 @@ config SECCOMP_FILTER
 
  See Documentation/prctl/seccomp_filter.txt for details.
 
+config SECCOMP_FILTER_EXTENDED
+   bool "Extended BPF seccomp filters"
+   depends on SECCOMP_FILTER && BPF_SYSCALL
+   help
+ Enables seccomp filters to be written in eBPF, as opposed
+ to just cBPF filters.
+
 config HAVE_GCC_PLUGINS
bool
help
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 19b8349a3809..945c65c4e461 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -22,6 +22,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #ifdef CONFIG_CGROUP_BPF
 BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
 #endif
+#ifdef CONFIG_SECCOMP_FILTER_EXTENDED
+BPF_PROG_TYPE(BPF_PROG_TYPE_SECCOMP, seccomp)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index db6bdc375126..5f96cb7ed954 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1,3 +1,4 @@
+
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  *
@@ -133,6 +134,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SOCK_OPS,
BPF_PROG_TYPE_SK_SKB,
BPF_PROG_TYPE_CGROUP_DEVICE,
+   BPF_PROG_TYPE_SECCOMP,
 };
 
 enum bpf_attach_type {
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 2a0bd9dd104d..7da8b39f2a6a 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -7,14 +7,17 @@
 
 
 /* Valid values for seccomp.mode and prctl(PR_SET_SECCOMP, ) */
-#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
-#define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
-#define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
+#define SECCOMP_MODE_DISABLED  0 /* seccomp is not in use. */
+#define SECCOMP_MODE_STRICT1 /* uses hard-coded filter. */
+#define SECCOMP_MODE_FILTER2 /* uses user-supplied filter. */
+#define SECCOMP_MODE_FILTER_EXTENDED   3 /* uses eBPF filter from fd */
 
 /* Valid operations for seccomp syscall. */
-#define SECCOMP_SET_MODE_STRICT0
-#define SECCOMP_SET_MODE_FILTER1
-#define SECCOMP_GET_ACTION_AVAIL   2
+#define SECCOMP_SET_MODE_STRICT0
+#define SECCOMP_SET_MODE_FILTER1
+#define SECCOMP_GET_ACTION_AVAIL   2
+#define SECCOMP_SET_MODE_FILTER_EXTENDED   3
+
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
 #define SECCOMP_FILTER_FLAG_TSYNC  1
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e24aa3241387..86d6ec8b916d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1202,6 +1202,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
+   type != BPF_PROG_TYPE_SECCOMP &&
!capable(CAP_SYS_ADMIN))
return -EPERM;
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 940fa408a288..b30dd25c1cb8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * struct seccomp_filter - container for seccomp BPF programs
@@ -367,17 +368,6 @@ static struct seccomp_filter 
*seccomp_prepare_filter(struct sock_fprog *fprog)
 
BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
 
-   /*
-* Installing a seccomp filter requires that the task has
-* CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
-* This avoids scenarios where unprivileged tasks can affect the
-* behavior of privileged children.
-*/
-   if (!task_no_new_privs(current) &&
-   security_capable_noaudit(current_cred(), c

[PATCH net-next 2/3] seccomp, ptrace: Add a mechanism to retrieve attached eBPF seccomp filters

2018-02-13 Thread Sargun Dhillon

From: Sargun Dhillon <sar...@netflix.com>

This extends the the ptrace API to allow fetching eBPF seccomp filters
attached to programs. This is to enable checkpoint / restore cases.
The user will have to use the traditional PTRACE_SECCOMP_GET_FILTER
API call, and if they get an invalid medium type error they can switch
over to the eBPF variant of the API -- PTRACE_SECCOMP_GET_FILTER_EXTENDED.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 include/linux/seccomp.h | 12 
 include/uapi/linux/ptrace.h |  5 +++--
 kernel/ptrace.c |  3 +++
 kernel/seccomp.c| 37 +
 4 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index c723a5c4e3ff..97fdbcffacc2 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -110,4 +110,16 @@ static inline long seccomp_get_metadata(struct task_struct 
*task,
return -EINVAL;
 }
 #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
+#if defined(CONFIG_SECCOMP_FILTER_EXTENDED) && 
defined(CONFIG_CHECKPOINT_RESTORE)
+extern long seccomp_get_filter_extended(struct task_struct *task,
+   unsigned long n,
+   void __user *data);
+#else
+static inline long seccomp_get_filter_extended(struct task_struct *task,
+  unsigned long n,
+  void __user *data)
+{
+   return -EINVAL;
+}
+#endif /* CONFIG_SECCOMP_FILTER_EXTENDED && CONFIG_CHECKPOINT_RESTORE */
 #endif /* _LINUX_SECCOMP_H */
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index e46d82b91166..c619eb46b9d9 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -65,8 +65,9 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_GETSIGMASK  0x420a
 #define PTRACE_SETSIGMASK  0x420b
 
-#define PTRACE_SECCOMP_GET_FILTER  0x420c
-#define PTRACE_SECCOMP_GET_METADATA0x420d
+#define PTRACE_SECCOMP_GET_FILTER  0x420c
+#define PTRACE_SECCOMP_GET_METADATA0x420d
+#define PTRACE_SECCOMP_GET_FILTER_EXTENDED 0x420e
 
 struct seccomp_metadata {
unsigned long filter_off;   /* Input: which filter */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 21fec73d45d4..90c62f9e1a55 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1096,6 +1096,9 @@ int ptrace_request(struct task_struct *child, long 
request,
ret = seccomp_get_metadata(child, addr, datavp);
break;
 
+   case PTRACE_SECCOMP_GET_FILTER_EXTENDED:
+   ret = seccomp_get_filter_extended(child, addr, datavp);
+
default:
break;
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b30dd25c1cb8..931a13a8cd63 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1155,6 +1155,43 @@ long seccomp_get_metadata(struct task_struct *task,
 }
 #endif
 
+#if defined(CONFIG_SECCOMP_FILTER_EXTENDED) && 
defined(CONFIG_CHECKPOINT_RESTORE)
+long seccomp_get_filter_extended(struct task_struct *task,
+unsigned long filter_off,
+void __user *data)
+{
+   struct seccomp_filter *filter;
+   struct bpf_prog *prog;
+   long ret;
+
+   if (!capable(CAP_SYS_ADMIN) ||
+   current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+   return -EACCES;
+   }
+
+   filter = get_nth_filter(task, filter_off);
+   if (IS_ERR(filter))
+   return PTR_ERR(filter);
+
+   if (bpf_prog_was_classic(filter->prog)) {
+   ret = -EMEDIUMTYPE;
+   goto out;
+   }
+   prog = bpf_prog_inc_not_zero(filter->prog);
+   if (IS_ERR(prog)) {
+   ret = PTR_ERR(prog);
+   goto out;
+   }
+
+   ret = bpf_prog_new_fd(filter->prog);
+   if (ret < 0)
+   bpf_prog_put(prog);
+out:
+   __put_seccomp_filter(filter);
+   return ret;
+}
+#endif
+
 #ifdef CONFIG_SYSCTL
 
 /* Human readable action names for friendly sysctl interaction */
-- 
2.14.1

Re: bpf bounded loops. Was: [flamebait] xdp

2016-12-03 Thread Sargun Dhillon

On Fri, Dec 2, 2016 at 4:20 PM, Alexei Starovoitov
 wrote:
> On Fri, Dec 02, 2016 at 11:42:15AM -0800, John Fastabend wrote:
>> >> As far as pattern search for DNS packets...
>> >> it was requested by Cloudflare guys back in March:
>> >> https://github.com/iovisor/bcc/issues/471
>> >> and it is useful for several tracing use cases as well.
>> >> Unfortunately no one had time to implement it yet.
>> >
>> > The string operations you proposed on the other hand, which would count
>> > as one eBPF instructions, would give a lot more flexibility and allow
>> > more cycles to burn, but don't help parsing binary protocols like IPv6
>> > extension headers.
>
> these are two separate things. we need pattern search regardless
> of bounded loops. bpf program shouldn't be doing any complicated
> algorithms. The main reasons to have loops are:
> - speed up execution (smaller I-cache footprint)
> - avoid forcing compiler to unroll loops (easier for users)
> - support loops where unroll is not possible (like example below)
>
>> My rough thinking on this was the verifier had to start looking for loop
>> invariants and to guarantee termination. Sounds scary in general but
>> LLVM could put these in some normal form for us and the verifier could
>> only accept decreasing loops, the invariants could be required to be
>> integers, etc. By simplifying the loop enough the problem becomes
>> tractable.
>
> yep. I think what Hannes was proposing earlier is straighforward
> to implement for a compiler guy. The following:
> for (int i = 0; i < (var & 0xff); i++)
>   sum += map->value[i];  /* map value_size >= 0xff */
> is obviously bounded and dataflow analysis can easily prove
> that all memory operations are valid.
> Static analysis tools do way way more than this.
>
>> I think this would be better than new instructions and/or multiple
>> verifiers.
>
> agree that it's better than new instructions that would have
> required JIT changes. Though there are pros to new insns too :)
>
Has there been any thought to adding a map, or foldl helper a la the
tail call helper? Although you'd want to allocate an accumulator of
kinds for the foldl, I imagine this could be bounded in size quite
small for things like binary parsing operations -- we could reasonably
allow the accumulator to be updated, and return a special value to
exit the loop. I also started working on a map function a while ago
which would call a bpf program for each set cell in an arraymap, and
each set key/value in a hash map.

My intent was to intentionally make it so I could do this on the
context itself, so I could do encryption in BPF. I wanted to be able
to fold over the packet 16, or 32 bytes at a time, and (1) modify the
content, and (2) generate the authentication tag.

Any opinions on that approach?

[PATCH net-next 1/2] samples, bpf: Refactor test_current_task_under_cgroup - separate out helpers

2016-12-02 Thread Sargun Dhillon

This patch modifies test_current_task_under_cgroup_user. The test has
several helpers around creating a temporary environment for cgroup
testing, and moving the current task around cgroups. This set of
helpers can then be used in other tests.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 samples/bpf/Makefile  |   2 +-
 samples/bpf/cgroup_helpers.c  | 177 ++
 samples/bpf/cgroup_helpers.h  |  16 ++
 samples/bpf/test_current_task_under_cgroup_user.c | 108 +++--
 4 files changed, 218 insertions(+), 85 deletions(-)
 create mode 100644 samples/bpf/cgroup_helpers.c
 create mode 100644 samples/bpf/cgroup_helpers.h

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 22b6407e..3c805af 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -54,7 +54,7 @@ test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
-test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
+test_current_task_under_cgroup-objs := bpf_load.o libbpf.o cgroup_helpers.o \
   test_current_task_under_cgroup_user.o
 trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
 sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c
new file mode 100644
index 000..9d1be94
--- /dev/null
+++ b/samples/bpf/cgroup_helpers.c
@@ -0,0 +1,177 @@
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+#include "cgroup_helpers.h"
+
+/*
+ * To avoid relying on the system setup, when setup_cgroup_env is called
+ * we create a new mount namespace, and cgroup namespace. The cgroup2
+ * root is mounted at CGROUP_MOUNT_PATH
+ *
+ * Unfortunately, most people don't have cgroupv2 enabled at this point in 
time.
+ * It's easier to create our own mount namespace and manage it ourselves.
+ *
+ * We assume /mnt exists.
+ */
+
+#define WALK_FD_LIMIT  16
+#define CGROUP_MOUNT_PATH  "/mnt"
+#define CGROUP_WORK_DIR"/cgroup-test-work-dir"
+#define format_cgroup_path(buf, path) \
+   snprintf(buf, sizeof(buf), "%s%s%s", CGROUP_MOUNT_PATH, \
+CGROUP_WORK_DIR, path)
+
+/**
+ * setup_cgroup_environment() - Setup the cgroup environment
+ *
+ * After calling this function, cleanup_cgroup_environment should be called
+ * once testing is complete.
+ *
+ * This function will print an error to stderr and return 1 if it is unable
+ * to setup the cgroup environment. If setup is successful, 0 is returned.
+ */
+int setup_cgroup_environment(void)
+{
+   char cgroup_workdir[PATH_MAX + 1];
+
+   format_cgroup_path(cgroup_workdir, "");
+
+   if (unshare(CLONE_NEWNS)) {
+   log_err("unshare");
+   return 1;
+   }
+
+   if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) {
+   log_err("mount fakeroot");
+   return 1;
+   }
+
+   if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) {
+   log_err("mount cgroup2");
+   return 1;
+   }
+
+   /* Cleanup existing failed runs, now that the environment is setup */
+   cleanup_cgroup_environment();
+
+   if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) {
+   log_err("mkdir cgroup work dir");
+   return 1;
+   }
+
+   return 0;
+}
+
+static int nftwfunc(const char *filename, const struct stat *statptr,
+   int fileflags, struct FTW *pfwt)
+{
+   if ((fileflags & FTW_D) && rmdir(filename))
+   log_err("Removing cgroup: %s", filename);
+   return 0;
+}
+
+
+static int join_cgroup_from_top(char *cgroup_path)
+{
+   char cgroup_procs_path[PATH_MAX + 1];
+   pid_t pid = getpid();
+   int fd, rc = 0;
+
+   snprintf(cgroup_procs_path, sizeof(cgroup_procs_path),
+"%s/cgroup.procs", cgroup_path);
+
+   fd = open(cgroup_procs_path, O_WRONLY);
+   if (fd < 0) {
+   log_err("Opening Cgroup Procs: %s", cgroup_procs_path);
+   return 1;
+   }
+
+   if (dprintf(fd, "%d\n", pid) < 0) {
+   log_err("Joining Cgroup");
+   rc = 1;
+   }
+
+   close(fd);
+   return rc;
+}
+
+/**
+ * join_cgroup() - Join a cgroup
+ * @path: The cgroup path, relative to the workdir, to join
+ *
+ * This function expects a cgroup to already be created, relative to the cgroup
+ * work dir, and it joins it. For example, passing "/my-cgroup" as the path
+ * would actually

[PATCH net-next 2/2] samples, bpf: Add automated test for cgroup filter attachments

2016-12-02 Thread Sargun Dhillon

This patch adds the sample program test_cgrp2_attach2. This program is
similar to test_cgrp2_attach, but it performs automated testing of the
cgroupv2 BPF attached filters. It runs the following checks:
* Simple filter attachment
* Application of filters to child cgroups
* Overriding filters on child cgroups
* Checking that this still works when the parent filter is removed

The filters that are used here are simply allow all / deny all filters, so
it isn't checking the actual functionality of the filters, but rather
the behaviour  around detachment / attachment. If net_cls is enabled,
this test will fail.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 samples/bpf/Makefile |   2 +
 samples/bpf/test_cgrp2_attach2.c | 132 +++
 2 files changed, 134 insertions(+)
 create mode 100644 samples/bpf/test_cgrp2_attach2.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 3c805af..8892d7c 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -23,6 +23,7 @@ hostprogs-y += map_perf_test
 hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += test_cgrp2_attach
+hostprogs-y += test_cgrp2_attach2
 hostprogs-y += xdp1
 hostprogs-y += xdp2
 hostprogs-y += test_current_task_under_cgroup
@@ -51,6 +52,7 @@ map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
 test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
 test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
+test_cgrp2_attach2-objs := libbpf.o test_cgrp2_attach2.o cgroup_helpers.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
new file mode 100644
index 000..ddfac42
--- /dev/null
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -0,0 +1,132 @@
+/* eBPF example program:
+ *
+ * - Creates arraymap in kernel with 4 bytes keys and 8 byte values
+ *
+ * - Loads eBPF program
+ *
+ *   The eBPF program accesses the map passed in to store two pieces of
+ *   information. The number of invocations of the program, which maps
+ *   to the number of packets received, is stored to key 0. Key 1 is
+ *   incremented on each iteration by the number of bytes stored in
+ *   the skb.
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ *
+ * - Every second, reads map[0] and map[1] to see how many bytes and
+ *   packets were seen on any socket of tasks in the given cgroup.
+ */
+
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "libbpf.h"
+#include "cgroup_helpers.h"
+
+#define FOO"/foo"
+#define BAR"/foo/bar/"
+#define PING_CMD   "ping -c1 -w1 127.0.0.1"
+
+static int prog_load(int verdict)
+{
+   int ret;
+   struct bpf_insn prog[] = {
+   BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+   BPF_EXIT_INSN(),
+   };
+
+   ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SKB,
+prog, sizeof(prog), "GPL", 0);
+
+   if (ret < 0) {
+   log_err("Loading program");
+   printf("Output from verifier:\n%s\n---\n", bpf_log_buf);
+   return 0;
+   }
+   return ret;
+}
+
+
+int main(int argc, char **argv)
+{
+   int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0;
+
+   allow_prog = prog_load(1);
+   if (!allow_prog)
+   goto err;
+
+   drop_prog = prog_load(0);
+   if (!drop_prog)
+   goto err;
+
+   if (setup_cgroup_environment())
+   goto err;
+
+   /* Create cgroup /foo, get fd, and join it */
+   foo = create_and_get_cgroup(FOO);
+   if (!foo)
+   goto err;
+
+   if (join_cgroup(FOO))
+   goto err;
+
+   if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS)) {
+   log_err("Attaching prog to /foo");
+   goto err;
+   }
+
+   assert(system(PING_CMD) != 0);
+
+   /* Create cgroup /foo/bar, get fd, and join it */
+   bar = create_and_get_cgroup(BAR);
+   if (!bar)
+   goto err;
+
+   if (join_cgroup(BAR))
+   goto err;
+
+   assert(system(PING_CMD) != 0);
+
+   if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+   log_err("Attaching prog to /foo/bar");
+   goto err;
+   }
+
+   assert(system(PING_CMD) == 0);
+
+
+   if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
+   log_err("Detaching program from /foo/bar");
+   goto err;
+   }
+
+   assert(system(PING_CMD) != 0);
+
+   if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+

[PATCH net-next 0/2] samples, bpf: Refactor; Add automated tests for cgroups

2016-12-02 Thread Sargun Dhillon

These two patches are around refactoring out some old, reusable code from the 
existing test_current_task_under_cgroup_user test, and adding a new, automated 
test.

There is some generic cgroupsv2 setup & cleanup code, given that most 
environment still don't have it setup by default. With this code, we're able
to pretty easily add an automated test for future cgroupsv2 functionality.

Sargun Dhillon (2):
  samples, bpf: Refactor test_current_task_under_cgroup - separate out
helpers
  samples, bpf: Add automated test for cgroup filter attachments

 samples/bpf/Makefile  |   4 +-
 samples/bpf/cgroup_helpers.c  | 177 ++
 samples/bpf/cgroup_helpers.h  |  16 ++
 samples/bpf/test_cgrp2_attach2.c  | 132 
 samples/bpf/test_current_task_under_cgroup_user.c | 108 +++--
 5 files changed, 352 insertions(+), 85 deletions(-)
 create mode 100644 samples/bpf/cgroup_helpers.c
 create mode 100644 samples/bpf/cgroup_helpers.h
 create mode 100644 samples/bpf/test_cgrp2_attach2.c

-- 
2.7.4

Re: [net-next 1/1] samples: bpf: Refactor test_cgrp2_attach -- use getopt, and add mode

2016-11-28 Thread Sargun Dhillon

On Mon, Nov 28, 2016 at 7:50 PM, Alexei Starovoitov
<alexei.starovoi...@gmail.com> wrote:
> On Mon, Nov 28, 2016 at 02:52:42PM -0800, Sargun Dhillon wrote:
>> This patch modifies test_cgrp2_attach to use getopt so we can use standard
>> command line parsing.
>>
>> It also adds an option to run the program in detach only mode. This does
>> not attach a new filter at the cgroup, but only runs the detach command.
>>
>> Lastly, it changes the attach code to not detach and then attach. It relies
>> on the 'hotswap' behaviour of CGroup BPF programs to be able to change
>> in-place. If detach-then-attach behaviour needs to be tested, the example
>> can be run in detach only mode prior to attachment.
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
>
> looks fine to me.
> I'd really prefer this example to become an automated test eventually.
I can do that. As far as test cases:

1. create /foo
2. enter foo
3. attach drop filter to foo
4. try to ping 127.0.0.1 (make sure it returns 0 replies)
5. create /foo/bar
6. enter /foo/bar
7. try to ping 127.0.0.1 (make sure it returns 0 replies)
8. attach passthrough filter to foo/bar
9. try to ping 127.0.0.1 (make sure it returns 1 replies)
10. Detach filter from foo/bar
11. try to ping 127.0.0.1 (make sure it returns 0 replies)
Reasonable?

>
> Acked-by: Alexei Starovoitov <a...@kernel.org>
>

[net-next 1/1] samples: bpf: Refactor test_cgrp2_attach -- use getopt, and add mode

2016-11-28 Thread Sargun Dhillon

This patch modifies test_cgrp2_attach to use getopt so we can use standard
command line parsing.

It also adds an option to run the program in detach only mode. This does
not attach a new filter at the cgroup, but only runs the detach command.

Lastly, it changes the attach code to not detach and then attach. It relies
on the 'hotswap' behaviour of CGroup BPF programs to be able to change
in-place. If detach-then-attach behaviour needs to be tested, the example
can be run in detach only mode prior to attachment.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 samples/bpf/test_cgrp2_attach.c | 80 +
 1 file changed, 50 insertions(+), 30 deletions(-)

diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c
index 63ef208..a19484c 100644
--- a/samples/bpf/test_cgrp2_attach.c
+++ b/samples/bpf/test_cgrp2_attach.c
@@ -10,8 +10,6 @@
  *   incremented on each iteration by the number of bytes stored in
  *   the skb.
  *
- * - Detaches any eBPF program previously attached to the cgroup
- *
  * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
  *
  * - Every second, reads map[0] and map[1] to see how many bytes and
@@ -75,35 +73,16 @@ static int prog_load(int map_fd, int verdict)
 
 static int usage(const char *argv0)
 {
-   printf("Usage: %s  <egress|ingress> [drop]\n", argv0);
+   printf("Usage: %s [-d] [-D]  <egress|ingress>\n", argv0);
+   printf("-d  Drop Traffic\n");
+   printf("-D  Detach filter, and exit\n");
return EXIT_FAILURE;
 }
 
-int main(int argc, char **argv)
+static int attach_filter(int cg_fd, int type, int verdict)
 {
-   int cg_fd, map_fd, prog_fd, key, ret;
+   int prog_fd, map_fd, ret, key;
long long pkt_cnt, byte_cnt;
-   enum bpf_attach_type type;
-   int verdict = 1;
-
-   if (argc < 3)
-   return usage(argv[0]);
-
-   if (strcmp(argv[2], "ingress") == 0)
-   type = BPF_CGROUP_INET_INGRESS;
-   else if (strcmp(argv[2], "egress") == 0)
-   type = BPF_CGROUP_INET_EGRESS;
-   else
-   return usage(argv[0]);
-
-   if (argc > 3 && strcmp(argv[3], "drop") == 0)
-   verdict = 0;
-
-   cg_fd = open(argv[1], O_DIRECTORY | O_RDONLY);
-   if (cg_fd < 0) {
-   printf("Failed to open cgroup path: '%s'\n", strerror(errno));
-   return EXIT_FAILURE;
-   }
 
map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY,
sizeof(key), sizeof(byte_cnt),
@@ -121,16 +100,12 @@ int main(int argc, char **argv)
return EXIT_FAILURE;
}
 
-   ret = bpf_prog_detach(cg_fd, type);
-   printf("bpf_prog_detach() returned '%s' (%d)\n", strerror(errno), 
errno);
-
ret = bpf_prog_attach(prog_fd, cg_fd, type);
if (ret < 0) {
printf("Failed to attach prog to cgroup: '%s'\n",
   strerror(errno));
return EXIT_FAILURE;
}
-
while (1) {
key = MAP_KEY_PACKETS;
assert(bpf_lookup_elem(map_fd, , _cnt) == 0);
@@ -145,3 +120,48 @@ int main(int argc, char **argv)
 
return EXIT_SUCCESS;
 }
+
+int main(int argc, char **argv)
+{
+   int detach_only = 0, verdict = 1;
+   enum bpf_attach_type type;
+   int opt, cg_fd, ret;
+
+   while ((opt = getopt(argc, argv, "Dd")) != -1) {
+   switch (opt) {
+   case 'd':
+   verdict = 0;
+   break;
+   case 'D':
+   detach_only = 1;
+   break;
+   default:
+   return usage(argv[0]);
+   }
+   }
+
+   if (argc - optind < 2)
+   return usage(argv[0]);
+
+   if (strcmp(argv[optind + 1], "ingress") == 0)
+   type = BPF_CGROUP_INET_INGRESS;
+   else if (strcmp(argv[optind + 1], "egress") == 0)
+   type = BPF_CGROUP_INET_EGRESS;
+   else
+   return usage(argv[0]);
+
+   cg_fd = open(argv[optind], O_DIRECTORY | O_RDONLY);
+   if (cg_fd < 0) {
+   printf("Failed to open cgroup path: '%s'\n", strerror(errno));
+   return EXIT_FAILURE;
+   }
+
+   if (detach_only) {
+   ret = bpf_prog_detach(cg_fd, type);
+   printf("bpf_prog_detach() returned '%s' (%d)\n",
+  strerror(errno), errno);
+   } else
+   ret = attach_filter(cg_fd, type, verdict);
+
+   return ret;
+}
-- 
2.7.4

Re: [RFC v4 00/18] Landlock LSM: Unprivileged sandboxing

2016-11-14 Thread Sargun Dhillon

formation from current process to
>> another one (e.g. through maps) to not reproduce the same security sensitive
>> behavior as ptrace.
>>
>> This design does not seem too intrusive but is flexible enough to allow a
>> powerful sandbox mechanism accessible by any process on Linux. The use of
>> seccomp and Landlock is more suitable with the help of a userland library 
>> (e.g.
>> libseccomp) that could help to specify a high-level language to express a
>> security policy instead of raw eBPF programs. Moreover, thanks to LLVM, it is
>> possible to express an eBPF program with a subset of C.
>>
>>
>> # FAQ
>>
>> ## Why does seccomp-bpf is not enough?
>>
>> A seccomp filter can access to raw syscall arguments which means that it is 
>> not
>> possible to filter according to pointed such as a file path. As the first
>> version of this patch series demonstrated, filtering at the syscall level is
>> complicated (e.g. need to take care of race conditions). This is mainly 
>> because
>> the access control checkpoints of the kernel are not at this high-level but
>> more underneath, at LSM hooks level. The LSM hooks are designed to handle 
>> this
>> kind of checks. This series use this approach to leverage the ability of
>> unprivileged users to limit themselves.
>>
>> Cf. "What it isn't?" in Documentation/prctl/seccomp_filter.txt
>>
>>
>> ## Why using the seccomp(2) syscall?
>>
>> Landlock use the same semantic as seccomp to apply access rule restrictions. 
>> It
>> add a new layer of security for the current process which is inherited by its
>> childs. It makes sense to use an unique access-restricting syscall (that 
>> should
>> be allowed by seccomp-bpf rules) which can only drop privileges. Moreover, a
>> Landlock eBPF program could come from outside a process (e.g. passed through 
>> a
>> UNIX socket). It is then useful to differentiate the creation/load of 
>> Landlock
>> eBPF programs via bpf(2), from rule enforcing via seccomp(2).
>>
>>
>> ## Why using cgroups?
>>
>> cgroups are designed to handle groups of processes. One use case is to manage
>> containers. Sandboxing based on process hierarchy (seccomp) is design to 
>> handle
>> immutable security policies, which is a good security property but does not
>> match all use cases. A user can attach Landlock rules to a cgroup. Doing so,
>> all the processes in that cgroup will be subject to the security policy.
>> However, if the user is allowed to manage this cgroup, it could dynamically
>> move this group of processes to a cgroup with another security policy (or
>> none). Landlock rules can be applied either on a process hierarchy (e.g.
>> application with built-in sandboxing) or a group of processes (e.g. container
>> sandboxing). Both approaches can be combined for the same process.
>>
>>
>> ## Does Landlock can limit network access or other resources?
>>
>> Limiting network access is obviously in the scope of Landlock but it is not 
>> yet
>> implemented. The main goal now is to get feedback about the whole concept, 
>> the
>> API and the file access control part. More access control types could be
>> implemented in the future.
>>
>> Sargun Dhillon sent a RFC (Checmate) [4] to deal with network manipulation.
>> This could be implemented on top of the Landlock framework.
>>
>>
>> ## Why a new LSM? Are SELinux, AppArmor, Smack or Tomoyo not good enough?
>>
>> The current access control LSMs are fine for their purpose which is to give 
>> the
>> *root* the ability to enforce a security policy for the *system*. What is
>> missing is a way to enforce a security policy for any applications by its
>> developer and *unprivileged user* as seccomp can do for raw syscall 
>> filtering.
>> Moreover, Landlock handles stacked hook programs from different users. It 
>> must
>> then ensure there is no possible malicious interactions between these 
>> programs.
>>
>> Differences with other (access control) LSMs:
>> * not only dedicated to administrators (i.e. no_new_priv);
>> * limited kernel attack surface (e.g. policy parsing);
>> * helpers to compare complex objects (path/FD), no access to internal kernel
>>   data (do not leak addresses);
>> * constrained policy rules/programs (no DoS: deterministic execution time);
>> * do not leak more information than the loader process can legitimately have
>>   access to (minimize metadata inference): must compare from an already 
>> allowed
&

Re: [RFC v3 18/22] cgroup,landlock: Add CGRP_NO_NEW_PRIVS to handle unprivileged hooks

2016-09-19 Thread Sargun Dhillon

On Thu, Sep 15, 2016 at 09:41:33PM +0200, Mickaël Salaün wrote:
> 
> On 15/09/2016 06:48, Alexei Starovoitov wrote:
> > On Wed, Sep 14, 2016 at 09:38:16PM -0700, Andy Lutomirski wrote:
> >> On Wed, Sep 14, 2016 at 9:31 PM, Alexei Starovoitov
> >>  wrote:
> >>> On Wed, Sep 14, 2016 at 09:08:57PM -0700, Andy Lutomirski wrote:
>  On Wed, Sep 14, 2016 at 9:00 PM, Alexei Starovoitov
>   wrote:
> > On Wed, Sep 14, 2016 at 07:27:08PM -0700, Andy Lutomirski wrote:
> >
> > This RFC handle both cgroup and seccomp approaches in a similar 
> > way. I
> > don't see why building on top of cgroup v2 is a problem. Is there
> > security issues with delegation?
> 
>  What I mean is: cgroup v2 delegation has a functionality problem.
>  Tejun says [1]:
> 
>  We haven't had to face this decision because cgroup has never 
>  properly
>  supported delegating to applications and the in-use setups where this
>  happens are custom configurations where there is no boundary between
>  system and applications and adhoc trial-and-error is good enough a 
>  way
>  to find a working solution.  That wiggle room goes away once we
>  officially open this up to individual applications.
> 
>  Unless and until that changes, I think that landlock should stay away
>  from cgroups.  Others could reasonably disagree with me.
> >>>
> >>> Ours and Sargun's use cases for cgroup+lsm+bpf is not for security
> >>> and not for sandboxing. So the above doesn't matter in such contexts.
> >>> lsm hooks + cgroups provide convenient scope and existing entry 
> >>> points.
> >>> Please see checmate examples how it's used.
> >>>
> >>
> >> To be clear: I'm not arguing at all that there shouldn't be
> >> bpf+lsm+cgroup integration.  I'm arguing that the unprivileged
> >> landlock interface shouldn't expose any cgroup integration, at least
> >> until the cgroup situation settles down a lot.
> >
> > ahh. yes. we're perfectly in agreement here.
> > I'm suggesting that the next RFC shouldn't include unpriv
> > and seccomp at all. Once bpf+lsm+cgroup is merged, we can
> > argue about unpriv with cgroups and even unpriv as a whole,
> > since it's not a given. Seccomp integration is also questionable.
> > I'd rather not have seccomp as a gate keeper for this lsm.
> > lsm and seccomp are orthogonal hook points. Syscalls and lsm hooks
> > don't have one to one relationship, so mixing them up is only
> > asking for trouble further down the road.
> > If we really need to carry some information from seccomp to lsm+bpf,
> > it's easier to add eBPF support to seccomp and let bpf side deal
> > with passing whatever information.
> >
> 
>  As an argument for keeping seccomp (or an extended seccomp) as the
>  interface for an unprivileged bpf+lsm: seccomp already checks off most
>  of the boxes for safely letting unprivileged programs sandbox
>  themselves.
> >>>
> >>> you mean the attach part of seccomp syscall that deals with no_new_priv?
> >>> sure, that's reusable.
> >>>
>  Furthermore, to the extent that there are use cases for
>  unprivileged bpf+lsm that *aren't* expressible within the seccomp
>  hierarchy, I suspect that syscall filters have exactly the same
>  problem and that we should fix seccomp to cover it.
> >>>
> >>> not sure what you mean by 'seccomp hierarchy'. The normal process
> >>> hierarchy ?
> >>
> >> Kind of.  I mean the filter layers that are inherited across fork(),
> >> the TSYNC mechanism, etc.
> >>
> >>> imo the main deficiency of secccomp is inability to look into arguments.
> >>> One can argue that it's a blessing, since composite args
> >>> are not yet copied into the kernel memory.
> >>> But in a lot of cases the seccomp arguments are FDs pointing
> >>> to kernel objects and if programs could examine those objects
> >>> the sandboxing scope would be more precise.
> >>> lsm+bpf solves that part and I'd still argue that it's
> >>> orthogonal to seccomp's pass/reject flow.
> >>> I mean if seccomp says 'ok' the syscall should continue executing
> >>> as normal and whatever LSM hooks were triggered by it may have
> >>> their own lsm+bpf verdicts.
> >>
> >> I agree with all of this...
> >>
> >>> Furthermore in the process hierarchy different children
> >>> should be able to set their own lsm+bpf filters that are not
> >>> related to parallel seccomp+bpf hierarchy of programs.
> >>> seccomp syscall can be an interface to attach programs
> >>> to lsm hooks, but nothing more than that.
> >>
> >> I'm not sure what you mean.  I mean that, logically, I think we should
> >> be able to do:
> >>
> >> seccomp(attach a syscall filter);
> >> fork();
> >> child does seccomp(attach some lsm filters);
>

Re: lsm naming dilemma. Re: [RFC v3 07/22] landlock: Handle file comparisons

2016-09-19 Thread Sargun Dhillon

I'm fine giving up the Checmate name. Landlock seems easy enough to
Google. I haven't gotten a chance to look through the entire patchset
yet, but it does seem like they are somewhat similar.

On Mon, Sep 19, 2016 at 5:12 PM, Alexei Starovoitov
 wrote:
> On Thu, Sep 15, 2016 at 11:25:10PM +0200, Mickaël Salaün wrote:
>> >> Agreed. With this RFC, the Checmate features (i.e. network helpers)
>> >> should be able to sit on top of Landlock.
>> >
>> > I think neither of them should be called fancy names for no technical 
>> > reason.
>> > We will have only one bpf based lsm. That's it and it doesn't
>> > need an obscure name. Directory name can be security/bpf/..stuff.c
>>
>> I disagree on an LSM named "BPF". I first started with the "seccomp LSM"
>> name (first RFC) but I later realized that it is confusing because
>> seccomp is associated to its syscall and the underlying features. Same
>> thing goes for BPF. It is also artificially hard to grep on a name too
>> used in the kernel source tree.
>> Making an association between the generic eBPF mechanism and a security
>> centric approach (i.e. LSM) seems a bit reductive (for BPF). Moreover,
>> the seccomp interface [1] can still be used.
>
> agree with above.
>
>> Landlock is a nice name to depict a sandbox as an enclave (i.e. a
>> landlocked country/state). I want to keep this name, which is simple,
>> express the goal of Landlock nicely and is comparable to other sandbox
>> mechanisms as Seatbelt or Pledge.
>> Landlock should not be confused with the underlying eBPF implementation.
>> Landlock could use more than only eBPF in the future and eBPF could be
>> used in other LSM as well.
>
> there will not be two bpf based LSMs.
> Therefore unless you can convince Sargun to give up his 'checmate' name,
> nothing goes in.
> The features you both need are 90% the same, so they must be done
> as part of single LSM whatever you both agree to call it.
>

Re: [PATCH v5 0/6] Add eBPF hooks for cgroups

2016-09-19 Thread Sargun Dhillon

On Mon, Sep 19, 2016 at 06:34:28PM +0200, Daniel Mack wrote:
> Hi,
> 
> On 09/16/2016 09:57 PM, Sargun Dhillon wrote:
> > On Wed, Sep 14, 2016 at 01:13:16PM +0200, Daniel Mack wrote:
> 
> >> I have no idea what makes you think this is limited to systemd. As I
> >> said, I provided an example for userspace that works from the command
> >> line. The same limitation apply as for all other users of cgroups.
> >>
> > So, at least in my work, we have Mesos, but on nearly every machine that 
> > Mesos 
> > runs, people also have systemd. Now, there's recently become a bit of a 
> > battle 
> > of ownership of things like cgroups on these machines. We can usually solve 
> > it 
> > by nesting under systemd cgroups, and thus so far we've avoided making too 
> > many 
> > systemd-specific concessions.
> > 
> > The reason this works (mostly), is because everything we touch has a sense 
> > of 
> > nesting, where we can apply policy at a place lower in the hierarchy, and 
> > yet 
> > systemd's monitoring and policy still stays in place. 
> > 
> > Now, with this patch, we don't have that, but I think we can reasonably add 
> > some 
> > flag like "no override" when applying policies, or alternatively something 
> > like 
> > "no new privileges", to prevent children from applying policies that 
> > override 
> > top-level policy.
> 
> Yes, but the API is already guarded by CAP_NET_ADMIN. Take that
> capability away from your children, and they can't tamper with the
> policy. Does that work for you?
> 
No. This can be addressed in a follow-on patch, but the use-case is that I have 
a container orchestrator (Docker, or Mesos), and systemd. The sysadmin controls 
systemd, and Docker is controlled by devs. Typically, the system owner wants 
some system level statistics, and filtering, and then we want to do 
per-container filtering.

We really want to be able to do nesting with userspace tools that are 
oblivious, 
and we want to delegate a level of the cgroup hierarchy to the tool that 
created 
it. I do not see Docker integrating with systemd any time soon, and that's 
really the only other alternative.

> > I realize there is a speed concern as well, but I think for 
> > people who want nested policy, we're willing to make the tradeoff. The cost
> > of traversing a few extra pointers still outweighs the overhead of network
> > namespaces, iptables, etc.. for many of us. 
> 
> Not sure. Have you tried it?
> 
Tried nested policies? Yes. I tried nested policy execution with syscalls, and 
I 
tested with bind and connect. The performance overhead was pretty minimal, but 
latency increased by 100 microseconds+ once the number of BPF hooks increased 
beyond 30. The BPF programs were trivial, and essentially did a map lookup, and 
returned 0.

I don't think that it's just raw cycles / execution time, but I didn't spend 
enough time digging into it to determine the performance hit. I'm waiting
for your patchset to land, and then I plan to work off of it.

> > What do you think Daniel?
> 
> I think we should look at an implementation once we really need it, and
> then revisit the performance impact. In any case, this can be changed
> under the hood, without touching the userspace API (except for adding
> flags if we need them).
> 
+1
> >> Not necessarily. You can as well do it the inetd way, and pass the
> >> socket to a process that is launched on demand, but do SO_ATTACH_FILTER
> >> + SO_LOCK_FILTER  in the middle. What happens with payload on the socket
> >> is not transparent to the launched binary at all. The proposed cgroup
> >> eBPF solution implements a very similar behavior in that regard.
> >
> > It would be nice to be able to see whether or not a filter is attached to a 
> > cgroup, but given this is going through syscalls, at least introspection
> > is possible as opposed to something like netlink.
> 
> Sure, there are many ways. I implemented the bpf cgroup logic using an
> own cgroup controller once, which made it possible to read out the
> status. But as we agreed on attaching programs through the bpf(2) system
> call, I moved back to the implementation that directly stores the
> pointers in the cgroup.
> 
> First enabling the controller through the fs-backed cgroup interface,
> then come back through the bpf(2) syscall and then go back to the fs
> interface to read out status values is a bit weird.
> 
Hrm, that makes sense. with the BPF syscall, would there be a way to get
file descriptor of the currently attached BPF program?

> >> And FWIW, I agree with Thomas - there is nothing wrong with having
> >> multiple

Re: [PATCH v5 0/6] Add eBPF hooks for cgroups

2016-09-18 Thread Sargun Dhillon

On Fri, Sep 16, 2016 at 12:57:29PM -0700, Sargun Dhillon wrote:
> On Wed, Sep 14, 2016 at 01:13:16PM +0200, Daniel Mack wrote:
> > Hi Pablo,
> > 
> > On 09/13/2016 07:24 PM, Pablo Neira Ayuso wrote:
> > > On Tue, Sep 13, 2016 at 03:31:20PM +0200, Daniel Mack wrote:
> > >> On 09/13/2016 01:56 PM, Pablo Neira Ayuso wrote:
> > >>> On Mon, Sep 12, 2016 at 06:12:09PM +0200, Daniel Mack wrote:
> > >>>> This is v5 of the patch set to allow eBPF programs for network
> > >>>> filtering and accounting to be attached to cgroups, so that they apply
> > >>>> to all sockets of all tasks placed in that cgroup. The logic also
> > >>>> allows to be extendeded for other cgroup based eBPF logic.
> > >>>
> > >>> 1) This infrastructure can only be useful to systemd, or any similar
> > >>>orchestration daemon. Look, you can only apply filtering policies
> > >>>to processes that are launched by systemd, so this only works
> > >>>for server processes.
> > >>
> > >> Sorry, but both statements aren't true. The eBPF policies apply to every
> > >> process that is placed in a cgroup, and my example program in 6/6 shows
> > >> how that can be done from the command line.
> > > 
> > > Then you have to explain me how can anyone else than systemd use this
> > > infrastructure?
> > 
> > I have no idea what makes you think this is limited to systemd. As I
> > said, I provided an example for userspace that works from the command
> > line. The same limitation apply as for all other users of cgroups.
> > 
> So, at least in my work, we have Mesos, but on nearly every machine that 
> Mesos 
> runs, people also have systemd. Now, there's recently become a bit of a 
> battle 
> of ownership of things like cgroups on these machines. We can usually solve 
> it 
> by nesting under systemd cgroups, and thus so far we've avoided making too 
> many 
> systemd-specific concessions.
> 
> The reason this works (mostly), is because everything we touch has a sense of 
> nesting, where we can apply policy at a place lower in the hierarchy, and yet 
> systemd's monitoring and policy still stays in place. 
> 
> Now, with this patch, we don't have that, but I think we can reasonably add 
> some 
> flag like "no override" when applying policies, or alternatively something 
> like 
> "no new privileges", to prevent children from applying policies that override 
> top-level policy. I realize there is a speed concern as well, but I think for 
> people who want nested policy, we're willing to make the tradeoff. The cost
> of traversing a few extra pointers still outweighs the overhead of network
> namespaces, iptables, etc.. for many of us. 
> 
> What do you think Daniel?
> 
> > > My main point is that those processes *need* to be launched by the
> > > orchestrator, which is was refering as 'server processes'.
> > 
> > Yes, that's right. But as I said, this rule applies to many other kernel
> > concepts, so I don't see any real issue.
> >
> Also, cgroups have become such a big part of how applications are managed
> that many of us have solved this problem.
> 
> > >> That's a limitation that applies to many more control mechanisms in the
> > >> kernel, and it's something that can easily be solved with fork+exec.
> > > 
> > > As long as you have control to launch the processes yes, but this
> > > will not work in other scenarios. Just like cgroup net_cls and friends
> > > are broken for filtering for things that you have no control to
> > > fork+exec.
> > 
> > Probably, but that's only solvable with rules that store the full cgroup
> > path then, and do a string comparison (!) for each packet flying by.
> >
> > >> That's just as transparent as SO_ATTACH_FILTER. What kind of
> > >> introspection mechanism do you have in mind?
> > > 
> > > SO_ATTACH_FILTER is called from the process itself, so this is a local
> > > filtering policy that you apply to your own process.
> > 
> > Not necessarily. You can as well do it the inetd way, and pass the
> > socket to a process that is launched on demand, but do SO_ATTACH_FILTER
> > + SO_LOCK_FILTER  in the middle. What happens with payload on the socket
> > is not transparent to the launched binary at all. The proposed cgroup
> > eBPF solution implements a very similar behavior in that regard.
> > 
> It would be nice to be able to see whether or not a filter is attached to a 
&g

Re: [PATCH v5 0/6] Add eBPF hooks for cgroups

2016-09-16 Thread Sargun Dhillon

On Wed, Sep 14, 2016 at 01:13:16PM +0200, Daniel Mack wrote:
> Hi Pablo,
> 
> On 09/13/2016 07:24 PM, Pablo Neira Ayuso wrote:
> > On Tue, Sep 13, 2016 at 03:31:20PM +0200, Daniel Mack wrote:
> >> On 09/13/2016 01:56 PM, Pablo Neira Ayuso wrote:
> >>> On Mon, Sep 12, 2016 at 06:12:09PM +0200, Daniel Mack wrote:
>  This is v5 of the patch set to allow eBPF programs for network
>  filtering and accounting to be attached to cgroups, so that they apply
>  to all sockets of all tasks placed in that cgroup. The logic also
>  allows to be extendeded for other cgroup based eBPF logic.
> >>>
> >>> 1) This infrastructure can only be useful to systemd, or any similar
> >>>orchestration daemon. Look, you can only apply filtering policies
> >>>to processes that are launched by systemd, so this only works
> >>>for server processes.
> >>
> >> Sorry, but both statements aren't true. The eBPF policies apply to every
> >> process that is placed in a cgroup, and my example program in 6/6 shows
> >> how that can be done from the command line.
> > 
> > Then you have to explain me how can anyone else than systemd use this
> > infrastructure?
> 
> I have no idea what makes you think this is limited to systemd. As I
> said, I provided an example for userspace that works from the command
> line. The same limitation apply as for all other users of cgroups.
> 
So, at least in my work, we have Mesos, but on nearly every machine that Mesos 
runs, people also have systemd. Now, there's recently become a bit of a battle 
of ownership of things like cgroups on these machines. We can usually solve it 
by nesting under systemd cgroups, and thus so far we've avoided making too many 
systemd-specific concessions.

The reason this works (mostly), is because everything we touch has a sense of 
nesting, where we can apply policy at a place lower in the hierarchy, and yet 
systemd's monitoring and policy still stays in place. 

Now, with this patch, we don't have that, but I think we can reasonably add 
some 
flag like "no override" when applying policies, or alternatively something like 
"no new privileges", to prevent children from applying policies that override 
top-level policy. I realize there is a speed concern as well, but I think for 
people who want nested policy, we're willing to make the tradeoff. The cost
of traversing a few extra pointers still outweighs the overhead of network
namespaces, iptables, etc.. for many of us. 

What do you think Daniel?

> > My main point is that those processes *need* to be launched by the
> > orchestrator, which is was refering as 'server processes'.
> 
> Yes, that's right. But as I said, this rule applies to many other kernel
> concepts, so I don't see any real issue.
>
Also, cgroups have become such a big part of how applications are managed
that many of us have solved this problem.

> >> That's a limitation that applies to many more control mechanisms in the
> >> kernel, and it's something that can easily be solved with fork+exec.
> > 
> > As long as you have control to launch the processes yes, but this
> > will not work in other scenarios. Just like cgroup net_cls and friends
> > are broken for filtering for things that you have no control to
> > fork+exec.
> 
> Probably, but that's only solvable with rules that store the full cgroup
> path then, and do a string comparison (!) for each packet flying by.
>
> >> That's just as transparent as SO_ATTACH_FILTER. What kind of
> >> introspection mechanism do you have in mind?
> > 
> > SO_ATTACH_FILTER is called from the process itself, so this is a local
> > filtering policy that you apply to your own process.
> 
> Not necessarily. You can as well do it the inetd way, and pass the
> socket to a process that is launched on demand, but do SO_ATTACH_FILTER
> + SO_LOCK_FILTER  in the middle. What happens with payload on the socket
> is not transparent to the launched binary at all. The proposed cgroup
> eBPF solution implements a very similar behavior in that regard.
> 
It would be nice to be able to see whether or not a filter is attached to a 
cgroup, but given this is going through syscalls, at least introspection
is possible as opposed to something like netlink.

> >> It's about filtering outgoing network packets of applications, and
> >> providing them with L2 information for filtering purposes. I don't think
> >> that's a very specific use-case.
> >>
> >> When the feature is not used at all, the added costs on the output path
> >> are close to zero, due to the use of static branches.
> > 
> > *You're proposing a socket filtering facility that hooks layer 2
> > output path*!
> 
> As I said, I'm open to discussing that. In order to make it work for L3,
> the LL_OFF issues need to be solved, as Daniel explained. Daniel,
> Alexei, any idea how much work that would be?
> 
> > That is only a rough ~30 lines kernel patchset to support this in
> > netfilter and only one extra input hook, with potential access to
> > conntrack and

Re: [PATCH v3 2/6] cgroup: add support for eBPF programs

2016-09-05 Thread Sargun Dhillon

On Mon, Sep 05, 2016 at 04:49:26PM +0200, Daniel Mack wrote:
> Hi,
> 
> On 08/30/2016 01:04 AM, Sargun Dhillon wrote:
> > On Fri, Aug 26, 2016 at 09:58:48PM +0200, Daniel Mack wrote:
> >> This patch adds two sets of eBPF program pointers to struct cgroup.
> >> One for such that are directly pinned to a cgroup, and one for such
> >> that are effective for it.
> >>
> >> To illustrate the logic behind that, assume the following example
> >> cgroup hierarchy.
> >>
> >>   A - B - C
> >> \ D - E
> >>
> >> If only B has a program attached, it will be effective for B, C, D
> >> and E. If D then attaches a program itself, that will be effective for
> >> both D and E, and the program in B will only affect B and C. Only one
> >> program of a given type is effective for a cgroup.
> >>
> > How does this work when running and orchestrator within an orchestrator? 
> > The 
> > Docker in Docker / Mesos in Mesos use case, where the top level 
> > orchestrator is 
> > observing the traffic, and there is an orchestrator within that also need 
> > to run 
> > it.
> > 
> > In this case, I'd like to run E's filter, then if it returns 0, D's, and 
> > B's, 
> > and so on.
> 
> Running multiple programs was an idea I had in one of my earlier drafts,
> but after some discussion, I refrained from it again because potentially
> walking the cgroup hierarchy on every packet is just too expensive.
>
I think you're correct here. Maybe this is something I do with the LSM-attached 
filters, and not for skb filters. Do you think there might be a way to opt-in 
to 
this option? 

> > Is it possible to allow this, either by flattening out the
> > datastructure (copy a ref to the bpf programs to C and E) or
> > something similar?
> 
> That would mean we carry a list of eBPF program pointers of dynamic
> size. IOW, the deeper inside the cgroup hierarchy, the bigger the list,
> so it can store a reference to all programs of all of its ancestor.
> 
> While I think that would be possible, even at some later point, I'd
> really like to avoid it for the sake of simplicity.
> 
> Is there any reason why this can't be done in userspace? Compile a
> program X for A, and overload it with Y, with Y doing the same than X
> but add some extra checks? Note that all users of the bpf(2) syscall API
> will need CAP_NET_ADMIN anyway, so there is no delegation to
> unprivileged sub-orchestators or anything alike really.

One of the use-cases that's becoming more and more common are 
containers-in-containers. In this, you have a privileged container that's 
running something like build orchestration, and you want to do macro-isolation 
(say limit access to only that tennant's infrastructure). Then, when the build 
orchestrator runs a build, it may want to monitor, and further isolate the 
tasks 
that run in the build job. This is a side-effect of composing different 
container technologies. Typically you use one system for images, then another 
for orchestration, and the actual program running inside of it can also 
leverage 
containerization.

Example:
K8s->Docker->Jenkins Agent->Jenkins Build Job

There's also a differentiation of ownership in each of these systems. I would 
really not require a middleware system that all my software has to talk to, 
because sometimes I'm taking off the shelf software (Jenkins), and porting it 
to 
containers. I think one of the pieces that's led to the success of cgroups is 
the straightforward API, and ease of use (and it's getting even easier in v2).

It's perfectly fine to give the lower level tasks CAP_NET_ADMIN, because we use 
something like seccomp-bpf plus some of the work I've been doing with the LSM 
to 
prevent the sub-orchestrators from accidentally blowing away the system. 
Usually, we trust these orchestrators (internal users), so it's more of a 
precautionary measure as opposed to a true security measure.

Also, rewriting BPF programs, although pretty straightforward sounds like a 
pain 
to do in userspace, even with a helper. If we were to take peoples programs and 
chain them together via tail call, or similar, I can imagine where rewriting a 
program might push you over the instruction limit.
> 
> 
> Thanks,
> Daniel
>

Re: [net-next RFC v2 4/9] bpf, security: Add Checmate security LSM and BPF program type

2016-08-29 Thread Sargun Dhillon

On Mon, Aug 29, 2016 at 02:49:17PM -0700, Alexei Starovoitov wrote:
> On 8/29/16 12:24 PM, Tejun Heo wrote:
> >Hello, Sargun.
> >
> >On Mon, Aug 29, 2016 at 11:49:07AM -0700, Sargun Dhillon wrote:
> >>It would be a separate hook per LSM hook. Why wouldn't we want a separate 
> >>bpf
> >>hook per lsm hook? I think if one program has to handle them all, the first
> >>program would be looking up the hook program in a bpf prog array. If you 
> >>think
> >>it's better to have this logic in the BPF program, that makes sense.
> >>
> >>I had a version of this patch that allowed you to attach a prog array 
> >>instead,
> >>but I think that it's cleaner attaching a program per lsm hook. In addition,
> >>there's a performance impact that comes from these hooks, so I wouldn't 
> >>want to
> >>execute unneccessary code if it's avoidable.
> >
> >Hmm... it doesn't really matter how the backend part looks like and if
> >we need to implement per-call hooks to lower runtime overhead, sure.
> >I was mostly worried about the approach propagating through the
> >userland visible interface.
> >
> >>The prog array approach also makes stacking filters difficult. If people 
> >>want
> >>multiple filters per hook, the orchestrator would have to rewrite the 
> >>existing
> >>filters to be cooperative.
> >
> >I'm not really sure "stacking" in the kernel side is a good idea.
> >Please see below.
> >
> >>>I'm not convinced about the approach.  It's an approach which pretty
> >>>much requires future extensions while being rigid.  Not a good
> >>>combination.
> >>
> >>Do you have an alternative recommendation? Maybe just a set of 5 u64s
> >>as the context object along with the hook ID?
> >
> >cgroup fs doesn't seem like the right interface for this but if it
> >were I'd go for named hook IDs instead of opaque numbers.
> >
> >>>Unless this is properly delegatable, IOW, it's safe to fully delegate
> >>>to a lesser security domain for all operations including program
> >>>loading and assignment (I can't see how that'd be the case), making it
> >>>an explicit controller doens't work in terms of userland interface.
> >>>It's fine for bpf / lsm / whatever to attach to cgroups by extending
> >>>struct cgroup itself or implementing an implicit controller but to be
> >>>visible as an explicit controller it must be able to follow cgroup
> >>>interface rules including delegation.  If not, it's best to come
> >>>through the interface which enforces the required permission checks
> >>>and then talk to cgroup from there.  This was also an issue with
> >>>network cgroup bpf programs that Daniel Mack is working on.  Please
> >>>chat with him.
> >>
> >>Program assignment is possible by lesser security domains. Program loading 
> >>is
> >>limited to CAP_SYS_ADMIN in init_user_ns. We could make it accessible to
> >>CAP_SYS_ADMIN in any userns, but it the reasoning behind this is that 
> >>Checmate
> >>BPF programs can leak kernel pointers.
> >
> >That doesn't make much sense to me.  Delegation doesn't mean much if a
> >delegatee can't load its own program (and I don't see how one can
> >delegate kernel pointer access to !root).  Also, unless there's
> >per-program fine control on who can load it, it seems pretty dangerous
> >to let anyone load any program.
> >
> >>Could we potentially restrict it to only CAP_MAC_OVERRIDE, while still 
> >>meeting
> >>cgroup delegation requirements?
> >
> >Wouldn't it make far more sense to pass cgroup fd to bpf syscall so
> >that "load this program" and "attach this program to the cgroup
> >identified by this fd" through the same interface and permission
> >checks?  cgroup participating in bpf operations is all fine but
> >splitting the userland interface across two domains seems like a bad
> >idea.
> >
> >>Filters which are higher up in the heirarchy will still be enforced during
> >>delegation. This was an explicit design, as the "Orchestrator in 
> >>Orchestrator"
> >>use case needs to be supported.
> >
> >Given that program loading is restricted to root, wouldn't it be an a
> >lot more efficient approach to let userland multiplex multiple
> >programs?  Walking the tree executing bpf programs each time one of
> >these operations runs can be pretty expensive.  Imagine a tre

Re: [PATCH v3 2/6] cgroup: add support for eBPF programs

2016-08-29 Thread Sargun Dhillon

On Fri, Aug 26, 2016 at 09:58:48PM +0200, Daniel Mack wrote:
> This patch adds two sets of eBPF program pointers to struct cgroup.
> One for such that are directly pinned to a cgroup, and one for such
> that are effective for it.
> 
> To illustrate the logic behind that, assume the following example
> cgroup hierarchy.
> 
>   A - B - C
> \ D - E
> 
> If only B has a program attached, it will be effective for B, C, D
> and E. If D then attaches a program itself, that will be effective for
> both D and E, and the program in B will only affect B and C. Only one
> program of a given type is effective for a cgroup.
> 
How does this work when running and orchestrator within an orchestrator? The 
Docker in Docker / Mesos in Mesos use case, where the top level orchestrator is 
observing the traffic, and there is an orchestrator within that also need to 
run 
it.

In this case, I'd like to run E's filter, then if it returns 0, D's, and B's, 
and so on. Is it possible to allow this, either by flattening out the 
datastructure (copy a ref to the bpf programs to C and E) or something similar?


> Attaching and detaching programs will be done through the bpf(2)
> syscall. For now, ingress and egress inet socket filtering are the
> only supported use-cases.
> 
> Signed-off-by: Daniel Mack 
> ---
>  include/linux/bpf-cgroup.h  |  70 +++
>  include/linux/cgroup-defs.h |   4 ++
>  init/Kconfig|  12 
>  kernel/bpf/Makefile |   1 +
>  kernel/bpf/cgroup.c | 165 
> 
>  kernel/cgroup.c |  18 +
>  6 files changed, 270 insertions(+)
>  create mode 100644 include/linux/bpf-cgroup.h
>  create mode 100644 kernel/bpf/cgroup.c
> 
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> new file mode 100644
> index 000..a5a25c1
> --- /dev/null
> +++ b/include/linux/bpf-cgroup.h
> @@ -0,0 +1,70 @@
> +#ifndef _BPF_CGROUP_H
> +#define _BPF_CGROUP_H
> +
> +#include 
> +#include 
> +
> +struct sock;
> +struct cgroup;
> +struct sk_buff;
> +
> +#ifdef CONFIG_CGROUP_BPF
> +
> +extern struct static_key_false cgroup_bpf_enabled_key;
> +#define cgroup_bpf_enabled static_branch_unlikely(_bpf_enabled_key)
> +
> +struct cgroup_bpf {
> + /*
> +  * Store two sets of bpf_prog pointers, one for programs that are
> +  * pinned directly to this cgroup, and one for those that are effective
> +  * when this cgroup is accessed.
> +  */
> + struct bpf_prog *prog[__MAX_BPF_ATTACH_TYPE];
> + struct bpf_prog *effective[__MAX_BPF_ATTACH_TYPE];
> +};
> +
> +void cgroup_bpf_put(struct cgroup *cgrp);
> +void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
> +
> +void __cgroup_bpf_update(struct cgroup *cgrp,
> +  struct cgroup *parent,
> +  struct bpf_prog *prog,
> +  enum bpf_attach_type type);
> +
> +/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
> +void cgroup_bpf_update(struct cgroup *cgrp,
> +struct bpf_prog *prog,
> +enum bpf_attach_type type);
> +
> +int __cgroup_bpf_run_filter(struct sock *sk,
> + struct sk_buff *skb,
> + enum bpf_attach_type type);
> +
> +/* Wrapper for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled */
> +static inline int cgroup_bpf_run_filter(struct sock *sk,
> + struct sk_buff *skb,
> + enum bpf_attach_type type)
> +{
> + if (cgroup_bpf_enabled)
> + return __cgroup_bpf_run_filter(sk, skb, type);
> +
> + return 0;
> +}
> +
> +#else
> +
> +struct cgroup_bpf {};
> +static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
> +static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
> +   struct cgroup *parent) {}
> +
> +static inline int cgroup_bpf_run_filter(struct sock *sk,
> + struct sk_buff *skb,
> + enum bpf_attach_type type)
> +{
> + return 0;
> +}
> +
> +#endif /* CONFIG_CGROUP_BPF */
> +
> +#endif /* _BPF_CGROUP_H */
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 5b17de6..861b467 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -16,6 +16,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #ifdef CONFIG_CGROUPS
>  
> @@ -300,6 +301,9 @@ struct cgroup {
>   /* used to schedule release agent */
>   struct work_struct release_agent_work;
>  
> + /* used to store eBPF programs */
> + struct cgroup_bpf bpf;
> +
>   /* ids of the ancestors at each level including self */
>   int ancestor_ids[];
>  };
> diff --git a/init/Kconfig b/init/Kconfig
> index cac3f09..5a89c83 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1144,6 +1144,18 @@ config CGROUP_PERF
>  
>

Re: [PATCH v3 5/6] net: core: run cgroup eBPF egress programs

2016-08-29 Thread Sargun Dhillon

On Tue, Aug 30, 2016 at 12:03:23AM +0200, Daniel Borkmann wrote:
> On 08/26/2016 09:58 PM, Daniel Mack wrote:
> >If the cgroup associated with the receiving socket has an eBPF
> >programs installed, run them from __dev_queue_xmit().
> >
> >eBPF programs used in this context are expected to either return 1 to
> >let the packet pass, or != 1 to drop them. The programs have access to
> >the full skb, including the MAC headers.
> >
> >Note that cgroup_bpf_run_filter() is stubbed out as static inline nop
> >for !CONFIG_CGROUP_BPF, and is otherwise guarded by a static key if
> >the feature is unused.
> >
> >Signed-off-by: Daniel Mack 
> >---
> >  net/core/dev.c | 6 ++
> >  1 file changed, 6 insertions(+)
> >
> >diff --git a/net/core/dev.c b/net/core/dev.c
> >index a75df86..17484e6 100644
> >--- a/net/core/dev.c
> >+++ b/net/core/dev.c
> >@@ -141,6 +141,7 @@
> >  #include 
> >  #include 
> >  #include 
> >+#include 
> >
> >  #include "net-sysfs.h"
> >
> >@@ -3329,6 +3330,11 @@ static int __dev_queue_xmit(struct sk_buff *skb, void 
> >*accel_priv)
> > if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
> > __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
> >
> >+rc = cgroup_bpf_run_filter(skb->sk, skb,
> >+   BPF_ATTACH_TYPE_CGROUP_INET_EGRESS);
> >+if (rc)
> >+return rc;
> 
> This would leak the whole skb by the way.
> 
> Apart from that, could this be modeled w/o affecting the forwarding path (at 
> some
> local output point where we know to have a valid socket)? Then you could also 
> drop
> the !sk and sk->sk_family tests, and we wouldn't need to replicate parts of 
> what
> clsact is doing as well. Hmm, maybe access to src/dst mac could be handled to 
> be
> just zeroes since not available at that point?
> 
> > /* Disable soft irqs for various locks below. Also
> >  * stops preemption for RCU.
> >  */
> >
Given this patchset only effects AF_INET, and AF_INET6, why not put the hooks 
at 
ip_output, and ip6_output

Re: [net-next RFC v2 4/9] bpf, security: Add Checmate security LSM and BPF program type

2016-08-29 Thread Sargun Dhillon

On Mon, Aug 29, 2016 at 01:01:18PM -0400, Tejun Heo wrote:
> Hello,
> 
> On Mon, Aug 29, 2016 at 04:47:07AM -0700, Sargun Dhillon wrote:
> > This patch adds a minor LSM, Checmate. Checmate is a flexible programmable,
> > extensible minor LSM that's coupled with cgroups and BPF. It is designed to
> > enforce container-specific policies. It is also a cgroupv2 controller. By
> > itself, it doesn't do very much, but in conjunction with a orchestrator
> > complex policies can be installed on the cgroup hierarchy.
> > 
> > These cgroup programs are tied to the kernel ABI version. If one tries
> > to load a BPF program compiled against a different kernel version,
> > an error will be thrown.
> 
> First of all, please talk with people working on network cgroup bpf
> and landlock.  I don't think it's a good idea to have N different ways
> to implement cgroup-aware bpf mechanism.  There can be multiple
> consumers but there gotta be a common mechanism instead of several
> independent controllers.
>
I've talked to Daniel Mack, and Alexei. I agree with you that it makes sense 
not 
to have an infinite number of these cgroup + bpf + lsm subsystems in the 
kernel. 
I think that making sure we don't sacrifice capability is important.

> > diff --git a/include/linux/checmate.h b/include/linux/checmate.h
> > new file mode 100644
> > index 000..4c4db4a
> > --- /dev/null
> > +++ b/include/linux/checmate.h
> > @@ -0,0 +1,108 @@
> > +#ifndef _LINUX_CHECMATE_H_
> > +#define _LINUX_CHECMATE_H_ 1
> > +#include 
> > +
> > +enum checmate_hook_num {
> > +   /* CONFIG_SECURITY_NET hooks */
> > +   CHECMATE_HOOK_UNIX_STREAM_CONNECT,
> > +   CHECMATE_HOOK_UNIX_MAY_SEND,
> > +   CHECMATE_HOOK_SOCKET_CREATE,
> > +   CHECMATE_HOOK_SOCKET_POST_CREATE,
> > +   CHECMATE_HOOK_SOCKET_BIND,
> > +   CHECMATE_HOOK_SOCKET_CONNECT,
> > +   CHECMATE_HOOK_SOCKET_LISTEN,
> > +   CHECMATE_HOOK_SOCKET_ACCEPT,
> > +   CHECMATE_HOOK_SOCKET_SENDMSG,
> > +   CHECMATE_HOOK_SOCKET_RECVMSG,
> > +   CHECMATE_HOOK_SOCKET_GETSOCKNAME,
> > +   CHECMATE_HOOK_SOCKET_GETPEERNAME,
> > +   CHECMATE_HOOK_SOCKET_GETSOCKOPT,
> > +   CHECMATE_HOOK_SOCKET_SETSOCKOPT,
> > +   CHECMATE_HOOK_SOCKET_SHUTDOWN,
> > +   CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB,
> > +   CHECMATE_HOOK_SK_FREE_SECURITY,
> > +   __CHECMATE_HOOK_MAX,
> > +};
> 
> Do we really want a separate hook for each call?  A logical extension
> of this would be having a separate hook per syscall which feels kinda
> horrible.
> 
It would be a separate hook per LSM hook. Why wouldn't we want a separate bpf 
hook per lsm hook? I think if one program has to handle them all, the first 
program would be looking up the hook program in a bpf prog array. If you think 
it's better to have this logic in the BPF program, that makes sense. 

I had a version of this patch that allowed you to attach a prog array instead, 
but I think that it's cleaner attaching a program per lsm hook. In addition, 
there's a performance impact that comes from these hooks, so I wouldn't want to 
execute unneccessary code if it's avoidable.

The prog array approach also makes stacking filters difficult. If people want 
multiple filters per hook, the orchestrator would have to rewrite the existing 
filters to be cooperative.

> > +/* CONFIG_SECURITY_NET contexts */
> > +struct checmate_unix_stream_connect_ctx {
> > +   struct sock *sock;
> > +   struct sock *other;
> > +   struct sock *newsk;
> > +};
> ...
> > +struct checmate_sk_free_security_ctx {
> > +   struct sock *sk;
> > +};
> ...
> > +struct checmate_ctx {
> > +   int hook;
> > +   union {
> > +/* CONFIG_SECURITY_NET contexts */
> > +   struct checmate_unix_stream_connect_ctx unix_stream_connect;
> > +   struct checmate_unix_may_send_ctx   unix_may_send;
> > +   struct checmate_socket_create_ctx   socket_create;
> > +   struct checmate_socket_bind_ctx socket_bind;
> > +   struct checmate_socket_connect_ctx  socket_connect;
> > +   struct checmate_socket_listen_ctx   socket_listen;
> > +   struct checmate_socket_accept_ctx   socket_accept;
> > +   struct checmate_socket_sendmsg_ctx  socket_sendmsg;
> > +   struct checmate_socket_recvmsg_ctx  socket_recvmsg;
> > +   struct checmate_socket_sock_rcv_skb_ctx socket_sock_rcv_skb;
> > +   struct checmate_sk_free_security_ctxsk_free_security;
> > +   };
> > +};
> 
> I'm not convinced about the approach.  It's an approach which pretty
> much requires future exte

[net-next RFC v2 7/9] samples/bpf: Split out helper code from test_current_task_under_cgroup_user

2016-08-29 Thread Sargun Dhillon

This splits out the cgroup helper code from
test_current_task_under_cgroup_user.c This code can be used to test any
program that needs to setup a cgroup v2 hierarchy temporarily, and put
itself into said hierarchy. It also includes some functions that make
moving around in the hierarchy a bit easier.

This patch is used in follow on samples.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 samples/bpf/Makefile  |   2 +-
 samples/bpf/cgroup_helpers.c  | 103 ++
 samples/bpf/cgroup_helpers.h  |  15 
 samples/bpf/test_current_task_under_cgroup_user.c |  72 +++
 4 files changed, 129 insertions(+), 63 deletions(-)
 create mode 100644 samples/bpf/cgroup_helpers.c
 create mode 100644 samples/bpf/cgroup_helpers.h

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index db3cb06..5d2c178 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -50,7 +50,7 @@ test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
-test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
+test_current_task_under_cgroup-objs := bpf_load.o libbpf.o cgroup_helpers.o \
   test_current_task_under_cgroup_user.o
 
 # Tell kbuild to always build the programs
diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c
new file mode 100644
index 000..e465497
--- /dev/null
+++ b/samples/bpf/cgroup_helpers.c
@@ -0,0 +1,103 @@
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "cgroup_helpers.h"
+
+#define CGROUP_MOUNT_PATH "/mnt"
+
+int add_controller(char *controller)
+{
+   int fd, rc = 0;
+
+   fd = open("cgroup.subtree_control", O_WRONLY);
+   if (fd < 0) {
+   log_err("Unable to open subtree_control");
+   return 1;
+   }
+   if (dprintf(fd, "+%s\n", controller) < 0) {
+   log_err("Adding Controller");
+   rc = 1;
+   }
+   close(fd);
+   return rc;
+}
+
+int mkdirp(char *path)
+{
+   int rc;
+
+   rc = mkdir(path, 0777);
+   if (rc && errno == EEXIST)
+   return 0;
+   return rc;
+}
+
+/*
+ * This is to avoid interfering with existing cgroups. Unfortunately,
+ * most people don't have cgroupv2 enabled at this point in time.
+ * It's easier to create our own mount namespace and manage it
+ * ourselves. This function drops you into the top of that cgroup2
+ * mount point, so make sure you call load_bpf before calling this.
+ */
+int setup_cgroups(void)
+{
+   if (unshare(CLONE_NEWNS)) {
+   log_err("unshare");
+   return 1;
+   }
+
+   if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) {
+   log_err("mount fakeroot");
+   return 1;
+   }
+
+   if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) {
+   log_err("mount cgroup2");
+   return 1;
+   }
+
+   if (chdir(CGROUP_MOUNT_PATH)) {
+   log_err("chdir");
+   return 1;
+   }
+
+   return 0;
+}
+
+int join_cgroup(char *path)
+{
+   char cgroup_path[1024];
+   pid_t pid = getpid();
+   int fd, rc = 0;
+
+   snprintf(cgroup_path, sizeof(cgroup_path), "%s/cgroup.procs", path);
+
+   fd = open(cgroup_path, O_WRONLY);
+   if (fd < 0) {
+   log_err("Opening Cgroup");
+   return 1;
+   }
+
+   if (dprintf(fd, "%d\n", pid) < 0) {
+   log_err("Joining Cgroup");
+   rc = 1;
+   }
+   close(fd);
+   return rc;
+}
+
+int reset_bpf_hook(int fd)
+{
+   if (dprintf(fd, "0\n") < 0) {
+   log_err("Unable to reset BPF hook");
+   return 1;
+   }
+   return 0;
+}
diff --git a/samples/bpf/cgroup_helpers.h b/samples/bpf/cgroup_helpers.h
new file mode 100644
index 000..f9f1bdf
--- /dev/null
+++ b/samples/bpf/cgroup_helpers.h
@@ -0,0 +1,15 @@
+#ifndef __CGROUP_HELPERS_H
+#define __CGROUP_HELPERS_H
+#include 
+
+#define clean_errno() (errno == 0 ? "None" : strerror(errno))
+#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
+   __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
+
+int mkdirp(char *path);
+int setup_cgroups(void);
+int join_cgroup(char *path);
+int reset_bpf_hook(int fd);
+int add_controller(char *controller);
+
+#endif
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c 
b/samples/bpf/test_current_task_under_cgroup_user.c
index 30b0bce..752a254

[net-next RFC v2 8/9] samples/bpf: Add limit_connections, remap_bind checmate examples / tests

2016-08-29 Thread Sargun Dhillon

1) limit_connections
This program performs connection limiting using a probablistic
datastructure. It ensures that for a given 2-tuple, there will never be
more than 10 connections. The parameters themselves are adjustable
to allow for trading off memory usage vs. collision likelihood. The
reason for not refcnting 2-tuples using atomic counters is the lack of
a safe free mechanism.

In order to run this program, you may need to bump your ulimit -l.

2) remap_bind
This program rewrites binds from 6789 to 12345. It is meant to mimic
the usage of DNAT.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 samples/bpf/Makefile  |  10 ++
 samples/bpf/bpf_helpers.h |   2 +
 samples/bpf/bpf_load.c|  11 +-
 samples/bpf/checmate_limit_connections_kern.c | 146 ++
 samples/bpf/checmate_limit_connections_user.c | 113 
 samples/bpf/checmate_remap_bind_kern.c|  28 +
 samples/bpf/checmate_remap_bind_user.c|  82 +++
 7 files changed, 389 insertions(+), 3 deletions(-)
 create mode 100644 samples/bpf/checmate_limit_connections_kern.c
 create mode 100644 samples/bpf/checmate_limit_connections_user.c
 create mode 100644 samples/bpf/checmate_remap_bind_kern.c
 create mode 100644 samples/bpf/checmate_remap_bind_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 5d2c178..ee5de8c 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -25,6 +25,8 @@ hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += xdp1
 hostprogs-y += xdp2
 hostprogs-y += test_current_task_under_cgroup
+hostprogs-y += checmate_remap_bind
+hostprogs-y += checmate_limit_connections
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -52,6 +54,10 @@ xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
 test_current_task_under_cgroup-objs := bpf_load.o libbpf.o cgroup_helpers.o \
   test_current_task_under_cgroup_user.o
+checmate_remap_bind-objs := bpf_load.o libbpf.o cgroup_helpers.o \
+   checmate_remap_bind_user.o
+checmate_limit_connections-objs := bpf_load.o libbpf.o cgroup_helpers.o \
+  checmate_limit_connections_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -79,6 +85,8 @@ always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
 always += test_current_task_under_cgroup_kern.o
+always += checmate_remap_bind_kern.o
+always += checmate_limit_connections_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -103,6 +111,8 @@ HOSTLOADLIBES_test_overhead += -lelf -lrt
 HOSTLOADLIBES_xdp1 += -lelf
 HOSTLOADLIBES_xdp2 += -lelf
 HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
+HOSTLOADLIBES_checmate_remap_bind += -lelf
+HOSTLOADLIBES_checmate_limit_connections += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index bbdf62a..da97ced 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -55,6 +55,8 @@ static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int 
size) =
(void *) BPF_FUNC_skb_get_tunnel_opt;
 static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) =
(void *) BPF_FUNC_skb_set_tunnel_opt;
+static int (*bpf_probe_write_checmate)(void *ctx, void *dst, void *src, int 
len) =
+   (void *) BPF_FUNC_probe_write_checmate;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 0cfda23..e12460a 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -51,6 +51,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
bool is_xdp = strncmp(event, "xdp", 3) == 0;
+   bool is_checmate = strncmp(event, "checmate", 8) == 0;
enum bpf_prog_type prog_type;
char buf[256];
int fd, efd, err, id;
@@ -69,6 +70,8 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
prog_type = BPF_PROG_TYPE_TRACEPOINT;
} else if (is_xdp) {
prog_type = BPF_PROG_TYPE_XDP;
+   } else if (is_checmate) {
+   prog_type = BPF_PROG_TYPE_CHECMATE;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -82,7 +85,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
 
prog_fd[prog_cnt++] = fd;

[net-next RFC v2 9/9] doc: Add LSM / BPF Checmate docs

2016-08-29 Thread Sargun Dhillon

This adds documentation on how to operate, and develop against the
Checmate LSM and Cgroup controller.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 Documentation/security/Checmate.txt | 54 +
 1 file changed, 54 insertions(+)
 create mode 100644 Documentation/security/Checmate.txt

diff --git a/Documentation/security/Checmate.txt 
b/Documentation/security/Checmate.txt
new file mode 100644
index 000..d409785
--- /dev/null
+++ b/Documentation/security/Checmate.txt
@@ -0,0 +1,54 @@
+--- What is Checmate? ---
+
+Checmate is a flexible programmable, extensible minor LSM that's coupled with
+cgroups and BPF. It is designed to enforce container-specific policies. By
+default, it does not enforce any policies. It is selectable at build time
+with CONFIG_SECURITY_CHECMATE, and it is controlled through the unified cgroups
+controller hierarchy.
+
+# How to use Checmate
+In order to use Checmate, you have to enable the controller on the cgroup2
+hierarchy. In order to prevent a centralized configuration daemon from mounting
+Checmate on the V1 hierarchy you may want to add 'cgroup_no_v1=checmate' to 
your
+boot command line.
+
+Enabling the controller:
+   mount -t cgroup2 none $MOUNT_POINT
+   cd $MOUNT_POINT
+   echo +checmate > cgroup.subtree_control
+
+Once you do this, immediate children of this node on the hierarchy will have a
+number of control files that begin with 'checmate.'. Each of these is mapped
+to an LSM hook by the same name. If you read the file, it will return the
+number of filters attached to that given hook. Details of the hooks can be
+found in lsm_hooks.h.
+
+All tasks which are members of a cgroup will have no only the checmate filters
+at that level enforced, but all levels above as well. If there is a need
+to exempt a specific sub-cgroup, a program can use current_task_under_cgroup
+along with a bpf map.
+
+## Adding filters:
+If you would like to add a filter, you must compile a BPF_PROG_TYPE_CHECMATE 
BPF
+program. You can then write the '%d\n' formatted version of the BPF program
+file descriptor to the relevant control file.
+
+## Removing filters:
+If you would like to remove a specific filter, you can write the negative file
+descriptor of the BPF program to the control file (a la '-%d\n'). If you would
+like to do this, then it is recommended that you pin your programs.
+
+If you would like to remove all filters from a specific hook, simply write '0'
+to the control file. During normal operation, you shouldn't have the bpf 
syscall
+return '0' for a given program, please take proper precautions to work around
+this.
+
+# Caveats
+## Hook Limit:
+Each hook is limited to having MAX_CHECMATE_INSTANCES (32) hooks per level
+in the hierarchy. The write call will return ENOSPC if you hit this condition.
+
+## CGroup v2 interaction with CGroup v1:
+Because the cgroups subsystem is in transition, using the net_prio or the
+net_classid v1 cgroups will render Checmate inoperable on all network
+hooks that inspect sockets.
\ No newline at end of file
-- 
2.7.4

[net-next RFC v2 6/9] bpf: Share current_task_under_cgroup helper and expose to Checmate programs

2016-08-29 Thread Sargun Dhillon

This patch exposes the current_task_under_cgroup helper to Checmate
programs. It can be used to implement exemptions for certain policies
when using Checmate programs by wrapping a pre-compiled policy
in a tail call along with this helper.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 include/linux/bpf.h  |  1 +
 kernel/bpf/helpers.c | 29 +
 kernel/trace/bpf_trace.c | 28 
 security/checmate/checmate_bpf.c |  2 ++
 4 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4e1fa57..5c5ed16 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -316,6 +316,7 @@ extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_current_task_proto;
 extern const struct bpf_func_proto bpf_probe_read_proto;
+extern const struct bpf_func_proto bpf_current_task_under_cgroup_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index c439afc..ffaaa4b 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -212,6 +213,34 @@ static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, 
u64 r5)
return ret;
 }
 
+static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 
r5)
+{
+   struct bpf_map *map = (struct bpf_map *)(long)r1;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+   struct cgroup *cgrp;
+   u32 idx = (u32)r2;
+
+   if (unlikely(in_interrupt()))
+   return -EINVAL;
+
+   if (unlikely(idx >= array->map.max_entries))
+   return -E2BIG;
+
+   cgrp = READ_ONCE(array->ptrs[idx]);
+   if (unlikely(!cgrp))
+   return -EAGAIN;
+
+   return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+   .func   = bpf_current_task_under_cgroup,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_CONST_MAP_PTR,
+   .arg2_type  = ARG_ANYTHING,
+};
+
 const struct bpf_func_proto bpf_probe_read_proto = {
.func   = bpf_probe_read,
.gpl_only   = true,
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index cb96eda..3725df2 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -343,34 +343,6 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void 
*meta, u64 meta_size,
return __bpf_perf_event_output(regs, map, flags, );
 }
 
-static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 
r5)
-{
-   struct bpf_map *map = (struct bpf_map *)(long)r1;
-   struct bpf_array *array = container_of(map, struct bpf_array, map);
-   struct cgroup *cgrp;
-   u32 idx = (u32)r2;
-
-   if (unlikely(in_interrupt()))
-   return -EINVAL;
-
-   if (unlikely(idx >= array->map.max_entries))
-   return -E2BIG;
-
-   cgrp = READ_ONCE(array->ptrs[idx]);
-   if (unlikely(!cgrp))
-   return -EAGAIN;
-
-   return task_under_cgroup_hierarchy(current, cgrp);
-}
-
-static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
-   .func   = bpf_current_task_under_cgroup,
-   .gpl_only   = false,
-   .ret_type   = RET_INTEGER,
-   .arg1_type  = ARG_CONST_MAP_PTR,
-   .arg2_type  = ARG_ANYTHING,
-};
-
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id 
func_id)
 {
switch (func_id) {
diff --git a/security/checmate/checmate_bpf.c b/security/checmate/checmate_bpf.c
index 24d6935..37ea609 100644
--- a/security/checmate/checmate_bpf.c
+++ b/security/checmate/checmate_bpf.c
@@ -91,6 +91,8 @@ checmate_prog_func_proto(enum bpf_func_id func_id)
return _probe_write_user_proto;
case BPF_FUNC_trace_printk:
return bpf_get_trace_printk_proto();
+   case BPF_FUNC_current_task_under_cgroup:
+   return _current_task_under_cgroup_proto;
default:
return NULL;
}
-- 
2.7.4

[net-next RFC v2 5/9] bpf: Add bpf_probe_write_checmate helper

2016-08-29 Thread Sargun Dhillon

This patch adds bpf_probe_write_checmate. This is a specific type of
helper for the Checmate subsystem. It allows safe writes to kernel
memory based on inspection of the Checmate ctx.

In this patch, it only allows writes during the socket_bind, and
socket_connect hooks to sockaddr. It can used to implement behaviour
that's common in containerzation platforms that either perform
DNAT/SNAT, or proxying to veil the true location of address to
service mapping.

Since this occurs only once, as opposed to per packet, it allows
for much higher performance. Not only this, but the standard BSD
API for introspecting sockets (getpeername) still works.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 include/uapi/linux/bpf.h | 11 
 security/checmate/checmate_bpf.c | 55 
 2 files changed, 66 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 91bc92f..3971456 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -398,6 +398,17 @@ enum bpf_func_id {
 */
BPF_FUNC_skb_change_tail,
 
+   /**
+* bpf_probe_write_checmate(ctx, void *dst, void *src, int len)
+* safely attempt to write to memory pointed to by a Checmate context
+* @ctx: struct checmate_ctx*
+* @dst: destination address in userspace
+* @src: source address on stack
+* @len: number of bytes to copy
+* Return: 0 on success or negative error
+*/
+   BPF_FUNC_probe_write_checmate,
+
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/security/checmate/checmate_bpf.c b/security/checmate/checmate_bpf.c
index 001225c..24d6935 100644
--- a/security/checmate/checmate_bpf.c
+++ b/security/checmate/checmate_bpf.c
@@ -12,6 +12,59 @@
 #include 
 #include 
 
+static int probe_write_socket_bind(struct checmate_socket_bind_ctx *ctx,
+  void *unsafe_ptr, void *src, int size)
+{
+   if (unsafe_ptr < (void *)ctx->address ||
+   (unsafe_ptr + size) > ((void *)ctx->address + ctx->addrlen))
+   return -EPERM;
+
+   memcpy(unsafe_ptr, src, size);
+
+   return 0;
+}
+
+static int probe_write_socket_connect(struct checmate_socket_connect_ctx *ctx,
+ void *unsafe_ptr, void *src, int size)
+{
+   if (unsafe_ptr < (void *)ctx->address ||
+   (unsafe_ptr + size) > ((void *)ctx->address + ctx->addrlen))
+   return -EPERM;
+
+   memcpy(unsafe_ptr, src, size);
+
+   return 0;
+}
+
+static u64 bpf_probe_write_checmate(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   struct checmate_ctx *ctx = (struct checmate_ctx *) (long) (r1);
+   void *unsafe_ptr = (void *) (long) r2;
+   void *src = (void *) (long) r3;
+   int size = (int) r4;
+
+   switch (ctx->hook) {
+   case CHECMATE_HOOK_SOCKET_BIND:
+   return probe_write_socket_bind(>socket_bind, unsafe_ptr,
+  src, size);
+   case CHECMATE_HOOK_SOCKET_CONNECT:
+   return probe_write_socket_connect(>socket_connect,
+ unsafe_ptr, src, size);
+   }
+
+   return -EPERM;
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+   .func   = bpf_probe_write_checmate,
+   .gpl_only   = true,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_CTX,
+   .arg2_type  = ARG_ANYTHING,
+   .arg3_type  = ARG_PTR_TO_STACK,
+   .arg4_type  = ARG_CONST_STACK_SIZE,
+};
+
 static const struct bpf_func_proto *
 checmate_prog_func_proto(enum bpf_func_id func_id)
 {
@@ -34,6 +87,8 @@ checmate_prog_func_proto(enum bpf_func_id func_id)
return _get_current_uid_gid_proto;
case BPF_FUNC_get_current_comm:
return _get_current_comm_proto;
+   case BPF_FUNC_probe_write_checmate:
+   return _probe_write_user_proto;
case BPF_FUNC_trace_printk:
return bpf_get_trace_printk_proto();
default:
-- 
2.7.4

[net-next RFC v2 4/9] bpf, security: Add Checmate security LSM and BPF program type

2016-08-29 Thread Sargun Dhillon

This patch adds a minor LSM, Checmate. Checmate is a flexible programmable,
extensible minor LSM that's coupled with cgroups and BPF. It is designed to
enforce container-specific policies. It is also a cgroupv2 controller. By
itself, it doesn't do very much, but in conjunction with a orchestrator
complex policies can be installed on the cgroup hierarchy.

These cgroup programs are tied to the kernel ABI version. If one tries
to load a BPF program compiled against a different kernel version,
an error will be thrown.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 include/linux/cgroup_subsys.h|   4 +
 include/linux/checmate.h | 108 +++
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/syscall.c |   2 +-
 security/Kconfig |   1 +
 security/Makefile|   2 +
 security/checmate/Kconfig|  11 +
 security/checmate/Makefile   |   3 +
 security/checmate/checmate_bpf.c |  68 +
 security/checmate/checmate_lsm.c | 610 +++
 10 files changed, 809 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/checmate.h
 create mode 100644 security/checmate/Kconfig
 create mode 100644 security/checmate/Makefile
 create mode 100644 security/checmate/checmate_bpf.c
 create mode 100644 security/checmate/checmate_lsm.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 0df0336a..fbb7aa7 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -56,6 +56,10 @@ SUBSYS(hugetlb)
 SUBSYS(pids)
 #endif
 
+#if IS_ENABLED(CONFIG_SECURITY_CHECMATE)
+SUBSYS(checmate)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/linux/checmate.h b/include/linux/checmate.h
new file mode 100644
index 000..4c4db4a
--- /dev/null
+++ b/include/linux/checmate.h
@@ -0,0 +1,108 @@
+#ifndef _LINUX_CHECMATE_H_
+#define _LINUX_CHECMATE_H_ 1
+#include 
+
+enum checmate_hook_num {
+   /* CONFIG_SECURITY_NET hooks */
+   CHECMATE_HOOK_UNIX_STREAM_CONNECT,
+   CHECMATE_HOOK_UNIX_MAY_SEND,
+   CHECMATE_HOOK_SOCKET_CREATE,
+   CHECMATE_HOOK_SOCKET_POST_CREATE,
+   CHECMATE_HOOK_SOCKET_BIND,
+   CHECMATE_HOOK_SOCKET_CONNECT,
+   CHECMATE_HOOK_SOCKET_LISTEN,
+   CHECMATE_HOOK_SOCKET_ACCEPT,
+   CHECMATE_HOOK_SOCKET_SENDMSG,
+   CHECMATE_HOOK_SOCKET_RECVMSG,
+   CHECMATE_HOOK_SOCKET_GETSOCKNAME,
+   CHECMATE_HOOK_SOCKET_GETPEERNAME,
+   CHECMATE_HOOK_SOCKET_GETSOCKOPT,
+   CHECMATE_HOOK_SOCKET_SETSOCKOPT,
+   CHECMATE_HOOK_SOCKET_SHUTDOWN,
+   CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB,
+   CHECMATE_HOOK_SK_FREE_SECURITY,
+   __CHECMATE_HOOK_MAX,
+};
+
+/* CONFIG_SECURITY_NET contexts */
+struct checmate_unix_stream_connect_ctx {
+   struct sock *sock;
+   struct sock *other;
+   struct sock *newsk;
+};
+
+struct checmate_unix_may_send_ctx {
+   struct socket *sock;
+   struct socket *other;
+};
+
+struct checmate_socket_create_ctx {
+   int family;
+   int type;
+   int protocol;
+   int kern;
+};
+
+struct checmate_socket_bind_ctx {
+   struct socket *sock;
+   struct sockaddr *address;
+   int addrlen;
+};
+
+struct checmate_socket_connect_ctx {
+   struct socket *sock;
+   struct sockaddr *address;
+   int addrlen;
+};
+
+struct checmate_socket_listen_ctx {
+   struct socket *sock;
+   int backlog;
+};
+
+struct checmate_socket_accept_ctx {
+   struct socket *sock;
+   struct socket *newsock;
+};
+
+struct checmate_socket_sendmsg_ctx {
+   struct socket *sock;
+   struct msghdr *msg;
+   int size;
+};
+
+struct checmate_socket_recvmsg_ctx {
+   struct socket *sock;
+   struct msghdr *msg;
+   int size;
+   int flags;
+};
+
+struct checmate_socket_sock_rcv_skb_ctx {
+   struct sock *sk;
+   struct sk_buff *skb;
+};
+
+struct checmate_sk_free_security_ctx {
+   struct sock *sk;
+};
+
+struct checmate_ctx {
+   int hook;
+   union {
+/* CONFIG_SECURITY_NET contexts */
+   struct checmate_unix_stream_connect_ctx unix_stream_connect;
+   struct checmate_unix_may_send_ctx   unix_may_send;
+   struct checmate_socket_create_ctx   socket_create;
+   struct checmate_socket_bind_ctx socket_bind;
+   struct checmate_socket_connect_ctx  socket_connect;
+   struct checmate_socket_listen_ctx   socket_listen;
+   struct checmate_socket_accept_ctx   socket_accept;
+   struct checmate_socket_sendmsg_ctx  socket_sendmsg;
+   struct checmate_socket_recvmsg_ctx  socket_recvmsg;
+   struct checmate_socket_sock_rcv_skb_ctx socket_sock_rcv_skb;
+   struct checmate_sk_free_security_ctxsk_free_security;
+   };
+};
+
+#endif /* _LINUX_CHECMATE_H_ */
diff --git a/include/uapi/linux/b

[net-next RFC v2 1/9] net: Make cgroup sk data present when calling security_sk_(alloc/free)

2016-08-29 Thread Sargun Dhillon

This patch changes the order of allocations / calls to allocate the
sock_cgroup_data before calling security_sk_alloc. In addition, this
patch also reorders the deallocation when calling security_sk_free
so that LSMs can examine the cgroup that a particular sk belongs to.

Signed-off-by: Sarguun Dhillon 
---
 net/core/sock.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 51a7304..3f12f4d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1333,6 +1333,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, 
gfp_t priority,
 
if (sk != NULL) {
kmemcheck_annotate_bitfield(sk, flags);
+   cgroup_sk_alloc(>sk_cgrp_data);
 
if (security_sk_alloc(sk, family, priority))
goto out_free;
@@ -1340,7 +1341,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, 
gfp_t priority,
if (!try_module_get(prot->owner))
goto out_free_sec;
sk_tx_queue_clear(sk);
-   cgroup_sk_alloc(>sk_cgrp_data);
}
 
return sk;
@@ -1348,6 +1348,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, 
gfp_t priority,
 out_free_sec:
security_sk_free(sk);
 out_free:
+   cgroup_sk_free(>sk_cgrp_data);
if (slab != NULL)
kmem_cache_free(slab, sk);
else
@@ -1363,8 +1364,8 @@ static void sk_prot_free(struct proto *prot, struct sock 
*sk)
owner = prot->owner;
slab = prot->slab;
 
-   cgroup_sk_free(>sk_cgrp_data);
security_sk_free(sk);
+   cgroup_sk_free(>sk_cgrp_data);
if (slab != NULL)
kmem_cache_free(slab, sk);
else
-- 
2.7.4

[net-next RFC v2 2/9] cgroups: move helper cgroup_parent to cgroup.h

2016-08-29 Thread Sargun Dhillon

This patch moves cgroup_parent into cgroup.h as a static inline helper
function so that others can use it. Although this pattern is easy
to implement, having it in one place simplifies the creation
of new cgroup controllers.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 include/linux/cgroup.h | 16 
 kernel/cgroup.c|  9 -
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a4414a1..b84eb6e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -571,6 +571,22 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn);
 }
 
+/**
+ * cgroup_parent - Get the parent of a specific cgroup
+ * @cgrp: target cgroup
+ *
+ * If the cgroup does not have a parent (top level), then this function
+ * returns NULL. Otherwise, it'll return a pointer to te the parent cgroup.
+ */
+static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+   struct cgroup_subsys_state *parent_css = cgrp->self.parent;
+
+   if (parent_css)
+   return container_of(parent_css, struct cgroup, self);
+   return NULL;
+}
+
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d1c51b7..1ec1a4e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -358,15 +358,6 @@ static void cgroup_idr_remove(struct idr *idr, int id)
spin_unlock_bh(_idr_lock);
 }
 
-static struct cgroup *cgroup_parent(struct cgroup *cgrp)
-{
-   struct cgroup_subsys_state *parent_css = cgrp->self.parent;
-
-   if (parent_css)
-   return container_of(parent_css, struct cgroup, self);
-   return NULL;
-}
-
 /* subsystems visibly enabled on a cgroup */
 static u16 cgroup_control(struct cgroup *cgrp)
 {
-- 
2.7.4

[net-next RFC v2 3/9] bpf: move tracing helpers (probe_read, get_current_task) to shared helpers

2016-08-29 Thread Sargun Dhillon

This patch moves bpf_probe_read and bpf_get_current_task to be the shared
cgroup infrastructure. These are useful outside of the context of just
tracing, but also inspection of a process memory during security
policy enforcement.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/helpers.c | 34 ++
 kernel/trace/bpf_trace.c | 33 -
 3 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1113423..4e1fa57 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -314,6 +314,8 @@ extern const struct bpf_func_proto 
bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_current_task_proto;
+extern const struct bpf_func_proto bpf_probe_read_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ea3afb..c439afc 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -186,3 +187,36 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg1_type  = ARG_PTR_TO_RAW_STACK,
.arg2_type  = ARG_CONST_STACK_SIZE,
 };
+
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   return (long) current;
+}
+
+const struct bpf_func_proto bpf_get_current_task_proto = {
+   .func   = bpf_get_current_task,
+   .gpl_only   = true,
+   .ret_type   = RET_INTEGER,
+};
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   void *dst = (void *) (long) r1;
+   int ret, size = (int) r2;
+   void *unsafe_ptr = (void *) (long) r3;
+
+   ret = probe_kernel_read(dst, unsafe_ptr, size);
+   if (unlikely(ret < 0))
+   memset(dst, 0, size);
+
+   return ret;
+}
+
+const struct bpf_func_proto bpf_probe_read_proto = {
+   .func   = bpf_probe_read,
+   .gpl_only   = true,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_RAW_STACK,
+   .arg2_type  = ARG_CONST_STACK_SIZE,
+   .arg3_type  = ARG_ANYTHING,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ad35213..cb96eda 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -59,28 +59,6 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
-static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-   void *dst = (void *) (long) r1;
-   int ret, size = (int) r2;
-   void *unsafe_ptr = (void *) (long) r3;
-
-   ret = probe_kernel_read(dst, unsafe_ptr, size);
-   if (unlikely(ret < 0))
-   memset(dst, 0, size);
-
-   return ret;
-}
-
-static const struct bpf_func_proto bpf_probe_read_proto = {
-   .func   = bpf_probe_read,
-   .gpl_only   = true,
-   .ret_type   = RET_INTEGER,
-   .arg1_type  = ARG_PTR_TO_RAW_STACK,
-   .arg2_type  = ARG_CONST_STACK_SIZE,
-   .arg3_type  = ARG_ANYTHING,
-};
-
 static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
void *unsafe_ptr = (void *) (long) r1;
@@ -365,17 +343,6 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void 
*meta, u64 meta_size,
return __bpf_perf_event_output(regs, map, flags, );
 }
 
-static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-   return (long) current;
-}
-
-static const struct bpf_func_proto bpf_get_current_task_proto = {
-   .func   = bpf_get_current_task,
-   .gpl_only   = true,
-   .ret_type   = RET_INTEGER,
-};
-
 static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 
r5)
 {
struct bpf_map *map = (struct bpf_map *)(long)r1;
-- 
2.7.4

[net-next RFC v2 0/9] Add Checmate: BPF-driven minor LSM

2016-08-29 Thread Sargun Dhillon

I've begun building out the skeleton of a Linux Security Module, and I'd like to
get feedback on it. It's a skeleton, and I've only populated a few hooks, so I'm
mostly looking for input on the general proposal, interest, and design. It's a
minor LSM. My particular use case is one in which containers are being
dynamically deployed to machines by internal developers in a different group.
The point of Checmate is to act as an extensible bed for _safe_, complex
security policies. It's nice to enable dynamic security policies that can be
defined in C, and change as neccessary, without ever having to patch, or rebuild
the kernel.

This is the second reroll of this patchset, and it's quite different than the 
first approach. Instead of being totally independent of the cgroups code, it is 
now a cgroups controller. It relies on the LSM API to hook into points in the 
kernel, and cgroups APIs to determine which policy to enforce. 

Right now, it's meant to be applied to containers. It is expected that it'd be 
configured by some kind of central management system. It's also expected that 
the central management system would have a set of policies that ship as binary 
images, and are controlled by BPF maps. Using this, one can have fairly complex 
filters, without requiring an entire toolchain. Although the patchset currently 
locks BPF programs to only working against the kernel they were compiled with, 
there is nothing in the future that prevents us from changing this.

To start, it only hooks into a subset of the LSM network API. The primary 
reason 
behind his is simplicity, and rather than build out of the full infrastructure, 
to start the comment process early. Also, there have been a number of patches 
(LandLock, Network cgroups controller, Daniel Mack's BPF filters on cgroups) 
that are similar, and these set of hooks solve many of the same problems.


Although, at first, much of this sounds like seccomp, it's quite different. 
First, you have access to kernel pointers, which allows you to dereference, and 
read data like sockaddrs safely. Since the data has been copied into 
kernelspace, you don't have to worry about TOC-TOU attacks.

The user-facing bits of the API are detailed in "Add LSM / BPF Checmate docs", 
but a short summary is that Checmatate is a cgroups controller. You can enable
it, and then write your BPF FDs to special control files. Once you do this,
the programs are enforced on all processes in that cgroup, and below it.

To answer the question of why not use IPTables - often times, there is an 
overhead to using a 2nd network namespace that is unacceptable. Not because 
network namespaces are inherently expensive, but many of us leverage 
infrastructure that cannot handle multiple IPs, and therefore we have to do
"weird" tricks to get multiple network NSs to work (NAT, mirroring, etc..).

Open Questions:

1) Performance: 

Right now, the patches aren't really performance optimized. For the task hooks, 
it's cheap enough because it's 1 dereference from task->cgroup, and then a 
matter of walking up the hierarchy. On the other hand, for SK's it can be 
considerably more expensive.

I am thinking that maybe it makes sense to add the security hook dynamically 
the 
first time that someones writes a BPF program to that controller. This way, you 
can have filters on syscalls that happen rarely, like bind, but you avoid
paying the cost on expensive hooks liks rcv_skb.

It would be really nice if sock_cgroup_data included pointers to the CSSs that 
were effective for a given sock.

Also, a minor point. The way that the Checmate struct are packed, we lose 4 
bytes for every hook because of alignment. If we moved counts into the top
level datastructure, we could work around this. I'd prefer not to do that.

2) API

The API right now tightly ties programs to the kernel version. I don't see a 
good way around this unless we decide that a subset of the lsm hooks API is 
immutable. That's a question for the LSM maintainers. 

Thanks to Alexei, Daniel B, and Daniel Mack, and Tejun for input. I would love 
to know what y'all think.


Sargun Dhillon (9):
  net: Make cgroup sk data present when calling security_sk_(alloc/free)
  cgroups: move helper cgroup_parent to cgroup.h
  bpf: move tracing helpers (probe_read, get_current_task) to shared
helpers
  bpf, security: Add Checmate security LSM and BPF program type
  bpf: Add bpf_probe_write_checmate helper
  bpf: Share current_task_under_cgroup helper and expose to Checmate
programs
  samples/bpf: Split out helper code from
test_current_task_under_cgroup_user
  samples/bpf: Add limit_connections, remap_bind checmate examples /
tests
  doc: Add LSM / BPF Checmate docs

 Documentation/security/Checmate.txt   |  54 ++
 include/linux/bpf.h   |   3 +
 include/linux/cgroup.h|  16 +
 include/linux/cgroup_subsys.h |   4 +
 i

Re: [RFC PATCH 0/5] Add eBPF hooks for cgroups

2016-08-23 Thread Sargun Dhillon

On Tue, Aug 23, 2016 at 10:27:28AM +0200, Daniel Mack wrote:
> On 08/22/2016 07:20 PM, Sargun Dhillon wrote:
> > On Mon, Aug 22, 2016 at 06:22:20PM +0200, Daniel Mack wrote:
> >> On 08/22/2016 06:06 PM, Pablo Neira Ayuso wrote:
> 
> >>> This patchset also needs an extra egress hook, not yet known where to
> >>> be placed, so two hooks in the network stacks in the end, 
> >>
> >> That should be solvable, I'm sure. I can as well leave egress out for
> >> the next version so it can be added later on.
> >>
> > Any idea where you might put that yet? Does dev_xmit seems like a 
> > reasonable 
> > place?
> 
> Ah, yes. Thanks for the pointer, that seems to work fine.
> 
Daniel pointed out to me that there's already a BPF program that's used there 
for tc matches. So, it should work fine. I would just verify you can call 
programs from IRQs, and rcu_bh plays well with it.

Alternatively, if you want to filter only IP traffic, ip_output, and ip6_output 
are fairly good places. I'm planning on putting some LSM hooks there soon. It's 
a bit simpler.

I also suggest you use verdicts rather than trimming for simplicity sake.

> > If someone uses the netprio, or the net classid controllers, skcd matches
> > no longer work.
> 
> Yes, sock_cgroup_ptr() will fall back to the v2 root in this case.
> 
> > Ideally, we should fix up these controllers to make them
> > more v2 friendly.
> 
> These controllers do not exist for v2, that's why sock_cgroup_ptr()
> behaves that way. What's your idea to fix that up?
I think that we should just add another pointer to the end of sock_cgroup_data 
while we're in this state of transition, and nudge people to disable 
CONFIG_CGROUP_NET_PRIO and CONFIG_CGROUP_NET_CLASSID over time.

Alternatively, we add these controllers for v2, and we have some kind of marker 
whether or not they're on v2 in the skcd. If they are, we can find the cgroup, 
and get the prioidx, and classid from the css. Although the comment in 
cgroup-defs.h suggests that v2 and classid should never be used concurrently, I 
can't help but to disagree, given there's legacy infrastructure that leverages 
classid.

> 
> 
> Thanks,
> Daniel
> 

Looking forward to seeing these patches,
-Sargun

Re: [RFC PATCH 0/5] Add eBPF hooks for cgroups

2016-08-22 Thread Sargun Dhillon

On Mon, Aug 22, 2016 at 06:22:20PM +0200, Daniel Mack wrote:
> On 08/22/2016 06:06 PM, Pablo Neira Ayuso wrote:
> > On Fri, Aug 19, 2016 at 07:07:39PM +0200, Thomas Graf wrote:
> 
> >> You brought up multiple tables which reflect the cumulative approach.
> >> This sometimes works but has its issues as well. Users must be aware
> >> of each other and anticipate what rules other users might inject
> >> before or after their own tables. The very existence of firewalld which
> >> aims at democratizing this collaboration proves this point.
> > 
> > Firewalld, was really required in the iptables predefined tables
> > model, in nft last time we talked about this during NFWS'15, future
> > plans for firewalld were not clear yet.
> > 
> > Moreover, in nft, different users can indeed dump the ruleset and it
> > would be possible to validate if one policy is being shadowed by
> > another coming later on. The bpf bytecode dump cannot be taken to the
> > original representation.
> 
> But as Thomas said - both things address different use-cases. For
> container setups, there is no administrator involved to use cli tools,
> so I don't think that's really much of an argument.
> 
> >> So in that sense I would very much like for both models to be made
> >> available to users. nftables+cgroups for a cumulative approach as
> >> well as BPF+cgroups for the delegation approach.  I don't see why the
> >> cgroups based filtering capability should not be made available to both.
> > 
> > This patchset also needs an extra egress hook, not yet known where to
> > be placed, so two hooks in the network stacks in the end, 
> 
> That should be solvable, I'm sure. I can as well leave egress out for
> the next version so it can be added later on.
> 
Any idea where you might put that yet? Does dev_xmit seems like a reasonable 
place?

> > and this only works for cgroups version 2.
> 
> I don't see a problem with that, as v1 and v2 hierarchies can peacefully
> coexist.
> 
If someone uses the netprio, or the net classid controllers, skcd matches
no longer work. Ideally, we should fix up these controllers to make them
more v2 friendly.

> > Last time we talked about this, main concerns were that this was too
> > specific, but this approach seems even more specific to me.
> 
> Hmm, I disagree - bpf programs that are associated with cgroups are
> rather something that can be extended a lot in the future, for instance
> for handling port binding permissions etc. Unlike the proposed network
> cgroup controller with all sorts of complicated knobs to control ranges
> of ports etc, a bpf program that take care of that in a much more
> versatile way.
> 
> I also strongly believe we can have both, a cgroup controller that has
> bpf programs for socket filtering and other things, _and_ a "post socket
> lookup netfilter" table type. Both will have their individual use-cases.
> 
> 
> Thanks,
> Daniel
>

Re: [RFC PATCH 4/5] net: filter: run cgroup eBPF programs

2016-08-21 Thread Sargun Dhillon

On Wed, Aug 17, 2016 at 04:00:47PM +0200, Daniel Mack wrote:
> If CONFIG_CGROUP_BPF is enabled, and the cgroup associated with the
> receiving socket has an eBPF programs installed, run them from
> sk_filter_trim_cap().
> 
> eBPF programs used in this context are expected to either return 1 to
> let the packet pass, or != 1 to drop them. The programs have access to
> the full skb, including the MAC headers.
> 
> This patch only implements the call site for ingress packets.
> 
> Signed-off-by: Daniel Mack 
> ---
>  net/core/filter.c | 44 
>  1 file changed, 44 insertions(+)
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index c5d8332..a1dd94b 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -52,6 +52,44 @@
>  #include 
>  #include 
>  
> +#ifdef CONFIG_CGROUP_BPF
> +static int sk_filter_cgroup_bpf(struct sock *sk, struct sk_buff *skb,
> + enum bpf_attach_type type)
> +{
> + struct sock_cgroup_data *skcd = >sk_cgrp_data;
> + struct cgroup *cgrp = sock_cgroup_ptr(skcd);
> + struct bpf_prog *prog;
> + int ret = 0;
> +
> + rcu_read_lock();
> +
> + switch (type) {
> + case BPF_ATTACH_TYPE_CGROUP_EGRESS:
> + prog = rcu_dereference(cgrp->bpf_egress);
> + break;
> + case BPF_ATTACH_TYPE_CGROUP_INGRESS:
> + prog = rcu_dereference(cgrp->bpf_ingress);
> + break;
> + default:
> + WARN_ON_ONCE(1);
> + ret = -EINVAL;
> + break;
> + }
> +
> + if (prog) {
> + unsigned int offset = skb->data - skb_mac_header(skb);
> +
> + __skb_push(skb, offset);
> + ret = bpf_prog_run_clear_cb(prog, skb) > 0 ? 0 : -EPERM;
> + __skb_pull(skb, offset);
> + }
> +
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +#endif /* !CONFIG_CGROUP_BPF */
> +
>  /**
>   *   sk_filter_trim_cap - run a packet through a socket filter
>   *   @sk: sock associated with _buff
> @@ -78,6 +116,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff 
> *skb, unsigned int cap)
>   if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
>   return -ENOMEM;
>  
> +#ifdef CONFIG_CGROUP_BPF
> + err = sk_filter_cgroup_bpf(sk, skb, BPF_ATTACH_TYPE_CGROUP_INGRESS);
> + if (err)
> + return err;
> +#endif
> +
>   err = security_sock_rcv_skb(sk, skb);
>   if (err)
>   return err;
> -- 
> 2.5.5
> 

So, casually looking at this patch, it looks like you're relying on 
sock_cgroup_data, which only points to the default hierarchy. If someone uses 
net_prio or net_classid, cgroup_sk_alloc_disable is called, and this wont work 
anymore. 

Any ideas on how to work around that? Does it make sense to add another pointer 
to sock_cgroup_data, or at least a warning when allocation is disabled?

Re: [RFC 0/4] RFC: Add Checmate, BPF-driven minor LSM

2016-08-15 Thread Sargun Dhillon

On Mon, Aug 15, 2016 at 12:59:13PM +0200, Mickaël Salaün wrote:
> 
> On 15/08/2016 05:09, Sargun Dhillon wrote:
> > On Mon, Aug 15, 2016 at 12:57:44AM +0200, Mickaël Salaün wrote:
> >> Our approaches have some common points (i.e. use eBPF in an LSM, stacked 
> >> filters like seccomp) but I'm focused on a kind of unprivileged LSM (i.e. 
> >> no 
> >> CAP_SYS_ADMIN), to make standalone sandboxes, which brings more 
> >> constraints 
> >> (e.g. no use of unsafe functions like bpf_probe_read(), take care of 
> >> privacy, 
> >> SUID exec, stable ABI…). However, I don't want to handle resource limits, 
> >> which should be the job of cgroups.
> >>
> > Kind of. Sometimes describing these resource limits is difficult. For 
> > example, I 
> > have a customer who is trying to restrict containers from burning up all 
> > the 
> > ephemeral ports on the machine. In this, they have an incredibly elaborate 
> > chain 
> > of wiring to prevent a given container from connecting to the same (proto, 
> > destip, destport) more than 1000 times.
> > 
> > I'm unsure of how you'd model that in a cgroup. 
> 
> This looks like a Netfilter rule. Have you tried applying this limitation 
> with the connlimit module?
> 
> 
I could do this by adding a new Netfilter match, but with the existing matches, 
the only ones that "select" by cgroup2 don't have the ability to connlimit by 
cgroup. Potentially, I could wire up something with the cgroup2 match, but this 
comes with a lot of overhead. If you know of a low-overhead way of doing this, 
I'd love to hear.

Have you ever user Kubernetes? (http://kubernetes.io/docs/whatisk8s/)? You 
usually have a bunch of independent systems running together under what's 
called 
a "Pod". You can think of this as an old style "lxc" container, or a VM, and in 
each of these pods there is nesting where you want to not only limit the pod's 
resources, but you also want to limit the resources of each application. Doing 
this without some layer of programmability in resource management layer can be 
difficult.

> > 
> >> For now, I'm focusing on file-system access control which is one of the 
> >> more 
> >> complex system to properly filter. I also plan to support basic network 
> >> access 
> >> control.
> >>
> >> What you are trying to accomplish seems more related to a Netfilter 
> >> extension 
> >> (something like ipset but with eBPF maybe?).
> >>
> > I don't only want to do network access control, I also want to write to the 
> > value once it's copied into kernel space. There are lot of benefits of 
> > doing 
> > this at the syscall level, but the two primary ones are performance, and 
> > capability. 
> > 
> > One of the biggest complaints with our current approach to filtering & load 
> > balancing (iptables) is that it hides information. When people connect 
> > through 
> > the load balancer, they want to find out who they connected to, and without 
> > some 
> > high application-level mechanism, this isn't possible. On the other hand, 
> > if we 
> > just rewrite the destination address in the connect hook, we can pretty 
> > easily
> > allow them to do getpeername.
> 
> What exactly is not doable with Netfilter (e.g. REDIRECT or TPROXY)?
> 
> 
Is there a way to "load balance" or "proxy" a connection where getpeername() 
tells you the real IP of the node you're connected to?

> > 
> > I'm curious about your filesystem access limiter. Do you have a way to make 
> > it so
> > that a given container can only write, say, 100mb of data to disk? 
> 
> It's a filesystem access control. It doesn't deal with quota and is not 
> focused on container but process hierarchies (which is more generic).
> 
> What is not doable with a quota mount option? It may be more appropriate to 
> enhance the VFS (or overlayfs) to apply this kind of limitation, if needed.
> 
Your overlayfs suggesion is on point. Since a lot of my containers look similar 
to Kubernetes though, quota isn't very well aligned with them (within a Pod, 
there are usually a bunch of independent things that need their usage limited). 
I think quota / overlayfs with labeling that comes from an LSM, or some other 
smart classifier would be ideal.

Re: [RFC 0/4] RFC: Add Checmate, BPF-driven minor LSM

2016-08-14 Thread Sargun Dhillon

On Mon, Aug 15, 2016 at 12:57:44AM +0200, Mickaël Salaün wrote:
> Hi,
> 
> I've been working on an extension to seccomp-bpf since last year and 
> published 
> a first RFC about it [1]. I'm working on a second RFC/PoC which use eBPF 
> instead of cBPF and is more close to a common LSM than the first RFC. I plan 
> to publish this second RFC by the end of the month.
> 
Interesting. I plan on dropping another RFC close to the end of the month as 
well.

> Our approaches have some common points (i.e. use eBPF in an LSM, stacked 
> filters like seccomp) but I'm focused on a kind of unprivileged LSM (i.e. no 
> CAP_SYS_ADMIN), to make standalone sandboxes, which brings more constraints 
> (e.g. no use of unsafe functions like bpf_probe_read(), take care of privacy, 
> SUID exec, stable ABI…). However, I don't want to handle resource limits, 
> which should be the job of cgroups.
> 
Kind of. Sometimes describing these resource limits is difficult. For example, 
I 
have a customer who is trying to restrict containers from burning up all the 
ephemeral ports on the machine. In this, they have an incredibly elaborate 
chain 
of wiring to prevent a given container from connecting to the same (proto, 
destip, destport) more than 1000 times.

I'm unsure of how you'd model that in a cgroup. 

> For now, I'm focusing on file-system access control which is one of the more 
> complex system to properly filter. I also plan to support basic network 
> access 
> control.
> 
> What you are trying to accomplish seems more related to a Netfilter extension 
> (something like ipset but with eBPF maybe?).
> 
I don't only want to do network access control, I also want to write to the 
value once it's copied into kernel space. There are lot of benefits of doing 
this at the syscall level, but the two primary ones are performance, and 
capability. 

One of the biggest complaints with our current approach to filtering & load 
balancing (iptables) is that it hides information. When people connect through 
the load balancer, they want to find out who they connected to, and without 
some 
high application-level mechanism, this isn't possible. On the other hand, if we 
just rewrite the destination address in the connect hook, we can pretty easily
allow them to do getpeername.

I'm curious about your filesystem access limiter. Do you have a way to make it 
so
that a given container can only write, say, 100mb of data to disk? 


>  Mickaël
> 
> 
> [1] http://www.openwall.com/lists/kernel-hardening/2016/03/24/2
> 
> 
> 

> On 09/08/2016 02:22, Kees Cook wrote:
> > On Mon, Aug 8, 2016 at 5:00 PM, Sargun Dhillon <sar...@sargun.me> wrote:
> >> On Mon, Aug 08, 2016 at 04:44:02PM -0700, Kees Cook wrote:
> >>> On Thu, Aug 4, 2016 at 12:11 AM, Sargun Dhillon <sar...@sargun.me> wrote:
> >>>> I distributed this patchset to linux-security-mod...@vger.kernel.org 
> >>>> earlier,
> >>>> but based on the fact that the archive is down, and this is a fairly
> >>>> broad-sweeping proposal, I figured I'd grow the audience a little bit. 
> >>>> Sorry
> >>>> if you received this multiple times.
> >>>>
> >>>> I've begun building out the skeleton of a Linux Security Module, and I'd 
> >>>> like to
> >>>> get feedback on it. It's a skeleton, and I've only populated a few 
> >>>> hooks, so I'm
> >>>> mostly looking for input on the general proposal, interest, and design. 
> >>>> It's a
> >>>> minor LSM. My particular use case is one in which containers are being
> >>>> dynamically deployed to machines by internal developers in a different 
> >>>> group.
> >>>> The point of Checmate is to act as an extensible bed for _safe_, complex
> >>>> security policies. It's nice to enable dynamic security policies that 
> >>>> can be
> >>>> defined in C, and change as neccessary, without ever having to patch, or 
> >>>> rebuild
> >>>> the kernel.
> >>>>
> >>>> For many of these containers, the security policies can be fairly 
> >>>> nuanced. One
> >>>> particular one to take into account is network security. Often times,
> >>>> administrators want to prevent ingress, and egress connectivity except 
> >>>> from a
> >>>> few select IPs. Egress filtering can be managed using net_cls, but 
> >>>> without
> >>>> modifying running software, it's non-trivial to attach a filter to all 
> >>>> sockets
> >>>> being created within a container. The inet_conn_request,

[PATCH net-next v5 0/3] Add test_current_task_under_cgroup bpf helper and test

2016-08-12 Thread Sargun Dhillon

This patchset includes a helper and an example to determine whether the probe is
currently executing in the context of a specific cgroup based on a cgroup bpf
map / array. The helper checks the cgroupsv2 hierarchy based on the handle in
the map and if the current cgroup is equal to it, or a descendant of it. The
helper was tested with the example program, and it was verified that the correct
behaviour occurs in the interrupt context.

In an earlier version of this patchset I had added an "opensnoop"-like tool, and
I realized I was basically reimplementing a lot of the code that already exists
in the bcc repo. So, instead I decided to write a test that creates a new mount
namespace, mounts up the cgroupv2 hierarchy, and does some basic tests.  I used
the sync syscall as a canary for these tests because it's a simple, 0-arg
syscall. Once this patch is accepted, adding support to opensnoop will be easy.

I also added a task_under_cgroup_hierarchy function in cgroups.h, as this
pattern is used in a couple places. Converting those can be done in a later
patchset.

Thanks to Alexei, Tejun, and Daniel for providing review.

v1->v2: Clean up
v2->v3: Move around ifdefs out of *.c files, add an "integration" test
v3->v4: De-genercize arraymap fetching function;
rename helper from in_cgroup to under_cgroup (makes much more sense)
Split adding cgroups task_under_cgroup_hierarchy function
v4->v5: Fix formatting

Sargun Dhillon (3):
  cgroup: Add task_under_cgroup_hierarchy cgroup inline function to
headers
  bpf: Add bpf_current_task_under_cgroup helper
  samples/bpf: Add test_current_task_under_cgroup test

 include/linux/cgroup.h|  23 
 include/uapi/linux/bpf.h  |  11 ++
 kernel/bpf/arraymap.c |   2 +-
 kernel/bpf/verifier.c |   4 +-
 kernel/trace/bpf_trace.c  |  30 +
 samples/bpf/Makefile  |   5 +
 samples/bpf/bpf_helpers.h |   2 +
 samples/bpf/test_current_task_under_cgroup_kern.c |  43 +++
 samples/bpf/test_current_task_under_cgroup_user.c | 145 ++
 9 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/test_current_task_under_cgroup_kern.c
 create mode 100644 samples/bpf/test_current_task_under_cgroup_user.c

-- 
2.7.4

[PATCH net-next v5 3/3] samples/bpf: Add test_current_task_under_cgroup test

2016-08-12 Thread Sargun Dhillon

This test has a BPF program which writes the last known pid to call the
sync syscall within a given cgroup to a map.

The user mode program creates its own mount namespace, and mounts the
cgroupsv2  hierarchy in there, as on all current test systems
(Ubuntu 16.04, Debian), the cgroupsv2 vfs is unmounted by default.
Once it does this, it proceeds to test.

The test checks for positive and negative condition. It ensures that
when it's part of a given cgroup, its pid is captured in the map,
and that when it leaves the cgroup, this doesn't happen.

It populate a cgroups arraymap prior to execution in userspace. This means
that the program must be run in the same cgroups namespace as the programs
that are being traced.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: Tejun Heo <t...@kernel.org>
---
 samples/bpf/Makefile  |   5 +
 samples/bpf/bpf_helpers.h |   2 +
 samples/bpf/test_current_task_under_cgroup_kern.c |  43 +++
 samples/bpf/test_current_task_under_cgroup_user.c | 145 ++
 4 files changed, 195 insertions(+)
 create mode 100644 samples/bpf/test_current_task_under_cgroup_kern.c
 create mode 100644 samples/bpf/test_current_task_under_cgroup_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 90ebf7d..eb582c6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -24,6 +24,7 @@ hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += xdp1
 hostprogs-y += xdp2
+hostprogs-y += test_current_task_under_cgroup
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -49,6 +50,8 @@ test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
+test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
+  test_current_task_under_cgroup_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -74,6 +77,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
+always += test_current_task_under_cgroup_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -97,6 +101,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
 HOSTLOADLIBES_xdp1 += -lelf
 HOSTLOADLIBES_xdp2 += -lelf
+HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index cbc52df..5e4c41e 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -45,6 +45,8 @@ static int (*bpf_get_stackid)(void *ctx, void *map, int 
flags) =
(void *) BPF_FUNC_get_stackid;
 static int (*bpf_probe_write_user)(void *dst, void *src, int size) =
(void *) BPF_FUNC_probe_write_user;
+static int (*bpf_current_task_under_cgroup)(void *map, int index) =
+   (void *) BPF_FUNC_current_task_under_cgroup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c 
b/samples/bpf/test_current_task_under_cgroup_kern.c
new file mode 100644
index 000..86b28d7
--- /dev/null
+++ b/samples/bpf/test_current_task_under_cgroup_kern.c
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 Sargun Dhillon <sar...@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+
+struct bpf_map_def SEC("maps") cgroup_map = {
+   .type   = BPF_MAP_TYPE_CGROUP_ARRAY,
+   .key_size   = sizeof(u32),
+   .value_size = sizeof(u32),
+   .max_entries= 1,
+};
+
+struct bpf_map_def SEC("maps") perf_map = {
+   .type   = BPF_MAP_TYPE_ARRAY,
+   .key_size   = sizeof(u32),
+   .value_size = sizeof(u64),
+   .max_entries= 1,
+};
+
+/* Writes the last PID that called sync to a map at index 0 */
+SEC("kprobe/sys_sync")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   u64 pid = bpf_get_current_pid_tgid();
+   int idx = 0;
+
+   if (!bpf_current_task_under_cgroup(_map, 0))
+   return 0;
+
+   bpf_map_update_elem(_map, , , BPF_ANY);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version")

[PATCH net-next v5 2/3] bpf: Add bpf_current_task_under_cgroup helper

2016-08-12 Thread Sargun Dhillon

This adds a bpf helper that's similar to the skb_in_cgroup helper to check
whether the probe is currently executing in the context of a specific
subset of the cgroupsv2 hierarchy. It does this based on membership test
for a cgroup arraymap. It is invalid to call this in an interrupt, and
it'll return an error. The helper is primarily to be used in debugging
activities for containers, where you may have multiple programs running in
a given top-level "container".

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: Tejun Heo <t...@kernel.org>
Acked-by: Tejun Heo <t...@kernel.org>
Acked-by: Alexei Starovoitov <a...@kernel.org>
---
 include/uapi/linux/bpf.h | 11 +++
 kernel/bpf/arraymap.c|  2 +-
 kernel/bpf/verifier.c|  4 +++-
 kernel/trace/bpf_trace.c | 30 ++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da218fe..bea0c4e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,17 @@ enum bpf_func_id {
 */
BPF_FUNC_probe_write_user,
 
+   /**
+* bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership 
of current task
+* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+* @index: index of the cgroup in the bpf_map
+* Return:
+*   == 0 current failed the cgroup2 descendant test
+*   == 1 current succeeded the cgroup2 descendant test
+*< 0 error
+*/
+   BPF_FUNC_current_task_under_cgroup,
+
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650..a2ac051 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
 }
 late_initcall(register_perf_event_array_map);
 
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUPS
 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 struct file *map_file /* not used */,
 int fd)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7094c69..d504722 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1053,7 +1053,8 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
goto error;
break;
case BPF_MAP_TYPE_CGROUP_ARRAY:
-   if (func_id != BPF_FUNC_skb_in_cgroup)
+   if (func_id != BPF_FUNC_skb_in_cgroup &&
+   func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
default:
@@ -1075,6 +1076,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+   case BPF_FUNC_current_task_under_cgroup:
case BPF_FUNC_skb_in_cgroup:
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b20438f..6b794d6 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -376,6 +376,34 @@ static const struct bpf_func_proto 
bpf_get_current_task_proto = {
.ret_type   = RET_INTEGER,
 };
 
+static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 
r5)
+{
+   struct bpf_map *map = (struct bpf_map *)(long)r1;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+   struct cgroup *cgrp;
+   u32 idx = (u32)r2;
+
+   if (unlikely(in_interrupt()))
+   return -EINVAL;
+
+   if (unlikely(idx >= array->map.max_entries))
+   return -E2BIG;
+
+   cgrp = READ_ONCE(array->ptrs[idx]);
+   if (unlikely(!cgrp))
+   return -EAGAIN;
+
+   return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+   .func   = bpf_current_task_under_cgroup,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_CONST_MAP_PTR,
+   .arg2_type  = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id 
func_id)
 {
switch (func_id) {
@@ -407,6 +435,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum 
bpf_func_id func_id)
return _perf_event_read_proto;
case BPF_FUNC_probe_write_user:
return bpf_get_probe_write_proto();
+   case BPF_FUNC_current_task_under_cgroup:
+   return _current_task_under_cgroup_proto;
default:
return NULL;
}
-- 
2.7.4

[PATCH net-next v5 1/3] cgroup: Add task_under_cgroup_hierarchy cgroup inline function to headers

2016-08-12 Thread Sargun Dhillon

This commit adds an inline function to cgroup.h to check whether a given
task is under a given cgroup hierarchy. This is to avoid having to put
ifdefs in .c files to gate access to cgroups. When cgroups are disabled
this always returns true.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: Tejun Heo <t...@kernel.org>
Acked-by: Tejun Heo <t...@kernel.org>
---
 include/linux/cgroup.h | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 984f73b..a4414a1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -497,6 +497,23 @@ static inline bool cgroup_is_descendant(struct cgroup 
*cgrp,
return cgrp->ancestor_ids[ancestor->level] == ancestor->id;
 }
 
+/**
+ * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
+ * @task: the task to be tested
+ * @ancestor: possible ancestor of @task's cgroup
+ *
+ * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ * It follows all the same rules as cgroup_is_descendant, and only applies
+ * to the default hierarchy.
+ */
+static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
+  struct cgroup *ancestor)
+{
+   struct css_set *cset = task_css_set(task);
+
+   return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
+}
+
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_is_populated(struct cgroup *cgrp)
 {
@@ -557,6 +574,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
+struct cgroup;
 
 static inline void css_put(struct cgroup_subsys_state *css) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
@@ -574,6 +592,11 @@ static inline void cgroup_free(struct task_struct *p) {}
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 
+static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
+  struct cgroup *ancestor)
+{
+   return true;
+}
 #endif /* !CONFIG_CGROUPS */
 
 /*
-- 
2.7.4

Re: [PATCH net-next v4 2/3] bpf: Add bpf_current_task_under_cgroup helper

2016-08-12 Thread Sargun Dhillon

On Fri, Aug 12, 2016 at 09:16:07AM +0200, Daniel Borkmann wrote:
> On 08/12/2016 06:50 AM, Sargun Dhillon wrote:
> >I realize that in_cgroup is more consistent, but under_cgroup makes
> >far more sense to me. I think it's more intuitive.
> >
> >On Thu, Aug 11, 2016 at 9:48 PM, Alexei Starovoitov
> ><alexei.starovoi...@gmail.com> wrote:
> >>On Thu, Aug 11, 2016 at 08:14:56PM -0700, Sargun Dhillon wrote:
> >>>This adds a bpf helper that's similar to the skb_in_cgroup helper to check
> >>>whether the probe is currently executing in the context of a specific
> >>>subset of the cgroupsv2 hierarchy. It does this based on membership test
> >>>for a cgroup arraymap. It is invalid to call this in an interrupt, and
> >>>it'll return an error. The helper is primarily to be used in debugging
> >>>activities for containers, where you may have multiple programs running in
> >>>a given top-level "container".
> >>>
> >>>Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> >>>Cc: Alexei Starovoitov <a...@kernel.org>
> >>>Cc: Daniel Borkmann <dan...@iogearbox.net>
> >>>Cc: Tejun Heo <t...@kernel.org>
> >>>---
> >>>+ /**
> >>>+  * bpf_current_task_under_cgroup(map, index) - Check cgroup2 
> >>>membership of current task
> >>>+  * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
> >>>+  * @index: index of the cgroup in the bpf_map
> >>>+  * Return:
> >>>+  *   == 0 current failed the cgroup2 descendant test
> >>>+  *   == 1 current succeeded the cgroup2 descendant test
> >>>+  *< 0 error
> >>>+  */
> >>>+ BPF_FUNC_current_task_under_cgroup,
> >>..
> >>>   case BPF_MAP_TYPE_CGROUP_ARRAY:
> >>>- if (func_id != BPF_FUNC_skb_in_cgroup)
> >>>+ if (func_id != BPF_FUNC_skb_in_cgroup &&
> >>>+ func_id != BPF_FUNC_current_task_under_cgroup)
> >>>   goto error;
> >>...
> >>>+ case BPF_FUNC_current_task_under_cgroup:
> >>>   case BPF_FUNC_skb_in_cgroup:
> >>
> >>Tejun,
> >>do you feel strongly about 'under' ?
> >>It just looks inconsistent vs existing skb_in_cgroup...
> >>"in cgroup" - 4k google hits
> >>"under cgroup" - 2k google hits
> 
> Alternative could be that we take "BPF_FUNC_current_in_cgroup" as a
> helper enum to keep consistency with what we have wrt skb helper, but
> for the cgroup header have the suggested task_under_cgroup_hierarchy()
> name.

I actually wish we could rename skb_in_cgroup to skb_under_cgroup. If we ever 
introduced a check for absolute membership versus ancestral membership, what 
would we call that?

Re: [PATCH net-next v4 2/3] bpf: Add bpf_current_task_under_cgroup helper

2016-08-11 Thread Sargun Dhillon

I realize that in_cgroup is more consistent, but under_cgroup makes
far more sense to me. I think it's more intuitive.

On Thu, Aug 11, 2016 at 9:48 PM, Alexei Starovoitov
<alexei.starovoi...@gmail.com> wrote:
> On Thu, Aug 11, 2016 at 08:14:56PM -0700, Sargun Dhillon wrote:
>> This adds a bpf helper that's similar to the skb_in_cgroup helper to check
>> whether the probe is currently executing in the context of a specific
>> subset of the cgroupsv2 hierarchy. It does this based on membership test
>> for a cgroup arraymap. It is invalid to call this in an interrupt, and
>> it'll return an error. The helper is primarily to be used in debugging
>> activities for containers, where you may have multiple programs running in
>> a given top-level "container".
>>
>> Signed-off-by: Sargun Dhillon <sar...@sargun.me>
>> Cc: Alexei Starovoitov <a...@kernel.org>
>> Cc: Daniel Borkmann <dan...@iogearbox.net>
>> Cc: Tejun Heo <t...@kernel.org>
>> ---
>> + /**
>> +  * bpf_current_task_under_cgroup(map, index) - Check cgroup2 
>> membership of current task
>> +  * @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
>> +  * @index: index of the cgroup in the bpf_map
>> +  * Return:
>> +  *   == 0 current failed the cgroup2 descendant test
>> +  *   == 1 current succeeded the cgroup2 descendant test
>> +  *< 0 error
>> +  */
>> + BPF_FUNC_current_task_under_cgroup,
> ..
>>   case BPF_MAP_TYPE_CGROUP_ARRAY:
>> - if (func_id != BPF_FUNC_skb_in_cgroup)
>> + if (func_id != BPF_FUNC_skb_in_cgroup &&
>> + func_id != BPF_FUNC_current_task_under_cgroup)
>>   goto error;
> ...
>> + case BPF_FUNC_current_task_under_cgroup:
>>   case BPF_FUNC_skb_in_cgroup:
>
> Tejun,
> do you feel strongly about 'under' ?
> It just looks inconsistent vs existing skb_in_cgroup...
> "in cgroup" - 4k google hits
> "under cgroup" - 2k google hits
>

[PATCH net-next v4 3/3] samples/bpf: Add test_current_task_under_cgroup test

2016-08-11 Thread Sargun Dhillon

This test has a BPF program which writes the last known pid to call the
sync syscall within a given cgroup to a map.

The user mode program creates its own mount namespace, and mounts the
cgroupsv2  hierarchy in there, as on all current test systems
(Ubuntu 16.04, Debian), the cgroupsv2 vfs is unmounted by default.
Once it does this, it proceeds to test.

The test checks for positive and negative condition. It ensures that
when it's part of a given cgroup, its pid is captured in the map,
and that when it leaves the cgroup, this doesn't happen.

It populate a cgroups arraymap prior to execution in userspace. This means
that the program must be run in the same cgroups namespace as the programs
that are being traced.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: Tejun Heo <t...@kernel.org>
---
 samples/bpf/Makefile  |   5 +
 samples/bpf/bpf_helpers.h |   2 +
 samples/bpf/test_current_task_under_cgroup_kern.c |  43 +++
 samples/bpf/test_current_task_under_cgroup_user.c | 145 ++
 4 files changed, 195 insertions(+)
 create mode 100644 samples/bpf/test_current_task_under_cgroup_kern.c
 create mode 100644 samples/bpf/test_current_task_under_cgroup_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 90ebf7d..eb582c6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -24,6 +24,7 @@ hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += xdp1
 hostprogs-y += xdp2
+hostprogs-y += test_current_task_under_cgroup
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -49,6 +50,8 @@ test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
+test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
+  test_current_task_under_cgroup_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -74,6 +77,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
+always += test_current_task_under_cgroup_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -97,6 +101,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
 HOSTLOADLIBES_xdp1 += -lelf
 HOSTLOADLIBES_xdp2 += -lelf
+HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index cbc52df..5e4c41e 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -45,6 +45,8 @@ static int (*bpf_get_stackid)(void *ctx, void *map, int 
flags) =
(void *) BPF_FUNC_get_stackid;
 static int (*bpf_probe_write_user)(void *dst, void *src, int size) =
(void *) BPF_FUNC_probe_write_user;
+static int (*bpf_current_task_under_cgroup)(void *map, int index) =
+   (void *) BPF_FUNC_current_task_under_cgroup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/test_current_task_under_cgroup_kern.c 
b/samples/bpf/test_current_task_under_cgroup_kern.c
new file mode 100644
index 000..86b28d7
--- /dev/null
+++ b/samples/bpf/test_current_task_under_cgroup_kern.c
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 Sargun Dhillon <sar...@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+
+struct bpf_map_def SEC("maps") cgroup_map = {
+   .type   = BPF_MAP_TYPE_CGROUP_ARRAY,
+   .key_size   = sizeof(u32),
+   .value_size = sizeof(u32),
+   .max_entries= 1,
+};
+
+struct bpf_map_def SEC("maps") perf_map = {
+   .type   = BPF_MAP_TYPE_ARRAY,
+   .key_size   = sizeof(u32),
+   .value_size = sizeof(u64),
+   .max_entries= 1,
+};
+
+/* Writes the last PID that called sync to a map at index 0 */
+SEC("kprobe/sys_sync")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   u64 pid = bpf_get_current_pid_tgid();
+   int idx = 0;
+
+   if (!bpf_current_task_under_cgroup(_map, 0))
+   return 0;
+
+   bpf_map_update_elem(_map, , , BPF_ANY);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version")

[PATCH net-next v4 2/3] bpf: Add bpf_current_task_under_cgroup helper

2016-08-11 Thread Sargun Dhillon

This adds a bpf helper that's similar to the skb_in_cgroup helper to check
whether the probe is currently executing in the context of a specific
subset of the cgroupsv2 hierarchy. It does this based on membership test
for a cgroup arraymap. It is invalid to call this in an interrupt, and
it'll return an error. The helper is primarily to be used in debugging
activities for containers, where you may have multiple programs running in
a given top-level "container".

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: Tejun Heo <t...@kernel.org>
---
 include/uapi/linux/bpf.h | 11 +++
 kernel/bpf/arraymap.c|  2 +-
 kernel/bpf/verifier.c|  4 +++-
 kernel/trace/bpf_trace.c | 30 ++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da218fe..bea0c4e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,17 @@ enum bpf_func_id {
 */
BPF_FUNC_probe_write_user,
 
+   /**
+* bpf_current_task_under_cgroup(map, index) - Check cgroup2 membership 
of current task
+* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+* @index: index of the cgroup in the bpf_map
+* Return:
+*   == 0 current failed the cgroup2 descendant test
+*   == 1 current succeeded the cgroup2 descendant test
+*< 0 error
+*/
+   BPF_FUNC_current_task_under_cgroup,
+
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650..a2ac051 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
 }
 late_initcall(register_perf_event_array_map);
 
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUPS
 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 struct file *map_file /* not used */,
 int fd)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7094c69..d504722 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1053,7 +1053,8 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
goto error;
break;
case BPF_MAP_TYPE_CGROUP_ARRAY:
-   if (func_id != BPF_FUNC_skb_in_cgroup)
+   if (func_id != BPF_FUNC_skb_in_cgroup &&
+   func_id != BPF_FUNC_current_task_under_cgroup)
goto error;
break;
default:
@@ -1075,6 +1076,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+   case BPF_FUNC_current_task_under_cgroup:
case BPF_FUNC_skb_in_cgroup:
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b20438f..e85f183 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -376,6 +376,34 @@ static const struct bpf_func_proto 
bpf_get_current_task_proto = {
.ret_type   = RET_INTEGER,
 };
 
+static u64 bpf_current_task_under_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 
r5)
+{
+   u32 idx = (u32)r2;
+   struct cgroup *cgrp;
+   struct bpf_map *map = (struct bpf_map *)(long)r1;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+   if (unlikely(in_interrupt()))
+   return -EINVAL;
+
+   if (unlikely(idx >= array->map.max_entries))
+   return -E2BIG;
+
+   cgrp = READ_ONCE(array->ptrs[idx]);
+   if (unlikely(!cgrp))
+   return -EAGAIN;
+
+   return task_under_cgroup_hierarchy(current, cgrp);
+}
+
+static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
+   .func   = bpf_current_task_under_cgroup,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_CONST_MAP_PTR,
+   .arg2_type  = ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id 
func_id)
 {
switch (func_id) {
@@ -407,6 +435,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum 
bpf_func_id func_id)
return _perf_event_read_proto;
case BPF_FUNC_probe_write_user:
return bpf_get_probe_write_proto();
+   case BPF_FUNC_current_task_under_cgroup:
+   return _current_task_under_cgroup_proto;
default:
return NULL;
}
-- 
2.7.4

[PATCH net-next v4 1/3] cgroup: Add task_under_cgroup_hierarchy cgroup inline function to headers

2016-08-11 Thread Sargun Dhillon

This commit adds an inline function to cgroup.h to check whether a given
task is under a given cgroup hierarchy. This is to avoid having to put
ifdefs in .c files to gate access to cgroups. When cgroups are disabled
this always returns true.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: Tejun Heo <t...@kernel.org>
---
 include/linux/cgroup.h | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 984f73b..a4414a1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -497,6 +497,23 @@ static inline bool cgroup_is_descendant(struct cgroup 
*cgrp,
return cgrp->ancestor_ids[ancestor->level] == ancestor->id;
 }
 
+/**
+ * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
+ * @task: the task to be tested
+ * @ancestor: possible ancestor of @task's cgroup
+ *
+ * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ * It follows all the same rules as cgroup_is_descendant, and only applies
+ * to the default hierarchy.
+ */
+static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
+  struct cgroup *ancestor)
+{
+   struct css_set *cset = task_css_set(task);
+
+   return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
+}
+
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_is_populated(struct cgroup *cgrp)
 {
@@ -557,6 +574,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
+struct cgroup;
 
 static inline void css_put(struct cgroup_subsys_state *css) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
@@ -574,6 +592,11 @@ static inline void cgroup_free(struct task_struct *p) {}
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 
+static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
+  struct cgroup *ancestor)
+{
+   return true;
+}
 #endif /* !CONFIG_CGROUPS */
 
 /*
-- 
2.7.4

[PATCH net-next v4 0/3]

2016-08-11 Thread Sargun Dhillon

This patchset includes a helper and an example to determine whether the probe is
currently executing in the context of a specific cgroup based on a cgroup bpf
map / array. The helper checks the cgroupsv2 hierarchy based on the handle in
the map and if the current cgroup is equal to it, or a descendant of it. The
helper was tested with the example program, and it was verified that the correct
behaviour occurs in the interrupt context.

In an earlier version of this patchset I had added an "opensnoop"-like tool, and
I realized I was basically reimplementing a lot of the code that already exists
in the bcc repo. So, instead I decided to write a test that creates a new mount
namespace, mounts up the cgroupv2 hierarchy, and does some basic tests.  I used
the sync syscall as a canary for these tests because it's a simple, 0-arg
syscall. Once this patch is accepted, adding support to opensnoop will be easy.

I also added a task_under_cgroup_hierarchy function in cgroups.h, as this 
pattern is used in a couple places. Converting those can be done in a later 
patchset.

Thanks to Alexei, Tejun, and Daniel for providing review.

v1->v2: Clean up
v2->v3: Move around ifdefs out of *.c files, add an "integration" test
v3->v4: De-genercize arraymap fetching function;
rename helper from in_cgroup to under_cgroup (makes much more sense)
Split adding cgroups task_under_cgroup_hierarchy function

Sargun Dhillon (3):
  cgroup: Add task_under_cgroup_hierarchy cgroup inline function to
headers
  bpf: Add bpf_current_task_under_cgroup helper
  samples/bpf: Add test_current_task_under_cgroup test

 include/linux/cgroup.h|  23 
 include/uapi/linux/bpf.h  |  11 ++
 kernel/bpf/arraymap.c |   2 +-
 kernel/bpf/verifier.c |   4 +-
 kernel/trace/bpf_trace.c  |  30 +
 samples/bpf/Makefile  |   5 +
 samples/bpf/bpf_helpers.h |   2 +
 samples/bpf/test_current_task_under_cgroup_kern.c |  43 +++
 samples/bpf/test_current_task_under_cgroup_user.c | 145 ++
 9 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/test_current_task_under_cgroup_kern.c
 create mode 100644 samples/bpf/test_current_task_under_cgroup_user.c

-- 
2.7.4

[PATCH net-next v3 2/2] samples/bpf: Add test_current_task_in_cgroup test

2016-08-11 Thread Sargun Dhillon

This test has a BPF program which writes the last known pid to call the
sync syscall within a given cgroup to a map.

The user mode program creates its own mount namespace, and mounts the
cgroupsv2 hierarchy in there, as on all current test systems
(Ubuntu 16.04, CoreOS 1053),  the cgroupsv2 vfs is unmounted by default.
Once it does this, it proceeds to test.

The test checks for positive and negative condition. It ensures that
when it's part of a given cgroup, its pid is captured in the map,
and that when it leaves the cgroup, this doesn't happen.

It populate a cgroups arraymap prior to execution in userspace. This means
that the program must be run in the same cgroups namespace as the programs
that are being traced.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: Tejun Heo <t...@kernel.org>
---
 samples/bpf/Makefile   |   4 +
 samples/bpf/bpf_helpers.h  |   2 +
 samples/bpf/test_current_task_in_cgroup_kern.c |  43 
 samples/bpf/test_current_task_in_cgroup_user.c | 145 +
 4 files changed, 194 insertions(+)
 create mode 100644 samples/bpf/test_current_task_in_cgroup_kern.c
 create mode 100644 samples/bpf/test_current_task_in_cgroup_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 90ebf7d..45aeb1c 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -24,6 +24,7 @@ hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += xdp1
 hostprogs-y += xdp2
+hostprogs-y += test_current_task_in_cgroup
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -49,6 +50,7 @@ test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
+test_current_task_in_cgroup-objs := bpf_load.o libbpf.o 
test_current_task_in_cgroup_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -74,6 +76,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
+always += test_current_task_in_cgroup_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -97,6 +100,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
 HOSTLOADLIBES_xdp1 += -lelf
 HOSTLOADLIBES_xdp2 += -lelf
+HOSTLOADLIBES_test_current_task_in_cgroup += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index cbc52df..e6bc5936 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -45,6 +45,8 @@ static int (*bpf_get_stackid)(void *ctx, void *map, int 
flags) =
(void *) BPF_FUNC_get_stackid;
 static int (*bpf_probe_write_user)(void *dst, void *src, int size) =
(void *) BPF_FUNC_probe_write_user;
+static int (*bpf_current_task_in_cgroup)(void *map, int index) =
+   (void *) BPF_FUNC_current_task_in_cgroup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/test_current_task_in_cgroup_kern.c 
b/samples/bpf/test_current_task_in_cgroup_kern.c
new file mode 100644
index 000..2a77bd7
--- /dev/null
+++ b/samples/bpf/test_current_task_in_cgroup_kern.c
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 Sargun Dhillon <sar...@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+
+struct bpf_map_def SEC("maps") cgroup_map = {
+   .type   = BPF_MAP_TYPE_CGROUP_ARRAY,
+   .key_size   = sizeof(u32),
+   .value_size = sizeof(u32),
+   .max_entries= 1,
+};
+
+struct bpf_map_def SEC("maps") perf_map = {
+   .type   = BPF_MAP_TYPE_ARRAY,
+   .key_size   = sizeof(u32),
+   .value_size = sizeof(u64),
+   .max_entries= 1,
+};
+
+/* Writes the last PID that called sync to a map at index 0 */
+SEC("kprobe/sys_sync")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   u64 pid = bpf_get_current_pid_tgid();
+   int idx = 0;
+
+   if (!bpf_current_task_in_cgroup(_map, 0))
+   return 0;
+
+   bpf_map_update_elem(_map, , , BPF_ANY);
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_current_task_in_cgroup_user.c 
b/samples/bp

[PATCH net-next v3 1/2] bpf: Add bpf_current_task_in_cgroup helper

2016-08-11 Thread Sargun Dhillon

This adds a bpf helper that's similar to the skb_in_cgroup helper to check
whether the probe is currently executing in the context of a specific
subset of the cgroupsv2 hierarchy. It does this based on membership test
for a cgroup arraymap. It is invalid to call this in an interrupt, and
it'll return an error. The helper is primarily to be used in debugging
activities for containers, where you may have multiple programs running in
a given top-level "container".

This patch also genericizes some of the arraymap fetching logic between the
skb_in_cgroup helper and this new helper.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Cc: Tejun Heo <t...@kernel.org>
---
 include/linux/bpf.h  | 22 ++
 include/linux/cgroup.h   | 23 +++
 include/uapi/linux/bpf.h | 11 +++
 kernel/bpf/arraymap.c|  2 +-
 kernel/bpf/verifier.c|  4 +++-
 kernel/trace/bpf_trace.c | 27 +++
 net/core/filter.c| 11 ---
 7 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1113423..6c01ab1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,4 +319,26 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+/* Helper to fetch a cgroup pointer based on index.
+ * @map: a cgroup arraymap
+ * @idx: index of the item you want to fetch
+ *
+ * Returns pointer on success,
+ * Error code if item not found, or out-of-bounds access
+ */
+static inline struct cgroup *fetch_arraymap_ptr(struct bpf_map *map, int idx)
+{
+   struct cgroup *cgrp;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+   if (unlikely(idx >= array->map.max_entries))
+   return ERR_PTR(-E2BIG);
+
+   cgrp = READ_ONCE(array->ptrs[idx]);
+   if (unlikely(!cgrp))
+   return ERR_PTR(-EAGAIN);
+
+   return cgrp;
+}
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 984f73b..d4e173d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -497,6 +497,23 @@ static inline bool cgroup_is_descendant(struct cgroup 
*cgrp,
return cgrp->ancestor_ids[ancestor->level] == ancestor->id;
 }
 
+/**
+ * task_in_cgroup_hierarchy - test task's membership of cgroup ancestry
+ * @task: the task to be tested
+ * @ancestor: possible ancestor of @task's cgroup
+ *
+ * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
+ * It follows all the same rules as cgroup_is_descendant, and only applies
+ * to the default hierarchy.
+ */
+static inline bool task_in_cgroup_hierarchy(struct task_struct *task,
+   struct cgroup *ancestor)
+{
+   struct css_set *cset = task_css_set(task);
+
+   return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
+}
+
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_is_populated(struct cgroup *cgrp)
 {
@@ -557,6 +574,7 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
+struct cgroup;
 
 static inline void css_put(struct cgroup_subsys_state *css) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
@@ -574,6 +592,11 @@ static inline void cgroup_free(struct task_struct *p) {}
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 
+static inline bool task_in_cgroup_hierarchy(struct task_struct *task,
+   struct cgroup *ancestor)
+{
+   return false;
+}
 #endif /* !CONFIG_CGROUPS */
 
 /*
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da218fe..64b1a07 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,17 @@ enum bpf_func_id {
 */
BPF_FUNC_probe_write_user,
 
+   /**
+* bpf_current_task_in_cgroup(map, index) - Check cgroup2 membership of 
current task
+* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+* @index: index of the cgroup in the bpf_map
+* Return:
+*   == 0 current failed the cgroup2 descendant test
+*   == 1 current succeeded the cgroup2 descendant test
+*< 0 error
+*/
+   BPF_FUNC_current_task_in_cgroup,
+
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650..a2ac051 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
 }
 late_initcall(register_perf_event_array_map);
 
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUPS

[PATCH net-next v3 0/2] Add bpf current_task_in_cgroup helper & opensnoop example

2016-08-11 Thread Sargun Dhillon

This patchset includes a helper and an example to determine whether the probe is
currently executing in the context of a specific cgroup based on a cgroup bpf
map / array. The helper checks the cgroupsv2 hierarchy based on the handle in
the map and if the current cgroup is equal to it, or a descendant of it. The
helper was tested with the example program, and it was verified that the correct
behaviour occurs in the interrupt context.

In an earlier version of this patchset I had added an "opensnoop"-like tool, 
and 
I realized I was basically reimplementing a lot of the code that already exists 
in the bcc repo. So, instead I decided to write a test that creates a new mount 
namespace, mounts up the cgroupv2 hierarchy, and does some basic tests.  I used 
the sync syscall as a canary for these tests because it's a simple, 0-arg 
syscall. Once this patch is accepted, adding support to opensnoop will be easy.

I also added a task_in_cgroup_hierarchy function in cgroups.h, as this pattern
is used in a couple places. Converting those can be done in a later patchset.

Thanks to Alexei for providing review.

v1->v2: Clean up
v2->v3: Move around ifdefs out of *.c files, add an "integration" test

Sargun Dhillon (2):
  bpf: Add bpf_current_task_in_cgroup helper
  samples/bpf: Add test_current_task_in_cgroup test

 include/linux/bpf.h|  22 
 include/linux/cgroup.h |  23 
 include/uapi/linux/bpf.h   |  11 ++
 kernel/bpf/arraymap.c  |   2 +-
 kernel/bpf/verifier.c  |   4 +-
 kernel/trace/bpf_trace.c   |  27 +
 net/core/filter.c  |  11 +-
 samples/bpf/Makefile   |   4 +
 samples/bpf/bpf_helpers.h  |   2 +
 samples/bpf/test_current_task_in_cgroup_kern.c |  43 
 samples/bpf/test_current_task_in_cgroup_user.c | 145 +
 11 files changed, 285 insertions(+), 9 deletions(-)
 create mode 100644 samples/bpf/test_current_task_in_cgroup_kern.c
 create mode 100644 samples/bpf/test_current_task_in_cgroup_user.c

-- 
2.7.4

Re: [net-next v2 v2 1/2] bpf: Add bpf_current_task_in_cgroup helper

2016-08-09 Thread Sargun Dhillon

On Tue, Aug 09, 2016 at 08:52:01PM -0700, Alexei Starovoitov wrote:
> On Tue, Aug 09, 2016 at 08:40:05PM -0700, Sargun Dhillon wrote:
> > On Tue, Aug 09, 2016 at 08:27:32PM -0700, Alexei Starovoitov wrote:
> > > On Tue, Aug 09, 2016 at 06:26:37PM -0700, Sargun Dhillon wrote:
> > > > On Tue, Aug 09, 2016 at 06:02:34PM -0700, Alexei Starovoitov wrote:
> > > > > On Tue, Aug 09, 2016 at 05:55:26PM -0700, Sargun Dhillon wrote:
> > > > > > On Tue, Aug 09, 2016 at 05:23:50PM -0700, Alexei Starovoitov wrote:
> > > > > > > On Tue, Aug 09, 2016 at 05:00:12PM -0700, Sargun Dhillon wrote:
> > > > > > > > This adds a bpf helper that's similar to the skb_in_cgroup 
> > > > > > > > helper to check
> > > > > > > > whether the probe is currently executing in the context of a 
> > > > > > > > specific
> > > > > > > > subset of the cgroupsv2 hierarchy. It does this based on 
> > > > > > > > membership test
> > > > > > > > for a cgroup arraymap. It is invalid to call this in an 
> > > > > > > > interrupt, and
> > > > > > > > it'll return an error. The helper is primarily to be used in 
> > > > > > > > debugging
> > > > > > > > activities for containers, where you may have multiple programs 
> > > > > > > > running in
> > > > > > > > a given top-level "container".
> > > > > > > > 
> > > > > > > > This patch also genericizes some of the arraymap fetching logic 
> > > > > > > > between the
> > > > > > > > skb_in_cgroup helper and this new helper.
> > > > > > > > 
> > > > > > > > Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> > > > > > > > Cc: Alexei Starovoitov <a...@kernel.org>
> > > > > > > > Cc: Daniel Borkmann <dan...@iogearbox.net>
> > > > > > > > ---
> > > > > > > >  include/linux/bpf.h  | 24 
> > > > > > > >  include/uapi/linux/bpf.h | 11 +++
> > > > > > > >  kernel/bpf/arraymap.c|  2 +-
> > > > > > > >  kernel/bpf/verifier.c|  4 +++-
> > > > > > > >  kernel/trace/bpf_trace.c | 34 
> > > > > > > > ++
> > > > > > > >  net/core/filter.c| 11 ---
> > > > > > > >  6 files changed, 77 insertions(+), 9 deletions(-)
> > > > > > > > 
> > > > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > > > index 1113423..9adf712 100644
> > > > > > > > --- a/include/linux/bpf.h
> > > > > > > > +++ b/include/linux/bpf.h
> > > > > > > > @@ -319,4 +319,28 @@ extern const struct bpf_func_proto 
> > > > > > > > bpf_get_stackid_proto;
> > > > > > > >  void bpf_user_rnd_init_once(void);
> > > > > > > >  u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
> > > > > > > >  
> > > > > > > > +#ifdef CONFIG_CGROUPS
> > > > > > > > +/* Helper to fetch a cgroup pointer based on index.
> > > > > > > > + * @map: a cgroup arraymap
> > > > > > > > + * @idx: index of the item you want to fetch
> > > > > > > > + *
> > > > > > > > + * Returns pointer on success,
> > > > > > > > + * Error code if item not found, or out-of-bounds access
> > > > > > > > + */
> > > > > > > > +static inline struct cgroup *fetch_arraymap_ptr(struct bpf_map 
> > > > > > > > *map, int idx)
> > > > > > > > +{
> > > > > > > > +   struct cgroup *cgrp;
> > > > > > > > +   struct bpf_array *array = container_of(map, struct 
> > > > > > > > bpf_array, map);
> > > > > > > > +
> > > > > > > > +   if (unlikely(idx >= array->map.max_entries))
> > > > > > > > +   return ERR_PTR(-E2BIG);
> > > > > > > > +
> > > > > > > > +   cgrp = READ_ON

Re: [net-next v2 v2 1/2] bpf: Add bpf_current_task_in_cgroup helper

2016-08-09 Thread Sargun Dhillon

On Tue, Aug 09, 2016 at 08:27:32PM -0700, Alexei Starovoitov wrote:
> On Tue, Aug 09, 2016 at 06:26:37PM -0700, Sargun Dhillon wrote:
> > On Tue, Aug 09, 2016 at 06:02:34PM -0700, Alexei Starovoitov wrote:
> > > On Tue, Aug 09, 2016 at 05:55:26PM -0700, Sargun Dhillon wrote:
> > > > On Tue, Aug 09, 2016 at 05:23:50PM -0700, Alexei Starovoitov wrote:
> > > > > On Tue, Aug 09, 2016 at 05:00:12PM -0700, Sargun Dhillon wrote:
> > > > > > This adds a bpf helper that's similar to the skb_in_cgroup helper 
> > > > > > to check
> > > > > > whether the probe is currently executing in the context of a 
> > > > > > specific
> > > > > > subset of the cgroupsv2 hierarchy. It does this based on membership 
> > > > > > test
> > > > > > for a cgroup arraymap. It is invalid to call this in an interrupt, 
> > > > > > and
> > > > > > it'll return an error. The helper is primarily to be used in 
> > > > > > debugging
> > > > > > activities for containers, where you may have multiple programs 
> > > > > > running in
> > > > > > a given top-level "container".
> > > > > > 
> > > > > > This patch also genericizes some of the arraymap fetching logic 
> > > > > > between the
> > > > > > skb_in_cgroup helper and this new helper.
> > > > > > 
> > > > > > Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> > > > > > Cc: Alexei Starovoitov <a...@kernel.org>
> > > > > > Cc: Daniel Borkmann <dan...@iogearbox.net>
> > > > > > ---
> > > > > >  include/linux/bpf.h  | 24 
> > > > > >  include/uapi/linux/bpf.h | 11 +++
> > > > > >  kernel/bpf/arraymap.c|  2 +-
> > > > > >  kernel/bpf/verifier.c|  4 +++-
> > > > > >  kernel/trace/bpf_trace.c | 34 ++
> > > > > >  net/core/filter.c| 11 ---
> > > > > >  6 files changed, 77 insertions(+), 9 deletions(-)
> > > > > > 
> > > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > > > index 1113423..9adf712 100644
> > > > > > --- a/include/linux/bpf.h
> > > > > > +++ b/include/linux/bpf.h
> > > > > > @@ -319,4 +319,28 @@ extern const struct bpf_func_proto 
> > > > > > bpf_get_stackid_proto;
> > > > > >  void bpf_user_rnd_init_once(void);
> > > > > >  u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
> > > > > >  
> > > > > > +#ifdef CONFIG_CGROUPS
> > > > > > +/* Helper to fetch a cgroup pointer based on index.
> > > > > > + * @map: a cgroup arraymap
> > > > > > + * @idx: index of the item you want to fetch
> > > > > > + *
> > > > > > + * Returns pointer on success,
> > > > > > + * Error code if item not found, or out-of-bounds access
> > > > > > + */
> > > > > > +static inline struct cgroup *fetch_arraymap_ptr(struct bpf_map 
> > > > > > *map, int idx)
> > > > > > +{
> > > > > > +   struct cgroup *cgrp;
> > > > > > +   struct bpf_array *array = container_of(map, struct bpf_array, 
> > > > > > map);
> > > > > > +
> > > > > > +   if (unlikely(idx >= array->map.max_entries))
> > > > > > +   return ERR_PTR(-E2BIG);
> > > > > > +
> > > > > > +   cgrp = READ_ONCE(array->ptrs[idx]);
> > > > > > +   if (unlikely(!cgrp))
> > > > > > +   return ERR_PTR(-EAGAIN);
> > > > > > +
> > > > > > +   return cgrp;
> > > > > > +}
> > > > > > +#endif /* CONFIG_CGROUPS */
> > > > > > +
> > > > > >  #endif /* _LINUX_BPF_H */
> > > > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > > > index da218fe..64b1a07 100644
> > > > > > --- a/include/uapi/linux/bpf.h
> > > > > > +++ b/include/uapi/linux/bpf.h
> > > > > > @@ -375,6 +375,17 @@ enum bpf_func_id {
> > > > > >  */
> > > > > > BPF_F

Re: [net-next v2 v2 1/2] bpf: Add bpf_current_task_in_cgroup helper

2016-08-09 Thread Sargun Dhillon

On Tue, Aug 09, 2016 at 06:02:34PM -0700, Alexei Starovoitov wrote:
> On Tue, Aug 09, 2016 at 05:55:26PM -0700, Sargun Dhillon wrote:
> > On Tue, Aug 09, 2016 at 05:23:50PM -0700, Alexei Starovoitov wrote:
> > > On Tue, Aug 09, 2016 at 05:00:12PM -0700, Sargun Dhillon wrote:
> > > > This adds a bpf helper that's similar to the skb_in_cgroup helper to 
> > > > check
> > > > whether the probe is currently executing in the context of a specific
> > > > subset of the cgroupsv2 hierarchy. It does this based on membership test
> > > > for a cgroup arraymap. It is invalid to call this in an interrupt, and
> > > > it'll return an error. The helper is primarily to be used in debugging
> > > > activities for containers, where you may have multiple programs running 
> > > > in
> > > > a given top-level "container".
> > > > 
> > > > This patch also genericizes some of the arraymap fetching logic between 
> > > > the
> > > > skb_in_cgroup helper and this new helper.
> > > > 
> > > > Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> > > > Cc: Alexei Starovoitov <a...@kernel.org>
> > > > Cc: Daniel Borkmann <dan...@iogearbox.net>
> > > > ---
> > > >  include/linux/bpf.h  | 24 
> > > >  include/uapi/linux/bpf.h | 11 +++
> > > >  kernel/bpf/arraymap.c|  2 +-
> > > >  kernel/bpf/verifier.c|  4 +++-
> > > >  kernel/trace/bpf_trace.c | 34 ++
> > > >  net/core/filter.c| 11 ---
> > > >  6 files changed, 77 insertions(+), 9 deletions(-)
> > > > 
> > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > index 1113423..9adf712 100644
> > > > --- a/include/linux/bpf.h
> > > > +++ b/include/linux/bpf.h
> > > > @@ -319,4 +319,28 @@ extern const struct bpf_func_proto 
> > > > bpf_get_stackid_proto;
> > > >  void bpf_user_rnd_init_once(void);
> > > >  u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
> > > >  
> > > > +#ifdef CONFIG_CGROUPS
> > > > +/* Helper to fetch a cgroup pointer based on index.
> > > > + * @map: a cgroup arraymap
> > > > + * @idx: index of the item you want to fetch
> > > > + *
> > > > + * Returns pointer on success,
> > > > + * Error code if item not found, or out-of-bounds access
> > > > + */
> > > > +static inline struct cgroup *fetch_arraymap_ptr(struct bpf_map *map, 
> > > > int idx)
> > > > +{
> > > > +   struct cgroup *cgrp;
> > > > +   struct bpf_array *array = container_of(map, struct bpf_array, 
> > > > map);
> > > > +
> > > > +   if (unlikely(idx >= array->map.max_entries))
> > > > +   return ERR_PTR(-E2BIG);
> > > > +
> > > > +   cgrp = READ_ONCE(array->ptrs[idx]);
> > > > +   if (unlikely(!cgrp))
> > > > +   return ERR_PTR(-EAGAIN);
> > > > +
> > > > +   return cgrp;
> > > > +}
> > > > +#endif /* CONFIG_CGROUPS */
> > > > +
> > > >  #endif /* _LINUX_BPF_H */
> > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > index da218fe..64b1a07 100644
> > > > --- a/include/uapi/linux/bpf.h
> > > > +++ b/include/uapi/linux/bpf.h
> > > > @@ -375,6 +375,17 @@ enum bpf_func_id {
> > > >  */
> > > > BPF_FUNC_probe_write_user,
> > > >  
> > > > +   /**
> > > > +* bpf_current_task_in_cgroup(map, index) - Check cgroup2 
> > > > membership of current task
> > > > +* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
> > > > +* @index: index of the cgroup in the bpf_map
> > > > +* Return:
> > > > +*   == 0 current failed the cgroup2 descendant test
> > > > +*   == 1 current succeeded the cgroup2 descendant test
> > > > +*< 0 error
> > > > +*/
> > > > +   BPF_FUNC_current_task_in_cgroup,
> > > > +
> > > > __BPF_FUNC_MAX_ID,
> > > >  };
> > > >  
> > > > diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> > > > index 633a650..a2ac051 100

Re: [net-next v2 v2 1/2] bpf: Add bpf_current_task_in_cgroup helper

2016-08-09 Thread Sargun Dhillon

On Tue, Aug 09, 2016 at 05:23:50PM -0700, Alexei Starovoitov wrote:
> On Tue, Aug 09, 2016 at 05:00:12PM -0700, Sargun Dhillon wrote:
> > This adds a bpf helper that's similar to the skb_in_cgroup helper to check
> > whether the probe is currently executing in the context of a specific
> > subset of the cgroupsv2 hierarchy. It does this based on membership test
> > for a cgroup arraymap. It is invalid to call this in an interrupt, and
> > it'll return an error. The helper is primarily to be used in debugging
> > activities for containers, where you may have multiple programs running in
> > a given top-level "container".
> > 
> > This patch also genericizes some of the arraymap fetching logic between the
> > skb_in_cgroup helper and this new helper.
> > 
> > Signed-off-by: Sargun Dhillon <sar...@sargun.me>
> > Cc: Alexei Starovoitov <a...@kernel.org>
> > Cc: Daniel Borkmann <dan...@iogearbox.net>
> > ---
> >  include/linux/bpf.h  | 24 
> >  include/uapi/linux/bpf.h | 11 +++
> >  kernel/bpf/arraymap.c|  2 +-
> >  kernel/bpf/verifier.c|  4 +++-
> >  kernel/trace/bpf_trace.c | 34 ++
> >  net/core/filter.c| 11 ---
> >  6 files changed, 77 insertions(+), 9 deletions(-)
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 1113423..9adf712 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -319,4 +319,28 @@ extern const struct bpf_func_proto 
> > bpf_get_stackid_proto;
> >  void bpf_user_rnd_init_once(void);
> >  u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
> >  
> > +#ifdef CONFIG_CGROUPS
> > +/* Helper to fetch a cgroup pointer based on index.
> > + * @map: a cgroup arraymap
> > + * @idx: index of the item you want to fetch
> > + *
> > + * Returns pointer on success,
> > + * Error code if item not found, or out-of-bounds access
> > + */
> > +static inline struct cgroup *fetch_arraymap_ptr(struct bpf_map *map, int 
> > idx)
> > +{
> > +   struct cgroup *cgrp;
> > +   struct bpf_array *array = container_of(map, struct bpf_array, map);
> > +
> > +   if (unlikely(idx >= array->map.max_entries))
> > +   return ERR_PTR(-E2BIG);
> > +
> > +   cgrp = READ_ONCE(array->ptrs[idx]);
> > +   if (unlikely(!cgrp))
> > +   return ERR_PTR(-EAGAIN);
> > +
> > +   return cgrp;
> > +}
> > +#endif /* CONFIG_CGROUPS */
> > +
> >  #endif /* _LINUX_BPF_H */
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index da218fe..64b1a07 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -375,6 +375,17 @@ enum bpf_func_id {
> >  */
> > BPF_FUNC_probe_write_user,
> >  
> > +   /**
> > +* bpf_current_task_in_cgroup(map, index) - Check cgroup2 membership of 
> > current task
> > +* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
> > +* @index: index of the cgroup in the bpf_map
> > +* Return:
> > +*   == 0 current failed the cgroup2 descendant test
> > +*   == 1 current succeeded the cgroup2 descendant test
> > +*< 0 error
> > +*/
> > +   BPF_FUNC_current_task_in_cgroup,
> > +
> > __BPF_FUNC_MAX_ID,
> >  };
> >  
> > diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> > index 633a650..a2ac051 100644
> > --- a/kernel/bpf/arraymap.c
> > +++ b/kernel/bpf/arraymap.c
> > @@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
> >  }
> >  late_initcall(register_perf_event_array_map);
> >  
> > -#ifdef CONFIG_SOCK_CGROUP_DATA
> > +#ifdef CONFIG_CGROUPS
> >  static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
> >  struct file *map_file /* not used */,
> >  int fd)
> > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> > index 7094c69..80efab8 100644
> > --- a/kernel/bpf/verifier.c
> > +++ b/kernel/bpf/verifier.c
> > @@ -1053,7 +1053,8 @@ static int check_map_func_compatibility(struct 
> > bpf_map *map, int func_id)
> > goto error;
> > break;
> > case BPF_MAP_TYPE_CGROUP_ARRAY:
> > -   if (func_id != BPF_FUNC_skb_in_cgroup)
> > +   if (func_id != BPF_FUNC_skb_in_cgroup &&
> > +   func_id != BPF_FUNC_current_task_in_cgroup)
> >

[net-next v2 v2 2/2] samples/bpf: Add opensnoop example that uses current_task_in_cgroup helper

2016-08-09 Thread Sargun Dhillon

This example adds the trace_opensnoop BPF sample. This example program
prints all activities of files being opened for all programs in the
provided cgroupsv2 cgroup and it's descendants in the cgroupv2 hierarchy.

It populate a cgroups arraymap prior to execution in userspace. This means
that the program must be run in the same cgroups namespace as the programs
that are being traced.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
---
 samples/bpf/Makefile   |  4 +++
 samples/bpf/bpf_helpers.h  |  2 ++
 samples/bpf/trace_opensnoop_kern.c | 35 +++
 samples/bpf/trace_opensnoop_user.c | 69 ++
 4 files changed, 110 insertions(+)
 create mode 100644 samples/bpf/trace_opensnoop_kern.c
 create mode 100644 samples/bpf/trace_opensnoop_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 90ebf7d..d9c37a4 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -24,6 +24,7 @@ hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += xdp1
 hostprogs-y += xdp2
+hostprogs-y += trace_opensnoop
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -49,6 +50,7 @@ test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
+trace_opensnoop-objs := bpf_load.o libbpf.o trace_opensnoop_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -74,6 +76,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
+always += trace_opensnoop_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -97,6 +100,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
 HOSTLOADLIBES_xdp1 += -lelf
 HOSTLOADLIBES_xdp2 += -lelf
+HOSTLOADLIBES_trace_opensnoop += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 217c8d5..d409cbb 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -43,6 +43,8 @@ static int (*bpf_get_stackid)(void *ctx, void *map, int 
flags) =
(void *) BPF_FUNC_get_stackid;
 static int (*bpf_probe_write_user)(void *dst, void *src, int size) =
(void *) BPF_FUNC_probe_write_user;
+static int (*bpf_current_task_in_cgroup)(void *map, int index) =
+   (void *) BPF_FUNC_current_task_in_cgroup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/trace_opensnoop_kern.c 
b/samples/bpf/trace_opensnoop_kern.c
new file mode 100644
index 000..dade471
--- /dev/null
+++ b/samples/bpf/trace_opensnoop_kern.c
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 Sargun Dhillon <sar...@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") cgroup_map = {
+   .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+   .key_size = sizeof(u32),
+   .value_size = sizeof(u32),
+   .max_entries = 1,
+};
+
+SEC("kprobe/sys_open")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   const char *filename = (char *)PT_REGS_PARM1(ctx);
+   char fmt[] = "Opening file: %s\n";
+
+   if (!bpf_current_task_in_cgroup(_map, 0))
+   return 0;
+
+   bpf_trace_printk(fmt, sizeof(fmt), filename);
+
+   return 1;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/trace_opensnoop_user.c 
b/samples/bpf/trace_opensnoop_user.c
new file mode 100644
index 000..403664e
--- /dev/null
+++ b/samples/bpf/trace_opensnoop_user.c
@@ -0,0 +1,69 @@
+#include 
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static void usage(char **argv)
+{
+   printf("Usage:  %s [...]\n", argv[0]);
+   printf("Prints the file opening activity of all processes under a given 
cgroupv2 hierarchy.\n");
+   printf("-v   Full path of the cgroup2\n");
+   printf("-h  Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+   char filename[256];
+   const char *cg2 = NULL;
+   int ret, opt, cg2_fd;
+   int array_index = 0

[net-next v2 v2 1/2] bpf: Add bpf_current_task_in_cgroup helper

2016-08-09 Thread Sargun Dhillon

This adds a bpf helper that's similar to the skb_in_cgroup helper to check
whether the probe is currently executing in the context of a specific
subset of the cgroupsv2 hierarchy. It does this based on membership test
for a cgroup arraymap. It is invalid to call this in an interrupt, and
it'll return an error. The helper is primarily to be used in debugging
activities for containers, where you may have multiple programs running in
a given top-level "container".

This patch also genericizes some of the arraymap fetching logic between the
skb_in_cgroup helper and this new helper.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
---
 include/linux/bpf.h  | 24 
 include/uapi/linux/bpf.h | 11 +++
 kernel/bpf/arraymap.c|  2 +-
 kernel/bpf/verifier.c|  4 +++-
 kernel/trace/bpf_trace.c | 34 ++
 net/core/filter.c| 11 ---
 6 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1113423..9adf712 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,4 +319,28 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#ifdef CONFIG_CGROUPS
+/* Helper to fetch a cgroup pointer based on index.
+ * @map: a cgroup arraymap
+ * @idx: index of the item you want to fetch
+ *
+ * Returns pointer on success,
+ * Error code if item not found, or out-of-bounds access
+ */
+static inline struct cgroup *fetch_arraymap_ptr(struct bpf_map *map, int idx)
+{
+   struct cgroup *cgrp;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+   if (unlikely(idx >= array->map.max_entries))
+   return ERR_PTR(-E2BIG);
+
+   cgrp = READ_ONCE(array->ptrs[idx]);
+   if (unlikely(!cgrp))
+   return ERR_PTR(-EAGAIN);
+
+   return cgrp;
+}
+#endif /* CONFIG_CGROUPS */
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da218fe..64b1a07 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,17 @@ enum bpf_func_id {
 */
BPF_FUNC_probe_write_user,
 
+   /**
+* bpf_current_task_in_cgroup(map, index) - Check cgroup2 membership of 
current task
+* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+* @index: index of the cgroup in the bpf_map
+* Return:
+*   == 0 current failed the cgroup2 descendant test
+*   == 1 current succeeded the cgroup2 descendant test
+*< 0 error
+*/
+   BPF_FUNC_current_task_in_cgroup,
+
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650..a2ac051 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
 }
 late_initcall(register_perf_event_array_map);
 
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUPS
 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 struct file *map_file /* not used */,
 int fd)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7094c69..80efab8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1053,7 +1053,8 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
goto error;
break;
case BPF_MAP_TYPE_CGROUP_ARRAY:
-   if (func_id != BPF_FUNC_skb_in_cgroup)
+   if (func_id != BPF_FUNC_skb_in_cgroup &&
+   func_id != BPF_FUNC_current_task_in_cgroup)
goto error;
break;
default:
@@ -1075,6 +1076,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+   case BPF_FUNC_current_task_in_cgroup:
case BPF_FUNC_skb_in_cgroup:
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b20438f..39f0290 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -376,6 +376,36 @@ static const struct bpf_func_proto 
bpf_get_current_task_proto = {
.ret_type   = RET_INTEGER,
 };
 
+#ifdef CONFIG_CGROUPS
+static u64 bpf_current_task_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   struct bpf_map *map = (struct bpf_map *)(long)r1;
+   struct css_set *cset;
+   struct cgroup *cgrp;
+   u32 idx = (u32)r2;
+
+   if (unlikely(in_interrupt()))
+   return

[net-next v2 v2 0/2] Add bpf current_task_in_cgroup helper & opensnoop example

2016-08-09 Thread Sargun Dhillon

This patchset includes a helper and an example to determine whether the probe 
is 
currently executing in the context of a specific cgroup based on a cgroup bpf 
map / array. The helper checks the cgroupsv2 hierarchy based on the handle in 
the map and if the current cgroup is equal to it, or a descendant of it. The 
helper was tested with the example program, and it was verified that the 
correct 
behaviour occurs in the interrupt context.

The example on the other hand, "open snoop" is much simplified version of that 
in the iovisor/BCC project. In order to run it, you must supply a specific 
cgroup in the hierarchy, and it'll print out all files being opened under it.

v1->v2: Add better example code -- OpenSnoop, clean up

    

Sargun Dhillon (2):
  bpf: Add bpf_current_task_in_cgroup helper
  samples/bpf: Add opensnoop example that uses current_task_in_cgroup
helper

 include/linux/bpf.h| 24 +
 include/uapi/linux/bpf.h   | 11 ++
 kernel/bpf/arraymap.c  |  2 +-
 kernel/bpf/verifier.c  |  4 ++-
 kernel/trace/bpf_trace.c   | 34 ++
 net/core/filter.c  | 11 +++---
 samples/bpf/Makefile   |  4 +++
 samples/bpf/bpf_helpers.h  |  2 ++
 samples/bpf/trace_opensnoop_kern.c | 35 +++
 samples/bpf/trace_opensnoop_user.c | 70 ++
 10 files changed, 188 insertions(+), 9 deletions(-)
 create mode 100644 samples/bpf/trace_opensnoop_kern.c
 create mode 100644 samples/bpf/trace_opensnoop_user.c

-- 
2.7.4

Re: [RFC 0/4] RFC: Add Checmate, BPF-driven minor LSM

2016-08-08 Thread Sargun Dhillon

On Mon, Aug 08, 2016 at 04:44:02PM -0700, Kees Cook wrote:
> On Thu, Aug 4, 2016 at 12:11 AM, Sargun Dhillon <sar...@sargun.me> wrote:
> > I distributed this patchset to linux-security-mod...@vger.kernel.org 
> > earlier,
> > but based on the fact that the archive is down, and this is a fairly
> > broad-sweeping proposal, I figured I'd grow the audience a little bit. Sorry
> > if you received this multiple times.
> >
> > I've begun building out the skeleton of a Linux Security Module, and I'd 
> > like to
> > get feedback on it. It's a skeleton, and I've only populated a few hooks, 
> > so I'm
> > mostly looking for input on the general proposal, interest, and design. 
> > It's a
> > minor LSM. My particular use case is one in which containers are being
> > dynamically deployed to machines by internal developers in a different 
> > group.
> > The point of Checmate is to act as an extensible bed for _safe_, complex
> > security policies. It's nice to enable dynamic security policies that can be
> > defined in C, and change as neccessary, without ever having to patch, or 
> > rebuild
> > the kernel.
> >
> > For many of these containers, the security policies can be fairly nuanced. 
> > One
> > particular one to take into account is network security. Often times,
> > administrators want to prevent ingress, and egress connectivity except from 
> > a
> > few select IPs. Egress filtering can be managed using net_cls, but without
> > modifying running software, it's non-trivial to attach a filter to all 
> > sockets
> > being created within a container. The inet_conn_request, socket_recvmsg,
> > socket_sock_rcv_skb hooks make this trivial to implement.
> >
> > Other times, containers need to be throttled in places where there's not 
> > really
> > a good place to impose that policy for software which isn't built in-house. 
> >  If
> > one wants to limit file creations/sec, or reject I/O under certain
> > characteristics, there's not a great place to do it now. This gives 
> > engineers a
> > mechanism to write those policies.
> >
> > This same flexibility can be used to take existing programs and enable safe 
> > BPF
> > helpers to modify memory to allow rules to pass. One example that I 
> > prototyped
> > was Docker's port mapping, which has an overhead (DNAT), and there's some 
> > loss
> > of fidelity in the BSD Socket API to identify what's going on. Instead, we 
> > can
> > just rewrite the port in a bind, based upon some data in a BPF map, and a 
> > cgroup
> > match.
> >
> > I can actually see other minor security modules being implemented in 
> > Checmate,
> > for example, Yama, or the recently proposed Hardchroot could be 
> > reimplemented in
> > BPF. Potentially, they could even be API compatible.
> >
> > Although, at first, much of this sounds like seccomp, it's quite different. 
> > For
> > one, what we can do in the security hooks is more complex (access to kernel
> > pointers). The other side of this is we can have effects on a system-wide,
> > or cgroup level. This also circumvents the need for CRIU-friendly policies.
> >
> > Lastly, the flexibility of this mechanism allows for prevention of security
> > vulnerabilities which are often complex in nature and require the 
> > interaction
> > of multiple hooks (CVE-2014-9717 is a good example), and although ksplice,
> > and livepatch exist, they're not always easy to use, as compared to loading
> > a single bpf program across all kernels.
> >
> > The user-facing API is exposed via prctl as it's meant to be very simple (at
> > least the kernel components). It only has three operations. For a given 
> > security
> > hook, you can attach a BPF program to it, which will add it to the set of
> > programs that are executed over when the hook is hit. You can reset a hook,
> > which removes all program associated with a given hook, and you can set a
> > deny_reset flag on a hook to prevent anyone from resetting it. It's likely 
> > that
> > an individual would want to set this in any production use case.
> 
> One fairly serious problem that seccomp had to overcome was dealing
> with exec+setuid in the face of an attacker. The main example is "what
> if we refuse to allow a program to drop privileges via a filter rule?"
> For seccomp, no-new-privs was introduced for non-root users of
> seccomp. Programmatic syscall (or LSM) filters need to deal with this,
> and it's a bit ungainly. :)
> 
Couldn't someone do the same with SELinux, or App

Re: [net-next 0/2] BPF, kprobes: Add current_in_cgroup helper

2016-08-08 Thread Sargun Dhillon

On Mon, Aug 08, 2016 at 11:27:32AM +0200, Daniel Borkmann wrote:
> On 08/08/2016 05:52 AM, Alexei Starovoitov wrote:
> >On Sun, Aug 07, 2016 at 08:08:19PM -0700, Sargun Dhillon wrote:
> >>Thanks for your feedback Alexei,
> >>I really appreciate it.
> >>
> >>On Sun, Aug 07, 2016 at 05:52:36PM -0700, Alexei Starovoitov wrote:
> >>>On Sat, Aug 06, 2016 at 09:56:06PM -0700, Sargun Dhillon wrote:
> >>>>On Sat, Aug 06, 2016 at 09:32:05PM -0700, Alexei Starovoitov wrote:
> >>>>>On Sat, Aug 06, 2016 at 09:06:53PM -0700, Sargun Dhillon wrote:
> >>>>>>This patchset includes a helper and an example to determine whether the 
> >>>>>>kprobe
> >>>>>>is currently executing in the context of a specific cgroup based on a 
> >>>>>>cgroup
> >>>>>>bpf map / array.
> >>>>>
> >>>>>description is too short to understand how this new helper is going to 
> >>>>>be used.
> >>>>>depending on kprobe current is not always valid.
> >>>>Anything not in in_interrupt() should have a current, right?
> >>>>
> >>>>>what are you trying to achieve?
> >>>>This is primarily to help troubleshoot containers (Docker, and now 
> >>>>systemd). A
> >>>>lot of the time we want to determine what's going on in a given container
> >>>>(opening files, connecting to systems, etc...). There's not really a 
> >>>>great way
> >>>>to restrict to containers except by manually walking datastructures to 
> >>>>check for
> >>>>the right cgroup. This seems like a better alternative.
> >>>
> >>>so it's about restricting or determining?
> >>>In other words if it's analytics/tracing that's one thing, but
> >>>enforcement/restriction is quite different.
> >>>For analytics one can walk task_css_set(current)->dfl_cgrp and remember
> >>>that pointer in a map or something for stats collections and similar.
> >>>If it's restricting apps in containers then kprobe approach
> >>>is not usable. I don't think you'd want to built an enforcement system
> >>>on an unstable api then can vary kernel-to-kernel.
> >>>
> >>The first real-world use case are to implement something like Sysdig. Often 
> >>the
> >>team running the team running the containers don't always know what's 
> >>inside of
> >>them, so they want to be able to view network, I/O, and other activity by
> >>container. Right now, the lowest common denominator between all of the
> >>containerization techniques is cgroups. We've seen examples of where a 
> >>admin is
> >>unsure of the workload, and would love to use opensnoop, but there are too 
> >>many
> >>workloads on the machine.
> >
> >Indeed it would be a useful feature to teach opensnoop to filter by a cgroup
> >and all descentants of it. If you can prepare a patch for it that would be
> >a strong use case for this bpf_current_in_cgroup helper and solid 
> >justification
> >to accept it in the kernel.
> >Something like cgroupv2 string path as an argument ?
> 
> How does this integrate with cgroup namespaces? Your current helper would only
> look at the cgroup in your current namespace, no? Or would the program 
> populating
> the map temporarily switch into other namespaces?
> 
The BPF program is namespace oblivious. If you had multiple cgroups namepaces, 
you'd have to open an fd for the other namespace's cgroup to populate the map. 
I 
see this as more of a userspace problem.

> What about cases where cgroup could be shared among other (net, ..) 
> namespaces,
> BPF program would still not be namespace aware to sort these things out?
> 
I'm not sure what you're getting at. It sounds like being "namespace aware" 
either means that during probe installation you restrict the probe to a given 
namespace, or you have another helper that allows you to check the namespace 
you're in. Would a second helper, and arraymap type address this? If so, I'd 
rather that be separate work.

> You'll also have the issue, for example, that bpf_perf_event_read() counters
> are global, combining them with cgroups helper in a program would lead to 
> false
> expectations (in the sense that they might also be assumed for that cgroup), 
> or
> do you have a way to tackle that as well (at least SW events, since HW should 
> not
> be possible)?
> 
> Btw, there's slightly related work from IBM folks (but to run it from within a
&g

Re: [net-next 0/2] BPF, kprobes: Add current_in_cgroup helper

2016-08-07 Thread Sargun Dhillon

Thanks for your feedback Alexei,
I really appreciate it.

On Sun, Aug 07, 2016 at 05:52:36PM -0700, Alexei Starovoitov wrote:
> On Sat, Aug 06, 2016 at 09:56:06PM -0700, Sargun Dhillon wrote:
> > On Sat, Aug 06, 2016 at 09:32:05PM -0700, Alexei Starovoitov wrote:
> > > On Sat, Aug 06, 2016 at 09:06:53PM -0700, Sargun Dhillon wrote:
> > > > This patchset includes a helper and an example to determine whether the 
> > > > kprobe 
> > > > is currently executing in the context of a specific cgroup based on a 
> > > > cgroup
> > > > bpf map / array. 
> > > 
> > > description is too short to understand how this new helper is going to be 
> > > used.
> > > depending on kprobe current is not always valid.
> > Anything not in in_interrupt() should have a current, right?
> > 
> > > what are you trying to achieve?
> > This is primarily to help troubleshoot containers (Docker, and now 
> > systemd). A 
> > lot of the time we want to determine what's going on in a given container 
> > (opening files, connecting to systems, etc...). There's not really a great 
> > way 
> > to restrict to containers except by manually walking datastructures to 
> > check for 
> > the right cgroup. This seems like a better alternative.
> 
> so it's about restricting or determining?
> In other words if it's analytics/tracing that's one thing, but
> enforcement/restriction is quite different.
> For analytics one can walk task_css_set(current)->dfl_cgrp and remember
> that pointer in a map or something for stats collections and similar.
> If it's restricting apps in containers then kprobe approach
> is not usable. I don't think you'd want to built an enforcement system
> on an unstable api then can vary kernel-to-kernel.
> 
The first real-world use case are to implement something like Sysdig. Often the 
team running the team running the containers don't always know what's inside of 
them, so they want to be able to view network, I/O, and other activity by 
container. Right now, the lowest common denominator between all of the 
containerization techniques is cgroups. We've seen examples of where a admin is 
unsure of the workload, and would love to use opensnoop, but there are too many 
workloads on the machine.

Unfortunately, I don't think that it's possible just to check 
task_css_set(current)->dfl_cgrp in a bpf program. The container, especially 
containers with sidecars (what Kubernetes calls Pods, I believe?) tend to have 
multiple nested cgroups inside of them. If you had a way to convert cgroup 
array 
entries to pointers, I imagine you could write an unrolled loop to check for 
ownership within a limited range.

I'm still looking for comments from the LSM folks on Checmate[1]. It appears 
that there has been very little churn in the LSM hooks API that's API-breaking. 
For many of syscall hooks, they're closely tied to the syscall API, so they 
can't really change too much. I think that a toolkit like iovisor, or another 
userland translation layer, these hooks could be very powerful. I would love to 
hear feedback from the LSM folks.

My plan with those patches is to reimplement Yama, and Hardchroot in BPF 
programs to show off the potential capabilities of Checmate. I'd also like to 
create some example programs blocking CVEs that have popped up. I think of the 
idea like nftables for kernel syscalls, storage, and the network stack.

The other example I want to show is implementing Docker-bridge style network 
isolation with Checmate. Most folks use it to map ports, and to restrict 
binding 
to specific ports, and not the dedicated network namespace, or loopback 
interface. It turns out for some applications this comes at a pretty 
significant 
hit[2][3], as well as awkward upper bounds based on conntrack.

> > > This looks like an alternative to lsm patches submitted earlier?
> > No. But I would like to use this helper in the LSM patches I'm working on. 
> > For 
> > now, with those patches, and this helper, I can create a map sized 1, and 
> > add 
> > the cgroup I care about to it. Given I can add as many bpf programs to an 
> > LSM
> > hook I want, I can use this mechanism to "attach BPF programs to cgroups" 
> > -- 
> > I put that in quotes because you're not really attaching it to a cgroup,
> > but just burning some instructions on checking it. 
> 
> how many cgroups will you need to check? The current bpf_skb_in_cgroup()
> suffers similar scaling issues.
> I think the proper restriction/enforcement could be done via attaching bpf
> program to a cgroup. These patches are being worked on Daniel Mack cc-ed.
> Then bpf program will be able to enforce networking behavior of applications
> in cgroups.
> For global

Re: [net-next 0/2] BPF, kprobes: Add current_in_cgroup helper

2016-08-06 Thread Sargun Dhillon

On Sat, Aug 06, 2016 at 09:32:05PM -0700, Alexei Starovoitov wrote:
> On Sat, Aug 06, 2016 at 09:06:53PM -0700, Sargun Dhillon wrote:
> > This patchset includes a helper and an example to determine whether the 
> > kprobe 
> > is currently executing in the context of a specific cgroup based on a cgroup
> > bpf map / array. 
> 
> description is too short to understand how this new helper is going to be 
> used.
> depending on kprobe current is not always valid.
Anything not in in_interrupt() should have a current, right?

> what are you trying to achieve?
This is primarily to help troubleshoot containers (Docker, and now systemd). A 
lot of the time we want to determine what's going on in a given container 
(opening files, connecting to systems, etc...). There's not really a great way 
to restrict to containers except by manually walking datastructures to check 
for 
the right cgroup. This seems like a better alternative.

> This looks like an alternative to lsm patches submitted earlier?
No. But I would like to use this helper in the LSM patches I'm working on. For 
now, with those patches, and this helper, I can create a map sized 1, and add 
the cgroup I care about to it. Given I can add as many bpf programs to an LSM
hook I want, I can use this mechanism to "attach BPF programs to cgroups" -- 
I put that in quotes because you're not really attaching it to a cgroup,
but just burning some instructions on checking it. 

In my mind it seems better than making cgroup-attachment a first-class part
of the checmate work since I still want to make globally available hooks
possible.

> btw net-next is closed and no new features accepted at the moment.
Sorry, I didn't realize that. I'd still love to get feedback.
>

[net-next 2/2] samples/bpf: Add example using current_in_cgroup

2016-08-06 Thread Sargun Dhillon

This is a simple trace example that shows programs connecting,
but only if they're in a chosen cgroup.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
---
 samples/bpf/Makefile   |  4 ++
 samples/bpf/bpf_helpers.h  |  2 +
 samples/bpf/trace_current_in_cgroup_kern.c | 44 
 samples/bpf/trace_current_in_cgroup_user.c | 66 ++
 4 files changed, 116 insertions(+)
 create mode 100644 samples/bpf/trace_current_in_cgroup_kern.c
 create mode 100644 samples/bpf/trace_current_in_cgroup_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 90ebf7d..61b0534 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -24,6 +24,7 @@ hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += xdp1
 hostprogs-y += xdp2
+hostprogs-y += trace_current_in_cgroup
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -49,6 +50,7 @@ test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
+trace_current_in_cgroup-objs := bpf_load.o libbpf.o 
trace_current_in_cgroup_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -74,6 +76,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
+always += trace_current_in_cgroup_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -97,6 +100,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
 HOSTLOADLIBES_xdp1 += -lelf
 HOSTLOADLIBES_xdp2 += -lelf
+HOSTLOADLIBES_trace_current_in_cgroup += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 217c8d5..080403c 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -43,6 +43,8 @@ static int (*bpf_get_stackid)(void *ctx, void *map, int 
flags) =
(void *) BPF_FUNC_get_stackid;
 static int (*bpf_probe_write_user)(void *dst, void *src, int size) =
(void *) BPF_FUNC_probe_write_user;
+static int (*bpf_current_in_cgroup)(void *map, int index) =
+   (void *) BPF_FUNC_current_in_cgroup;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/trace_current_in_cgroup_kern.c 
b/samples/bpf/trace_current_in_cgroup_kern.c
new file mode 100644
index 000..7aafb86
--- /dev/null
+++ b/samples/bpf/trace_current_in_cgroup_kern.c
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 Sargun Dhillon <sar...@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include 
+
+struct bpf_map_def SEC("maps") test_current_in_cgroup_map = {
+   .type = BPF_MAP_TYPE_CGROUP_ARRAY,
+   .key_size = sizeof(u32),
+   .value_size = sizeof(u32),
+   .max_entries = 1,
+};
+
+SEC("kprobe/sys_connect")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   struct sockaddr_in addr = {};
+   void *sockaddr_arg = (void *)PT_REGS_PARM2(ctx);
+   int sockaddr_len = (int)PT_REGS_PARM3(ctx);
+   char fmt[] = "Connection on port %d\n";
+
+   if (!bpf_current_in_cgroup(_current_in_cgroup_map, 0))
+   return 0;
+   if (sockaddr_len > sizeof(addr))
+   return 0;
+   if (bpf_probe_read(, sizeof(addr), sockaddr_arg) != 0)
+   return 0;
+   if (addr.sin_family != AF_INET)
+   return 0;
+
+   bpf_trace_printk(fmt, sizeof(fmt), be16_to_cpu(addr.sin_port));
+
+   return 1;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/trace_current_in_cgroup_user.c 
b/samples/bpf/trace_current_in_cgroup_user.c
new file mode 100644
index 000..be717bb
--- /dev/null
+++ b/samples/bpf/trace_current_in_cgroup_user.c
@@ -0,0 +1,66 @@
+#include 
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static void usage(char **argv)
+{
+   printf("Usage:  %s [...]\n", argv[0]);
+   printf("-f   Full path of the cgroup2\n");
+   printf("-h  Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+   char filename[256];

[net-next 0/2] BPF, kprobes: Add current_in_cgroup helper

2016-08-06 Thread Sargun Dhillon

This patchset includes a helper and an example to determine whether the kprobe 
is currently executing in the context of a specific cgroup based on a cgroup
bpf map / array. 

Sargun Dhillon (2):
  bpf: Add bpf_current_in_cgroup helper
  samples/bpf: Add example using current_in_cgroup

 include/linux/bpf.h| 24 +++
 include/uapi/linux/bpf.h   | 11 +
 kernel/bpf/arraymap.c  |  2 +-
 kernel/bpf/verifier.c  |  4 +-
 kernel/trace/bpf_trace.c   | 34 +++
 net/core/filter.c  | 11 ++---
 samples/bpf/Makefile   |  4 ++
 samples/bpf/bpf_helpers.h  |  2 +
 samples/bpf/trace_current_in_cgroup_kern.c | 44 
 samples/bpf/trace_current_in_cgroup_user.c | 66 ++
 10 files changed, 193 insertions(+), 9 deletions(-)
 create mode 100644 samples/bpf/trace_current_in_cgroup_kern.c
 create mode 100644 samples/bpf/trace_current_in_cgroup_user.c

-- 
2.7.4

[net-next 1/2] bpf: Add bpf_current_in_cgroup helper

2016-08-06 Thread Sargun Dhillon

This adds a kprobe helper that's similar to the skb_in_cgroup helper. It
checks whether the probe is currently executing in the context of the
cgroup at the given index a CGROUP_ARRAY.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
---
 include/linux/bpf.h  | 24 
 include/uapi/linux/bpf.h | 11 +++
 kernel/bpf/arraymap.c|  2 +-
 kernel/bpf/verifier.c|  4 +++-
 kernel/trace/bpf_trace.c | 34 ++
 net/core/filter.c| 11 ---
 6 files changed, 77 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1113423..9adf712 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -319,4 +319,28 @@ extern const struct bpf_func_proto bpf_get_stackid_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#ifdef CONFIG_CGROUPS
+/* Helper to fetch a cgroup pointer based on index.
+ * @map: a cgroup arraymap
+ * @idx: index of the item you want to fetch
+ *
+ * Returns pointer on success,
+ * Error code if item not found, or out-of-bounds access
+ */
+static inline struct cgroup *fetch_arraymap_ptr(struct bpf_map *map, int idx)
+{
+   struct cgroup *cgrp;
+   struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+   if (unlikely(idx >= array->map.max_entries))
+   return ERR_PTR(-E2BIG);
+
+   cgrp = READ_ONCE(array->ptrs[idx]);
+   if (unlikely(!cgrp))
+   return ERR_PTR(-EAGAIN);
+
+   return cgrp;
+}
+#endif /* CONFIG_CGROUPS */
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da218fe..23a5b99 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,17 @@ enum bpf_func_id {
 */
BPF_FUNC_probe_write_user,
 
+   /**
+* bpf_current_in_cgroup(map, index) - Check cgroup2 membership of skb
+* @map: pointer to bpf_map in BPF_MAP_TYPE_CGROUP_ARRAY type
+* @index: index of the cgroup in the bpf_map
+* Return:
+*   == 0 current failed the cgroup2 descendant test
+*   == 1 current succeeded the cgroup2 descendant test
+*< 0 error
+*/
+   BPF_FUNC_current_in_cgroup,
+
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 633a650..a2ac051 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -538,7 +538,7 @@ static int __init register_perf_event_array_map(void)
 }
 late_initcall(register_perf_event_array_map);
 
-#ifdef CONFIG_SOCK_CGROUP_DATA
+#ifdef CONFIG_CGROUPS
 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
 struct file *map_file /* not used */,
 int fd)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f72f23b..e16559b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1052,7 +1052,8 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
goto error;
break;
case BPF_MAP_TYPE_CGROUP_ARRAY:
-   if (func_id != BPF_FUNC_skb_in_cgroup)
+   if (func_id != BPF_FUNC_skb_in_cgroup &&
+   func_id != BPF_FUNC_current_in_cgroup)
goto error;
break;
default:
@@ -1074,6 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map 
*map, int func_id)
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
break;
+   case BPF_FUNC_current_in_cgroup:
case BPF_FUNC_skb_in_cgroup:
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
goto error;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b20438f..f2a6bc5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -376,6 +376,36 @@ static const struct bpf_func_proto 
bpf_get_current_task_proto = {
.ret_type   = RET_INTEGER,
 };
 
+#ifdef CONFIG_CGROUPS
+static u64 bpf_current_in_cgroup(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   struct bpf_map *map = (struct bpf_map *)(long)r1;
+   struct css_set *cset;
+   struct cgroup *cgrp;
+   u32 idx = (u32)r2;
+
+   if (unlikely(in_interrupt()))
+   return -EINVAL;
+
+   cgrp = fetch_arraymap_ptr(map, idx);
+
+   if (unlikely(IS_ERR(cgrp)))
+   return PTR_ERR(cgrp);
+
+   cset = task_css_set(current);
+
+   return cgroup_is_descendant(cset->dfl_cgrp, cgrp);
+}
+
+static const struct bpf_func_proto bpf_current_in_cgroup_proto = {
+   .func   = bpf_current_in_cgroup,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_ty

Re: [RFC 2/4] bpf, security: Add Checmate

2016-08-05 Thread Sargun Dhillon

On Thu, Aug 04, 2016 at 05:34:32PM +0800, zhuyj wrote:
>  Sure.
> Is it better to add
> #ifndef CONFIG_PREEMPT_RCU ?
> 
> On Thu, Aug 4, 2016 at 4:28 PM, Eric Dumazet  wrote:
> > Please do not top post
> >
> > On Thu, 2016-08-04 at 16:08 +0800, zhuyj wrote:
> >>  +void register_checmate_prog_ops(void);
> >> maybe it is extern void register_checmate_prog_ops(void);?
> >>
> >> +   preempt_disable();
> >> +   rcu_read_lock();
> >> IMHO, it is not necessary to use the above 2 since rcu_read_lock will
> >> call preempt_disable.
> >
> > You might double check if this claim is true if CONFIG_PREEMPT_RCU=y
> >
> >
> >
Thanks for your feedback zhuyj, Looking at kernel documentation itself, it 
looks 
like this is the preferred mechanism[1]. Their example:

 1 preempt_disable();
 2 rcu_read_lock();
 3 do_something();
 4 rcu_read_unlock();
 5 preempt_enable();

But, I think you're right. Do you know if there's a great benefit of doing 
this? 
Or does it make sense to implement a new macro, a la 
rcu_read_lock_and_preent_disable()?

[1] 
https://www.kernel.org/doc/Documentation/RCU/Design/Requirements/Requirements.html#Disabling
 Preemption Does Not Block Grace Periods

Re: [RFC 0/4] RFC: Add Checmate, BPF-driven minor LSM

2016-08-04 Thread Sargun Dhillon

On Thu, Aug 04, 2016 at 11:45:08AM +0200, Daniel Borkmann wrote:
> Hi Sargun,
> 
> On 08/04/2016 09:11 AM, Sargun Dhillon wrote:
> [...]
> >[It's a] minor LSM. My particular use case is one in which containers are 
> >being
> >dynamically deployed to machines by internal developers in a different group.
> [...]
> >For many of these containers, the security policies can be fairly nuanced. 
> >One
> >particular one to take into account is network security. Often times,
> >administrators want to prevent ingress, and egress connectivity except from a
> >few select IPs. Egress filtering can be managed using net_cls, but without
> >modifying running software, it's non-trivial to attach a filter to all 
> >sockets
> >being created within a container. The inet_conn_request, socket_recvmsg,
> >socket_sock_rcv_skb hooks make this trivial to implement.
> 
> I'm not too familiar with LSMs, but afaik, when you install such policies they
> are effectively global, right? How would you install/manage such policies per
> container?
> 
> On a quick glance, this would then be the job of the BPF proglet from the 
> global
> hook, no? If yes, then the BPF contexts the BPF prog works with seem rather 
> quite
> limited ...
You're right. They are global hooks. If you'd want the policy to be specific to 
a given cgroup, or namespace, you'd have to introduce a level of indirection 
through a prog_array, or some such. There are still cases (the CVE, and Docker 
bind) case where you want global isolation. The other big aspect is being able 
to implement application-specific LSMs without requiring kmods. (A la 
hardchroot).

> 
> +struct checmate_file_open_ctx {
> + struct file *file;
> + const struct cred *cred;
> +};
> +
> +struct checmate_task_create_ctx {
> + unsigned long clone_flags;
> +};
> +
> +struct checmate_task_free_ctx {
> + struct task_struct *task;
> +};
> +
> +struct checmate_socket_connect_ctx {
> + struct socket *sock;
> + struct sockaddr *address;
> + int addrlen;
> +};
> 
> ... or are you using bpf_probe_read() in some way to walk 'current' to 
> retrieve
> a namespace from there somehow? Like via nsproxy? But how you make sense of 
> this
> for defining a per container policy?
In my prototype code, I'm using uts namespace + hostname, and I'm extracting 
that via the bpf_probe_read walk. You're right, that's less than awesome. In 
the 
longer-term, I'd hope we'd be able to add a helper like bpf_current_in_cgroup 
(a 
la bpf_skb_in_cgroup). The idea is that we'd add enough helpers to avoid this. 
I 
can submit some more example BPF programs if that'd help.  Off the top of my 
head:

* current_in_cgroup 
* introduce struct pid map 
* introduce helpers to inspect common datatypes passed to the helper -- if you 
  look at something like the the net hooks, there aren't actually that many
  datatypes being passed around
* Introduce an example top-level cgroup that maps cgroup -> tail_call into
  other programs

> 
> Do you see a way where we don't need to define so many different ctx each 
> time?
> 
> My other concern from a security PoV is that when using things like 
> bpf_probe_read()
> we're dependent on kernel structs and there's a risk that when people migrate 
> such
> policies that expectations break due to underlying structs changed. I see 
> you've
> addressed that in patch 4 to place a small stone in the way, yeah kinda 
> works. It's
> mostly a reminder that this is not stable ABI.
> 
> Thanks,
> Daniel

Re: [RFC 0/4] RFC: Add Checmate, BPF-driven minor LSM

2016-08-04 Thread Sargun Dhillon

On Thu, Aug 04, 2016 at 10:41:17AM +0200, Richard Weinberger wrote:
> Sargun,
> 
> On Thu, Aug 4, 2016 at 9:11 AM, Sargun Dhillon <sar...@sargun.me> wrote:
> > I distributed this patchset to linux-security-mod...@vger.kernel.org 
> > earlier,
> > but based on the fact that the archive is down, and this is a fairly
> > broad-sweeping proposal, I figured I'd grow the audience a little bit. Sorry
> > if you received this multiple times.
> >
> > I've begun building out the skeleton of a Linux Security Module, and I'd 
> > like to
> > get feedback on it. It's a skeleton, and I've only populated a few hooks, 
> > so I'm
> > mostly looking for input on the general proposal, interest, and design. 
> > It's a
> > minor LSM. My particular use case is one in which containers are being
> > dynamically deployed to machines by internal developers in a different 
> > group.
> > The point of Checmate is to act as an extensible bed for _safe_, complex
> > security policies. It's nice to enable dynamic security policies that can be
> > defined in C, and change as neccessary, without ever having to patch, or 
> > rebuild
> > the kernel.
> >
> > For many of these containers, the security policies can be fairly nuanced. 
> > One
> > particular one to take into account is network security. Often times,
> > administrators want to prevent ingress, and egress connectivity except from 
> > a
> > few select IPs. Egress filtering can be managed using net_cls, but without
> > modifying running software, it's non-trivial to attach a filter to all 
> > sockets
> > being created within a container. The inet_conn_request, socket_recvmsg,
> > socket_sock_rcv_skb hooks make this trivial to implement.
> 
> What is wrong with having firewall rules per container?
> Either by matching the container IP or an interface...
> 
This requires infrastructure that's not always available. For one, this 
approach 
typically requires a network namespace per container, and therefore a dedicated 
IP. It's pretty common [1][2] to not have an IP/container solution, nor a 
network namespace per container solution. The alternatives to have a network 
namespace without IP/container typically involve bifurcating traffic using TC 
mirred actions, and friends. This isn't really great for debuggability. Twitter 
does this with their Mesos network isolator [3]. Cgroups / net_cls is great for 
egress traffic, but not ingress.

> > Other times, containers need to be throttled in places where there's not 
> > really
> > a good place to impose that policy for software which isn't built in-house. 
> >  If
> > one wants to limit file creations/sec, or reject I/O under certain
> > characteristics, there's not a great place to do it now. This gives 
> > engineers a
> > mechanism to write those policies.
> 
> Hmm, not sure if resource control is something we want to do with an LSM.
> 
This is just an example I brought up. I know of a fairly large security vendor 
that has abuse "patterns", and locks software down if it looks "abusive". They 
do it for VMs, but it'd be nice to do similar for containers.

> > This same flexibility can be used to take existing programs and enable safe 
> > BPF
> > helpers to modify memory to allow rules to pass. One example that I 
> > prototyped
> > was Docker's port mapping, which has an overhead (DNAT), and there's some 
> > loss
> > of fidelity in the BSD Socket API to identify what's going on. Instead, we 
> > can
> > just rewrite the port in a bind, based upon some data in a BPF map, and a 
> > cgroup
> > match.
> >
> > I can actually see other minor security modules being implemented in 
> > Checmate,
> > for example, Yama, or the recently proposed Hardchroot could be 
> > reimplemented in
> > BPF. Potentially, they could even be API compatible.
> >
> > Although, at first, much of this sounds like seccomp, it's quite different. 
> > For
> > one, what we can do in the security hooks is more complex (access to kernel
> > pointers). The other side of this is we can have effects on a system-wide,
> > or cgroup level. This also circumvents the need for CRIU-friendly policies.
> 
> It is like seccomp except that you have a single rule set and target LSM hooks
> instead of syscalls, right?
You're right, it's very similar. I like to think of Checmate as nftables for 
syscalls.

It turns out having this on LSM hooks is a very big difference. Since LSM hooks 
are executed after data is copied to the kernel, you can safely dereference 
pointers and inspect the user's intentions. In one of the attached patches, I 
bloc

[RFC 4/4] bpf: Restrict Checmate bpf programs to current kernel ABI

2016-08-04 Thread Sargun Dhillon

I think it makes sense to restrict Checmate to loading programs that have been 
compiled with the current kernel ABI. We can further stabilize the ABI, and 
perhaps lift this restriction later.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 kernel/bpf/syscall.c | 2 +-
 samples/bpf/checmate1_kern.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 228f962..2a37b4d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -741,7 +741,7 @@ static int bpf_prog_load(union bpf_attr *attr)
if (attr->insn_cnt >= BPF_MAXINSNS)
return -EINVAL;
 
-   if (type == BPF_PROG_TYPE_KPROBE &&
+   if ((type & (BPF_PROG_TYPE_KPROBE | BPF_PROG_TYPE_CHECMATE)) &&
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
 
diff --git a/samples/bpf/checmate1_kern.c b/samples/bpf/checmate1_kern.c
index f78b66b..d4ec1fa 100644
--- a/samples/bpf/checmate1_kern.c
+++ b/samples/bpf/checmate1_kern.c
@@ -3,6 +3,7 @@
 #include 
 #include 
 #include "bpf_helpers.h"
+#include 
 
 SEC("checmate")
 int prog(struct checmate_ctx *ctx)
@@ -24,4 +25,4 @@ int prog(struct checmate_ctx *ctx)
 }
 
 char _license[] SEC("license") = "GPL";
-
+u32 _version SEC("version") = LINUX_VERSION_CODE;
-- 
2.7.4

[RFC 0/4] RFC: Add Checmate, BPF-driven minor LSM

2016-08-04 Thread Sargun Dhillon

I distributed this patchset to linux-security-mod...@vger.kernel.org earlier, 
but based on the fact that the archive is down, and this is a fairly 
broad-sweeping proposal, I figured I'd grow the audience a little bit. Sorry
if you received this multiple times.

I've begun building out the skeleton of a Linux Security Module, and I'd like 
to 
get feedback on it. It's a skeleton, and I've only populated a few hooks, so 
I'm 
mostly looking for input on the general proposal, interest, and design. It's a 
minor LSM. My particular use case is one in which containers are being 
dynamically deployed to machines by internal developers in a different group. 
The point of Checmate is to act as an extensible bed for _safe_, complex 
security policies. It's nice to enable dynamic security policies that can be 
defined in C, and change as neccessary, without ever having to patch, or 
rebuild 
the kernel.

For many of these containers, the security policies can be fairly nuanced. One 
particular one to take into account is network security. Often times, 
administrators want to prevent ingress, and egress connectivity except from a 
few select IPs. Egress filtering can be managed using net_cls, but without 
modifying running software, it's non-trivial to attach a filter to all sockets 
being created within a container. The inet_conn_request, socket_recvmsg, 
socket_sock_rcv_skb hooks make this trivial to implement. 

Other times, containers need to be throttled in places where there's not really 
a good place to impose that policy for software which isn't built in-house.  If 
one wants to limit file creations/sec, or reject I/O under certain 
characteristics, there's not a great place to do it now. This gives engineers a 
mechanism to write those policies. 

This same flexibility can be used to take existing programs and enable safe BPF 
helpers to modify memory to allow rules to pass. One example that I prototyped 
was Docker's port mapping, which has an overhead (DNAT), and there's some loss 
of fidelity in the BSD Socket API to identify what's going on. Instead, we can 
just rewrite the port in a bind, based upon some data in a BPF map, and a 
cgroup 
match.

I can actually see other minor security modules being implemented in Checmate, 
for example, Yama, or the recently proposed Hardchroot could be reimplemented 
in 
BPF. Potentially, they could even be API compatible.

Although, at first, much of this sounds like seccomp, it's quite different. For
one, what we can do in the security hooks is more complex (access to kernel
pointers). The other side of this is we can have effects on a system-wide,
or cgroup level. This also circumvents the need for CRIU-friendly policies.

Lastly, the flexibility of this mechanism allows for prevention of security
vulnerabilities which are often complex in nature and require the interaction
of multiple hooks (CVE-2014-9717 is a good example), and although ksplice,
and livepatch exist, they're not always easy to use, as compared to loading
a single bpf program across all kernels.

The user-facing API is exposed via prctl as it's meant to be very simple (at 
least the kernel components). It only has three operations. For a given 
security 
hook, you can attach a BPF program to it, which will add it to the set of 
programs that are executed over when the hook is hit. You can reset a hook, 
which removes all program associated with a given hook, and you can set a 
deny_reset flag on a hook to prevent anyone from resetting it. It's likely that 
an individual would want to set this in any production use case.

On the BPF side of it, all that's involved in the work in progress is to
move some of the tracing helpers into the shared helpers. For example,
it's very valuable to have access to current when enforcing a hook.
BPF programs also have access to maps, which somewhat works around
the need for security blobs in some cases.

I would love to know what y'all think.

Sargun Dhillon (4):
  bpf: move tracing helpers to shared helpers
  bpf, security: Add Checmate
  security/checmate: Add Checmate sample
  bpf: Restrict Checmate bpf programs to current kernel ABI

 include/linux/bpf.h  |   2 +
 include/linux/checmate.h |  38 +
 include/uapi/linux/Kbuild|   1 +
 include/uapi/linux/bpf.h |   1 +
 include/uapi/linux/checmate.h|  65 +
 include/uapi/linux/prctl.h   |   3 +
 kernel/bpf/helpers.c |  34 +
 kernel/bpf/syscall.c |   2 +-
 kernel/trace/bpf_trace.c |  33 -
 samples/bpf/Makefile |   4 +
 samples/bpf/bpf_load.c   |  11 +-
 samples/bpf/checmate1_kern.c |  28 
 samples/bpf/checmate1_user.c |  54 +++
 security/Kconfig |   1 +
 security/Makefile|   2 +
 security/checmate/Kconfig|   6 +
 security/checmate/Makefile   |   3 +
 security/checmate/checmate_bpf.c |  67 +
 security/checmate/checmate_lsm.c | 304

[RFC 2/4] bpf, security: Add Checmate

2016-08-04 Thread Sargun Dhillon

This adds the minor LSM Checmate. The purpose of Checmate is to act as an
extensible LSM in which you can load security modules. The module has a
simple API, as it's meant to have most of the logic in BPF hooks. It has
three APIs that are accessible via prctl.

As follows:
* Install hook: This appends a new BPF program to a given hook. Hook
programs themselves must be unique BPF programs.
* Reset hook: This detaches all bpf programs asssociated with a hook.
* Deny Reset: This locks a hook, preventing reset. In production
  operation, it's expected that the user would lock
  their hooks.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 include/linux/checmate.h |  38 +
 include/uapi/linux/Kbuild|   1 +
 include/uapi/linux/bpf.h |   1 +
 include/uapi/linux/checmate.h|  65 +
 include/uapi/linux/prctl.h   |   3 +
 security/Kconfig |   1 +
 security/Makefile|   2 +
 security/checmate/Kconfig|   6 +
 security/checmate/Makefile   |   3 +
 security/checmate/checmate_bpf.c |  67 +
 security/checmate/checmate_lsm.c | 304 +++
 11 files changed, 491 insertions(+)
 create mode 100644 include/linux/checmate.h
 create mode 100644 include/uapi/linux/checmate.h
 create mode 100644 security/checmate/Kconfig
 create mode 100644 security/checmate/Makefile
 create mode 100644 security/checmate/checmate_bpf.c
 create mode 100644 security/checmate/checmate_lsm.c

diff --git a/include/linux/checmate.h b/include/linux/checmate.h
new file mode 100644
index 000..3e492b0
--- /dev/null
+++ b/include/linux/checmate.h
@@ -0,0 +1,38 @@
+#ifndef _LINUX_CHECMATE_H_
+#define _LINUX_CHECMATE_H_ 1
+#include 
+#include 
+
+/* Miscellanious contexts */
+struct checmate_file_open_ctx {
+   struct file *file;
+   const struct cred *cred;
+};
+
+struct checmate_task_create_ctx {
+   unsigned long clone_flags;
+};
+
+struct checmate_task_free_ctx {
+   struct task_struct *task;
+};
+
+struct checmate_socket_connect_ctx {
+   struct socket *sock;
+   struct sockaddr *address;
+   int addrlen;
+};
+
+struct checmate_ctx {
+   int hook;
+   union {
+   /* Miscellanious contexts */
+   struct checmate_file_open_ctx   file_open_ctx;
+   struct checmate_task_create_ctx task_create_ctx;
+   struct checmate_task_free_ctx   task_free_ctx;
+   /* CONFIG_SECURITY_NET contexts */
+   struct checmate_socket_connect_ctx  
socket_connect_ctx;
+   };
+};
+
+#endif
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ec10cfe..f8670a7 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -82,6 +82,7 @@ header-y += cciss_defs.h
 header-y += cciss_ioctl.h
 header-y += cdrom.h
 header-y += cgroupstats.h
+header-y += checmate.h
 header-y += chio.h
 header-y += cm4000_cs.h
 header-y += cn_proc.h
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index da218fe..6cafb58 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SCHED_ACT,
BPF_PROG_TYPE_TRACEPOINT,
BPF_PROG_TYPE_XDP,
+   BPF_PROG_TYPE_CHECMATE,
 };
 
 #define BPF_PSEUDO_MAP_FD  1
diff --git a/include/uapi/linux/checmate.h b/include/uapi/linux/checmate.h
new file mode 100644
index 000..18af381
--- /dev/null
+++ b/include/uapi/linux/checmate.h
@@ -0,0 +1,65 @@
+#ifndef _UAPI__LINUX_CHECMATE_H__
+#define _UAPI__LINUX_CHECMATE_H__
+
+#define CHECMATE_INSTALL_HOOK 1
+#define CHECMATE_DENY_RESET 2
+#define CHECMATE_RESET 3
+
+enum checmate_hook {
+   CHECMATE_HOOK_UNSPEC,
+   /* CONFIG_SECURITY_NET hooks */
+   CHECMATE_HOOK_UNIX_STREAM_CONNECT,
+   CHECMATE_HOOK_UNIX_MAY_SEND,
+   CHECMATE_HOOK_SOCKET_CREATE,
+   CHECMATE_HOOK_SOCKET_POST_CREATE,
+   CHECMATE_HOOK_SOCKET_BIND,
+   CHECMATE_HOOK_SOCKET_CONNECT,
+   CHECMATE_HOOK_SOCKET_LISTEN,
+   CHECMATE_HOOK_SOCKET_ACCEPT,
+   CHECMATE_HOOK_SOCKET_SENDMSG,
+   CHECMATE_HOOK_SOCKET_RECVMSG,
+   CHECMATE_HOOK_SOCKET_GETSOCKNAME,
+   CHECMATE_HOOK_SOCKET_GETPEERNAME,
+   CHECMATE_HOOK_SOCKET_GETSOCKOPT,
+   CHECMATE_HOOK_SOCKET_SETSOCKOPT,
+   CHECMATE_HOOK_SOCKET_SHUTDOWN,
+   CHECMATE_HOOK_SOCKET_SOCK_RCV_SKB,
+   CHECMATE_HOOK_SOCKET_GETPEERSEC_STREAM,
+   CHECMATE_HOOK_SOCKET_GETPEERSEC_DGRAM,
+   CHECMATE_HOOK_SK_ALLOC_SECURITY,
+   CHECMATE_HOOK_SK_FREE_SECURITY,
+   CHECMATE_HOOK_SK_CLONE_SECURITY,
+   CHECMATE_HOOK_SK_GETSECID,
+   CHECMATE_HOOK_SOCK_GRAFT,
+   CHECMATE_HOOK_INET_CONN_REQUEST,
+   CHECMATE_HOOK_INET_CSK_CLONE,
+   CHECMATE_HOOK_INET_CONN_ESTABLISHED,
+   CHECMATE_HOOK_SECMARK_RELABEL_

[RFC 1/4] bpf: move tracing helpers to shared helpers

2016-08-04 Thread Sargun Dhillon

Move bpf_probe_read and bpf_get_current_task to the shared helpers
so that Checmate can use them.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 include/linux/bpf.h  |  2 ++
 kernel/bpf/helpers.c | 34 ++
 kernel/trace/bpf_trace.c | 33 -
 3 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1113423..4e1fa57 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -314,6 +314,8 @@ extern const struct bpf_func_proto 
bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_current_task_proto;
+extern const struct bpf_func_proto bpf_probe_read_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ea3afb..c439afc 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* If kernel subsystem is allowing eBPF programs to call this function,
  * inside its own verifier_ops->get_func_proto() callback it should return
@@ -186,3 +187,36 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg1_type  = ARG_PTR_TO_RAW_STACK,
.arg2_type  = ARG_CONST_STACK_SIZE,
 };
+
+static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   return (long) current;
+}
+
+const struct bpf_func_proto bpf_get_current_task_proto = {
+   .func   = bpf_get_current_task,
+   .gpl_only   = true,
+   .ret_type   = RET_INTEGER,
+};
+
+static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   void *dst = (void *) (long) r1;
+   int ret, size = (int) r2;
+   void *unsafe_ptr = (void *) (long) r3;
+
+   ret = probe_kernel_read(dst, unsafe_ptr, size);
+   if (unlikely(ret < 0))
+   memset(dst, 0, size);
+
+   return ret;
+}
+
+const struct bpf_func_proto bpf_probe_read_proto = {
+   .func   = bpf_probe_read,
+   .gpl_only   = true,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_RAW_STACK,
+   .arg2_type  = ARG_CONST_STACK_SIZE,
+   .arg3_type  = ARG_ANYTHING,
+};
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b20438f..f7a107b 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -59,28 +59,6 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
-static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-   void *dst = (void *) (long) r1;
-   int ret, size = (int) r2;
-   void *unsafe_ptr = (void *) (long) r3;
-
-   ret = probe_kernel_read(dst, unsafe_ptr, size);
-   if (unlikely(ret < 0))
-   memset(dst, 0, size);
-
-   return ret;
-}
-
-static const struct bpf_func_proto bpf_probe_read_proto = {
-   .func   = bpf_probe_read,
-   .gpl_only   = true,
-   .ret_type   = RET_INTEGER,
-   .arg1_type  = ARG_PTR_TO_RAW_STACK,
-   .arg2_type  = ARG_CONST_STACK_SIZE,
-   .arg3_type  = ARG_ANYTHING,
-};
-
 static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 {
void *unsafe_ptr = (void *) (long) r1;
@@ -365,17 +343,6 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void 
*meta, u64 meta_size,
return __bpf_perf_event_output(regs, map, flags, );
 }
 
-static u64 bpf_get_current_task(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-   return (long) current;
-}
-
-static const struct bpf_func_proto bpf_get_current_task_proto = {
-   .func   = bpf_get_current_task,
-   .gpl_only   = true,
-   .ret_type   = RET_INTEGER,
-};
-
 static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id 
func_id)
 {
switch (func_id) {
-- 
2.7.4

[RFC 3/4] security/checmate: Add Checmate sample

2016-08-04 Thread Sargun Dhillon

The Checmate sample installs a policy barring new AF_INET connections
to port 1. We install the hook, and show an example of connect
returning EPERM, and then reset the policy.

If this is running concurrently with other policy engines, bad things
could happen.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
---
 samples/bpf/Makefile |  4 
 samples/bpf/bpf_load.c   | 11 ++---
 samples/bpf/checmate1_kern.c | 27 ++
 samples/bpf/checmate1_user.c | 54 
 4 files changed, 93 insertions(+), 3 deletions(-)
 create mode 100644 samples/bpf/checmate1_kern.c
 create mode 100644 samples/bpf/checmate1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 90ebf7d..83e1da8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -24,6 +24,7 @@ hostprogs-y += test_overhead
 hostprogs-y += test_cgrp2_array_pin
 hostprogs-y += xdp1
 hostprogs-y += xdp2
+hostprogs-y += checmate1
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -49,6 +50,7 @@ test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
 xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
+checmate1-objs := bpf_load.o libbpf.o checmate1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -74,6 +76,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
 always += test_cgrp2_tc_kern.o
 always += xdp1_kern.o
 always += xdp2_kern.o
+always += checmate1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -97,6 +100,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
 HOSTLOADLIBES_xdp1 += -lelf
 HOSTLOADLIBES_xdp2 += -lelf
+HOSTLOADLIBES_checmate1 += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on 
cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc 
CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 0cfda23..49e84e7b 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -51,6 +51,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
bool is_xdp = strncmp(event, "xdp", 3) == 0;
+   bool is_checmate = strncmp(event, "checmate", 8) == 0;
enum bpf_prog_type prog_type;
char buf[256];
int fd, efd, err, id;
@@ -69,6 +70,8 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
prog_type = BPF_PROG_TYPE_TRACEPOINT;
} else if (is_xdp) {
prog_type = BPF_PROG_TYPE_XDP;
+   } else if (is_checmate) {
+   prog_type = BPF_PROG_TYPE_CHECMATE;
} else {
printf("Unknown event '%s'\n", event);
return -1;
@@ -82,7 +85,7 @@ static int load_and_attach(const char *event, struct bpf_insn 
*prog, int size)
 
prog_fd[prog_cnt++] = fd;
 
-   if (is_xdp)
+   if (is_xdp || is_checmate)
return 0;
 
if (is_socket) {
@@ -326,7 +329,8 @@ int load_bpf_file(char *path)
memcmp(shname_prog, "kretprobe/", 10) == 0 ||
memcmp(shname_prog, "tracepoint/", 11) == 0 ||
memcmp(shname_prog, "xdp", 3) == 0 ||
-   memcmp(shname_prog, "socket", 6) == 0)
+   memcmp(shname_prog, "socket", 6) == 0 ||
+   memcpy(shname_prog, "checmate", 8) == 0)
load_and_attach(shname_prog, insns, 
data_prog->d_size);
}
}
@@ -344,7 +348,8 @@ int load_bpf_file(char *path)
memcmp(shname, "kretprobe/", 10) == 0 ||
memcmp(shname, "tracepoint/", 11) == 0 ||
memcmp(shname, "xdp", 3) == 0 ||
-   memcmp(shname, "socket", 6) == 0)
+   memcmp(shname, "socket", 6) == 0 ||
+   memcmp(shname, "checmate", 8) == 0)
load_and_attach(shname, data->d_buf, data->d_size);
}
 
diff --git a/samples/bpf/checmate1_kern.c b/samples/bpf/checmate1_kern.c
new file mode 100644
index 000..f78b66b
--- /dev/null
+++ b/samples/bpf/checmate1_kern.c
@@ -0,0 +1,27 @@
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("checmate")
+int prog(struct checmate_ctx *ctx)
+{
+   struct sockaddr address;
+   struct sockaddr_in *in_addr;
+   char fmt[] = "Denying access on port 1\n";
+
+   bpf_probe_read(, siz

[PATCH net-next v7 0/2] bpf: add bpf_probe_write_user helper & example

2016-07-25 Thread Sargun Dhillon

This patch series contains two patches that add support for a probe_write
helper to BPF programs. This allows them to manipulate user memory during
the course of tracing. The second patch in the series has an example that
uses it, in one the intended ways to divert execution.

Thanks to Alexei Starovoitov, and Daniel Borkmann for being patient, review, 
and 
helping me get familiar with the code base. I've made changes based on their 
recommendations.

This helper should be considered for experimental usage and debugging, so we
print a warning to dmesg when it is along with the command and pid when someone
tries to install a proglet that uses it. A follow-up patchset will contain a
mechanism to verify the safety of the probe beyond what was done by hand.

v1->v2: restrict writing to user space, as opposed to globally v2->v3: Fixed
formatting issues v3->v4: Rename copy_to_user -> bpf_probe_write
Simplify checking of whether or not it's safe to write
Add warnings to dmesg
v4->v5: Raise warning level
Cleanup location of warning code
Make test fail when helper is broken
v5->v6: General formatting cleanup
Rename bpf_probe_write -> bpf_probe_write_user
v6->v7: More formatting cleanup.
Clarifying a few comments
    Clarified log message

Sargun Dhillon (2):
  bpf: Add bpf_probe_write_user BPF helper to be called in tracers
  samples/bpf: Add test/example of using bpf_probe_write_user bpf helper

 include/uapi/linux/bpf.h | 10 
 kernel/trace/bpf_trace.c | 45 ++
 samples/bpf/Makefile |  4 ++
 samples/bpf/bpf_helpers.h|  2 +
 samples/bpf/test_probe_write_user_kern.c | 52 +
 samples/bpf/test_probe_write_user_user.c | 78 
 6 files changed, 191 insertions(+)
 create mode 100644 samples/bpf/test_probe_write_user_kern.c
 create mode 100644 samples/bpf/test_probe_write_user_user.c

-- 
2.7.4

[PATCH net-next v7 1/2] bpf: Add bpf_probe_write_user BPF helper to be called in tracers

2016-07-25 Thread Sargun Dhillon

This allows user memory to be written to during the course of a kprobe.
It shouldn't be used to implement any kind of security mechanism
because of TOC-TOU attacks, but rather to debug, divert, and
manipulate execution of semi-cooperative processes.

Although it uses probe_kernel_write, we limit the address space
the probe can write into by checking the space with access_ok.
We do this as opposed to calling copy_to_user directly, in order
to avoid sleeping. In addition we ensure the threads's current fs
/ segment is USER_DS and the thread isn't exiting nor a kernel thread.

Given this feature is meant for experiments, and it has a risk of
crashing the system, and running programs, we print a warning on
when a proglet that attempts to use this helper is installed,
along with the pid and process name.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Acked-by: Alexei Starovoitov <a...@kernel.org>
---
 include/uapi/linux/bpf.h  | 10 ++
 kernel/trace/bpf_trace.c  | 45 +
 samples/bpf/bpf_helpers.h |  2 ++
 3 files changed, 57 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2b7076f..da218fe 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -365,6 +365,16 @@ enum bpf_func_id {
 */
BPF_FUNC_get_current_task,
 
+   /**
+* bpf_probe_write_user(void *dst, void *src, int len)
+* safely attempt to write to a location
+* @dst: destination address in userspace
+* @src: source address on stack
+* @len: number of bytes to copy
+* Return: 0 on success or negative error
+*/
+   BPF_FUNC_probe_write_user,
+
__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a12bbd3..b20438f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,6 +81,49 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
.arg3_type  = ARG_ANYTHING,
 };
 
+static u64 bpf_probe_write_user(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+   void *unsafe_ptr = (void *) (long) r1;
+   void *src = (void *) (long) r2;
+   int size = (int) r3;
+
+   /*
+* Ensure we're in user context which is safe for the helper to
+* run. This helper has no business in a kthread.
+*
+* access_ok() should prevent writing to non-user memory, but in
+* some situations (nommu, temporary switch, etc) access_ok() does
+* not provide enough validation, hence the check on KERNEL_DS.
+*/
+
+   if (unlikely(in_interrupt() ||
+current->flags & (PF_KTHREAD | PF_EXITING)))
+   return -EPERM;
+   if (unlikely(segment_eq(get_fs(), KERNEL_DS)))
+   return -EPERM;
+   if (!access_ok(VERIFY_WRITE, unsafe_ptr, size))
+   return -EPERM;
+
+   return probe_kernel_write(unsafe_ptr, src, size);
+}
+
+static const struct bpf_func_proto bpf_probe_write_user_proto = {
+   .func   = bpf_probe_write_user,
+   .gpl_only   = true,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_ANYTHING,
+   .arg2_type  = ARG_PTR_TO_STACK,
+   .arg3_type  = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
+{
+   pr_warn_ratelimited("%s[%d] is installing a program with 
bpf_probe_write_user helper that may corrupt user memory!",
+   current->comm, task_pid_nr(current));
+
+   return _probe_write_user_proto;
+}
+
 /*
  * limited trace_printk()
  * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
@@ -362,6 +405,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum 
bpf_func_id func_id)
return _get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return _perf_event_read_proto;
+   case BPF_FUNC_probe_write_user:
+   return bpf_get_probe_write_proto();
default:
return NULL;
}
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index 84e3fd9..217c8d5 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -41,6 +41,8 @@ static int (*bpf_perf_event_output)(void *ctx, void *map, int 
index, void *data,
(void *) BPF_FUNC_perf_event_output;
 static int (*bpf_get_stackid)(void *ctx, void *map, int flags) =
(void *) BPF_FUNC_get_stackid;
+static int (*bpf_probe_write_user)(void *dst, void *src, int size) =
+   (void *) BPF_FUNC_probe_write_user;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.7.4

[PATCH net-next v7 2/2] samples/bpf: Add test/example of using bpf_probe_write_user bpf helper

2016-07-25 Thread Sargun Dhillon

This example shows using a kprobe to act as a dnat mechanism to divert
traffic for arbitrary endpoints. It rewrite the arguments to a syscall
while they're still in userspace, and before the syscall has a chance
to copy the argument into kernel space.

Although this is an example, it also acts as a test because the mapped
address is 255.255.255.255:555 -> real address, and that's not a legal
address to connect to. If the helper is broken, the example will fail
on the intermediate steps, as well as the final step to verify the
rewrite of userspace memory succeeded.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Acked-by: Alexei Starovoitov <a...@kernel.org>
---
 samples/bpf/Makefile |  4 ++
 samples/bpf/test_probe_write_user_kern.c | 52 +
 samples/bpf/test_probe_write_user_user.c | 78 
 3 files changed, 134 insertions(+)
 create mode 100644 samples/bpf/test_probe_write_user_kern.c
 create mode 100644 samples/bpf/test_probe_write_user_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index d2d2b35..90ebf7d 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
 hostprogs-y += offwaketime
@@ -37,6 +38,7 @@ tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
 tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
+test_probe_write_user-objs := bpf_load.o libbpf.o test_probe_write_user_user.o
 trace_output-objs := bpf_load.o libbpf.o trace_output_user.o
 lathist-objs := bpf_load.o libbpf.o lathist_user.o
 offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
@@ -59,6 +61,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += test_probe_write_user_kern.o
 always += trace_output_kern.o
 always += tcbpf1_kern.o
 always += lathist_kern.o
@@ -85,6 +88,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_test_probe_write_user += -lelf
 HOSTLOADLIBES_trace_output += -lelf -lrt
 HOSTLOADLIBES_lathist += -lelf
 HOSTLOADLIBES_offwaketime += -lelf
diff --git a/samples/bpf/test_probe_write_user_kern.c 
b/samples/bpf/test_probe_write_user_kern.c
new file mode 100644
index 000..3a677c8
--- /dev/null
+++ b/samples/bpf/test_probe_write_user_kern.c
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Sargun Dhillon <sar...@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") dnat_map = {
+   .type = BPF_MAP_TYPE_HASH,
+   .key_size = sizeof(struct sockaddr_in),
+   .value_size = sizeof(struct sockaddr_in),
+   .max_entries = 256,
+};
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of arguments and their positions can change, etc.
+ * In such case this bpf+kprobe example will no longer be meaningful
+ *
+ * This example sits on a syscall, and the syscall ABI is relatively stable
+ * of course, across platforms, and over time, the ABI may change.
+ */
+SEC("kprobe/sys_connect")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   struct sockaddr_in new_addr, orig_addr = {};
+   struct sockaddr_in *mapped_addr;
+   void *sockaddr_arg = (void *)PT_REGS_PARM2(ctx);
+   int sockaddr_len = (int)PT_REGS_PARM3(ctx);
+
+   if (sockaddr_len > sizeof(orig_addr))
+   return 0;
+
+   if (bpf_probe_read(_addr, sizeof(orig_addr), sockaddr_arg) != 0)
+   return 0;
+
+   mapped_addr = bpf_map_lookup_elem(_map, _addr);
+   if (mapped_addr != NULL) {
+   memcpy(_addr, mapped_addr, sizeof(new_addr));
+   bpf_probe_write_user(sockaddr_arg, _addr,
+sizeof(new_addr));
+   }
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_probe_write_user_user.c 
b/samples/bpf/test_probe_write_user_user.c
new file mode 100644
index 000..a44bf34
--- /dev/null
+++ b/samples/bpf/test_probe_write_user_user.c
@@ -0,0 +1,78 @@
+#include 
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+#include 
+#include 
+#include 
+#include 
+
+int main(int ac, c

Relax kern_version constraints on bpf kprobes?

2016-07-23 Thread Sargun Dhillon

In kernel/bpf/syscall.c we restrict programs loading bpf kprobe programs so 
attr.kern_version must be exactly equal to what the user is running at the 
moment. This makes a lot of sense because kprobes can touch lots of
unstable bits of the kernel ABI. 

Unfortunately, this makes it really difficult to ship binary bpf programs
for debugging, and most customers don't want to go through all the steps
of preparing for compilation and installation of bpf programs for their 
specific kernel that was shipped by their vendor. 

This is especially problematic when the probe is touching only stable ABIs
(syscalls), or alternatively is just logging performance events. I realize
that we can change this section pretty easily by reading the version at
load time and modifying it, but it's kind of a pain.

For programs that we know are safe, is there a mechanism by which we can
bypass this check, and tell the loader that we know what we're doing
since these programs are only accessible to CAP_SYS_ADMIN?

[PATCH net-next v6 2/2] samples/bpf: Add test/example of using bpf_probe_write_user bpf helper

2016-07-23 Thread Sargun Dhillon

This example shows using a kprobe to act as a dnat mechanism to divert
traffic for arbitrary endpoints. It rewrite the arguments to a syscall
while they're still in userspace, and before the syscall has a chance
to copy the argument into kernel space.

Although this is an example, it also acts as a test because the mapped
address is 255.255.255.255:555 -> real address, and that's not a legal
address to connect to. If the helper is broken, the example will fail
on the intermediate steps, as well as the final step to verify the
rewrite of userspace memory succeeded.

Signed-off-by: Sargun Dhillon <sar...@sargun.me>
Cc: Alexei Starovoitov <a...@kernel.org>
Cc: Daniel Borkmann <dan...@iogearbox.net>
Acked-by: Alexei Starovoitov <a...@kernel.org>
---
 samples/bpf/Makefile |  4 ++
 samples/bpf/test_probe_write_user_kern.c | 52 +
 samples/bpf/test_probe_write_user_user.c | 78 
 3 files changed, 134 insertions(+)
 create mode 100644 samples/bpf/test_probe_write_user_kern.c
 create mode 100644 samples/bpf/test_probe_write_user_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index d2d2b35..90ebf7d 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -14,6 +14,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
 hostprogs-y += offwaketime
@@ -37,6 +38,7 @@ tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
 tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
+test_probe_write_user-objs := bpf_load.o libbpf.o test_probe_write_user_user.o
 trace_output-objs := bpf_load.o libbpf.o trace_output_user.o
 lathist-objs := bpf_load.o libbpf.o lathist_user.o
 offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
@@ -59,6 +61,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += test_probe_write_user_kern.o
 always += trace_output_kern.o
 always += tcbpf1_kern.o
 always += lathist_kern.o
@@ -85,6 +88,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_test_probe_write_user += -lelf
 HOSTLOADLIBES_trace_output += -lelf -lrt
 HOSTLOADLIBES_lathist += -lelf
 HOSTLOADLIBES_offwaketime += -lelf
diff --git a/samples/bpf/test_probe_write_user_kern.c 
b/samples/bpf/test_probe_write_user_kern.c
new file mode 100644
index 000..3a677c8
--- /dev/null
+++ b/samples/bpf/test_probe_write_user_kern.c
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Sargun Dhillon <sar...@sargun.me>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") dnat_map = {
+   .type = BPF_MAP_TYPE_HASH,
+   .key_size = sizeof(struct sockaddr_in),
+   .value_size = sizeof(struct sockaddr_in),
+   .max_entries = 256,
+};
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of arguments and their positions can change, etc.
+ * In such case this bpf+kprobe example will no longer be meaningful
+ *
+ * This example sits on a syscall, and the syscall ABI is relatively stable
+ * of course, across platforms, and over time, the ABI may change.
+ */
+SEC("kprobe/sys_connect")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   struct sockaddr_in new_addr, orig_addr = {};
+   struct sockaddr_in *mapped_addr;
+   void *sockaddr_arg = (void *)PT_REGS_PARM2(ctx);
+   int sockaddr_len = (int)PT_REGS_PARM3(ctx);
+
+   if (sockaddr_len > sizeof(orig_addr))
+   return 0;
+
+   if (bpf_probe_read(_addr, sizeof(orig_addr), sockaddr_arg) != 0)
+   return 0;
+
+   mapped_addr = bpf_map_lookup_elem(_map, _addr);
+   if (mapped_addr != NULL) {
+   memcpy(_addr, mapped_addr, sizeof(new_addr));
+   bpf_probe_write_user(sockaddr_arg, _addr,
+sizeof(new_addr));
+   }
+   return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/test_probe_write_user_user.c 
b/samples/bpf/test_probe_write_user_user.c
new file mode 100644
index 000..a44bf34
--- /dev/null
+++ b/samples/bpf/test_probe_write_user_user.c
@@ -0,0 +1,78 @@
+#include 
+#include 
+#include 
+#include 
+#include "libbpf.h"
+#include "bpf_load.h"
+#include 
+#include 
+#include 
+#include 
+
+int main(int ac, c

1 2 >

1 - 100 of 127 matches

Mail list logo