Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Daniel Borkmann

On 02/07/2017 01:02 AM, Alexei Starovoitov wrote:

On 2/6/17 3:39 PM, Daniel Borkmann wrote:

On 02/04/2017 04:34 AM, Alexei Starovoitov wrote:
[...]

+BPF_CALL_1(bpf_skb_netns_id, struct sk_buff *, skb)
+{
+struct net_device *dev = skb->dev;
+
+if (!dev)
+return 0;
+return proc_get_ns_devid_inum(_net(dev)->ns);
+}
+
+static const struct bpf_func_proto bpf_skb_netns_id_proto = {
+.func= bpf_skb_netns_id,
+.gpl_only= false,
+.ret_type= RET_INTEGER,
+.arg1_type= ARG_PTR_TO_CTX,
+};
+
  static const struct bpf_func_proto *
  sk_filter_func_proto(enum bpf_func_id func_id)
  {
@@ -2620,6 +2649,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
  case BPF_FUNC_trace_printk:
  if (capable(CAP_SYS_ADMIN))
  return bpf_get_trace_printk_proto();
+case BPF_FUNC_sk_netns_id:
+return _skb_netns_id_proto;
  default:
  return NULL;
  }


Btw, I think here's an oversight that would still need to be
fixed. Above would mean that trace printk from unprivileged would
fall through and use _skb_netns_id_proto as proto now instead
of NULL. So BPF_FUNC_sk_netns_id needs to be placed above the
BPF_FUNC_trace_printk case, not in its fall-through path. Looks
like Chenbo in his get_socket_cookie missed this, too. Other than
that BPF bits seem good to me.


Ahh, right. Good catch.
I'll add 'else return NULL;' otherwise somebody might step on it again.
Thanks Daniel!


I guess an explicit comment "/* fall-through */" would also be fine
and get noticed. Thanks!


Eric,
still waiting for your review of nsfs.c bits.





Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Alexei Starovoitov

On 2/6/17 3:39 PM, Daniel Borkmann wrote:

On 02/04/2017 04:34 AM, Alexei Starovoitov wrote:
[...]

+BPF_CALL_1(bpf_skb_netns_id, struct sk_buff *, skb)
+{
+struct net_device *dev = skb->dev;
+
+if (!dev)
+return 0;
+return proc_get_ns_devid_inum(_net(dev)->ns);
+}
+
+static const struct bpf_func_proto bpf_skb_netns_id_proto = {
+.func= bpf_skb_netns_id,
+.gpl_only= false,
+.ret_type= RET_INTEGER,
+.arg1_type= ARG_PTR_TO_CTX,
+};
+
  static const struct bpf_func_proto *
  sk_filter_func_proto(enum bpf_func_id func_id)
  {
@@ -2620,6 +2649,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
  case BPF_FUNC_trace_printk:
  if (capable(CAP_SYS_ADMIN))
  return bpf_get_trace_printk_proto();
+case BPF_FUNC_sk_netns_id:
+return _skb_netns_id_proto;
  default:
  return NULL;
  }


Btw, I think here's an oversight that would still need to be
fixed. Above would mean that trace printk from unprivileged would
fall through and use _skb_netns_id_proto as proto now instead
of NULL. So BPF_FUNC_sk_netns_id needs to be placed above the
BPF_FUNC_trace_printk case, not in its fall-through path. Looks
like Chenbo in his get_socket_cookie missed this, too. Other than
that BPF bits seem good to me.


Ahh, right. Good catch.
I'll add 'else return NULL;' otherwise somebody might step on it again.
Thanks Daniel!

Eric,
still waiting for your review of nsfs.c bits.



Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper

2017-02-06 Thread Daniel Borkmann

On 02/04/2017 04:34 AM, Alexei Starovoitov wrote:
[...]

+BPF_CALL_1(bpf_skb_netns_id, struct sk_buff *, skb)
+{
+   struct net_device *dev = skb->dev;
+
+   if (!dev)
+   return 0;
+   return proc_get_ns_devid_inum(_net(dev)->ns);
+}
+
+static const struct bpf_func_proto bpf_skb_netns_id_proto = {
+   .func   = bpf_skb_netns_id,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_CTX,
+};
+
  static const struct bpf_func_proto *
  sk_filter_func_proto(enum bpf_func_id func_id)
  {
@@ -2620,6 +2649,8 @@ sk_filter_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN))
return bpf_get_trace_printk_proto();
+   case BPF_FUNC_sk_netns_id:
+   return _skb_netns_id_proto;
default:
return NULL;
}


Btw, I think here's an oversight that would still need to be
fixed. Above would mean that trace printk from unprivileged would
fall through and use _skb_netns_id_proto as proto now instead
of NULL. So BPF_FUNC_sk_netns_id needs to be placed above the
BPF_FUNC_trace_printk case, not in its fall-through path. Looks
like Chenbo in his get_socket_cookie missed this, too. Other than
that BPF bits seem good to me.


@@ -2700,6 +2731,17 @@ xdp_func_proto(enum bpf_func_id func_id)
  }


Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper

2017-02-05 Thread Andy Lutomirski
On Sun, Feb 5, 2017 at 11:24 AM, David Ahern  wrote:
> On 2/3/17 8:34 PM, Alexei Starovoitov wrote:
>> Therefore introduce 'u64 bpf_sk_netns_id(sk);' helper. It returns
>> unique value that identifies netns of given socket or dev_net(skb->dev)
>> The upper 32-bits of the return value contain device id where namespace
>> filesystem resides and lower 32-bits contain inode number within that 
>> filesystem.
>> It's the same as
>>  struct stat st;
>>  stat("/proc/pid/ns/net", );
>>  return (st->st_dev << 32)  | st->st_ino;
>>
> ...
>
>> can be considered a new feature, whereas for cgroup_sock
>> programs that modify sk->bound_dev_if (like 'ip vrf' does)
>> it's a bug fix, since 'ip vrf' needs to be netns aware.
>>
>
>
> LGTM.
>
> Reviewed-by: David Ahern 
> Tested-by: David Ahern 
>
> Updated patches for vrf in network namespaces are here:
> https://github.com/dsahern/iproute2 vrf/ip-vrf
>
>
> root@kenny-jessie2:~# ip vrf exec red bash
>
> root@kenny-jessie2:red:~# ping -c1 -w1 10.100.1.254
> PING 10.100.1.254 (10.100.1.254) 56(84) bytes of data.
> 64 bytes from 10.100.1.254: icmp_seq=1 ttl=64 time=0.230 ms
>
> --- 10.100.1.254 ping statistics ---
> 1 packets transmitted, 1 received, 0% packet loss, time 0ms
> rtt min/avg/max/mdev = 0.230/0.230/0.230/0.000 ms
>
> root@kenny-jessie2:red:~# unshare -n
>
> root@kenny-jessie2:red:~# ping -c1 -w1 10.100.1.254
> connect: Network is unreachable
>
>
> Andy: thank you for the feedback on the 'ip vrf' use case. I believe this 
> kernel patch + my iproute2 patches address the issues mentioned so far. 
> Specifically, the transcript above shows your concern about 'unshare -n' case 
> is handled. In one of the many responses last night, you mentioned I have 'a 
> somewhat kludgey fix that gets the "ip netns" case'. If you can elaborate on 
> 'somewhat kludgey', I can fix those this week as well.

What I meant was: fixing it in iproute2 without kernel changes would
be kludgey and incomplete.  You seem to have fixed it by depending on
this kernel patch.

--Andy


Re: [PATCH v3 net] bpf: add bpf_sk_netns_id() helper

2017-02-05 Thread David Ahern
On 2/3/17 8:34 PM, Alexei Starovoitov wrote:
> Therefore introduce 'u64 bpf_sk_netns_id(sk);' helper. It returns
> unique value that identifies netns of given socket or dev_net(skb->dev)
> The upper 32-bits of the return value contain device id where namespace
> filesystem resides and lower 32-bits contain inode number within that 
> filesystem.
> It's the same as
>  struct stat st;
>  stat("/proc/pid/ns/net", );
>  return (st->st_dev << 32)  | st->st_ino;
> 
...

> can be considered a new feature, whereas for cgroup_sock
> programs that modify sk->bound_dev_if (like 'ip vrf' does)
> it's a bug fix, since 'ip vrf' needs to be netns aware.
> 


LGTM.

Reviewed-by: David Ahern 
Tested-by: David Ahern 

Updated patches for vrf in network namespaces are here:
https://github.com/dsahern/iproute2 vrf/ip-vrf


root@kenny-jessie2:~# ip vrf exec red bash

root@kenny-jessie2:red:~# ping -c1 -w1 10.100.1.254
PING 10.100.1.254 (10.100.1.254) 56(84) bytes of data.
64 bytes from 10.100.1.254: icmp_seq=1 ttl=64 time=0.230 ms

--- 10.100.1.254 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.230/0.230/0.230/0.000 ms

root@kenny-jessie2:red:~# unshare -n

root@kenny-jessie2:red:~# ping -c1 -w1 10.100.1.254
connect: Network is unreachable


Andy: thank you for the feedback on the 'ip vrf' use case. I believe this 
kernel patch + my iproute2 patches address the issues mentioned so far. 
Specifically, the transcript above shows your concern about 'unshare -n' case 
is handled. In one of the many responses last night, you mentioned I have 'a 
somewhat kludgey fix that gets the "ip netns" case'. If you can elaborate on 
'somewhat kludgey', I can fix those this week as well.


[PATCH v3 net] bpf: add bpf_sk_netns_id() helper

2017-02-03 Thread Alexei Starovoitov
in cases where bpf programs are looking at sockets and packets
that belong to different netns, it could be useful to get an id
that uniquely identify a netns within the whole system.

Therefore introduce 'u64 bpf_sk_netns_id(sk);' helper. It returns
unique value that identifies netns of given socket or dev_net(skb->dev)
The upper 32-bits of the return value contain device id where namespace
filesystem resides and lower 32-bits contain inode number within that 
filesystem.
It's the same as
 struct stat st;
 stat("/proc/pid/ns/net", );
 return (st->st_dev << 32)  | st->st_ino;

For example to disallow raw sockets in all non-init netns
the bpf_type_cgroup_sock program can do:
if (sk->type == SOCK_RAW && bpf_sk_netns_id(sk) != 0x3f075)
  return 0;
where 0x3f075 comes from combination of st_dev and st_ino
of /proc/pid/ns/net

Note that all bpf programs types are global. The same socket filter
program can be attached to sockets in different netns,
just like cls_bpf can see ingress/egress packets of multiple
net_devices in different netns. The cgroup_bpf programs are
the most exposed to sockets and devices across netns,
but the need to identify netns applies to all.
For example, if bpf_type_cgroup_skb didn't exist the system wide
monitoring daemon could have used ld_preload mechanism and
attached the same program to see traffic from applications
across netns. Therefore make bpf_sk_netns_id() helper available
to all network related bpf program types.
For socket, cls_bpf and cgroup_skb programs this helper
can be considered a new feature, whereas for cgroup_sock
programs that modify sk->bound_dev_if (like 'ip vrf' does)
it's a bug fix, since 'ip vrf' needs to be netns aware.

Signed-off-by: Alexei Starovoitov 
---
Eric, I'v added proc_get_ns_devid_inum() to nsfs.c
right next to __ns_get_path(), so when it is time in the future
to make nsfs more namespace aware, it will be easy to adjust
both new_inode_pseudo(mnt->mnt_sb) line and proc_get_ns_devid_inum()
I thought about using ns->stashed, but it's obviously transient
inode and not usable. If later we decide to store dev_t into ns_common
it will be fine as well. We'll just change proc_get_ns_devid_inum()
without affecting user space.

v2->v3: build bot complained. s/static/static inline/. no other changes.
---
 fs/nsfs.c |  7 +++
 include/linux/proc_ns.h   |  3 ++-
 include/uapi/linux/bpf.h  | 14 +-
 net/core/filter.c | 44 ++-
 samples/bpf/bpf_helpers.h |  2 ++
 samples/bpf/sock_flags_kern.c |  2 ++
 samples/bpf/sockex1_kern.c|  2 ++
 7 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8c9fb29c6673..1a604bccef86 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -49,6 +49,13 @@ static void nsfs_evict(struct inode *inode)
ns->ops->put(ns);
 }
 
+u64 proc_get_ns_devid_inum(struct ns_common *ns)
+{
+   u64 dev = new_encode_dev(nsfs_mnt->mnt_sb->s_dev);
+
+   return (dev << 32) | ns->inum;
+}
+
 static void *__ns_get_path(struct path *path, struct ns_common *ns)
 {
struct vfsmount *mnt = nsfs_mnt;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 12cb8bd81d2d..531c16105198 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -48,7 +48,7 @@ extern int pid_ns_prepare_proc(struct pid_namespace *ns);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
 extern int proc_alloc_inum(unsigned int *pino);
 extern void proc_free_inum(unsigned int inum);
-
+extern u64 proc_get_ns_devid_inum(struct ns_common *ns);
 #else /* CONFIG_PROC_FS */
 
 static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
@@ -61,6 +61,7 @@ static inline int proc_alloc_inum(unsigned int *inum)
 }
 static inline void proc_free_inum(unsigned int inum) {}
 
+static inline u64 proc_get_ns_devid_inum(struct ns_common *ns) { return 0; }
 #endif /* CONFIG_PROC_FS */
 
 static inline int ns_alloc_inum(struct ns_common *ns)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eb0e87dbe9f..e5b8cf16cbaf 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -430,6 +430,17 @@ union bpf_attr {
  * @xdp_md: pointer to xdp_md
  * @delta: An positive/negative integer to be added to xdp_md.data
  * Return: 0 on success or negative on error
+ *
+ * u64 bpf_sk_netns_id(sk)
+ * Returns unique value that identifies netns of given socket or skb.
+ * The upper 32-bits of the return value contain device id where namespace
+ * filesystem resides and lower 32-bits contain inode number within
+ * that filesystem. It's the same value as:
+ *  struct stat st;
+ *  stat("/proc/pid/ns/net", );
+ *  return (st->st_dev << 32)  | st->st_ino;
+ * @sk: pointer to struct sock or struct __sk_buff
+ * Return: filesystem's device id | netns inode
  */
 #define __BPF_FUNC_MAPPER(FN)  \
FN(unspec),