Re: [bpf-next V1 PATCH 0/8] bpf/xdp: add flags argument to ndo_xdp_xmit and flag flush operation

2018-05-30 Thread Song Liu
Overall, this set looks good to me. The only suggestion I have is to add more
documentation on the expected behavior of XDP_XMIT_FLUSH in netdevice.h
(as part of 01/08).

Thanks,
Song


On Wed, May 30, 2018 at 11:00 AM, Jesper Dangaard Brouer
 wrote:
> As I mentioned in merge commit 10f678683e4 ("Merge branch 'xdp_xmit-bulking'")
> I plan to change the API for ndo_xdp_xmit once more, by adding a flags
> argument, which is done in this patchset.
>
> I know it is late in the cycle (currently at rc7), but it would be
> nice to avoid changing NDOs over several kernel releases, as it is
> annoying to vendors and distro backporters, but it is not strictly
> UAPI so it is allowed (according to Alexei).
>
> The end-goal is getting rid of the ndo_xdp_flush operation, as it will
> make it possible for drivers to implement a TXQ synchronization mechanism
> that is not necessarily derived from the CPU id (smp_processor_id).
>
> This patchset removes all callers of the ndo_xdp_flush operation, but
> it doesn't take the last step of removing it from all drivers.  This
> can be done later, or I can update the patchset on request.
>
> Micro-benchmarks only show a very small performance improvement, for
> map-redirect around ~2 ns, and for non-map redirect ~7 ns.  I've not
> benchmarked this with CONFIG_RETPOLINE, but the performance benefit
> should be more visible given we end-up removing an indirect call.
>
> ---
>
> Jesper Dangaard Brouer (8):
>   xdp: add flags argument to ndo_xdp_xmit API
>   i40e: implement flush flag for ndo_xdp_xmit
>   ixgbe: implement flush flag for ndo_xdp_xmit
>   tun: implement flush flag for ndo_xdp_xmit
>   virtio_net: implement flush flag for ndo_xdp_xmit
>   xdp: done implementing ndo_xdp_xmit flush flag for all drivers
>   bpf/xdp: non-map redirect can avoid calling ndo_xdp_flush
>   bpf/xdp: devmap can avoid calling ndo_xdp_flush
>
>
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c   |9 -
>  drivers/net/ethernet/intel/i40e/i40e_txrx.h   |3 ++-
>  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   23 +--
>  drivers/net/tun.c |   25 
> ++---
>  drivers/net/virtio_net.c  |9 -
>  include/linux/netdevice.h |7 ---
>  include/net/xdp.h |4 
>  kernel/bpf/devmap.c   |   20 +++-
>  net/core/filter.c |3 +--
>  9 files changed, 69 insertions(+), 34 deletions(-)
>
> --


Re: [bpf-next V1 PATCH 8/8] bpf/xdp: devmap can avoid calling ndo_xdp_flush

2018-05-30 Thread Song Liu
On Wed, May 30, 2018 at 11:01 AM, Jesper Dangaard Brouer
 wrote:
> The XDP_REDIRECT map devmap can avoid using ndo_xdp_flush, by instead
> instructing ndo_xdp_xmit to flush via XDP_XMIT_FLUSH flag in
> appropriate places.
>
> Notice after this patch it is possible to remove ndo_xdp_flush
> completely, as this is the last user of ndo_xdp_flush. This is left
> for later patches, to keep driver changes separate.
>
> Signed-off-by: Jesper Dangaard Brouer 
> ---
>  kernel/bpf/devmap.c |   20 +++-
>  1 file changed, 7 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 04fbd75a5274..9c846a7a8cff 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -217,7 +217,7 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
>  }
>
>  static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> -struct xdp_bulk_queue *bq)
> +  struct xdp_bulk_queue *bq, bool flush)

How about we use "int flags" instead of "bool flush" for easier extension?

Thanks,
Song

>  {
> struct net_device *dev = obj->dev;
> int sent = 0, drops = 0, err = 0;
> @@ -232,7 +232,8 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> prefetch(xdpf);
> }
>
> -   sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
> +   sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q,
> +flush ? XDP_XMIT_FLUSH : 0);
> if (sent < 0) {
> err = sent;
> sent = 0;
> @@ -276,7 +277,6 @@ void __dev_map_flush(struct bpf_map *map)
> for_each_set_bit(bit, bitmap, map->max_entries) {
> struct bpf_dtab_netdev *dev = 
> READ_ONCE(dtab->netdev_map[bit]);
> struct xdp_bulk_queue *bq;
> -   struct net_device *netdev;
>
> /* This is possible if the dev entry is removed by user space
>  * between xdp redirect and flush op.
> @@ -287,10 +287,7 @@ void __dev_map_flush(struct bpf_map *map)
> __clear_bit(bit, bitmap);
>
> bq = this_cpu_ptr(dev->bulkq);
> -   bq_xmit_all(dev, bq);
> -   netdev = dev->dev;
> -   if (likely(netdev->netdev_ops->ndo_xdp_flush))
> -   netdev->netdev_ops->ndo_xdp_flush(netdev);
> +   bq_xmit_all(dev, bq, true);
> }
>  }
>
> @@ -320,7 +317,7 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct 
> xdp_frame *xdpf,
> struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
>
> if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
> -   bq_xmit_all(obj, bq);
> +   bq_xmit_all(obj, bq, false);
>
> /* Ingress dev_rx will be the same for all xdp_frame's in
>  * bulk_queue, because bq stored per-CPU and must be flushed
> @@ -359,8 +356,7 @@ static void *dev_map_lookup_elem(struct bpf_map *map, 
> void *key)
>
>  static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
>  {
> -   if (dev->dev->netdev_ops->ndo_xdp_flush) {
> -   struct net_device *fl = dev->dev;
> +   if (dev->dev->netdev_ops->ndo_xdp_xmit) {
> struct xdp_bulk_queue *bq;
> unsigned long *bitmap;
>
> @@ -371,9 +367,7 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
> __clear_bit(dev->bit, bitmap);
>
> bq = per_cpu_ptr(dev->bulkq, cpu);
> -   bq_xmit_all(dev, bq);
> -
> -   fl->netdev_ops->ndo_xdp_flush(dev->dev);
> +   bq_xmit_all(dev, bq, true);
> }
> }
>  }
>


Re: [bpf-next V1 PATCH 2/8] i40e: implement flush flag for ndo_xdp_xmit

2018-05-30 Thread Song Liu
On Wed, May 30, 2018 at 11:00 AM, Jesper Dangaard Brouer
 wrote:
> Signed-off-by: Jesper Dangaard Brouer 

I guess we still need to say something in the commit message?

> ---
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c |5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
> b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index c0451d6e0790..03c1446f0465 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -3685,7 +3685,7 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct 
> xdp_frame **frames,
> if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
> return -ENXIO;
>
> -   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
> +   if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
> return -EINVAL;
>
> for (i = 0; i < n; i++) {
> @@ -3699,6 +3699,9 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct 
> xdp_frame **frames,
> }
> }
>
> +   if (unlikely(flags & XDP_XMIT_FLUSH))
> +   i40e_xdp_ring_update_tail(vsi->xdp_rings[queue_index]);
> +
> return n - drops;

Do we still flush when drops > 0?

Thanks,
Song

>  }
>
>


Re: [bpf-next V1 PATCH 1/8] xdp: add flags argument to ndo_xdp_xmit API

2018-05-30 Thread Song Liu
gs)
>  {
> struct tun_struct *tun = netdev_priv(dev);
> struct tun_file *tfile;
> @@ -1294,6 +1295,9 @@ static int tun_xdp_xmit(struct net_device *dev, int n, 
> struct xdp_frame **frames
> int cnt = n;
> int i;
>
> +   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
> +   return -EINVAL;
> +
> rcu_read_lock();
>
> numqueues = READ_ONCE(tun->numqueues);
> @@ -1332,7 +1336,7 @@ static int tun_xdp_tx(struct net_device *dev, struct 
> xdp_buff *xdp)
> if (unlikely(!frame))
> return -EOVERFLOW;
>
> -   return tun_xdp_xmit(dev, 1, );
> +   return tun_xdp_xmit(dev, 1, , 0);
>  }
>
>  static void tun_xdp_flush(struct net_device *dev)
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index b2647dd5d302..4ed823625953 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -468,7 +468,7 @@ static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
>  }
>
>  static int virtnet_xdp_xmit(struct net_device *dev,
> -   int n, struct xdp_frame **frames)
> +   int n, struct xdp_frame **frames, u32 flags)
>  {
> struct virtnet_info *vi = netdev_priv(dev);
> struct receive_queue *rq = vi->rq;
> @@ -481,6 +481,9 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> int err;
> int i;
>
> +   if (unlikely(flags & ~XDP_XMIT_FLAGS_NONE))
> +   return -EINVAL;
> +
> qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
> sq = >sq[qp];
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 8452f72087ef..7f17785a59d7 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1185,13 +1185,13 @@ struct dev_ifalias {
>   * This function is used to set or query state related to XDP on the
>   * netdevice and manage BPF offload. See definition of
>   * enum bpf_netdev_command for details.
> - * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame 
> **xdp);
> + * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
> + * u32 flags);
>   * This function is used to submit @n XDP packets for transmit on a
>   * netdevice. Returns number of frames successfully transmitted, frames
>   * that got dropped are freed/returned via xdp_return_frame().
>   * Returns negative number, means general error invoking ndo, meaning
>   * no frames were xmit'ed and core-caller will free all frames.
> - * TODO: Consider add flag to allow sending flush operation.
>   * void (*ndo_xdp_flush)(struct net_device *dev);
>   * This function is used to inform the driver to flush a particular
>   * xdp tx queue. Must be called on same CPU as xdp_xmit.
> @@ -1380,7 +1380,8 @@ struct net_device_ops {
> int (*ndo_bpf)(struct net_device *dev,
>struct netdev_bpf *bpf);
> int (*ndo_xdp_xmit)(struct net_device *dev, int n,
> -   struct xdp_frame **xdp);
> +   struct xdp_frame **xdp,
> +   u32 flags);
> void    (*ndo_xdp_flush)(struct net_device *dev);
>  };
>
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index 7ad779237ae8..308a4b30b484 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -40,6 +40,11 @@ enum xdp_mem_type {
> MEM_TYPE_MAX,
>  };
>
> +/* XDP flags for ndo_xdp_xmit */
> +#define XDP_XMIT_FLAGS_NONE0U
> +#define XDP_XMIT_FLUSH (1U << 0)
> +#define XDP_XMIT_FLAGS_MASKXDP_XMIT_FLUSH
> +

I guess we need more documentation here on what XDP_XMIT_FLUSH does.

Other than this, it looks good to me.

Acked-by: Song Liu 


>  struct xdp_mem_info {
> u32 type; /* enum xdp_mem_type, but known size type */
> u32 id;
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index ae16d0c373ef..04fbd75a5274 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -232,7 +232,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
> prefetch(xdpf);
> }
>
> -   sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
> +   sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, 0);
> if (sent < 0) {
> err = sent;
> sent = 0;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 81bd2e9fe8fc..6a21dbcad350 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3056,7 +3056,7 @@ static int __bpf_tx_xdp(struct net_device *dev,
> if (unlikely(!xdpf))
> return -EOVERFLOW;
>
> -   sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, );
> +   sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, , 0);
> if (sent <= 0)
> return sent;
> dev->netdev_ops->ndo_xdp_flush(dev);
>


Re: [PATCH bpf-next 08/11] bpf: fix cbpf parser bug for octal numbers

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Range is 0-7, not 0-9, otherwise parser silently excludes it from the
> strtol() rather than throwing an error.
>
> Reported-by: Marc Boschma 
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  tools/bpf/bpf_exp.l | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tools/bpf/bpf_exp.l b/tools/bpf/bpf_exp.l
> index bd83149..4da8d05 100644
> --- a/tools/bpf/bpf_exp.l
> +++ b/tools/bpf/bpf_exp.l
> @@ -175,7 +175,7 @@ extern void yyerror(const char *str);
> yylval.number = strtol(yytext, NULL, 10);
> return number;
> }
> -([0][0-9]+){
> +([0][0-7]+){
> yylval.number = strtol(yytext + 1, NULL, 8);
> return number;
> }
> --
> 2.9.5
>


Re: [PATCH bpf-next 07/11] bpf: make sure to clear unused fields in tunnel/xfrm state fetch

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Since the remaining bits are not filled in struct bpf_tunnel_key
> resp. struct bpf_xfrm_state and originate from uninitialized stack
> space, we should make sure to clear them before handing control
> back to the program.
>
> Also add a padding element to struct bpf_xfrm_state for future use
> similar as we have in struct bpf_tunnel_key and clear it as well.
>
>   struct bpf_xfrm_state {
>   __u32  reqid;/* 0 4 */
>   __u32  spi;  /* 4 4 */
>   __u16  family;   /* 8 2 */
>
>   /* XXX 2 bytes hole, try to pack */
>
>   union {
>   __u32  remote_ipv4;  /*   4 */
>   __u32  remote_ipv6[4];   /*  16 */
>   };   /*1216 */
>
>   /* size: 28, cachelines: 1, members: 4 */
>   /* sum members: 26, holes: 1, sum holes: 2 */
>   /* last cacheline: 28 bytes */
>   };
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  include/uapi/linux/bpf.h | 3 ++-
>  net/core/filter.c| 6 ++
>  2 files changed, 8 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e2853aa..7108711 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2214,7 +2214,7 @@ struct bpf_tunnel_key {
> };
> __u8 tunnel_tos;
> __u8 tunnel_ttl;
> -   __u16 tunnel_ext;
> +   __u16 tunnel_ext;   /* Padding, future use. */
> __u32 tunnel_label;
>  };
>
> @@ -2225,6 +2225,7 @@ struct bpf_xfrm_state {
> __u32 reqid;
> __u32 spi;  /* Stored in network byte order */
> __u16 family;
> +   __u16 ext;  /* Padding, future use. */
> union {
> __u32 remote_ipv4;  /* Stored in network byte order */
> __u32 remote_ipv6[4];   /* Stored in network byte order */
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 717c740..5ceb5e6 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3445,6 +3445,7 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, 
> skb, struct bpf_tunnel_key
> to->tunnel_id = be64_to_cpu(info->key.tun_id);
> to->tunnel_tos = info->key.tos;
> to->tunnel_ttl = info->key.ttl;
> +   to->tunnel_ext = 0;
>
> if (flags & BPF_F_TUNINFO_IPV6) {
> memcpy(to->remote_ipv6, >key.u.ipv6.src,
> @@ -3452,6 +3453,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, 
> skb, struct bpf_tunnel_key
> to->tunnel_label = be32_to_cpu(info->key.label);
> } else {
> to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
> +   memset(>remote_ipv6[1], 0, sizeof(__u32) * 3);
> +   to->tunnel_label = 0;
> }
>
> if (unlikely(size != sizeof(struct bpf_tunnel_key)))
> @@ -4047,11 +4050,14 @@ BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, 
> skb, u32, index,
> to->reqid = x->props.reqid;
> to->spi = x->id.spi;
> to->family = x->props.family;
> +   to->ext = 0;
> +
> if (to->family == AF_INET6) {
> memcpy(to->remote_ipv6, x->props.saddr.a6,
>sizeof(to->remote_ipv6));
> } else {
> to->remote_ipv4 = x->props.saddr.a4;
> +   memset(>remote_ipv6[1], 0, sizeof(__u32) * 3);
> }
>
> return 0;
> --
> 2.9.5
>


Re: [PATCH bpf-next 05/11] bpf: avoid retpoline for lookup/update/delete calls on maps

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> While some of the BPF map lookup helpers provide a ->map_gen_lookup()
> callback for inlining the map lookup altogether it is not available
> for every map, so the remaining ones have to call bpf_map_lookup_elem()
> helper which does a dispatch to map->ops->map_lookup_elem(). In
> times of retpolines, this will control and trap speculative execution
> rather than letting it do its work for the indirect call and will
> therefore cause a slowdown. Likewise, bpf_map_update_elem() and
> bpf_map_delete_elem() do not have an inlined version and need to call
> into their map->ops->map_update_elem() resp. map->ops->map_delete_elem()
> handlers.
>
> Before:
>
>   # bpftool p d x i 1
> 0: (bf) r2 = r10
> 1: (07) r2 += -8
> 2: (7a) *(u64 *)(r2 +0) = 0
> 3: (18) r1 = map[id:1]
> 5: (85) call __htab_map_lookup_elem#232656
> 6: (15) if r0 == 0x0 goto pc+4
> 7: (71) r1 = *(u8 *)(r0 +35)
> 8: (55) if r1 != 0x0 goto pc+1
> 9: (72) *(u8 *)(r0 +35) = 1
>10: (07) r0 += 56
>11: (15) if r0 == 0x0 goto pc+4
>12: (bf) r2 = r0
>13: (18) r1 = map[id:1]
>15: (85) call bpf_map_delete_elem#215008  <-- indirect call via
>16: (95) exit helper
>
> After:
>
>   # bpftool p d x i 1
> 0: (bf) r2 = r10
> 1: (07) r2 += -8
> 2: (7a) *(u64 *)(r2 +0) = 0
> 3: (18) r1 = map[id:1]
> 5: (85) call __htab_map_lookup_elem#233328
> 6: (15) if r0 == 0x0 goto pc+4
> 7: (71) r1 = *(u8 *)(r0 +35)
> 8: (55) if r1 != 0x0 goto pc+1
> 9: (72) *(u8 *)(r0 +35) = 1
>10: (07) r0 += 56
>11: (15) if r0 == 0x0 goto pc+4
>12: (bf) r2 = r0
>13: (18) r1 = map[id:1]
>15: (85) call htab_lru_map_delete_elem#238240  <-- direct call
>16: (95) exit
>
> In all three lookup/update/delete cases however we can use the actual
> address of the map callback directly if we find that there's only a
> single path with a map pointer leading to the helper call, meaning
> when the map pointer has not been poisoned from verifier side.
> Example code can be seen above for the delete case.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  include/linux/filter.h |  3 +++
>  kernel/bpf/hashtab.c   | 12 ++---
>  kernel/bpf/verifier.c  | 67 
> +-
>  3 files changed, 62 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index b443f70..d407ede 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -301,6 +301,9 @@ struct xdp_buff;
>
>  /* Function call */
>
> +#define BPF_CAST_CALL(x)   \
> +   ((u64 (*)(u64, u64, u64, u64, u64))(x))
> +
>  #define BPF_EMIT_CALL(FUNC)\
> ((struct bpf_insn) {\
> .code  = BPF_JMP | BPF_CALL,\
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index b76828f..3ca2198 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -503,7 +503,9 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, 
> struct bpf_insn *insn_buf)
> struct bpf_insn *insn = insn_buf;
> const int ret = BPF_REG_0;
>
> -   *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, 
> u64))__htab_map_lookup_elem);
> +   BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
> +(void *(*)(struct bpf_map *map, void *key))NULL));
> +   *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
> *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
> *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
> offsetof(struct htab_elem, key) +
> @@ -530,7 +532,9 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
> const int ret = BPF_REG_0;
> const int ref_reg = BPF_REG_1;
>
> -   *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, 
> u64))__htab_map_lookup_elem);
> +   BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
> +(void *(*)(struct bpf_map *map, void *key))NULL));
> +   *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
> *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
> *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
>   offsetof(struct htab_elem, lru_node) +
> @@ -1369,7 +1373,9 @@ static u32 htab_of_map_gen_lookup(struct bpf_map *map,
> struct bpf_insn *insn = insn_buf;
> const int ret = BPF

Re: [PATCH bpf-next 09/11] bpf: fix context access in tracing progs on 32 bit archs

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Wang reported that all the testcases for BPF_PROG_TYPE_PERF_EVENT
> program type in test_verifier report the following errors on x86_32:
>
>   172/p unpriv: spill/fill of different pointers ldx FAIL
>   Unexpected error message!
>   0: (bf) r6 = r10
>   1: (07) r6 += -8
>   2: (15) if r1 == 0x0 goto pc+3
>   R1=ctx(id=0,off=0,imm=0) R6=fp-8,call_-1 R10=fp0,call_-1
>   3: (bf) r2 = r10
>   4: (07) r2 += -76
>   5: (7b) *(u64 *)(r6 +0) = r2
>   6: (55) if r1 != 0x0 goto pc+1
>   R1=ctx(id=0,off=0,imm=0) R2=fp-76,call_-1 R6=fp-8,call_-1 R10=fp0,call_-1 
> fp-8=fp
>   7: (7b) *(u64 *)(r6 +0) = r1
>   8: (79) r1 = *(u64 *)(r6 +0)
>   9: (79) r1 = *(u64 *)(r1 +68)
>   invalid bpf_context access off=68 size=8
>
>   378/p check bpf_perf_event_data->sample_period byte load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (71) r0 = *(u8 *)(r1 +68)
>   invalid bpf_context access off=68 size=1
>
>   379/p check bpf_perf_event_data->sample_period half load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (69) r0 = *(u16 *)(r1 +68)
>   invalid bpf_context access off=68 size=2
>
>   380/p check bpf_perf_event_data->sample_period word load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (61) r0 = *(u32 *)(r1 +68)
>   invalid bpf_context access off=68 size=4
>
>   381/p check bpf_perf_event_data->sample_period dword load permitted FAIL
>   Failed to load prog 'Permission denied'!
>   0: (b7) r0 = 0
>   1: (79) r0 = *(u64 *)(r1 +68)
>   invalid bpf_context access off=68 size=8
>
> Reason is that struct pt_regs on x86_32 doesn't fully align to 8 byte
> boundary due to its size of 68 bytes.
>
> Therefore, bpf_ctx_narrow_access_ok() will then bail out saying that
> off & (size_default - 1) which is 68 & 7 doesn't cleanly align in the
> case of sample_period access from struct bpf_perf_event_data, hence
> verifier wrongly thinks we might be doing an unaligned access here.
> Therefore adjust this down to machine size and check the offset for
> narrow access on that basis.
>
> We also need to fix pe_prog_is_valid_access(), since we hit the check
> for off % size != 0 (e.g. 68 % 8 -> 4) in the first and last test.
>
> Reported-by: Wang YanQing 
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 
> ---
>  include/linux/filter.h   | 30 --
>  kernel/trace/bpf_trace.c | 10 --
>  2 files changed, 32 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d407ede..89903d2 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -639,16 +639,34 @@ static inline bool bpf_prog_was_classic(const struct 
> bpf_prog *prog)
> return prog->type == BPF_PROG_TYPE_UNSPEC;
>  }
>
> -static inline bool
> -bpf_ctx_narrow_access_ok(u32 off, u32 size, const u32 size_default)
> +static inline u32 bpf_ctx_off_adjust_machine(u32 size)
> +{
> +   const u32 size_machine = sizeof(unsigned long);
> +
> +   if (size > size_machine && size % size_machine == 0)
> +   size = size_machine;

Not sure whether I understand this correctly. I guess we only need:
if (size % size_machine == 0)
   size = size_machine;

Or, is this function equivalent to
if (size == 8 && size_machine == 4)
 size = 4;

If this is the case, maybe we can make bpf_ctx_narrow_align_ok()
simpler?

Thanks,
Song

> +
> +   return size;
> +}
> +
> +static inline bool bpf_ctx_narrow_align_ok(u32 off, u32 size_access,
> +  u32 size_default)
>  {
> -   bool off_ok;
> +   size_default = bpf_ctx_off_adjust_machine(size_default);
> +   size_access  = bpf_ctx_off_adjust_machine(size_access);
> +
>  #ifdef __LITTLE_ENDIAN
> -   off_ok = (off & (size_default - 1)) == 0;
> +   return (off & (size_default - 1)) == 0;
>  #else
> -   off_ok = (off & (size_default - 1)) + size == size_default;
> +   return (off & (size_default - 1)) + size_access == size_default;
>  #endif
> -   return off_ok && size <= size_default && (size & (size - 1)) == 0;
> +}
> +
> +static inline bool
> +bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
> +{
> +   return bpf_ctx_narrow_align_ok(off, size, size_default) &&
> +  size <= size_default && (size & (size - 1)) == 0;
>  }
>
>  #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))
&g

Re: [PATCH bpf-next 04/11] bpf: show prog and map id in fdinfo

2018-05-30 Thread Song Liu
On Tue, May 29, 2018 at 12:55 PM, Daniel Borkmann  wrote:
> On 05/29/2018 07:27 PM, Jesper Dangaard Brouer wrote:
>> On Mon, 28 May 2018 02:43:37 +0200
>> Daniel Borkmann  wrote:
>>
>>> Its trivial and straight forward to expose it for scripts that can
>>> then use it along with bpftool in order to inspect an individual
>>> application's used maps and progs. Right now we dump some basic
>>> information in the fdinfo file but with the help of the map/prog
>>> id full introspection becomes possible now.
>>>
>>> Signed-off-by: Daniel Borkmann 
>>> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

>>
>> AFAICR iproute uses this proc fdinfo, for pinned maps.  Have you tested
>> if this change is handled gracefully by tc ?
>
> Yep, it works just fine, I also tested it before submission.


Re: [PATCH bpf-next 10/11] bpf: sync bpf uapi header with tools

2018-05-30 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Pull in recent changes from include/uapi/linux/bpf.h.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  tools/include/uapi/linux/bpf.h | 20 ++--
>  1 file changed, 18 insertions(+), 2 deletions(-)
>
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 9b8c6e3..7108711 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -2004,6 +2004,20 @@ union bpf_attr {
>   * direct packet access.
>   * Return
>   * 0 on success, or a negative error in case of failure.
> + *
> + * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb)
> + * Description
> + * Return the cgroup v2 id of the socket associated with the 
> *skb*.
> + * This is roughly similar to the **bpf_get_cgroup_classid**\ ()
> + * helper for cgroup v1 by providing a tag resp. identifier that
> + * can be matched on or used for map lookups e.g. to implement
> + * policy. The cgroup v2 id of a given path in the hierarchy is
> + * exposed in user space through the f_handle API in order to get
> + * to the same 64-bit id.
> + *
> + * This helper can be used on TC egress path, but not on ingress.
> + * Return
> + * The id is returned or 0 in case the id could not be retrieved.
>   */
>  #define __BPF_FUNC_MAPPER(FN)  \
> FN(unspec), \
> @@ -2082,7 +2096,8 @@ union bpf_attr {
> FN(lwt_push_encap), \
> FN(lwt_seg6_store_bytes),   \
> FN(lwt_seg6_adjust_srh),\
> -   FN(lwt_seg6_action),
> +   FN(lwt_seg6_action),\
> +   FN(skb_cgroup_id),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> @@ -2199,7 +2214,7 @@ struct bpf_tunnel_key {
> };
> __u8 tunnel_tos;
> __u8 tunnel_ttl;
> -   __u16 tunnel_ext;
> +   __u16 tunnel_ext;   /* Padding, future use. */
> __u32 tunnel_label;
>  };
>
> @@ -2210,6 +2225,7 @@ struct bpf_xfrm_state {
> __u32 reqid;
> __u32 spi;  /* Stored in network byte order */
> __u16 family;
> +   __u16 ext;  /* Padding, future use. */
> union {
> __u32 remote_ipv4;  /* Stored in network byte order */
> __u32 remote_ipv6[4];   /* Stored in network byte order */
> --
> 2.9.5
>


Re: [PATCH bpf-next 11/11] bpf, doc: add missing patchwork url and libbpf to maintainers

2018-05-29 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Add missing bits under tools/lib/bpf/ and also Q: entry in order to
> make it easier for people to retrieve current patch queue.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 


> ---
>  MAINTAINERS | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f492431..2fd51db 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2722,6 +2722,7 @@ L:netdev@vger.kernel.org
>  L: linux-ker...@vger.kernel.org
>  T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git
>  T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
> +Q: https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
>  S: Supported
>  F: arch/x86/net/bpf_jit*
>  F: Documentation/networking/filter.txt
> @@ -2740,6 +2741,7 @@ F:net/sched/act_bpf.c
>  F: net/sched/cls_bpf.c
>  F: samples/bpf/
>  F: tools/bpf/
> +F: tools/lib/bpf/
>  F: tools/testing/selftests/bpf/
>
>  BROADCOM B44 10/100 ETHERNET DRIVER
> --
> 2.9.5
>


Re: [PATCH bpf-next] bpftool: Support sendmsg{4,6} attach types

2018-05-29 Thread Song Liu
On Tue, May 29, 2018 at 2:20 PM, Jakub Kicinski  wrote:
> On Tue, 29 May 2018 13:29:31 -0700, Andrey Ignatov wrote:
>> Add support for recently added BPF_CGROUP_UDP4_SENDMSG and
>> BPF_CGROUP_UDP6_SENDMSG attach types to bpftool, update documentation
>> and bash completion.
>>
>> Signed-off-by: Andrey Ignatov 
>
> Reviewed-by: Jakub Kicinski 
>
>> I'm not sure about "since 4.18" in Documentation part. I can follow-up when
>> the next kernel version is known.
>
> IMHO it's fine, we can follow up if Linus decides to call it something
> else :)
>
> Thanks!

Acked-by: Song Liu 


Re: [PATCH bpf-next] bpf: clean up eBPF helpers documentation

2018-05-29 Thread Song Liu
On Tue, May 29, 2018 at 4:27 AM, Quentin Monnet
 wrote:
> These are minor edits for the eBPF helpers documentation in
> include/uapi/linux/bpf.h.
>
> The main fix consists in removing "BPF_FIB_LOOKUP_", because it ends
> with a non-escaped underscore that gets interpreted by rst2man and
> produces the following message in the resulting manual page:
>
> DOCUTILS SYSTEM MESSAGES
>System Message: ERROR/3 (/tmp/bpf-helpers.rst:, line 1514)
>   Unknown target name: "bpf_fib_lookup".
>
> Other edits consist in:
>
> - Improving formatting for flag values for "bpf_fib_lookup()" helper.
> - Emphasising a parameter name in description of the return value for
>   "bpf_get_stack()" helper.
> - Removing unnecessary blank lines between "Description" and "Return"
>   sections for the few helpers that would use it, for consistency.
>
> Signed-off-by: Quentin Monnet 
> ---
>  include/uapi/linux/bpf.h | 21 ++---
>  1 file changed, 10 insertions(+), 11 deletions(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index cc68787f2d97..3f556b35ac8d 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1010,7 +1010,6 @@ union bpf_attr {
>   * ::
>   *
>   * # sysctl kernel.perf_event_max_stack=
> - *
>   * Return
>   * The positive or null stack id on success, or a negative error
>   * in case of failure.
> @@ -1821,10 +1820,9 @@ union bpf_attr {
>   * ::
>   *
>   * # sysctl kernel.perf_event_max_stack=
> - *
>   * Return
> - * a non-negative value equal to or less than size on success, or
> - * a negative error in case of failure.
> + * A non-negative value equal to or less than *size* on success,
> + * or a negative error in case of failure.
>   *
>   * int skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void 
> *to, u32 len, u32 start_header)
>   * Description
> @@ -1845,7 +1843,6 @@ union bpf_attr {
>   * in socket filters where *skb*\ **->data** does not always 
> point
>   * to the start of the mac header and where "direct packet 
> access"
>   * is not available.
> - *
>   * Return
>   * 0 on success, or a negative error in case of failure.
>   *
> @@ -1861,16 +1858,18 @@ union bpf_attr {
>   * rt_metric is set to metric from route.
>   *
>   * *plen* argument is the size of the passed in struct.
> - * *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
> + * *flags* argument can be a combination of one or more of the
> + * following values:
>   *
> - * **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
> - * full lookup using FIB rules
> - * **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
> - * perspective (default is ingress)
> + * **BPF_FIB_LOOKUP_DIRECT**
> + * Do a direct table lookup vs full lookup using FIB
> + * rules.
> + * **BPF_FIB_LOOKUP_OUTPUT**
> + * Perform lookup from an egress perspective (default is
> + * ingress).
>   *
>   * *ctx* is either **struct xdp_md** for XDP programs or
>   * **struct sk_buff** tc cls_act programs.
> - *
>   * Return
>   * Egress device index on success, 0 if packet needs to continue
>   * up the stack for further processing or a negative error in 
> case
> --
> 2.14.1
>

Please also apply the same changes to tools/include/uapi/linux/bpf.h.

Other than this, it looks to me.

Acked-by: Song Liu 

Thanks,
Song


Re: [PATCH bpf-next 03/11] bpf: fixup error message from gpl helpers on license mismatch

2018-05-29 Thread Song Liu
On Tue, May 29, 2018 at 10:16 AM, Jesper Dangaard Brouer
 wrote:
> On Mon, 28 May 2018 02:43:36 +0200
> Daniel Borkmann  wrote:
>
>> Stating 'proprietary program' in the error is just silly since it
>> can also be a different open source license than that which is just
>> not compatible.
>>
>> Reference: https://twitter.com/majek04/status/998531268039102465
>> Signed-off-by: Daniel Borkmann 
>> Acked-by: Alexei Starovoitov 
>
> Acked-by: Jesper Dangaard Brouer 
>
> Thank you for cleaning up this confusion :-)
>

Acked-by: Song Liu 

>> ---
>>  kernel/bpf/verifier.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 1fd9667b..4f4786e 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -2462,7 +2462,7 @@ static int check_helper_call(struct bpf_verifier_env 
>> *env, int func_id, int insn
>>
>>   /* eBPF programs must be GPL compatible to use GPL-ed functions */
>>   if (!env->prog->gpl_compatible && fn->gpl_only) {
>> - verbose(env, "cannot call GPL only function from proprietary 
>> program\n");
>> + verbose(env, "cannot call GPL-restricted function from non-GPL 
>> compatible program\n");
>>   return -EINVAL;
>>   }
>>
>
>
>
> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH bpf-next 02/11] bpf: add also cbpf long jump test cases with heavy expansion

2018-05-29 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> We have one triggering on eBPF but lets also add a cBPF example to
> make sure we keep tracking them. Also add anther cBPF test running
> max number of MSH ops.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 


> ---
>  lib/test_bpf.c | 63 
> ++
>  1 file changed, 63 insertions(+)
>
> diff --git a/lib/test_bpf.c b/lib/test_bpf.c
> index 317f231..60aedc8 100644
> --- a/lib/test_bpf.c
> +++ b/lib/test_bpf.c
> @@ -356,6 +356,52 @@ static int bpf_fill_maxinsns11(struct bpf_test *self)
> return __bpf_fill_ja(self, BPF_MAXINSNS, 68);
>  }
>
> +static int bpf_fill_maxinsns12(struct bpf_test *self)
> +{
> +   unsigned int len = BPF_MAXINSNS;
> +   struct sock_filter *insn;
> +   int i = 0;
> +
> +   insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
> +   if (!insn)
> +   return -ENOMEM;
> +
> +   insn[0] = __BPF_JUMP(BPF_JMP | BPF_JA, len - 2, 0, 0);
> +
> +   for (i = 1; i < len - 1; i++)
> +   insn[i] = __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0);
> +
> +   insn[len - 1] = __BPF_STMT(BPF_RET | BPF_K, 0xabababab);
> +
> +   self->u.ptr.insns = insn;
> +   self->u.ptr.len = len;
> +
> +   return 0;
> +}
> +
> +static int bpf_fill_maxinsns13(struct bpf_test *self)
> +{
> +   unsigned int len = BPF_MAXINSNS;
> +   struct sock_filter *insn;
> +   int i = 0;
> +
> +   insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL);
> +   if (!insn)
> +   return -ENOMEM;
> +
> +   for (i = 0; i < len - 3; i++)
> +   insn[i] = __BPF_STMT(BPF_LDX | BPF_B | BPF_MSH, 0);
> +
> +   insn[len - 3] = __BPF_STMT(BPF_LD | BPF_IMM, 0xabababab);
> +   insn[len - 2] = __BPF_STMT(BPF_ALU | BPF_XOR | BPF_X, 0);
> +   insn[len - 1] = __BPF_STMT(BPF_RET | BPF_A, 0);
> +
> +   self->u.ptr.insns = insn;
> +   self->u.ptr.len = len;
> +
> +   return 0;
> +}
> +
>  static int bpf_fill_ja(struct bpf_test *self)
>  {
> /* Hits exactly 11 passes on x86_64 JIT. */
> @@ -5290,6 +5336,23 @@ static struct bpf_test tests[] = {
> .expected_errcode = -ENOTSUPP,
> },
> {
> +   "BPF_MAXINSNS: jump over MSH",
> +   { },
> +   CLASSIC | FLAG_EXPECTED_FAIL,
> +   { 0xfa, 0xfb, 0xfc, 0xfd, },
> +   { { 4, 0xabababab } },
> +   .fill_helper = bpf_fill_maxinsns12,
> +   .expected_errcode = -EINVAL,
> +   },
> +   {
> +   "BPF_MAXINSNS: exec all MSH",
> +   { },
> +   CLASSIC,
> +   { 0xfa, 0xfb, 0xfc, 0xfd, },
> +   { { 4, 0xababab83 } },
> +   .fill_helper = bpf_fill_maxinsns13,
> +   },
> +   {
> "BPF_MAXINSNS: ld_abs+get_processor_id",
> { },
> CLASSIC,
> --
> 2.9.5
>


Re: [PATCH bpf-next 01/11] bpf: test case for map pointer poison with calls/branches

2018-05-29 Thread Song Liu
On Sun, May 27, 2018 at 5:43 PM, Daniel Borkmann  wrote:
> Add several test cases where the same or different map pointers
> originate from different paths in the program and execute a map
> lookup or tail call at a common location.
>
> Signed-off-by: Daniel Borkmann 
> Acked-by: Alexei Starovoitov 

Acked-by: Song Liu 

> ---
>  include/linux/filter.h  |  10 ++
>  tools/include/linux/filter.h|  10 ++
>  tools/testing/selftests/bpf/test_verifier.c | 185 
> 
>  3 files changed, 178 insertions(+), 27 deletions(-)
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index d358d18..b443f70 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -289,6 +289,16 @@ struct xdp_buff;
> .off   = OFF,   \
> .imm   = 0 })
>
> +/* Relative call */
> +
> +#define BPF_CALL_REL(TGT)  \
> +   ((struct bpf_insn) {\
> +   .code  = BPF_JMP | BPF_CALL,\
> +   .dst_reg = 0,   \
> +   .src_reg = BPF_PSEUDO_CALL, \
> +   .off   = 0, \
> +   .imm   = TGT })
> +
>  /* Function call */
>
>  #define BPF_EMIT_CALL(FUNC)\
> diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h
> index c5e512d..af55acf 100644
> --- a/tools/include/linux/filter.h
> +++ b/tools/include/linux/filter.h
> @@ -263,6 +263,16 @@
>  #define BPF_LD_MAP_FD(DST, MAP_FD) \
> BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)
>
> +/* Relative call */
> +
> +#define BPF_CALL_REL(TGT)  \
> +   ((struct bpf_insn) {\
> +   .code  = BPF_JMP | BPF_CALL,\
> +   .dst_reg = 0,   \
> +   .src_reg = BPF_PSEUDO_CALL, \
> +   .off   = 0, \
> +   .imm   = TGT })
> +
>  /* Program exit */
>
>  #define BPF_EXIT_INSN()\
> diff --git a/tools/testing/selftests/bpf/test_verifier.c 
> b/tools/testing/selftests/bpf/test_verifier.c
> index 4b4f015..7cb1d74 100644
> --- a/tools/testing/selftests/bpf/test_verifier.c
> +++ b/tools/testing/selftests/bpf/test_verifier.c
> @@ -50,7 +50,7 @@
>
>  #define MAX_INSNS  BPF_MAXINSNS
>  #define MAX_FIXUPS 8
> -#define MAX_NR_MAPS4
> +#define MAX_NR_MAPS7
>  #define POINTER_VALUE  0xcafe4all
>  #define TEST_DATA_LEN  64
>
> @@ -66,7 +66,9 @@ struct bpf_test {
> int fixup_map1[MAX_FIXUPS];
> int fixup_map2[MAX_FIXUPS];
> int fixup_map3[MAX_FIXUPS];
> -   int fixup_prog[MAX_FIXUPS];
> +   int fixup_map4[MAX_FIXUPS];
> +   int fixup_prog1[MAX_FIXUPS];
> +   int fixup_prog2[MAX_FIXUPS];
> int fixup_map_in_map[MAX_FIXUPS];
> const char *errstr;
> const char *errstr_unpriv;
> @@ -2769,7 +2771,7 @@ static struct bpf_test tests[] = {
> BPF_MOV64_IMM(BPF_REG_0, 0),
> BPF_EXIT_INSN(),
> },
> -   .fixup_prog = { 1 },
> +   .fixup_prog1 = { 1 },
> .errstr_unpriv = "R3 leaks addr into helper",
> .result_unpriv = REJECT,
> .result = ACCEPT,
> @@ -2856,7 +2858,7 @@ static struct bpf_test tests[] = {
> BPF_MOV64_IMM(BPF_REG_0, 1),
> BPF_EXIT_INSN(),
> },
> -   .fixup_prog = { 1 },
> +   .fixup_prog1 = { 1 },
> .result = ACCEPT,
> .retval = 42,
> },
> @@ -2870,7 +2872,7 @@ static struct bpf_test tests[] = {
> BPF_MOV64_IMM(BPF_REG_0, 1),
> BPF_EXIT_INSN(),
> },
> -   .fixup_prog = { 1 },
> +   .fixup_prog1 = { 1 },
> .result = ACCEPT,
> .retval = 41,
> },
> @@ -2884,7 +2886,7 @@ static struct bpf_test tests[] = {
> BPF_MOV64_IMM(BPF_REG_0, 1),
> BPF_EXIT_INSN(),
> },
> -   .fixup_prog = { 1 },
> +   .fixup_prog1 = { 1 },
> .result = ACCEPT,
> .retval = 1,
> },

[PATCH v2 net-next] net: remove bypassed check in sch_direct_xmit()

2018-05-29 Thread Song Liu
Checking netif_xmit_frozen_or_stopped() at the end of sch_direct_xmit()
is being bypassed. This is because "ret" from sch_direct_xmit() will be
either NETDEV_TX_OK or NETDEV_TX_BUSY, and only ret == NETDEV_TX_OK == 0
will reach the condition:

if (ret && netif_xmit_frozen_or_stopped(txq))
return false;

This patch cleans up the code by removing the whole condition.

For more discussion about this, please refer to
   https://marc.info/?t=15272719578

Signed-off-by: Song Liu 
Cc: John Fastabend 
Cc: Alexei Starovoitov 
Cc: David S. Miller 
---
 net/sched/sch_generic.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 760ab1b..69078c8 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -346,9 +346,6 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
return false;
}
 
-   if (ret && netif_xmit_frozen_or_stopped(txq))
-   return false;
-
return true;
 }
 
-- 
2.9.5



Re: [PATCH net-next] net: remove bypassed check in sch_direct_xmit()

2018-05-29 Thread Song Liu



> On May 29, 2018, at 1:58 AM, Sergei Shtylyov 
>  wrote:
> 
> Hello!
> 
> On 5/29/2018 12:36 AM, Song Liu wrote:
> 
>> Check sch_direct_xmit() at the end of sch_direct_xmit() will be bypassed.
> 
>   "Checking netif_xmit_frozen_or_stopped()", perhaps? Else it doesn't make 
> much sense...

Thanks Sergei!

Sending v2 with fix. 

Song

> 
>> This is because "ret" from sch_direct_xmit() will be either NETDEV_TX_OK
>> or NETDEV_TX_BUSY, and only ret == NETDEV_TX_OK == 0 will reach the
>> condition:
>> if (ret && netif_xmit_frozen_or_stopped(txq))
>> return false;
>> This patch cleans up the code by removing  the whole condition.
>> For more discussion about this, please refer to
>>https://marc.info/?t=15272719578
>> Signed-off-by: Song Liu 
>> Cc: John Fastabend 
>> Cc: Alexei Starovoitov 
>> Cc: David S. Miller 
> [...]
> 
> MBR, Sergei




Re: [PATCH net] net: sched: check netif_xmit_frozen_or_stopped() in sch_direct_xmit()

2018-05-29 Thread Song Liu



> On May 29, 2018, at 7:02 AM, David Miller  wrote:
> 
> From: Song Liu 
> Date: Fri, 25 May 2018 11:11:44 -0700
> 
>> Summary:
>> 
>> At the end of sch_direct_xmit(), we are in the else path of
>> !dev_xmit_complete(ret), which means ret == NETDEV_TX_OK. The following
>> condition will always fail and netif_xmit_frozen_or_stopped() is not
>> checked at all.
>> 
>>if (ret && netif_xmit_frozen_or_stopped(txq))
>> return false;
>> 
>> In this patch, this condition is fixed as:
>> 
>>if (netif_xmit_frozen_or_stopped(txq))
>> return false;
>> 
>> and further simplifies the code as:
>> 
>>return !netif_xmit_frozen_or_stopped(txq);
>> 
>> Fixes: 29b86cdac00a ("net: sched: remove remaining uses for qdisc_qlen in 
>> xmit path")
>> Cc: John Fastabend 
>> Cc: David S. Miller 
>> Signed-off-by: Song Liu 
> 
> I expect a new version of this patch which removes the test entirely.

The new version of it is here: http://patchwork.ozlabs.org/patch/921708/

Thanks,
Song



[PATCH net-next] net: remove bypassed check in sch_direct_xmit()

2018-05-28 Thread Song Liu
Check sch_direct_xmit() at the end of sch_direct_xmit() will be bypassed.
This is because "ret" from sch_direct_xmit() will be either NETDEV_TX_OK
or NETDEV_TX_BUSY, and only ret == NETDEV_TX_OK == 0 will reach the
condition:

if (ret && netif_xmit_frozen_or_stopped(txq))
return false;

This patch cleans up the code by removing  the whole condition.

For more discussion about this, please refer to
   https://marc.info/?t=15272719578

Signed-off-by: Song Liu 
Cc: John Fastabend 
Cc: Alexei Starovoitov 
Cc: David S. Miller 
---
 net/sched/sch_generic.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 760ab1b..69078c8 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -346,9 +346,6 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
return false;
}
 
-   if (ret && netif_xmit_frozen_or_stopped(txq))
-   return false;
-
return true;
 }
 
-- 
2.9.5



Re: [PATCH net] net: sched: check netif_xmit_frozen_or_stopped() in sch_direct_xmit()

2018-05-28 Thread Song Liu



> On May 26, 2018, at 12:43 PM, John Fastabend  wrote:
> 
> On 05/25/2018 12:46 PM, Song Liu wrote:
>> On Fri, May 25, 2018 at 11:11 AM, Song Liu  wrote:
>>> Summary:
>>> 
>>> At the end of sch_direct_xmit(), we are in the else path of
>>> !dev_xmit_complete(ret), which means ret == NETDEV_TX_OK. The following
>>> condition will always fail and netif_xmit_frozen_or_stopped() is not
>>> checked at all.
>>> 
>>>if (ret && netif_xmit_frozen_or_stopped(txq))
>>> return false;
>>> 
>>> In this patch, this condition is fixed as:
>>> 
>>>if (netif_xmit_frozen_or_stopped(txq))
>>> return false;
>>> 
>>> and further simplifies the code as:
>>> 
>>>return !netif_xmit_frozen_or_stopped(txq);
>>> 
>>> Fixes: 29b86cdac00a ("net: sched: remove remaining uses for qdisc_qlen in 
>>> xmit path")
>>> Cc: John Fastabend 
>>> Cc: David S. Miller 
>>> Signed-off-by: Song Liu 
>>> ---
>>> net/sched/sch_generic.c | 5 +
>>> 1 file changed, 1 insertion(+), 4 deletions(-)
>>> 
>>> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
>>> index 39c144b..8261d48 100644
>>> --- a/net/sched/sch_generic.c
>>> +++ b/net/sched/sch_generic.c
>>> @@ -346,10 +346,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc 
>>> *q,
>>>return false;
>>>}
>>> 
>>> -   if (ret && netif_xmit_frozen_or_stopped(txq))
>>> -   return false;
>>> -
>>> -   return true;
>>> +   return !netif_xmit_frozen_or_stopped(txq);
>>> }
>>> 
>>> /*
>>> --
>>> 2.9.5
>>> 
>> 
>> Alexei and I discussed about this offline. We would like to share our
>> discussion here to
>> clarify the motivation.
>> 
>> Before 29b86cdac00a, ret in condition "if (ret &&
>> netif_xmit_frozen_or_stopped()" is not
>> the value from dev_hard_start_xmit(), because ret is overwritten by
>> either qdisc_qlen()
>> or dev_requeue_skb(). Therefore, 29b86cdac00a changed the behavior of
>> this condition.
>> 
>> For ret from dev_hard_start_xmit(), I dig into the function and found
>> it is from return value
>> of ndo_start_xmit(). Per netdevice.h, ndo_start_xmit() should only
>> return NETDEV_TX_OK
>> or NETDEV_TX_BUSY. I survey many drivers, and they all follow the rule. The 
>> only
>> exception is vlan.
>> 
>> Given ret could only be NETDEV_TX_OK or NETDEV_TX_BUSY (ignore vlan for now),
>> if it fails condition "if (!dev_xmit_complete(ret))", ret must be
>> NETDEV_TX_OK == 0. So
>> netif_xmit_frozen_or_stopped() will always be bypassed.
>> 
>> It is probably OK to ignore netif_xmit_frozen_or_stopped(), and return true 
>> from
>> sch_direct_xmit(), as I didn't see that break any functionality. But
>> it is more like "correct
>> by accident" to me. This is the motivation of my original patch.
>> 
>> Alexei pointed out that, the following condition is more like original logic:
>> 
>>  if (qdisc_qlen(q) && netif_xmit_frozen_or_stopped(txq))
>>return false;
>> 
>> However, I think John would like to remove qdisc_qlen() from the tx
>> path. I didn't see
> 
> Yep qdisc_qlen() is not very friendly for lockless users. At
> some point we will get around to writing a distributed rate
> limiter qdisc and it will be nice to not have to work-around
> qdisc_qlen().
> 
>> any issue without the extra qdisc_qlen() check, so the patch is
>> probably good AS-IS.
>> 
>> Please share your comments and feedback on this.
>> 
> 
> Thanks for the detailed analysis. The above patch looks OK
> to me. Actually I'm debating if we should just drop the check.
> But, there looks to be a case where drivers return NETDEV_TX_OK
> and then stop the queue because it is nearly overrun. By putting
> the check there we stop early instead of doing some extra work
> before realizing the driver ring is full.
> 
> Still this overrun case should be rare so removing the check
> should be OK. Plus as you note its not been running anyways. My
> current recommendation is just remove the check altogether.
> 
> Thanks,
> John 

Thanks John! I will resend a clean up patch to net-next. 

Song



Re: [bpf-next PATCH] bpf: sockhash fix race with bpf_tcp_close and map delete

2018-05-26 Thread Song Liu
On Fri, May 25, 2018 at 10:37 AM, John Fastabend
<john.fastab...@gmail.com> wrote:
> syzbot reported two related splats, a use after free and null
> pointer dereference, when a TCP socket is closed while the map is
> also being removed.
>
> The psock keeps a reference to all map slots that have a reference
> to the sock so that when the sock is closed we can clean up any
> outstanding sock{map|hash} entries. This avoids pinning a sock
> forever if the map owner fails to do proper cleanup. However, the
> result is we have two paths that can free an entry in the map. Even
> the comment in the sock{map|hash} tear down function, sock_hash_free()
> notes this:
>
>  At this point no update, lookup or delete operations can happen.
>  However, be aware we can still get a socket state event updates,
>  and data ready callbacks that reference the psock from sk_user_data.
>
> Both removal paths omitted taking the hash bucket lock resulting
> in the case where we have two references that are in the process
> of being free'd.
>
> Reported-by: syzbot+a761b81c211794fa1...@syzkaller.appspotmail.com
> Signed-off-by: John Fastabend <john.fastab...@gmail.com>

Acked-by: Song Liu <songliubrav...@fb.com>

> ---
>  kernel/bpf/sockmap.c |   33 +
>  1 file changed, 21 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
> index 52a91d8..b508141f 100644
> --- a/kernel/bpf/sockmap.c
> +++ b/kernel/bpf/sockmap.c
> @@ -225,6 +225,16 @@ static void free_htab_elem(struct bpf_htab *htab, struct 
> htab_elem *l)
> kfree_rcu(l, rcu);
>  }
>
> +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> +{
> +   return >buckets[hash & (htab->n_buckets - 1)];
> +}
> +
> +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 
> hash)
> +{
> +   return &__select_bucket(htab, hash)->head;
> +}
> +
>  static void bpf_tcp_close(struct sock *sk, long timeout)
>  {
> void (*close_fun)(struct sock *sk, long timeout);
> @@ -268,9 +278,15 @@ static void bpf_tcp_close(struct sock *sk, long timeout)
> smap_release_sock(psock, sk);
> }
> } else {
> +   u32 hash = e->hash_link->hash;
> +   struct bucket *b;
> +
> +   b = __select_bucket(e->htab, hash);
> +   raw_spin_lock_bh(>lock);
> hlist_del_rcu(>hash_link->hash_node);
> smap_release_sock(psock, e->hash_link->sk);
> free_htab_elem(e->htab, e->hash_link);
> +   raw_spin_unlock_bh(>lock);
> }
> }
> write_unlock_bh(>sk_callback_lock);
> @@ -2043,16 +2059,6 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr 
> *attr)
> return ERR_PTR(err);
>  }
>
> -static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
> -{
> -   return >buckets[hash & (htab->n_buckets - 1)];
> -}
> -
> -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 
> hash)
> -{
> -   return &__select_bucket(htab, hash)->head;
> -}
> -
>  static void sock_hash_free(struct bpf_map *map)
>  {
> struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> @@ -2069,10 +2075,12 @@ static void sock_hash_free(struct bpf_map *map)
>  */
> rcu_read_lock();
> for (i = 0; i < htab->n_buckets; i++) {
> -   struct hlist_head *head = select_bucket(htab, i);
> +   struct bucket *b = __select_bucket(htab, i);
> +   struct hlist_head *head = >head;
> struct hlist_node *n;
> struct htab_elem *l;
>
> +   raw_spin_lock_bh(>lock);
> hlist_for_each_entry_safe(l, n, head, hash_node) {
> struct sock *sock = l->sk;
> struct smap_psock *psock;
> @@ -2090,8 +2098,9 @@ static void sock_hash_free(struct bpf_map *map)
> smap_release_sock(psock, sock);
> }
> write_unlock_bh(>sk_callback_lock);
> -   kfree(l);
> +   free_htab_elem(htab, l);
> }
> +   raw_spin_unlock_bh(>lock);
> }
> rcu_read_unlock();
> bpf_map_area_free(htab->buckets);
>


Re: [PATCH v2 net-next] tcp: use data length instead of skb->len in tcp_probe

2018-05-25 Thread Song Liu


> On May 25, 2018, at 3:14 AM, Yafang Shao <laoar.s...@gmail.com> wrote:
> 
> skb->len is meaningless to user.
> data length could be more helpful, with which we can easily filter out
> the packet without payload.
> 
> Signed-off-by: Yafang Shao <laoar.s...@gmail.com>

Acked-by: Song Liu <songliubrav...@fb.com>


> ---
> include/trace/events/tcp.h | 8 
> 1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index c1a5284..703abb6 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -236,7 +236,7 @@
>   __field(__u16, sport)
>   __field(__u16, dport)
>   __field(__u32, mark)
> - __field(__u16, length)
> + __field(__u16, data_len)
>   __field(__u32, snd_nxt)
>   __field(__u32, snd_una)
>   __field(__u32, snd_cwnd)
> @@ -261,7 +261,7 @@
>   __entry->dport = ntohs(inet->inet_dport);
>   __entry->mark = skb->mark;
> 
> - __entry->length = skb->len;
> + __entry->data_len = skb->len - tcp_hdrlen(skb);
>   __entry->snd_nxt = tp->snd_nxt;
>   __entry->snd_una = tp->snd_una;
>   __entry->snd_cwnd = tp->snd_cwnd;
> @@ -272,9 +272,9 @@
>   __entry->sock_cookie = sock_gen_cookie(sk);
>   ),
> 
> - TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x 
> snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u 
> sock_cookie=%llx",
> + TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x 
> snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u 
> sock_cookie=%llx",
> __entry->saddr, __entry->daddr, __entry->mark,
> -   __entry->length, __entry->snd_nxt, __entry->snd_una,
> +   __entry->data_len, __entry->snd_nxt, __entry->snd_una,
> __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
> __entry->srtt, __entry->rcv_wnd, __entry->sock_cookie)
> );
> -- 
> 1.8.3.1
> 



Re: [PATCH bpf-next] libbpf: Install btf.h with libbpf

2018-05-25 Thread Song Liu
On Fri, May 25, 2018 at 10:33 AM, Martin KaFai Lau <ka...@fb.com> wrote:
> On Fri, May 25, 2018 at 10:23:13AM -0700, Andrey Ignatov wrote:
>> install_headers target should contain all headers that are part of
>> libbpf. Add missing btf.h
>>
>> Signed-off-by: Andrey Ignatov <r...@fb.com>
> Acked-by: Martin KaFai Lau <ka...@fb.com>

Acked-by: Song Liu <songliubrav...@fb.com>


Re: [PATCH, net-next 2/2] bpf: avoid -Wmaybe-uninitialized warning

2018-05-25 Thread Song Liu

> On May 25, 2018, at 2:33 PM, Arnd Bergmann <a...@arndb.de> wrote:
> 
> The stack_map_get_build_id_offset() function is too long for gcc to track
> whether 'work' may or may not be initialized at the end of it, leading
> to a false-positive warning:
> 
> kernel/bpf/stackmap.c: In function 'stack_map_get_build_id_offset':
> kernel/bpf/stackmap.c:334:13: error: 'work' may be used uninitialized in this 
> function [-Werror=maybe-uninitialized]
> 
> This removes the 'in_nmi_ctx' flag and uses the state of that variable
> itself to see if it got initialized.
> 
> Fixes: bae77c5eb5b2 ("bpf: enable stackmap with build_id in nmi context")
> Signed-off-by: Arnd Bergmann <a...@arndb.de>
> ---
> kernel/bpf/stackmap.c | 7 +++
> 1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index b59ace0f0f09..b675a3f3d141 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -285,11 +285,10 @@ static void stack_map_get_build_id_offset(struct 
> bpf_stack_build_id *id_offs,
> {
>   int i;
>   struct vm_area_struct *vma;
> - bool in_nmi_ctx = in_nmi();
>   bool irq_work_busy = false;
> - struct stack_map_irq_work *work;
> + struct stack_map_irq_work *work = NULL;
> 
> - if (in_nmi_ctx) {
> + if (in_nmi()) {
>   work = this_cpu_ptr(_read_work);
>   if (work->irq_work.flags & IRQ_WORK_BUSY)
>   /* cannot queue more up_read, fallback */
> @@ -328,7 +327,7 @@ static void stack_map_get_build_id_offset(struct 
> bpf_stack_build_id *id_offs,
>   id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
>   }
> 
> - if (!in_nmi_ctx) {
> + if (!work) {
>   up_read(>mm->mmap_sem);
>   } else {
>   work->sem = >mm->mmap_sem;
> -- 
> 2.9.0
> 

Acked-by: Song Liu <songliubrav...@fb.com>

Re: [PATCH, net-next 1/2] bpf: btf: avoid -Wreturn-type warning

2018-05-25 Thread Song Liu


> On May 25, 2018, at 2:33 PM, Arnd Bergmann <a...@arndb.de> wrote:
> 
> gcc warns about a noreturn function possibly returning in
> some configurations:
> 
> kernel/bpf/btf.c: In function 'env_type_is_resolve_sink':
> kernel/bpf/btf.c:729:1: error: control reaches end of non-void function 
> [-Werror=return-type]
> 
> Using BUG() instead of BUG_ON() avoids that warning and otherwise
> does the exact same thing.
> 
> Fixes: eb3f595dab40 ("bpf: btf: Validate type reference")
> Signed-off-by: Arnd Bergmann <a...@arndb.de>
> ---
> kernel/bpf/btf.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
> index 9cbeabb5aca3..2822a0cf4f48 100644
> --- a/kernel/bpf/btf.c
> +++ b/kernel/bpf/btf.c
> @@ -749,7 +749,7 @@ static bool env_type_is_resolve_sink(const struct 
> btf_verifier_env *env,
>   !btf_type_is_array(next_type) &&
>   !btf_type_is_struct(next_type);
>   default:
> - BUG_ON(1);
> + BUG();
>   }
> }
> 
> -- 
> 2.9.0
> 

Acked-by: Song Liu <songliubrav...@fb.com>




Re: [PATCH net] net: sched: check netif_xmit_frozen_or_stopped() in sch_direct_xmit()

2018-05-25 Thread Song Liu
On Fri, May 25, 2018 at 11:11 AM, Song Liu <songliubrav...@fb.com> wrote:
> Summary:
>
> At the end of sch_direct_xmit(), we are in the else path of
> !dev_xmit_complete(ret), which means ret == NETDEV_TX_OK. The following
> condition will always fail and netif_xmit_frozen_or_stopped() is not
> checked at all.
>
> if (ret && netif_xmit_frozen_or_stopped(txq))
>  return false;
>
> In this patch, this condition is fixed as:
>
> if (netif_xmit_frozen_or_stopped(txq))
>  return false;
>
> and further simplifies the code as:
>
> return !netif_xmit_frozen_or_stopped(txq);
>
> Fixes: 29b86cdac00a ("net: sched: remove remaining uses for qdisc_qlen in 
> xmit path")
> Cc: John Fastabend <john.fastab...@gmail.com>
> Cc: David S. Miller <da...@davemloft.net>
> Signed-off-by: Song Liu <songliubrav...@fb.com>
> ---
>  net/sched/sch_generic.c | 5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
>
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index 39c144b..8261d48 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -346,10 +346,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc 
> *q,
> return false;
> }
>
> -   if (ret && netif_xmit_frozen_or_stopped(txq))
> -   return false;
> -
> -   return true;
> +   return !netif_xmit_frozen_or_stopped(txq);
>  }
>
>  /*
> --
> 2.9.5
>

Alexei and I discussed about this offline. We would like to share our
discussion here to
clarify the motivation.

Before 29b86cdac00a, ret in condition "if (ret &&
netif_xmit_frozen_or_stopped()" is not
the value from dev_hard_start_xmit(), because ret is overwritten by
either qdisc_qlen()
or dev_requeue_skb(). Therefore, 29b86cdac00a changed the behavior of
this condition.

For ret from dev_hard_start_xmit(), I dig into the function and found
it is from return value
of ndo_start_xmit(). Per netdevice.h, ndo_start_xmit() should only
return NETDEV_TX_OK
or NETDEV_TX_BUSY. I survey many drivers, and they all follow the rule. The only
exception is vlan.

Given ret could only be NETDEV_TX_OK or NETDEV_TX_BUSY (ignore vlan for now),
if it fails condition "if (!dev_xmit_complete(ret))", ret must be
NETDEV_TX_OK == 0. So
netif_xmit_frozen_or_stopped() will always be bypassed.

It is probably OK to ignore netif_xmit_frozen_or_stopped(), and return true from
sch_direct_xmit(), as I didn't see that break any functionality. But
it is more like "correct
by accident" to me. This is the motivation of my original patch.

Alexei pointed out that, the following condition is more like original logic:

  if (qdisc_qlen(q) && netif_xmit_frozen_or_stopped(txq))
return false;

However, I think John would like to remove qdisc_qlen() from the tx
path. I didn't see
any issue without the extra qdisc_qlen() check, so the patch is
probably good AS-IS.

Please share your comments and feedback on this.

Thanks,
Song


[PATCH net] net: sched: check netif_xmit_frozen_or_stopped() in sch_direct_xmit()

2018-05-25 Thread Song Liu
Summary:

At the end of sch_direct_xmit(), we are in the else path of
!dev_xmit_complete(ret), which means ret == NETDEV_TX_OK. The following
condition will always fail and netif_xmit_frozen_or_stopped() is not
checked at all.

if (ret && netif_xmit_frozen_or_stopped(txq))
 return false;

In this patch, this condition is fixed as:

if (netif_xmit_frozen_or_stopped(txq))
 return false;

and further simplifies the code as:

return !netif_xmit_frozen_or_stopped(txq);

Fixes: 29b86cdac00a ("net: sched: remove remaining uses for qdisc_qlen in xmit 
path")
Cc: John Fastabend <john.fastab...@gmail.com>
Cc: David S. Miller <da...@davemloft.net>
Signed-off-by: Song Liu <songliubrav...@fb.com>
---
 net/sched/sch_generic.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 39c144b..8261d48 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -346,10 +346,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
return false;
}
 
-   if (ret && netif_xmit_frozen_or_stopped(txq))
-   return false;
-
-   return true;
+   return !netif_xmit_frozen_or_stopped(txq);
 }
 
 /*
-- 
2.9.5



Re: [PATCH] [RFC] bpf: tracing: new helper bpf_get_current_cgroup_ino

2018-05-25 Thread Y Song
On Fri, May 25, 2018 at 8:21 AM, Alban Crequy <al...@kinvolk.io> wrote:
> On Wed, May 23, 2018 at 4:34 AM Y Song <ys114...@gmail.com> wrote:
>
>> I did a quick prototyping and the above interface seems working fine.
>
> Thanks! I gave your kernel patch & userspace program a try and it works for
> me on cgroup-v2.
>
> Also, I found out how to get my containers to use both cgroup-v1 and
> cgroup-v2 (by enabling systemd's hybrid cgroup mode and docker's
> '--exec-opt native.cgroupdriver=systemd' option). So I should be able to
> use the BPF helper function without having to add support for all the
> cgroup-v1 hierarchies.

Great. Will submit a formal patch soon.

>
>> The kernel change:
>> ===
>
>> [yhs@localhost bpf-next]$ git diff
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 97446bbe2ca5..669b7383fddb 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -1976,7 +1976,8 @@ union bpf_attr {
>>  FN(fib_lookup), \
>>  FN(sock_hash_update),   \
>>  FN(msg_redirect_hash),  \
>> -   FN(sk_redirect_hash),
>> +   FN(sk_redirect_hash),   \
>> +   FN(get_current_cgroup_id),
>
>>   /* integer value in 'imm' field of BPF_CALL instruction selects which
> helper
>>* function eBPF program intends to call
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index ce2cbbff27e4..e11e3298f911 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -493,6 +493,21 @@ static const struct bpf_func_proto
>> bpf_current_task_under_cgroup_proto = {
>>  .arg2_type  = ARG_ANYTHING,
>>   };
>
>> +BPF_CALL_0(bpf_get_current_cgroup_id)
>> +{
>> +   struct cgroup *cgrp = task_dfl_cgroup(current);
>> +   if (!cgrp)
>> +   return -EINVAL;
>> +
>> +   return cgrp->kn->id.id;
>> +}
>> +
>> +static const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
>> +   .func   = bpf_get_current_cgroup_id,
>> +   .gpl_only   = false,
>> +   .ret_type   = RET_INTEGER,
>> +};
>> +
>>   BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
>> const void *, unsafe_ptr)
>>   {
>> @@ -563,6 +578,8 @@ tracing_func_proto(enum bpf_func_id func_id, const
>> struct bpf_prog *prog)
>>  return _get_prandom_u32_proto;
>>  case BPF_FUNC_probe_read_str:
>>  return _probe_read_str_proto;
>> +   case BPF_FUNC_get_current_cgroup_id:
>> +   return _get_current_cgroup_id_proto;
>>  default:
>>  return NULL;
>>  }
>
>> The following program can be used to print out a cgroup id given a cgroup
> path.
>> [yhs@localhost cg]$ cat get_cgroup_id.c
>> #define _GNU_SOURCE
>> #include 
>> #include 
>> #include 
>> #include 
>> #include 
>
>> int main(int argc, char **argv)
>> {
>>  int dirfd, err, flags, mount_id, fhsize;
>>  struct file_handle *fhp;
>>  char *pathname;
>
>>  if (argc != 2) {
>>  printf("usage: %s \n", argv[0]);
>>  return 1;
>>  }
>
>>  pathname = argv[1];
>>  dirfd = AT_FDCWD;
>>  flags = 0;
>
>>  fhsize = sizeof(*fhp);
>>  fhp = malloc(fhsize);
>>  if (!fhp)
>>  return 1;
>
>>  err = name_to_handle_at(dirfd, pathname, fhp, _id, flags);
>>  if (err >= 0) {
>>  printf("error\n");
>>  return 1;
>>  }
>
>>  fhsize = sizeof(struct file_handle) + fhp->handle_bytes;
>>  fhp = realloc(fhp, fhsize);
>>  if (!fhp)
>>  return 1;
>
>>  err = name_to_handle_at(dirfd, pathname, fhp, _id, flags);
>>  if (err < 0)
>>  perror("name_to_handle_at");
>>  else {
>>  int i;
>
>>  printf("dir = %s, mount_id = %d\n", pathname, mount_id);
>>  printf("handle_bytes = %d, handle_type = %d\n", fhp->handle_bytes,
>>  fhp->handle_type);
>>  if (fhp->handle_bytes != 8)
>>  return 1;
>
>>  printf("cgroup_id = 0x%llx\n", *(unsigned long long
> *)fhp->f_handle);
>>  }
>
>>  return 0;
>> }
>> [yhs@localhost cg]$
>
>>

Re: [PATCH bpf-next] selftests/bpf: missing headers test_lwt_seg6local

2018-05-25 Thread Y Song
On Fri, May 25, 2018 at 9:16 AM, Y Song <ys114...@gmail.com> wrote:
> On Fri, May 25, 2018 at 4:20 AM, Mathieu Xhonneux <m.xhonn...@gmail.com> 
> wrote:
>> Previous patch "selftests/bpf: test for seg6local End.BPF action" lacks
>> some UAPI headers in tools/.
>>
>> clang -I. -I./include/uapi -I../../../include/uapi -idirafter
>> /usr/local/include -idirafter
>> /data/users/yhs/work/llvm/build/install/lib/clang/7.0.0/include
>> -idirafter /usr/include -Wno-compare-distinct-pointer-types \
>>  -O2 -target bpf -emit-llvm -c test_lwt_seg6local.c -o - |  \
>> llc -march=bpf -mcpu=generic  -filetype=obj -o
>> [...]/net-next/tools/testing/selftests/bpf/test_lwt_seg6local.o
>> test_lwt_seg6local.c:4:10: fatal error: 'linux/seg6_local.h' file not found
>>  ^~~~
>> 1 error generated.
>> make: Leaving directory
>> `/data/users/yhs/work/net-next/tools/testing/selftests/bpf'
>>
>> Reported-by: Y Song <ys114...@gmail.com>
>> Signed-off-by: Mathieu Xhonneux <m.xhonn...@gmail.com>
>> ---
>>  .../selftests/bpf/include/uapi/linux/seg6.h| 55 +++
>>  .../selftests/bpf/include/uapi/linux/seg6_local.h  | 80 
>> ++
>>  2 files changed, 135 insertions(+)
>>  create mode 100644 tools/testing/selftests/bpf/include/uapi/linux/seg6.h
>>  create mode 100644 
>> tools/testing/selftests/bpf/include/uapi/linux/seg6_local.h
>
> Thanks for fixing the issue.
>
> Acked-by: Y Song <ys114...@gmail.com>

Although it fixed the issue, the file is placed in
tools/testing/selftests/bpf/include/uapi/linux
directory. Considering the file is really coming from
linux/include/uapi/linux directory, should it
be placed in tools/include/uapi/linux directory instead?


Re: [PATCH bpf-next] selftests/bpf: missing headers test_lwt_seg6local

2018-05-25 Thread Song Liu
On Fri, May 25, 2018 at 4:20 AM, Mathieu Xhonneux <m.xhonn...@gmail.com> wrote:
> Previous patch "selftests/bpf: test for seg6local End.BPF action" lacks
> some UAPI headers in tools/.
>
> clang -I. -I./include/uapi -I../../../include/uapi -idirafter
> /usr/local/include -idirafter
> /data/users/yhs/work/llvm/build/install/lib/clang/7.0.0/include
> -idirafter /usr/include -Wno-compare-distinct-pointer-types \
>  -O2 -target bpf -emit-llvm -c test_lwt_seg6local.c -o - |  \
> llc -march=bpf -mcpu=generic  -filetype=obj -o
> [...]/net-next/tools/testing/selftests/bpf/test_lwt_seg6local.o
> test_lwt_seg6local.c:4:10: fatal error: 'linux/seg6_local.h' file not found
>  ^~~~
> 1 error generated.
> make: Leaving directory
> `/data/users/yhs/work/net-next/tools/testing/selftests/bpf'
>
> Reported-by: Y Song <ys114...@gmail.com>
> Signed-off-by: Mathieu Xhonneux <m.xhonn...@gmail.com>
> ---
>  .../selftests/bpf/include/uapi/linux/seg6.h| 55 +++
>  .../selftests/bpf/include/uapi/linux/seg6_local.h  | 80 
> ++
>  2 files changed, 135 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/include/uapi/linux/seg6.h
>  create mode 100644 
> tools/testing/selftests/bpf/include/uapi/linux/seg6_local.h
>
> diff --git a/tools/testing/selftests/bpf/include/uapi/linux/seg6.h 
> b/tools/testing/selftests/bpf/include/uapi/linux/seg6.h
> new file mode 100644
> index ..286e8d6a8e98
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/include/uapi/linux/seg6.h
> @@ -0,0 +1,55 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +/*
> + *  SR-IPv6 implementation
> + *
> + *  Author:
> + *  David Lebrun <david.leb...@uclouvain.be>
> + *
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License
> + *  as published by the Free Software Foundation; either version
> + *  2 of the License, or (at your option) any later version.
> + */
> +
> +#ifndef _UAPI_LINUX_SEG6_H
> +#define _UAPI_LINUX_SEG6_H
> +
> +#include 
> +#include  /* For struct in6_addr. */
> +
> +/*
> + * SRH
> + */
> +struct ipv6_sr_hdr {
> +   __u8nexthdr;
> +   __u8hdrlen;
> +   __u8type;
> +   __u8segments_left;
> +   __u8first_segment; /* Represents the last_entry field of SRH */
> +   __u8flags;
> +   __u16   tag;
> +
> +   struct in6_addr segments[0];
> +};
> +
> +#define SR6_FLAG1_PROTECTED(1 << 6)
> +#define SR6_FLAG1_OAM  (1 << 5)
> +#define SR6_FLAG1_ALERT(1 << 4)
> +#define SR6_FLAG1_HMAC (1 << 3)
> +
> +#define SR6_TLV_INGRESS1
> +#define SR6_TLV_EGRESS 2
> +#define SR6_TLV_OPAQUE 3
> +#define SR6_TLV_PADDING4
> +#define SR6_TLV_HMAC   5
> +
> +#define sr_has_hmac(srh) ((srh)->flags & SR6_FLAG1_HMAC)
> +
> +struct sr6_tlv {
> +   __u8 type;
> +   __u8 len;
> +   __u8 data[0];
> +};
> +
> +#endif
> diff --git a/tools/testing/selftests/bpf/include/uapi/linux/seg6_local.h 
> b/tools/testing/selftests/bpf/include/uapi/linux/seg6_local.h
> new file mode 100644
> index ..edc138bdc56d
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/include/uapi/linux/seg6_local.h
> @@ -0,0 +1,80 @@
> +/*
> + *  SR-IPv6 implementation
> + *
> + *  Author:
> + *  David Lebrun <david.leb...@uclouvain.be>
> + *
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License
> + *  as published by the Free Software Foundation; either version
> + *  2 of the License, or (at your option) any later version.
> + */
> +
> +#ifndef _UAPI_LINUX_SEG6_LOCAL_H
> +#define _UAPI_LINUX_SEG6_LOCAL_H
> +
> +#include 
> +
> +enum {
> +   SEG6_LOCAL_UNSPEC,
> +   SEG6_LOCAL_ACTION,
> +   SEG6_LOCAL_SRH,
> +   SEG6_LOCAL_TABLE,
> +   SEG6_LOCAL_NH4,
> +   SEG6_LOCAL_NH6,
> +   SEG6_LOCAL_IIF,
> +   SEG6_LOCAL_OIF,
> +   SEG6_LOCAL_BPF,
> +   __SEG6_LOCAL_MAX,
> +};
> +#define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1)
> +
> +enum {
> +   SEG6_LOCAL_ACTION_UNSPEC= 0,
> +   /* node segment */
> +   SEG6_LOCAL_ACTION_END   = 1,
> +   /* adjacency segment (IPv6 cross-connect) */
> +   SEG6_LOCAL_ACTION_END_X = 2,
> +   /* lookup of next seg NH in table */
> +   SEG6_LOCAL_ACTIO

Re: [PATCH bpf-next] selftests/bpf: missing headers test_lwt_seg6local

2018-05-25 Thread Y Song
On Fri, May 25, 2018 at 4:20 AM, Mathieu Xhonneux <m.xhonn...@gmail.com> wrote:
> Previous patch "selftests/bpf: test for seg6local End.BPF action" lacks
> some UAPI headers in tools/.
>
> clang -I. -I./include/uapi -I../../../include/uapi -idirafter
> /usr/local/include -idirafter
> /data/users/yhs/work/llvm/build/install/lib/clang/7.0.0/include
> -idirafter /usr/include -Wno-compare-distinct-pointer-types \
>  -O2 -target bpf -emit-llvm -c test_lwt_seg6local.c -o - |  \
> llc -march=bpf -mcpu=generic  -filetype=obj -o
> [...]/net-next/tools/testing/selftests/bpf/test_lwt_seg6local.o
> test_lwt_seg6local.c:4:10: fatal error: 'linux/seg6_local.h' file not found
>  ^~~~
> 1 error generated.
> make: Leaving directory
> `/data/users/yhs/work/net-next/tools/testing/selftests/bpf'
>
> Reported-by: Y Song <ys114...@gmail.com>
> Signed-off-by: Mathieu Xhonneux <m.xhonn...@gmail.com>
> ---
>  .../selftests/bpf/include/uapi/linux/seg6.h| 55 +++
>  .../selftests/bpf/include/uapi/linux/seg6_local.h  | 80 
> ++
>  2 files changed, 135 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/include/uapi/linux/seg6.h
>  create mode 100644 
> tools/testing/selftests/bpf/include/uapi/linux/seg6_local.h

Thanks for fixing the issue.

Acked-by: Y Song <ys114...@gmail.com>


Re: [PATCH bpf-next v7 6/6] selftests/bpf: test for seg6local End.BPF action

2018-05-24 Thread Y Song
When compiling latest bpf-next, I hit the following compilation error:

clang -I. -I./include/uapi -I../../../include/uapi -idirafter
/usr/local/include -idirafter
/data/users/yhs/work/llvm/build/install/lib/clang/7.0.0/include
-idirafter /usr/include -Wno-compare-distinct-pointer-types \
 -O2 -target bpf -emit-llvm -c test_lwt_seg6local.c -o - |  \
llc -march=bpf -mcpu=generic  -filetype=obj -o
/data/users/yhs/work/net-next/tools/testing/selftests/bpf/test_lwt_seg6local.o
test_lwt_seg6local.c:4:10: fatal error: 'linux/seg6_local.h' file not found
#include 
 ^~~~
1 error generated.
make: Leaving directory
`/data/users/yhs/work/net-next/tools/testing/selftests/bpf'

Should the seg6_local.h be copied to tools/ directory?

On Sun, May 20, 2018 at 6:58 AM, Mathieu Xhonneux  wrote:
> Add a new test for the seg6local End.BPF action. The following helpers
> are also tested:
>
> - bpf_lwt_push_encap within the LWT BPF IN hook
> - bpf_lwt_seg6_action
> - bpf_lwt_seg6_adjust_srh
> - bpf_lwt_seg6_store_bytes
>
> A chain of End.BPF actions is built. The SRH is injected through a LWT
> BPF IN hook before entering this chain. Each End.BPF action validates
> the previous one, otherwise the packet is dropped. The test succeeds
> if the last node in the chain receives the packet and the UDP datagram
> contained can be retrieved from userspace.
>
> Signed-off-by: Mathieu Xhonneux 
> ---
>  tools/include/uapi/linux/bpf.h|  97 -
>  tools/testing/selftests/bpf/Makefile  |   6 +-
>  tools/testing/selftests/bpf/bpf_helpers.h |  12 +
>  tools/testing/selftests/bpf/test_lwt_seg6local.c  | 437 
> ++
>  tools/testing/selftests/bpf/test_lwt_seg6local.sh | 140 +++
>  5 files changed, 689 insertions(+), 3 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/test_lwt_seg6local.c
>  create mode 100755 tools/testing/selftests/bpf/test_lwt_seg6local.sh
>
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 97446bbe2ca5..b217a33d80a4 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -141,6 +141,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_SK_MSG,
> BPF_PROG_TYPE_RAW_TRACEPOINT,
> BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
> +   BPF_PROG_TYPE_LWT_SEG6LOCAL,
>  };
>
>  enum bpf_attach_type {
> @@ -1902,6 +1903,90 @@ union bpf_attr {
>   * egress otherwise). This is the only flag supported for now.
>   * Return
>   * **SK_PASS** on success, or **SK_DROP** on error.
> + *
> + * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
> + * Description
> + * Encapsulate the packet associated to *skb* within a Layer 3
> + * protocol header. This header is provided in the buffer at
> + * address *hdr*, with *len* its size in bytes. *type* indicates
> + * the protocol of the header and can be one of:
> + *
> + * **BPF_LWT_ENCAP_SEG6**
> + * IPv6 encapsulation with Segment Routing Header
> + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH,
> + * the IPv6 header is computed by the kernel.
> + * **BPF_LWT_ENCAP_SEG6_INLINE**
> + * Only works if *skb* contains an IPv6 packet. Insert a
> + * Segment Routing Header (**struct ipv6_sr_hdr**) inside
> + * the IPv6 header.
> + *
> + * A call to this helper is susceptible to change the underlaying
> + * packet buffer. Therefore, at load time, all checks on pointers
> + * previously done by the verifier are invalidated and must be
> + * performed again, if the helper is used in combination with
> + * direct packet access.
> + * Return
> + * 0 on success, or a negative error in case of failure.
> + *
> + * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void 
> *from, u32 len)
> + * Description
> + * Store *len* bytes from address *from* into the packet
> + * associated to *skb*, at *offset*. Only the flags, tag and TLVs
> + * inside the outermost IPv6 Segment Routing Header can be
> + * modified through this helper.
> + *
> + * A call to this helper is susceptible to change the underlaying
> + * packet buffer. Therefore, at load time, all checks on pointers
> + * previously done by the verifier are invalidated and must be
> + * performed again, if the helper is used in combination with
> + * direct packet access.
> + * Return
> + * 0 on success, or a negative error in case of failure.
> + *
> + * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta)
> + * Description
> + *   

[PATCH bpf-next v5 6/7] tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs

2018-05-24 Thread Yonghong Song
The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c | 158 +++
 1 file changed, 158 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733..0ef6820 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,162 @@ static void test_get_stack_raw_tp(void)
bpf_object__close(obj);
 }
 
+static void test_task_fd_query_rawtp(void)
+{
+   const char *file = "./test_get_stack_rawtp.o";
+   __u64 probe_offset, probe_addr;
+   __u32 len, prog_id, fd_type;
+   struct bpf_object *obj;
+   int efd, err, prog_fd;
+   __u32 duration = 0;
+   char buf[256];
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, , _fd);
+   if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+   return;
+
+   efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+   if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+
+   /* query (getpid(), efd) */
+   len = sizeof(buf);
+   err = bpf_task_fd_query(getpid(), efd, 0, buf, , _id,
+   _type, _offset, _addr);
+   if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+ errno))
+   goto close_prog;
+
+   err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ strcmp(buf, "sys_enter") == 0;
+   if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+ fd_type, buf))
+   goto close_prog;
+
+   /* test zero len */
+   len = 0;
+   err = bpf_task_fd_query(getpid(), efd, 0, buf, , _id,
+   _type, _offset, _addr);
+   if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n",
+ err, errno))
+   goto close_prog;
+   err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == strlen("sys_enter");
+   if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+   goto close_prog;
+
+   /* test empty buffer */
+   len = sizeof(buf);
+   err = bpf_task_fd_query(getpid(), efd, 0, 0, , _id,
+   _type, _offset, _addr);
+   if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n",
+ err, errno))
+   goto close_prog;
+   err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == strlen("sys_enter");
+   if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+   goto close_prog;
+
+   /* test smaller buffer */
+   len = 3;
+   err = bpf_task_fd_query(getpid(), efd, 0, buf, , _id,
+   _type, _offset, _addr);
+   if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 3)",
+ "err %d errno %d\n", err, errno))
+   goto close_prog;
+   err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == strlen("sys_enter") &&
+ strcmp(buf, "sy") == 0;
+   if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+   goto close_prog;
+
+   goto close_prog_noerr;
+close_prog:
+   error_cnt++;
+close_prog_noerr:
+   bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+  const char *tp_name)
+{
+   const char *file = "./test_tracepoint.o";
+   int err, bytes, efd, prog_fd, pmu_fd;
+   struct perf_event_attr attr = {};
+   __u64 probe_offset, probe_addr;
+   __u32 len, prog_id, fd_type;
+   struct bpf_object *obj;
+   __u32 duration = 0;
+   char buf[256];
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, , _fd);
+   if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+   goto close_prog;
+
+   snprintf(buf, sizeof(buf),
+"/sys/kernel/debug/tracing/events/%s/id", probe_name);
+   efd = open(buf, O_RDONLY, 0);
+   if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+   bytes = read(efd, buf, sizeof(buf));
+   close(efd);
+   if 

[PATCH bpf-next v5 5/7] samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY

2018-05-24 Thread Yonghong Song
This is mostly to test kprobe/uprobe which needs kernel headers.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 samples/bpf/Makefile |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 382 +++
 3 files changed, 405 insertions(+)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1..7dc85ed 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES += $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4  += -lrt
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 000..f4b0a9e
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+   return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 000..8381d79
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({ \
+   int __ret = !!(condition);  \
+   if (__ret) {\
+   printf("FAIL: %s:\n", __func__);\
+   perror(""); \
+   return -1;  \
+   }   \
+})
+
+#define CHECK_AND_RET(condition) ({\
+   int __ret = !!(condition);  \
+   if (__ret)  \
+   return -1;  \
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+   return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   errno = 0;
+   ret = (int)strtol(buf, NULL, 10);
+   CHECK_PERROR_RET(errno);
+   return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+   CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+   e

[PATCH bpf-next v5 7/7] tools/bpftool: add perf subcommand

2018-05-24 Thread Yonghong Song
The new command "bpftool perf [show | list]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information. Documentation is also added.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
  kprobe: trace.py '__x64_sys_nanosleep'
  kretprobe:  trace.py 'r::__x64_sys_nanosleep'
  tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
  uprobe: trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

  $ bpftool perf
  pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
  pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
  pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
  pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159

  $ bpftool -j perf
  
[{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0},
 \
   
{"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0},
 \
   
{"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"},
 \
   
{"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]

  $ bpftool prog
  5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
  loaded_at 2018-05-15T04:46:37-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 4
  7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
  loaded_at 2018-05-15T04:48:32-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 7
  8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
  loaded_at 2018-05-15T04:48:48-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 8
  9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
  loaded_at 2018-05-15T04:49:52-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 9

  $ ps ax | grep "python ./trace.py"
  21711 pts/0T  0:03 python ./trace.py __x64_sys_write
  21765 pts/0S+ 0:00 python ./trace.py r::__x64_sys_nanosleep
  21767 pts/2S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
  21800 pts/3S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
  22374 pts/1S+ 0:00 grep --color=auto python ./trace.py

Reviewed-by: Jakub Kicinski <jakub.kicin...@netronome.com>
Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 
 tools/bpf/bpftool/Documentation/bpftool.rst  |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool|   9 +
 tools/bpf/bpftool/main.c |   3 +-
 tools/bpf/bpftool/main.h |   1 +
 tools/bpf/bpftool/perf.c | 246 +++
 6 files changed, 343 insertions(+), 2 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst 
b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
new file mode 100644
index 000..e3eb0ea
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -0,0 +1,81 @@
+
+bpftool-perf
+
+---
+tool for inspection of perf related bpf prog attachments
+---
+
+:Manual section: 8
+
+SYNOPSIS
+
+
+   **bpftool** [*OPTIONS*] **perf** *COMMAND*
+
+   *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+   *COMMANDS* :=
+   { **show** | **list** | **help** }
+
+PERF COMMANDS
+=
+
+|  **bpftool** **perf { show | list }**
+|  **bpftool** **perf help**
+
+DESCRIPTION
+===
+   **bpftool perf { show | list }**
+ List all raw_tracepoint, tracepoint, kprobe attachment in the 
system.
+
+ Output will start with process id and file descriptor in that 
process,
+ followed by bpf program id, attachment information, and 
attachment point.
+ The attachment point for raw_tracepoint/tracepoint is the 
trace probe name.
+ The attachment point for k[ret]probe is either symbol name 
and offset,
+ or a kernel virtual address.
+ The attachment point for u[ret]probe is the file name and the 
file offset.
+
+   **bpftool perf help**
+ Print short help message.
+
+OPTIONS
+===
+

[PATCH bpf-next v5 3/7] tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in libbpf

2018-05-24 Thread Yonghong Song
Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
implement bpf_task_fd_query() in libbpf. The test programs
in samples/bpf and tools/testing/selftests/bpf, and later bpftool
will use this libbpf function to query kernel.

Acked-by: Martin KaFai Lau <ka...@fb.com>
Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/include/uapi/linux/bpf.h | 26 ++
 tools/lib/bpf/bpf.c| 23 +++
 tools/lib/bpf/bpf.h|  3 +++
 3 files changed, 52 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e95fec9..9b8c6e3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -380,6 +381,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   __u32   pid;/* input: pid */
+   __u32   fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input/output: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+*   filename for uprobe
+*/
+   __u32   prog_id;/* output: prod_id */
+   __u32   fd_type;/* output: BPF_FD_TYPE_* */
+   __u64   probe_offset;   /* output: probe_offset */
+   __u64   probe_addr; /* output: probe_addr */
+   } task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2557,4 +2574,13 @@ struct bpf_fib_lookup {
__u8dmac[6]; /* ETH_ALEN */
 };
 
+enum bpf_task_fd_type {
+   BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
+   BPF_FD_TYPE_TRACEPOINT, /* tp name */
+   BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */
+   BPF_FD_TYPE_KRETPROBE,  /* (symbol + offset) or addr */
+   BPF_FD_TYPE_UPROBE, /* filename + offset */
+   BPF_FD_TYPE_URETPROBE,  /* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 442b4cd..9ddc89d 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -643,3 +643,26 @@ int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, 
__u32 log_buf_size,
 
return fd;
 }
+
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+ __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+ __u64 *probe_addr)
+{
+   union bpf_attr attr = {};
+   int err;
+
+   attr.task_fd_query.pid = pid;
+   attr.task_fd_query.fd = fd;
+   attr.task_fd_query.flags = flags;
+   attr.task_fd_query.buf = ptr_to_u64(buf);
+   attr.task_fd_query.buf_len = *buf_len;
+
+   err = sys_bpf(BPF_TASK_FD_QUERY, , sizeof(attr));
+   *buf_len = attr.task_fd_query.buf_len;
+   *prog_id = attr.task_fd_query.prog_id;
+   *fd_type = attr.task_fd_query.fd_type;
+   *probe_offset = attr.task_fd_query.probe_offset;
+   *probe_addr = attr.task_fd_query.probe_addr;
+
+   return err;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index d12344f..0639a30 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -107,4 +107,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type 
type, __u32 query_flags,
 int bpf_raw_tracepoint_open(const char *name, int prog_fd);
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 bool do_log);
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+ __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+ __u64 *probe_addr);
 #endif
-- 
2.9.5



[PATCH bpf-next v5 4/7] tools/bpf: add ksym_get_addr() in trace_helpers

2018-05-24 Thread Yonghong Song
Given a kernel function name, ksym_get_addr() will return the kernel
address for this function, or 0 if it cannot find this function name
in /proc/kallsyms. This function will be used later when a kernel
address is used to initiate a kprobe perf event.

Acked-by: Martin KaFai Lau <ka...@fb.com>
Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/trace_helpers.c | 12 
 tools/testing/selftests/bpf/trace_helpers.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c 
b/tools/testing/selftests/bpf/trace_helpers.c
index 8fb4fe8..3868dcb 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -72,6 +72,18 @@ struct ksym *ksym_search(long key)
return [0];
 }
 
+long ksym_get_addr(const char *name)
+{
+   int i;
+
+   for (i = 0; i < sym_cnt; i++) {
+   if (strcmp(syms[i].name, name) == 0)
+   return syms[i].addr;
+   }
+
+   return 0;
+}
+
 static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h 
b/tools/testing/selftests/bpf/trace_helpers.h
index 36d90e3..3b4bcf7 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -11,6 +11,7 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
 
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
-- 
2.9.5



[PATCH bpf-next v5 0/7] bpf: implement BPF_TASK_FD_QUERY

2018-05-24 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, this command will return bpf related information
to user space. Right now it only supports tracepoint/kprobe/uprobe
perf event fd's. For such a fd, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Patch #1 adds function perf_get_event() in kernel/events/core.c.
Patch #2 implements the bpf subcommand BPF_TASK_FD_QUERY.
Patch #3 syncs tools bpf.h header and also add bpf_task_fd_query()
in the libbpf library for samples/selftests/bpftool to use.
Patch #4 adds ksym_get_addr() utility function.
Patch #5 add a test in samples/bpf for querying k[ret]probes and
u[ret]probes.
Patch #6 add a test in tools/testing/selftests/bpf for querying
raw_tracepoint and tracepoint.
Patch #7 add a new subcommand "perf" to bpftool.

Changelogs:
  v4 -> v5:
 . return strlen(buf) instead of strlen(buf) + 1 
   in the attr.buf_len. As long as user provides
   non-empty buffer, it will be filed with empty
   string, truncated string, or full string
   based on the buffer size and the length of
   to-be-copied string.
  v3 -> v4:
 . made attr buf_len input/output. The length of
   actual buffter is written to buf_len so user space knows
   what is actually needed. If user provides a buffer
   with length >= 1 but less than required, do partial
   copy and return -ENOSPC.
 . code simplification with put_user.
 . changed query result attach_info to fd_type.
 . add tests at selftests/bpf to test zero len, null buf and
   insufficient buf.
  v2 -> v3:
 . made perf_get_event() return perf_event pointer const.
   this was to ensure that event fields are not meddled.
 . detect whether newly BPF_TASK_FD_QUERY is supported or
   not in "bpftool perf" and warn users if it is not.
  v1 -> v2:
 . changed bpf subcommand name from BPF_PERF_EVENT_QUERY
   to BPF_TASK_FD_QUERY.
 . fixed various "bpftool perf" issues and added documentation
   and auto-completion.

Yonghong Song (7):
  perf/core: add perf_get_event() to return perf_event given a struct
file
  bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
  tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in
libbpf
  tools/bpf: add ksym_get_addr() in trace_helpers
  samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
  tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
  tools/bpftool: add perf subcommand

 include/linux/perf_event.h   |   5 +
 include/linux/trace_events.h |  17 +
 include/uapi/linux/bpf.h |  26 ++
 kernel/bpf/syscall.c | 131 
 kernel/events/core.c |   8 +
 kernel/trace/bpf_trace.c |  48 +++
 kernel/trace/trace_kprobe.c  |  29 ++
 kernel/trace/trace_uprobe.c  |  22 ++
 samples/bpf/Makefile |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 382 +++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 +
 tools/bpf/bpftool/Documentation/bpftool.rst  |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool|   9 +
 tools/bpf/bpftool/main.c |   3 +-
 tools/bpf/bpftool/main.h |   1 +
 tools/bpf/bpftool/perf.c | 246 +++
 tools/include/uapi/linux/bpf.h   |  26 ++
 tools/lib/bpf/bpf.c  |  23 ++
 tools/lib/bpf/bpf.h  |   3 +
 tools/testing/selftests/bpf/test_progs.c | 158 ++
 tools/testing/selftests/bpf/trace_helpers.c  |  12 +
 tools/testing/selftests/bpf/trace_helpers.h  |   1 +
 23 files changed, 1257 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/task_fd_query_kern

[PATCH bpf-next v5 1/7] perf/core: add perf_get_event() to return perf_event given a struct file

2018-05-24 Thread Yonghong Song
A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/perf_event.h | 5 +
 kernel/events/core.c   | 8 
 2 files changed, 13 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..eec302b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct file *perf_event_get(unsigned int fd);
+extern const struct perf_event *perf_get_event(struct file *file);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct 
task_struct *child)   { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
 static inline struct file *perf_event_get(unsigned int fd) { return 
ERR_PTR(-EINVAL); }
+static inline const struct perf_event *perf_get_event(struct file *file)
+{
+   return ERR_PTR(-EINVAL);
+}
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event)
 {
return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce..6eeab86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
return file;
 }
 
+const struct perf_event *perf_get_event(struct file *file)
+{
+   if (file->f_op != _fops)
+   return ERR_PTR(-EINVAL);
+
+   return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
if (!event)
-- 
2.9.5



[PATCH bpf-next v5 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

2018-05-24 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Acked-by: Martin KaFai Lau <ka...@fb.com>
Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/trace_events.h |  17 ++
 include/uapi/linux/bpf.h |  26 +
 kernel/bpf/syscall.c | 131 +++
 kernel/trace/bpf_trace.c |  48 
 kernel/trace/trace_kprobe.c  |  29 ++
 kernel/trace/trace_uprobe.c  |  22 
 6 files changed, 273 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..d34144a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, 
void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+   u32 *fd_type, const char **buf,
+   u64 *probe_offset, u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void 
*ctx)
 {
@@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map 
*bpf_find_raw_tracepoint(const char *name
 {
return NULL;
 }
+static inline int bpf_get_perf_event_info(const struct perf_event *event,
+ u32 *prog_id, u32 *fd_type,
+ const char **buf, u64 *probe_offset,
+ u64 *probe_addr)
+{
+   return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int 
flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+  u32 *fd_type, const char **symbol,
+  u64 *probe_offset, u64 *probe_addr,
+  bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+  u32 *fd_type, const char **filename,
+  u64 *probe_offset, bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e95fec9..9b8c6e3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -380,6 +381,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   __u32   pid;/* input: pid */
+   __u32   fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input/output: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+   

Re: [PATCH net-next] tcp: use data length instead of skb->len in tcp_probe

2018-05-24 Thread Song Liu


> On May 24, 2018, at 5:48 AM, Yafang Shao <laoar.s...@gmail.com> wrote:
> 
> skb->len is meaningless to user.
> data length could be more helpful, with which we can easily filter out
> the packet without payload.
> 
> Signed-off-by: Yafang Shao <laoar.s...@gmail.com>
> ---
> include/trace/events/tcp.h | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
> index c1a5284..259b991 100644
> --- a/include/trace/events/tcp.h
> +++ b/include/trace/events/tcp.h
> @@ -261,7 +261,7 @@
>   __entry->dport = ntohs(inet->inet_dport);
>   __entry->mark = skb->mark;
> 
> - __entry->length = skb->len;
> + __entry->length = skb->len - tcp_hdrlen(skb);

We should also rename __entry->length to __entry->data_len, so that whoever
using this field will notice the change. 

Thanks,
Song


>   __entry->snd_nxt = tp->snd_nxt;
>   __entry->snd_una = tp->snd_una;
>   __entry->snd_cwnd = tp->snd_cwnd;
> @@ -272,7 +272,7 @@
>   __entry->sock_cookie = sock_gen_cookie(sk);
>   ),
> 
> - TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x 
> snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u 
> sock_cookie=%llx",
> + TP_printk("src=%pISpc dest=%pISpc mark=%#x data_len=%d snd_nxt=%#x 
> snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u rcv_wnd=%u 
> sock_cookie=%llx",
> __entry->saddr, __entry->daddr, __entry->mark,
> __entry->length, __entry->snd_nxt, __entry->snd_una,
> __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
> -- 
> 1.8.3.1
> 



Re: [PATCH bpf-next v4 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

2018-05-24 Thread Yonghong Song



On 5/23/18 10:07 PM, Martin KaFai Lau wrote:

On Wed, May 23, 2018 at 05:18:42PM -0700, Yonghong Song wrote:

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Signed-off-by: Yonghong Song <y...@fb.com>
---
  include/linux/trace_events.h |  17 +++
  include/uapi/linux/bpf.h |  26 ++
  kernel/bpf/syscall.c | 115 +++
  kernel/trace/bpf_trace.c |  48 ++
  kernel/trace/trace_kprobe.c  |  29 +++
  kernel/trace/trace_uprobe.c  |  22 +
  6 files changed, 257 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..d34144a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, 
void __user *info);
  int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
  int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog 
*prog);
  struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+   u32 *fd_type, const char **buf,
+   u64 *probe_offset, u64 *probe_addr);
  #else
  static inline unsigned int trace_call_bpf(struct trace_event_call *call, void 
*ctx)
  {
@@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map 
*bpf_find_raw_tracepoint(const char *name
  {
return NULL;
  }
+static inline int bpf_get_perf_event_info(const struct perf_event *event,
+ u32 *prog_id, u32 *fd_type,
+ const char **buf, u64 *probe_offset,
+ u64 *probe_addr)
+{
+   return -EOPNOTSUPP;
+}
  #endif
  
  enum {

@@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int 
flags);
  #ifdef CONFIG_KPROBE_EVENTS
  extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
  extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+  u32 *fd_type, const char **symbol,
+  u64 *probe_offset, u64 *probe_addr,
+  bool perf_type_tracepoint);
  #endif
  #ifdef CONFIG_UPROBE_EVENTS
  extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
  extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+  u32 *fd_type, const char **filename,
+  u64 *probe_offset, bool perf_type_tracepoint);
  #endif
  extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3e502d..0d51946 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
  };
  
  enum bpf_map_type {

@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   __u32   pid;/* input: pid */
+   __u32   fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input/output: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+

[PATCH bpf-next v4 5/7] samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY

2018-05-23 Thread Yonghong Song
This is mostly to test kprobe/uprobe which needs kernel headers.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 samples/bpf/Makefile |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 382 +++
 3 files changed, 405 insertions(+)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1..7dc85ed 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES += $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4  += -lrt
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 000..f4b0a9e
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+   return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 000..8381d79
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({ \
+   int __ret = !!(condition);  \
+   if (__ret) {\
+   printf("FAIL: %s:\n", __func__);\
+   perror(""); \
+   return -1;  \
+   }   \
+})
+
+#define CHECK_AND_RET(condition) ({\
+   int __ret = !!(condition);  \
+   if (__ret)  \
+   return -1;  \
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+   return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   errno = 0;
+   ret = (int)strtol(buf, NULL, 10);
+   CHECK_PERROR_RET(errno);
+   return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+   CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+   e

[PATCH bpf-next v4 6/7] tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs

2018-05-23 Thread Yonghong Song
The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c | 158 +++
 1 file changed, 158 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733..c9fd351 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,162 @@ static void test_get_stack_raw_tp(void)
bpf_object__close(obj);
 }
 
+static void test_task_fd_query_rawtp(void)
+{
+   const char *file = "./test_get_stack_rawtp.o";
+   __u64 probe_offset, probe_addr;
+   __u32 len, prog_id, fd_type;
+   struct bpf_object *obj;
+   int efd, err, prog_fd;
+   __u32 duration = 0;
+   char buf[256];
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, , _fd);
+   if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+   return;
+
+   efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+   if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+
+   /* query (getpid(), efd) */
+   len = sizeof(buf);
+   err = bpf_task_fd_query(getpid(), efd, 0, buf, , _id,
+   _type, _offset, _addr);
+   if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+ errno))
+   goto close_prog;
+
+   err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ strcmp(buf, "sys_enter") == 0;
+   if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+ fd_type, buf))
+   goto close_prog;
+
+   /* test zero len */
+   len = 0;
+   err = bpf_task_fd_query(getpid(), efd, 0, buf, , _id,
+   _type, _offset, _addr);
+   if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n",
+ err, errno))
+   goto close_prog;
+   err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == (strlen("sys_enter") + 1);
+   if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+   goto close_prog;
+
+   /* test empty buffer */
+   len = sizeof(buf);
+   err = bpf_task_fd_query(getpid(), efd, 0, 0, , _id,
+   _type, _offset, _addr);
+   if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n",
+ err, errno))
+   goto close_prog;
+   err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == (strlen("sys_enter") + 1);
+   if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+   goto close_prog;
+
+   /* test smaller buffer */
+   len = 2;
+   err = bpf_task_fd_query(getpid(), efd, 0, buf, , _id,
+   _type, _offset, _addr);
+   if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 2)",
+ "err %d errno %d\n", err, errno))
+   goto close_prog;
+   err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == (strlen("sys_enter") + 1) &&
+ strncmp(buf, "sys_enter", 2) == 0;
+   if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+   goto close_prog;
+
+   goto close_prog_noerr;
+close_prog:
+   error_cnt++;
+close_prog_noerr:
+   bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+  const char *tp_name)
+{
+   const char *file = "./test_tracepoint.o";
+   int err, bytes, efd, prog_fd, pmu_fd;
+   struct perf_event_attr attr = {};
+   __u64 probe_offset, probe_addr;
+   __u32 len, prog_id, fd_type;
+   struct bpf_object *obj;
+   __u32 duration = 0;
+   char buf[256];
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, , _fd);
+   if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+   goto close_prog;
+
+   snprintf(buf, sizeof(buf),
+"/sys/kernel/debug/tracing/events/%s/id", probe_name);
+   efd = open(buf, O_RDONLY, 0);
+   if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+   bytes = read(efd, buf, sizeof(buf)

[PATCH bpf-next v4 7/7] tools/bpftool: add perf subcommand

2018-05-23 Thread Yonghong Song
The new command "bpftool perf [show | list]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information. Documentation is also added.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
  kprobe: trace.py '__x64_sys_nanosleep'
  kretprobe:  trace.py 'r::__x64_sys_nanosleep'
  tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
  uprobe: trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

  $ bpftool perf
  pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
  pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
  pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
  pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159

  $ bpftool -j perf
  
[{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0},
 \
   
{"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0},
 \
   
{"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"},
 \
   
{"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]

  $ bpftool prog
  5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
  loaded_at 2018-05-15T04:46:37-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 4
  7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
  loaded_at 2018-05-15T04:48:32-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 7
  8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
  loaded_at 2018-05-15T04:48:48-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 8
  9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
  loaded_at 2018-05-15T04:49:52-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 9

  $ ps ax | grep "python ./trace.py"
  21711 pts/0T  0:03 python ./trace.py __x64_sys_write
  21765 pts/0S+ 0:00 python ./trace.py r::__x64_sys_nanosleep
  21767 pts/2S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
  21800 pts/3S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
  22374 pts/1S+ 0:00 grep --color=auto python ./trace.py

Reviewed-by: Jakub Kicinski <jakub.kicin...@netronome.com>
Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 
 tools/bpf/bpftool/Documentation/bpftool.rst  |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool|   9 +
 tools/bpf/bpftool/main.c |   3 +-
 tools/bpf/bpftool/main.h |   1 +
 tools/bpf/bpftool/perf.c | 246 +++
 6 files changed, 343 insertions(+), 2 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst 
b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
new file mode 100644
index 000..e3eb0ea
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -0,0 +1,81 @@
+
+bpftool-perf
+
+---
+tool for inspection of perf related bpf prog attachments
+---
+
+:Manual section: 8
+
+SYNOPSIS
+
+
+   **bpftool** [*OPTIONS*] **perf** *COMMAND*
+
+   *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+   *COMMANDS* :=
+   { **show** | **list** | **help** }
+
+PERF COMMANDS
+=
+
+|  **bpftool** **perf { show | list }**
+|  **bpftool** **perf help**
+
+DESCRIPTION
+===
+   **bpftool perf { show | list }**
+ List all raw_tracepoint, tracepoint, kprobe attachment in the 
system.
+
+ Output will start with process id and file descriptor in that 
process,
+ followed by bpf program id, attachment information, and 
attachment point.
+ The attachment point for raw_tracepoint/tracepoint is the 
trace probe name.
+ The attachment point for k[ret]probe is either symbol name 
and offset,
+ or a kernel virtual address.
+ The attachment point for u[ret]probe is the file name and the 
file offset.
+
+   **bpftool perf help**
+ Print short help message.
+
+OPTIONS
+===
+

[PATCH bpf-next v4 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

2018-05-23 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/trace_events.h |  17 +++
 include/uapi/linux/bpf.h |  26 ++
 kernel/bpf/syscall.c | 115 +++
 kernel/trace/bpf_trace.c |  48 ++
 kernel/trace/trace_kprobe.c  |  29 +++
 kernel/trace/trace_uprobe.c  |  22 +
 6 files changed, 257 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..d34144a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, 
void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+   u32 *fd_type, const char **buf,
+   u64 *probe_offset, u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void 
*ctx)
 {
@@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map 
*bpf_find_raw_tracepoint(const char *name
 {
return NULL;
 }
+static inline int bpf_get_perf_event_info(const struct perf_event *event,
+ u32 *prog_id, u32 *fd_type,
+ const char **buf, u64 *probe_offset,
+ u64 *probe_addr)
+{
+   return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int 
flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+  u32 *fd_type, const char **symbol,
+  u64 *probe_offset, u64 *probe_addr,
+  bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+  u32 *fd_type, const char **filename,
+  u64 *probe_offset, bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3e502d..0d51946 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   __u32   pid;/* input: pid */
+   __u32   fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input/output: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+*

[PATCH bpf-next v4 1/7] perf/core: add perf_get_event() to return perf_event given a struct file

2018-05-23 Thread Yonghong Song
A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/perf_event.h | 5 +
 kernel/events/core.c   | 8 
 2 files changed, 13 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..eec302b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct file *perf_event_get(unsigned int fd);
+extern const struct perf_event *perf_get_event(struct file *file);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct 
task_struct *child)   { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
 static inline struct file *perf_event_get(unsigned int fd) { return 
ERR_PTR(-EINVAL); }
+static inline const struct perf_event *perf_get_event(struct file *file)
+{
+   return ERR_PTR(-EINVAL);
+}
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event)
 {
return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce..6eeab86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
return file;
 }
 
+const struct perf_event *perf_get_event(struct file *file)
+{
+   if (file->f_op != _fops)
+   return ERR_PTR(-EINVAL);
+
+   return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
if (!event)
-- 
2.9.5



[PATCH bpf-next v4 3/7] tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in libbpf

2018-05-23 Thread Yonghong Song
Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
implement bpf_task_fd_query() in libbpf. The test programs
in samples/bpf and tools/testing/selftests/bpf, and later bpftool
will use this libbpf function to query kernel.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/include/uapi/linux/bpf.h | 26 ++
 tools/lib/bpf/bpf.c| 23 +++
 tools/lib/bpf/bpf.h|  3 +++
 3 files changed, 52 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c3e502d..0d51946 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   __u32   pid;/* input: pid */
+   __u32   fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input/output: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+*   filename for uprobe
+*/
+   __u32   prog_id;/* output: prod_id */
+   __u32   fd_type;/* output: BPF_FD_TYPE_* */
+   __u64   probe_offset;   /* output: probe_offset */
+   __u64   probe_addr; /* output: probe_addr */
+   } task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2458,4 +2475,13 @@ struct bpf_fib_lookup {
__u8dmac[6]; /* ETH_ALEN */
 };
 
+enum bpf_task_fd_type {
+   BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
+   BPF_FD_TYPE_TRACEPOINT, /* tp name */
+   BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */
+   BPF_FD_TYPE_KRETPROBE,  /* (symbol + offset) or addr */
+   BPF_FD_TYPE_UPROBE, /* filename + offset */
+   BPF_FD_TYPE_URETPROBE,  /* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 442b4cd..9ddc89d 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -643,3 +643,26 @@ int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, 
__u32 log_buf_size,
 
return fd;
 }
+
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+ __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+ __u64 *probe_addr)
+{
+   union bpf_attr attr = {};
+   int err;
+
+   attr.task_fd_query.pid = pid;
+   attr.task_fd_query.fd = fd;
+   attr.task_fd_query.flags = flags;
+   attr.task_fd_query.buf = ptr_to_u64(buf);
+   attr.task_fd_query.buf_len = *buf_len;
+
+   err = sys_bpf(BPF_TASK_FD_QUERY, , sizeof(attr));
+   *buf_len = attr.task_fd_query.buf_len;
+   *prog_id = attr.task_fd_query.prog_id;
+   *fd_type = attr.task_fd_query.fd_type;
+   *probe_offset = attr.task_fd_query.probe_offset;
+   *probe_addr = attr.task_fd_query.probe_addr;
+
+   return err;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index d12344f..0639a30 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -107,4 +107,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type 
type, __u32 query_flags,
 int bpf_raw_tracepoint_open(const char *name, int prog_fd);
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 bool do_log);
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+ __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+ __u64 *probe_addr);
 #endif
-- 
2.9.5



[PATCH bpf-next v4 4/7] tools/bpf: add ksym_get_addr() in trace_helpers

2018-05-23 Thread Yonghong Song
Given a kernel function name, ksym_get_addr() will return the kernel
address for this function, or 0 if it cannot find this function name
in /proc/kallsyms. This function will be used later when a kernel
address is used to initiate a kprobe perf event.

Acked-by: Martin KaFai Lau <ka...@fb.com>
Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/trace_helpers.c | 12 
 tools/testing/selftests/bpf/trace_helpers.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c 
b/tools/testing/selftests/bpf/trace_helpers.c
index 8fb4fe8..3868dcb 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -72,6 +72,18 @@ struct ksym *ksym_search(long key)
return [0];
 }
 
+long ksym_get_addr(const char *name)
+{
+   int i;
+
+   for (i = 0; i < sym_cnt; i++) {
+   if (strcmp(syms[i].name, name) == 0)
+   return syms[i].addr;
+   }
+
+   return 0;
+}
+
 static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h 
b/tools/testing/selftests/bpf/trace_helpers.h
index 36d90e3..3b4bcf7 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -11,6 +11,7 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
 
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
-- 
2.9.5



[PATCH bpf-next v4 0/7] bpf: implement BPF_TASK_FD_QUERY

2018-05-23 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, this command will return bpf related information
to user space. Right now it only supports tracepoint/kprobe/uprobe
perf event fd's. For such a fd, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Patch #1 adds function perf_get_event() in kernel/events/core.c.
Patch #2 implements the bpf subcommand BPF_TASK_FD_QUERY.
Patch #3 syncs tools bpf.h header and also add bpf_task_fd_query()
in the libbpf library for samples/selftests/bpftool to use.
Patch #4 adds ksym_get_addr() utility function.
Patch #5 add a test in samples/bpf for querying k[ret]probes and
u[ret]probes.
Patch #6 add a test in tools/testing/selftests/bpf for querying
raw_tracepoint and tracepoint.
Patch #7 add a new subcommand "perf" to bpftool.

Changelogs:
  v3 -> v4:
 . made attr buf_len input/output. The length of
   actual buffter is written to buf_len so user space knows
   what is actually needed. If user provides a buffer
   with length >= 1 but less than required, do partial
   copy and return -ENOSPC.
 . code simplification with put_user.
 . changed query result attach_info to fd_type.
 . add tests at selftests/bpf to test zero len, null buf and
   insufficient buf.
  v2 -> v3:
 . made perf_get_event() return perf_event pointer const.
   this was to ensure that event fields are not meddled.
 . detect whether newly BPF_TASK_FD_QUERY is supported or
   not in "bpftool perf" and warn users if it is not.
  v1 -> v2:
 . changed bpf subcommand name from BPF_PERF_EVENT_QUERY
   to BPF_TASK_FD_QUERY.
 . fixed various "bpftool perf" issues and added documentation
   and auto-completion.

Yonghong Song (7):
  perf/core: add perf_get_event() to return perf_event given a struct
file
  bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
  tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in
libbpf
  tools/bpf: add ksym_get_addr() in trace_helpers
  samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
  tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
  tools/bpftool: add perf subcommand

 include/linux/perf_event.h   |   5 +
 include/linux/trace_events.h |  17 +
 include/uapi/linux/bpf.h |  26 ++
 kernel/bpf/syscall.c | 115 +++
 kernel/events/core.c |   8 +
 kernel/trace/bpf_trace.c |  48 +++
 kernel/trace/trace_kprobe.c  |  29 ++
 kernel/trace/trace_uprobe.c  |  22 ++
 samples/bpf/Makefile |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 382 +++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 +
 tools/bpf/bpftool/Documentation/bpftool.rst  |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool|   9 +
 tools/bpf/bpftool/main.c |   3 +-
 tools/bpf/bpftool/main.h |   1 +
 tools/bpf/bpftool/perf.c | 246 +++
 tools/include/uapi/linux/bpf.h   |  26 ++
 tools/lib/bpf/bpf.c  |  23 ++
 tools/lib/bpf/bpf.h  |   3 +
 tools/testing/selftests/bpf/test_progs.c | 158 ++
 tools/testing/selftests/bpf/trace_helpers.c  |  12 +
 tools/testing/selftests/bpf/trace_helpers.h  |   1 +
 23 files changed, 1241 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

-- 
2.9.5



Re: [PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

2018-05-23 Thread Yonghong Song



On 5/23/18 2:04 PM, Alexei Starovoitov wrote:

On Wed, May 23, 2018 at 10:13:22AM -0700, Martin KaFai Lau wrote:

+   __u32   prog_id;/* output: prod_id */
+   __u32   attach_info;/* output: BPF_ATTACH_* */
+   __u64   probe_offset;   /* output: probe_offset */
+   __u64   probe_addr; /* output: probe_addr */
+   } task_fd_query;
  } __attribute__((aligned(8)));
  
  /* The description below is an attempt at providing documentation to eBPF

@@ -2458,4 +2475,14 @@ struct bpf_fib_lookup {
__u8dmac[6]; /* ETH_ALEN */
  };
  
+/* used by  based query */

+enum {

Nit. Instead of a comment, is it better to give this
enum a descriptive name?


+   BPF_ATTACH_RAW_TRACEPOINT,  /* tp name */
+   BPF_ATTACH_TRACEPOINT,  /* tp name */
+   BPF_ATTACH_KPROBE,  /* (symbol + offset) or addr */
+   BPF_ATTACH_KRETPROBE,   /* (symbol + offset) or addr */
+   BPF_ATTACH_UPROBE,  /* filename + offset */
+   BPF_ATTACH_URETPROBE,   /* filename + offset */
+};


One more nit here.
Can we come up with better names for the above?
'attach' is a verb. I cannot help but read above as it's an action
for the kernel to attach to something and not the type of event
where the program was attached to.
Since we pass task+fd into that BPF_TASK_FD_QUERY command how
about returning BPF_FD_TYPE_KPROBE, BPF_FD_TYPE_TRACEPOINT, ... ?


Okay will use BPF_FD_TYPE_*... which is indeed better than
BPF_ATTACH_*.


Re: [PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

2018-05-23 Thread Yonghong Song



On 5/23/18 10:13 AM, Martin KaFai Lau wrote:

On Tue, May 22, 2018 at 09:30:46AM -0700, Yonghong Song wrote:

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

LGTM, some comments inline.



Signed-off-by: Yonghong Song <y...@fb.com>
---
  include/linux/trace_events.h |  16 ++
  include/uapi/linux/bpf.h |  27 ++
  kernel/bpf/syscall.c | 124 +++
  kernel/trace/bpf_trace.c |  48 +
  kernel/trace/trace_kprobe.c  |  29 ++
  kernel/trace/trace_uprobe.c  |  22 
  6 files changed, 266 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..eab806d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, 
void __user *info);
  int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
  int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog 
*prog);
  struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+   u32 *attach_info, const char **buf,
+   u64 *probe_offset, u64 *probe_addr);

The first arg is 'const struct perf_event *event' while...


  #else
  static inline unsigned int trace_call_bpf(struct trace_event_call *call, void 
*ctx)
  {
@@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map 
*bpf_find_raw_tracepoint(const char *name
  {
return NULL;
  }
+static inline int bpf_get_perf_event_info(const struct file *file, u32 
*prog_id,

this one has 'const struct file *file'?


Thanks for catching this. Will correct this in the next revision.




+ u32 *attach_info, const char **buf,
+ u64 *probe_offset, u64 *probe_addr)
+{
+   return -EOPNOTSUPP;
+}
  #endif
  
  enum {

@@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int 
flags);
  #ifdef CONFIG_KPROBE_EVENTS
  extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
  extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+  u32 *attach_info, const char **symbol,
+  u64 *probe_offset, u64 *probe_addr,
+  bool perf_type_tracepoint);
  #endif
  #ifdef CONFIG_UPROBE_EVENTS
  extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
  extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+  u32 *attach_info, const char **filename,
+  u64 *probe_offset, bool perf_type_tracepoint);
  #endif
  extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97446bb..a602150 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
  };
  
  enum bpf_map_type {

@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   int pid;/* input: pid */
+   int fd; /* input: fd */

Should fd and pid be always positive?
The current fd (like map_fd) in bpf_attr is using __u32.


Will change both pid and fd to __u32. In kernel fd 

Re: [PATCH] [RFC] bpf: tracing: new helper bpf_get_current_cgroup_ino

2018-05-22 Thread Y Song
On Tue, May 22, 2018 at 8:35 PM, Alexei Starovoitov
<alexei.starovoi...@gmail.com> wrote:
> On Tue, May 22, 2018 at 08:33:24PM -0700, Y Song wrote:
>> +   struct cgroup *cgrp = task_dfl_cgroup(current);
>> +   if (!cgrp)
>> +   return -EINVAL;
>
> why this check is needed?

No reason :-) Originally I am concerned whether it is possible cgrp
could be NULL.
By looking at the code, it SEEMS to me that it could not be NULL, but I am not
100% sure (as I am not a cgroup expert). Since you are asking,
probably means it cannot be NULL, so will remove it in formal upstream patch.


Re: [PATCH] [RFC] bpf: tracing: new helper bpf_get_current_cgroup_ino

2018-05-22 Thread Y Song
I did a quick prototyping and the above interface seems working fine.

The kernel change:
===

[yhs@localhost bpf-next]$ git diff
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97446bbe2ca5..669b7383fddb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1976,7 +1976,8 @@ union bpf_attr {
FN(fib_lookup), \
FN(sock_hash_update),   \
FN(msg_redirect_hash),  \
-   FN(sk_redirect_hash),
+   FN(sk_redirect_hash),   \
+   FN(get_current_cgroup_id),

 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ce2cbbff27e4..e11e3298f911 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -493,6 +493,21 @@ static const struct bpf_func_proto
bpf_current_task_under_cgroup_proto = {
.arg2_type  = ARG_ANYTHING,
 };

+BPF_CALL_0(bpf_get_current_cgroup_id)
+{
+   struct cgroup *cgrp = task_dfl_cgroup(current);
+   if (!cgrp)
+   return -EINVAL;
+
+   return cgrp->kn->id.id;
+}
+
+static const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
+   .func   = bpf_get_current_cgroup_id,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+};
+
 BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
   const void *, unsafe_ptr)
 {
@@ -563,6 +578,8 @@ tracing_func_proto(enum bpf_func_id func_id, const
struct bpf_prog *prog)
return _get_prandom_u32_proto;
case BPF_FUNC_probe_read_str:
return _probe_read_str_proto;
+   case BPF_FUNC_get_current_cgroup_id:
+   return _get_current_cgroup_id_proto;
default:
return NULL;
}

The following program can be used to print out a cgroup id given a cgroup path.
[yhs@localhost cg]$ cat get_cgroup_id.c
#define _GNU_SOURCE
#include 
#include 
#include 
#include 
#include 

int main(int argc, char **argv)
{
int dirfd, err, flags, mount_id, fhsize;
struct file_handle *fhp;
char *pathname;

if (argc != 2) {
printf("usage: %s \n", argv[0]);
return 1;
}

pathname = argv[1];
dirfd = AT_FDCWD;
flags = 0;

fhsize = sizeof(*fhp);
fhp = malloc(fhsize);
if (!fhp)
return 1;

err = name_to_handle_at(dirfd, pathname, fhp, _id, flags);
if (err >= 0) {
printf("error\n");
return 1;
}

fhsize = sizeof(struct file_handle) + fhp->handle_bytes;
fhp = realloc(fhp, fhsize);
if (!fhp)
return 1;

err = name_to_handle_at(dirfd, pathname, fhp, _id, flags);
if (err < 0)
perror("name_to_handle_at");
else {
int i;

printf("dir = %s, mount_id = %d\n", pathname, mount_id);
printf("handle_bytes = %d, handle_type = %d\n", fhp->handle_bytes,
fhp->handle_type);
if (fhp->handle_bytes != 8)
return 1;

printf("cgroup_id = 0x%llx\n", *(unsigned long long *)fhp->f_handle);
}

return 0;
}
[yhs@localhost cg]$

Given a cgroup path, the user can get cgroup_id and use it in their bpf
program for filtering purpose.

I run a simple program t.c
   int main() { while(1) sleep(1); return 0; }
in the cgroup v2 directory /home/yhs/tmp/yhs
   none on /home/yhs/tmp type cgroup2 (rw,relatime,seclabel)

$ ./get_cgroup_id /home/yhs/tmp/yhs
dir = /home/yhs/tmp/yhs, mount_id = 124
handle_bytes = 8, handle_type = 1
cgroup_id = 0x106b2

// the below command to get cgroup_id from the kernel for the
// process compiled with t.c and ran under /home/yhs/tmp/yhs:
$ sudo ./trace.py -p 4067 '__x64_sys_nanosleep "cgid = %llx", $cgid'
PID TID COMMFUNC -
40674067a.out   __x64_sys_nanosleep cgid = 106b2
40674067a.out   __x64_sys_nanosleep cgid = 106b2
40674067a.out   __x64_sys_nanosleep cgid = 106b2
^C[yhs@localhost tools]$

The kernel and user space cgid matches. Will provide a
formal patch later.




On Mon, May 21, 2018 at 5:24 PM, Y Song <ys114...@gmail.com> wrote:
> On Mon, May 21, 2018 at 9:26 AM, Alexei Starovoitov
> <alexei.starovoi...@gmail.com> wrote:
>> On Sun, May 13, 2018 at 07:33:18PM +0200, Alban Crequy wrote:
>>>
>>> +BPF_CALL_2(bpf_get_current_cgroup_ino, u32, hierarchy, u64, flags)
>>> +{
>>> + // TODO: pick the correct hierarchy instead of the mem controller
>>> + struct cgroup *cgrp = task_cgroup(current, memory_cgrp_id);
>>> +
>>> + if (unlikely(!cgrp))
>>> + return -EINVAL;
>>> + if (unlikely(hierarchy))
>>> + return -EINVAL;
>>> + if (u

[PATCH bpf-next v3 7/7] tools/bpftool: add perf subcommand

2018-05-22 Thread Yonghong Song
The new command "bpftool perf [show | list]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information. Documentation is also added.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
  kprobe: trace.py '__x64_sys_nanosleep'
  kretprobe:  trace.py 'r::__x64_sys_nanosleep'
  tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
  uprobe: trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

  $ bpftool perf
  pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
  pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
  pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
  pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159

  $ bpftool -j perf
  
[{"pid":21711,"fd":5,"prog_id":5,"attach_info":"kprobe","func":"__x64_sys_write","offset":0},
 \
   
{"pid":21765,"fd":5,"prog_id":7,"attach_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0},
 \
   
{"pid":21767,"fd":5,"prog_id":8,"attach_info":"tracepoint","tracepoint":"sys_enter_nanosleep"},
 \
   
{"pid":21800,"fd":5,"prog_id":9,"attach_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}]

  $ bpftool prog
  5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
  loaded_at 2018-05-15T04:46:37-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 4
  7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
  loaded_at 2018-05-15T04:48:32-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 7
  8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
  loaded_at 2018-05-15T04:48:48-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 8
  9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
  loaded_at 2018-05-15T04:49:52-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 9

  $ ps ax | grep "python ./trace.py"
  21711 pts/0T  0:03 python ./trace.py __x64_sys_write
  21765 pts/0S+ 0:00 python ./trace.py r::__x64_sys_nanosleep
  21767 pts/2S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
  21800 pts/3S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
  22374 pts/1S+ 0:00 grep --color=auto python ./trace.py

Reviewed-by: Jakub Kicinski <jakub.kicin...@netronome.com>
Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 
 tools/bpf/bpftool/Documentation/bpftool.rst  |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool|   9 +
 tools/bpf/bpftool/main.c |   3 +-
 tools/bpf/bpftool/main.h |   1 +
 tools/bpf/bpftool/perf.c | 244 +++
 6 files changed, 341 insertions(+), 2 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst 
b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
new file mode 100644
index 000..3e65375
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -0,0 +1,81 @@
+
+bpftool-perf
+
+---
+tool for inspection of perf related bpf prog attachments
+---
+
+:Manual section: 8
+
+SYNOPSIS
+
+
+   **bpftool** [*OPTIONS*] **perf** *COMMAND*
+
+   *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+   *COMMANDS* :=
+   { **show** | **list** | **help** }
+
+PERF COMMANDS
+=
+
+|  **bpftool** **perf { show | list }**
+|  **bpftool** **perf help**
+
+DESCRIPTION
+===
+   **bpftool perf { show | list }**
+ List all raw_tracepoint, tracepoint, kprobe attachment in the 
system.
+
+ Output will start with process id and file descriptor in that 
process,
+ followed by bpf program id, attachment information, and 
attachment point.
+ The attachment point for raw_tracepoint/tracepoint is the 
trace probe name.
+ The attachment point for k[ret]probe is either symbol name 
and offset,
+ or a kernel virtual address.
+ The attachment point for u[ret]probe is the file name and the 
file offset.
+
+   **bpftool perf help**
+ Print short help message.
+
+OPTIONS

[PATCH bpf-next v3 6/7] tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs

2018-05-22 Thread Yonghong Song
The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c | 133 +++
 1 file changed, 133 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733..f7ede03 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,137 @@ static void test_get_stack_raw_tp(void)
bpf_object__close(obj);
 }
 
+static void test_task_fd_query_rawtp(void)
+{
+   const char *file = "./test_get_stack_rawtp.o";
+   struct perf_event_attr attr = {};
+   __u64 probe_offset, probe_addr;
+   int efd, err, prog_fd, pmu_fd;
+   __u32 prog_id, attach_info;
+   struct bpf_object *obj;
+   __u32 duration = 0;
+   char buf[256];
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, , _fd);
+   if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+   return;
+
+   efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+   if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+
+   attr.sample_type = PERF_SAMPLE_RAW;
+   attr.type = PERF_TYPE_SOFTWARE;
+   attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+   pmu_fd = syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
+   if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+ errno))
+   goto close_prog;
+
+   err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+   if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
+ err, errno))
+   goto close_prog;
+
+   /* query (getpid(), efd */
+   err = bpf_task_fd_query(getpid(), efd, 0, buf, 256, _id,
+   _info, _offset, _addr);
+   if (CHECK(err < 0, "bpf_trace_event_query", "err %d errno %d\n", err,
+ errno))
+   goto close_prog;
+
+   err = (attach_info == BPF_ATTACH_RAW_TRACEPOINT) &&
+ (strcmp(buf, "sys_enter") == 0);
+   if (CHECK(!err, "check_results", "attach_info %d tp_name %s\n",
+ attach_info, buf))
+   goto close_prog;
+
+   goto close_prog_noerr;
+close_prog:
+   error_cnt++;
+close_prog_noerr:
+   bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+  const char *tp_name)
+{
+   const char *file = "./test_tracepoint.o";
+   int err, bytes, efd, prog_fd, pmu_fd;
+   struct perf_event_attr attr = {};
+   __u64 probe_offset, probe_addr;
+   __u32 prog_id, attach_info;
+   struct bpf_object *obj;
+   __u32 duration = 0;
+   char buf[256];
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, , _fd);
+   if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+   goto close_prog;
+
+   snprintf(buf, sizeof(buf),
+"/sys/kernel/debug/tracing/events/%s/id", probe_name);
+   efd = open(buf, O_RDONLY, 0);
+   if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+   bytes = read(efd, buf, sizeof(buf));
+   close(efd);
+   if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+ "bytes %d errno %d\n", bytes, errno))
+   goto close_prog;
+
+   attr.config = strtol(buf, NULL, 0);
+   attr.type = PERF_TYPE_TRACEPOINT;
+   attr.sample_type = PERF_SAMPLE_RAW;
+   attr.sample_period = 1;
+   attr.wakeup_events = 1;
+   pmu_fd = syscall(__NR_perf_event_open, , -1 /* pid */,
+0 /* cpu 0 */, -1 /* group id */,
+0 /* flags */);
+   if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+   goto close_pmu;
+
+   err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+   if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+ errno))
+   goto close_pmu;
+
+   err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+   if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+ errno))
+   goto close_pmu;
+
+   /* query (getpid(), pmu_fd */
+   err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, 256, _id,
+

[PATCH bpf-next v3 5/7] samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY

2018-05-22 Thread Yonghong Song
This is mostly to test kprobe/uprobe which needs kernel headers.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 samples/bpf/Makefile |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 379 +++
 3 files changed, 402 insertions(+)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1..7dc85ed 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES += $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4  += -lrt
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 000..f4b0a9e
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+   return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 000..792ef24
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({ \
+   int __ret = !!(condition);  \
+   if (__ret) {\
+   printf("FAIL: %s:\n", __func__);\
+   perror(""); \
+   return -1;  \
+   }   \
+})
+
+#define CHECK_AND_RET(condition) ({\
+   int __ret = !!(condition);  \
+   if (__ret)  \
+   return -1;  \
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+   return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   errno = 0;
+   ret = (int)strtol(buf, NULL, 10);
+   CHECK_PERROR_RET(errno);
+   return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+   CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+   

[PATCH bpf-next v3 0/7] bpf: implement BPF_TASK_FD_QUERY

2018-05-22 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, this command will return bpf related information
to user space. Right now it only supports tracepoint/kprobe/uprobe
perf event fd's. For such a fd, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Patch #1 adds function perf_get_event() in kernel/events/core.c.
Patch #2 implements the bpf subcommand BPF_TASK_FD_QUERY.
Patch #3 syncs tools bpf.h header and also add bpf_task_fd_query()
in the libbpf library for samples/selftests/bpftool to use.
Patch #4 adds ksym_get_addr() utility function.
Patch #5 add a test in samples/bpf for querying k[ret]probes and
u[ret]probes.
Patch #6 add a test in tools/testing/selftests/bpf for querying
raw_tracepoint and tracepoint.
Patch #7 add a new subcommand "perf" to bpftool.

Changelogs:
  v2 -> v3:
 . made perf_get_event() return perf_event pointer const.
   this was to ensure that event fields are not meddled.
 . detect whether newly BPF_TASK_FD_QUERY is supported or
   not in "bpftool perf" and warn users if it is not.
  v1 -> v2:
 . changed bpf subcommand name from BPF_PERF_EVENT_QUERY
   to BPF_TASK_FD_QUERY.
 . fixed various "bpftool perf" issues and added documentation
   and auto-completion.

Yonghong Song (7):
  perf/core: add perf_get_event() to return perf_event given a struct
file
  bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
  tools/bpf: sync kernel header bpf.h and add bpf_trace_event_query in
libbpf
  tools/bpf: add ksym_get_addr() in trace_helpers
  samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
  tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
  tools/bpftool: add perf subcommand

 include/linux/perf_event.h   |   5 +
 include/linux/trace_events.h |  16 +
 include/uapi/linux/bpf.h |  27 ++
 kernel/bpf/syscall.c | 124 
 kernel/events/core.c |   8 +
 kernel/trace/bpf_trace.c |  48 +++
 kernel/trace/trace_kprobe.c  |  29 ++
 kernel/trace/trace_uprobe.c  |  22 ++
 samples/bpf/Makefile |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 379 +++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 +
 tools/bpf/bpftool/Documentation/bpftool.rst  |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool|   9 +
 tools/bpf/bpftool/main.c |   3 +-
 tools/bpf/bpftool/main.h |   1 +
 tools/bpf/bpftool/perf.c | 244 +++
 tools/include/uapi/linux/bpf.h   |  27 ++
 tools/lib/bpf/bpf.c  |  24 ++
 tools/lib/bpf/bpf.h  |   3 +
 tools/testing/selftests/bpf/test_progs.c | 133 
 tools/testing/selftests/bpf/trace_helpers.c  |  12 +
 tools/testing/selftests/bpf/trace_helpers.h  |   1 +
 23 files changed, 1222 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

-- 
2.9.5



[PATCH bpf-next v3 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

2018-05-22 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/trace_events.h |  16 ++
 include/uapi/linux/bpf.h |  27 ++
 kernel/bpf/syscall.c | 124 +++
 kernel/trace/bpf_trace.c |  48 +
 kernel/trace/trace_kprobe.c  |  29 ++
 kernel/trace/trace_uprobe.c  |  22 
 6 files changed, 266 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..eab806d 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, 
void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+   u32 *attach_info, const char **buf,
+   u64 *probe_offset, u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void 
*ctx)
 {
@@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map 
*bpf_find_raw_tracepoint(const char *name
 {
return NULL;
 }
+static inline int bpf_get_perf_event_info(const struct file *file, u32 
*prog_id,
+ u32 *attach_info, const char **buf,
+ u64 *probe_offset, u64 *probe_addr)
+{
+   return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +569,17 @@ extern void perf_trace_del(struct perf_event *event, int 
flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+  u32 *attach_info, const char **symbol,
+  u64 *probe_offset, u64 *probe_addr,
+  bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+  u32 *attach_info, const char **filename,
+  u64 *probe_offset, bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97446bb..a602150 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   int pid;/* input: pid */
+   int fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+*   filename for uprobe
+*/
+   __u32   pro

[PATCH bpf-next v3 1/7] perf/core: add perf_get_event() to return perf_event given a struct file

2018-05-22 Thread Yonghong Song
A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/perf_event.h | 5 +
 kernel/events/core.c   | 8 
 2 files changed, 13 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..eec302b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct file *perf_event_get(unsigned int fd);
+extern const struct perf_event *perf_get_event(struct file *file);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct 
task_struct *child)   { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
 static inline struct file *perf_event_get(unsigned int fd) { return 
ERR_PTR(-EINVAL); }
+static inline const struct perf_event *perf_get_event(struct file *file)
+{
+   return ERR_PTR(-EINVAL);
+}
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event)
 {
return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce..6eeab86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
return file;
 }
 
+const struct perf_event *perf_get_event(struct file *file)
+{
+   if (file->f_op != _fops)
+   return ERR_PTR(-EINVAL);
+
+   return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
if (!event)
-- 
2.9.5



[PATCH bpf-next v3 3/7] tools/bpf: sync kernel header bpf.h and add bpf_trace_event_query in libbpf

2018-05-22 Thread Yonghong Song
Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
implement bpf_trace_event_query() in libbpf. The test programs
in samples/bpf and tools/testing/selftests/bpf, and later bpftool
will use this libbpf function to query kernel.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/include/uapi/linux/bpf.h | 27 +++
 tools/lib/bpf/bpf.c| 24 
 tools/lib/bpf/bpf.h|  3 +++
 3 files changed, 54 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 97446bb..a602150 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   int pid;/* input: pid */
+   int fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+*   filename for uprobe
+*/
+   __u32   prog_id;/* output: prod_id */
+   __u32   attach_info;/* output: BPF_ATTACH_* */
+   __u64   probe_offset;   /* output: probe_offset */
+   __u64   probe_addr; /* output: probe_addr */
+   } task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2458,4 +2475,14 @@ struct bpf_fib_lookup {
__u8dmac[6]; /* ETH_ALEN */
 };
 
+/* used by <task, fd> based query */
+enum {
+   BPF_ATTACH_RAW_TRACEPOINT,  /* tp name */
+   BPF_ATTACH_TRACEPOINT,  /* tp name */
+   BPF_ATTACH_KPROBE,  /* (symbol + offset) or addr */
+   BPF_ATTACH_KRETPROBE,   /* (symbol + offset) or addr */
+   BPF_ATTACH_UPROBE,  /* filename + offset */
+   BPF_ATTACH_URETPROBE,   /* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 6a8a000..da3f336 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -643,3 +643,27 @@ int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, 
__u32 log_buf_size,
 
return fd;
 }
+
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 buf_len,
+ __u32 *prog_id, __u32 *attach_info,
+ __u64 *probe_offset, __u64 *probe_addr)
+{
+   union bpf_attr attr = {};
+   int err;
+
+   attr.task_fd_query.pid = pid;
+   attr.task_fd_query.fd = fd;
+   attr.task_fd_query.flags = flags;
+   attr.task_fd_query.buf = ptr_to_u64(buf);
+   attr.task_fd_query.buf_len = buf_len;
+
+   err = sys_bpf(BPF_TASK_FD_QUERY, , sizeof(attr));
+   if (!err) {
+   *prog_id = attr.task_fd_query.prog_id;
+   *attach_info = attr.task_fd_query.attach_info;
+   *probe_offset = attr.task_fd_query.probe_offset;
+   *probe_addr = attr.task_fd_query.probe_addr;
+   }
+
+   return err;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 15bff77..9adfde6 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -107,4 +107,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type 
type, __u32 query_flags,
 int bpf_raw_tracepoint_open(const char *name, int prog_fd);
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 bool do_log);
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 buf_len,
+ __u32 *prog_id, __u32 *prog_info,
+ __u64 *probe_offset, __u64 *probe_addr);
 #endif
-- 
2.9.5



[PATCH bpf-next v3 4/7] tools/bpf: add ksym_get_addr() in trace_helpers

2018-05-22 Thread Yonghong Song
Given a kernel function name, ksym_get_addr() will return the kernel
address for this function, or 0 if it cannot find this function name
in /proc/kallsyms. This function will be used later when a kernel
address is used to initiate a kprobe perf event.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/trace_helpers.c | 12 
 tools/testing/selftests/bpf/trace_helpers.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c 
b/tools/testing/selftests/bpf/trace_helpers.c
index 8fb4fe8..3868dcb 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -72,6 +72,18 @@ struct ksym *ksym_search(long key)
return [0];
 }
 
+long ksym_get_addr(const char *name)
+{
+   int i;
+
+   for (i = 0; i < sym_cnt; i++) {
+   if (strcmp(syms[i].name, name) == 0)
+   return syms[i].addr;
+   }
+
+   return 0;
+}
+
 static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h 
b/tools/testing/selftests/bpf/trace_helpers.h
index 36d90e3..3b4bcf7 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -11,6 +11,7 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
 
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
-- 
2.9.5



Re: [PATCH bpf-next 3/7] bpf: btf: Check array->index_type

2018-05-21 Thread Yonghong Song



On 5/18/18 5:16 PM, Martin KaFai Lau wrote:

Instead of ingoring the array->index_type field.  Enforce that
it must be an unsigned BTF_KIND_INT.

Signed-off-by: Martin KaFai Lau 
---
  kernel/bpf/btf.c | 83 
  1 file changed, 59 insertions(+), 24 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 536e5981ad8c..b4e48dae2240 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -444,6 +444,28 @@ static const struct btf_type *btf_type_by_id(const struct 
btf *btf, u32 type_id)
return btf->types[type_id];
  }
  
+/*

+ * Regular int is not a bit field and it must be either
+ * u8/u16/u32/u64.
+ */
+static bool btf_type_int_is_regular(const struct btf_type *t)
+{
+   u16 nr_bits, nr_bytes;
+   u32 int_data;
+
+   int_data = btf_type_int(t);
+   nr_bits = BTF_INT_BITS(int_data);
+   nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+   if (BITS_PER_BYTE_MASKED(nr_bits) ||
+   BTF_INT_OFFSET(int_data) ||
+   (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+   return false;
+   }
+
+   return true;
+}
+
  __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
  const char *fmt, ...)
  {
@@ -1309,14 +1331,16 @@ static s32 btf_array_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
  
-	/* We are a little forgiving on array->index_type since

-* the kernel is not using it.
-*/
-   /* Array elem cannot be in type void,
-* so !array->type is not allowed.
+   /* Array elem type and index type cannot be in type void,
+* so !array->type and !array->index_type are not allowed.
 */
if (!array->type || BTF_TYPE_PARENT(array->type)) {
-   btf_verifier_log_type(env, t, "Invalid type_id");
+   btf_verifier_log_type(env, t, "Invalid elem");
+   return -EINVAL;
+   }
+
+   if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
+   btf_verifier_log_type(env, t, "Invalid index");
return -EINVAL;
}
  
@@ -1329,11 +1353,35 @@ static int btf_array_resolve(struct btf_verifier_env *env,

 const struct resolve_vertex *v)
  {
const struct btf_array *array = btf_type_array(v->t);
-   const struct btf_type *elem_type;
-   u32 elem_type_id = array->type;
+   const struct btf_type *elem_type, *index_type;
+   u32 elem_type_id, index_type_id;
struct btf *btf = env->btf;
u32 elem_size;
  
+	/* Check array->index_type */

+   index_type_id = array->index_type;
+   index_type = btf_type_by_id(btf, index_type_id);
+   if (btf_type_is_void_or_null(index_type)) {
+   btf_verifier_log_type(env, v->t, "Invalid index");
+   return -EINVAL;
+   }
+
+   if (!env_type_is_resolve_sink(env, index_type) &&
+   !env_type_is_resolved(env, index_type_id))
+   return env_stack_push(env, index_type, index_type_id);
+
+   index_type = btf_type_id_size(btf, _type_id, NULL);
+   if (!index_type || !btf_type_is_int(index_type) ||
+   /* bit field int is not allowed */
+   !btf_type_int_is_regular(index_type) ||
+   /* unsigned only */
+   BTF_INT_ENCODING(btf_type_int(index_type))) {


Could you explain why you only support array index type to be
unsigned? A lot of test cases  in Patch #7 are amended with unsigned 
types. In C, signed integers can surely be index, e.g., a[-1].



+   btf_verifier_log_type(env, v->t, "Invalid index");
+   return -EINVAL;
+   }
+
+   /* Check array->type */
+   elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
if (btf_type_is_void_or_null(elem_type)) {
btf_verifier_log_type(env, v->t,
@@ -1351,22 +1399,9 @@ static int btf_array_resolve(struct btf_verifier_env 
*env,
return -EINVAL;
}
  
-	if (btf_type_is_int(elem_type)) {

-   int int_type_data = btf_type_int(elem_type);
-   u16 nr_bits = BTF_INT_BITS(int_type_data);
-   u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
-
-   /* Put more restriction on array of int.  The int cannot
-* be a bit field and it must be either u8/u16/u32/u64.
-*/
-   if (BITS_PER_BYTE_MASKED(nr_bits) ||
-   BTF_INT_OFFSET(int_type_data) ||
-   (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
-nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
-   btf_verifier_log_type(env, v->t,
- "Invalid array of int");
-   return -EINVAL;
-   

Re: [PATCH] [RFC] bpf: tracing: new helper bpf_get_current_cgroup_ino

2018-05-21 Thread Y Song
On Mon, May 21, 2018 at 9:26 AM, Alexei Starovoitov
 wrote:
> On Sun, May 13, 2018 at 07:33:18PM +0200, Alban Crequy wrote:
>>
>> +BPF_CALL_2(bpf_get_current_cgroup_ino, u32, hierarchy, u64, flags)
>> +{
>> + // TODO: pick the correct hierarchy instead of the mem controller
>> + struct cgroup *cgrp = task_cgroup(current, memory_cgrp_id);
>> +
>> + if (unlikely(!cgrp))
>> + return -EINVAL;
>> + if (unlikely(hierarchy))
>> + return -EINVAL;
>> + if (unlikely(flags))
>> + return -EINVAL;
>> +
>> + return cgrp->kn->id.ino;
>
> ino only is not enough to identify cgroup. It needs generation number too.
> I don't quite see how hierarchy and flags can be used in the future.
> Also why limit it to memcg?
>
> How about something like this instead:
>
> BPF_CALL_2(bpf_get_current_cgroup_id)
> {
> struct cgroup *cgrp = task_dfl_cgroup(current);
>
> return cgrp->kn->id.id;
> }
> The user space can use fhandle api to get the same 64-bit id.

I think this should work. This will also be useful to bcc as user
space can encode desired id
in the bpf program and compared that id to the current cgroup id, so we can have
cgroup level tracing (esp. stat collection) support. To cope with
cgroup hierarchy, user can use
cgroup-array based approach or explicitly compare against multiple cgroup id's.


Re: [PATCH bpf-next 4/7] bpf: btf: Remove unused bits from uapi/linux/btf.h

2018-05-21 Thread Yonghong Song



On 5/18/18 5:16 PM, Martin KaFai Lau wrote:

This patch does the followings:
1. Limit BTF_MAX_TYPES and BTF_MAX_NAME_OFFSET to 64k.  We can
raise it later.

2. Remove the BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID.  They are
currently encoded at the highest bit of a u32.
It is because the current use case does not require supporting
parent type (i.e type_id referring to a type in another BTF file).
It also does not support referring to a string in ELF.

The BTF_TYPE_PARENT and BTF_STR_TBL_ELF_ID checks are replaced
by BTF_TYPE_ID_CHECK and BTF_STR_OFFSET_CHECK which are
defined in btf.c instead of uapi/linux/btf.h.

3. Limit the BTF_INFO_KIND from 5 bits to 4 bits which is enough.
There is unused bits headroom if we ever needed it later.

4. The root bit in BTF_INFO is also removed because it is not
used in the current use case.

The above can be added back later because the verifier
ensures the unused bits are zeros.

Signed-off-by: Martin KaFai Lau <ka...@fb.com>


Acked-by: Yonghong Song <y...@fb.com>


Re: [PATCH bpf-next 5/7] bpf: btf: Rename btf_key_id and btf_value_id in bpf_map_info

2018-05-21 Thread Yonghong Song



On 5/18/18 5:16 PM, Martin KaFai Lau wrote:

In "struct bpf_map_info", the name "btf_id", "btf_key_id" and "btf_value_id"
could cause confusion because the "id" of "btf_id" means the BPF obj id
given to the BTF object while
"btf_key_id" and "btf_value_id" means the BTF type id within
that BTF object.

To make it clear, btf_key_id and btf_value_id are
renamed to btf_key_type_id and btf_value_type_id.

Suggested-by: Daniel Borkmann <dan...@iogearbox.net>
Signed-off-by: Martin KaFai Lau <ka...@fb.com>

Acked-by: Yonghong Song <y...@fb.com>


Re: [PATCH bpf-next 3/7] bpf: btf: Check array->index_type

2018-05-21 Thread Yonghong Song



On 5/18/18 5:16 PM, Martin KaFai Lau wrote:

Instead of ingoring the array->index_type field.  Enforce that
it must be an unsigned BTF_KIND_INT.

Signed-off-by: Martin KaFai Lau 
---
  kernel/bpf/btf.c | 83 
  1 file changed, 59 insertions(+), 24 deletions(-)

diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 536e5981ad8c..b4e48dae2240 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -444,6 +444,28 @@ static const struct btf_type *btf_type_by_id(const struct 
btf *btf, u32 type_id)
return btf->types[type_id];
  }
  
+/*

+ * Regular int is not a bit field and it must be either
+ * u8/u16/u32/u64.
+ */
+static bool btf_type_int_is_regular(const struct btf_type *t)
+{
+   u16 nr_bits, nr_bytes;
+   u32 int_data;
+
+   int_data = btf_type_int(t);
+   nr_bits = BTF_INT_BITS(int_data);
+   nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+   if (BITS_PER_BYTE_MASKED(nr_bits) ||
+   BTF_INT_OFFSET(int_data) ||
+   (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+   return false;
+   }
+
+   return true;
+}
+
  __printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
  const char *fmt, ...)
  {
@@ -1309,14 +1331,16 @@ static s32 btf_array_check_meta(struct btf_verifier_env 
*env,
return -EINVAL;
}
  
-	/* We are a little forgiving on array->index_type since

-* the kernel is not using it.
-*/
-   /* Array elem cannot be in type void,
-* so !array->type is not allowed.
+   /* Array elem type and index type cannot be in type void,
+* so !array->type and !array->index_type are not allowed.
 */
if (!array->type || BTF_TYPE_PARENT(array->type)) {
-   btf_verifier_log_type(env, t, "Invalid type_id");
+   btf_verifier_log_type(env, t, "Invalid elem");
+   return -EINVAL;
+   }
+
+   if (!array->index_type || BTF_TYPE_PARENT(array->index_type)) {
+   btf_verifier_log_type(env, t, "Invalid index");
return -EINVAL;
}
  
@@ -1329,11 +1353,35 @@ static int btf_array_resolve(struct btf_verifier_env *env,

 const struct resolve_vertex *v)
  {
const struct btf_array *array = btf_type_array(v->t);
-   const struct btf_type *elem_type;
-   u32 elem_type_id = array->type;
+   const struct btf_type *elem_type, *index_type;
+   u32 elem_type_id, index_type_id;
struct btf *btf = env->btf;
u32 elem_size;
  
+	/* Check array->index_type */

+   index_type_id = array->index_type;
+   index_type = btf_type_by_id(btf, index_type_id);
+   if (btf_type_is_void_or_null(index_type)) {
+   btf_verifier_log_type(env, v->t, "Invalid index");
+   return -EINVAL;
+   }
+
+   if (!env_type_is_resolve_sink(env, index_type) &&
+   !env_type_is_resolved(env, index_type_id))
+   return env_stack_push(env, index_type, index_type_id);
+
+   index_type = btf_type_id_size(btf, _type_id, NULL);
+   if (!index_type || !btf_type_is_int(index_type) ||
+   /* bit field int is not allowed */
+   !btf_type_int_is_regular(index_type) ||
+   /* unsigned only */
+   BTF_INT_ENCODING(btf_type_int(index_type))) {
+   btf_verifier_log_type(env, v->t, "Invalid index");
+   return -EINVAL;
+   }


Currently, in uapi/linux/btf.h, we have
/* Attributes stored in the BTF_INT_ENCODING */
#define BTF_INT_SIGNED  0x1
#define BTF_INT_CHAR0x2
#define BTF_INT_BOOL0x4
#define BTF_INT_VARARGS 0x8

The BPF_INT_ENCODING value 0 stands for UNSIGNED.
Do we want to explicitly document this in uapi/linux/bpf.h?


+
+   /* Check array->type */
+   elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
if (btf_type_is_void_or_null(elem_type)) {
btf_verifier_log_type(env, v->t,
@@ -1351,22 +1399,9 @@ static int btf_array_resolve(struct btf_verifier_env 
*env,
return -EINVAL;
}
  
-	if (btf_type_is_int(elem_type)) {

-   int int_type_data = btf_type_int(elem_type);
-   u16 nr_bits = BTF_INT_BITS(int_type_data);
-   u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
-
-   /* Put more restriction on array of int.  The int cannot
-* be a bit field and it must be either u8/u16/u32/u64.
-*/
-   if (BITS_PER_BYTE_MASKED(nr_bits) ||
-   BTF_INT_OFFSET(int_type_data) ||
-   (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
-nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
-   btf_verifier_log_type(env, v->t,
-   

Re: [PATCH bpf-next 2/7] bpf: btf: Change how section is supported in btf_header

2018-05-21 Thread Yonghong Song



On 5/18/18 5:16 PM, Martin KaFai Lau wrote:

There are currently unused section descriptions in the btf_header.  Those
sections are here to support future BTF use cases.  For example, the
func section (func_off) is to support function signature (e.g. the BPF
prog function signature).

Instead of spelling out all potential sections up-front in the btf_header.
This patch makes changes to btf_header such that extending it (e.g. adding
a section) is possible later.  The unused ones can be removed for now and
they can be added back later.

This patch:
1. adds a hdr_len to the btf_header.  It will allow adding
sections (and other info like parent_label and parent_name)
later.  The check is similar to the existing bpf_attr.
If a user passes in a longer hdr_len, the kernel
ensures the extra tailing bytes are 0.

2. allows the section order in the BTF object to be
different from its sec_off order in btf_header.

3. each sec_off is followed by a sec_len.  It must not have gap or
overlapping among sections.

The string section is ensured to be at the end due to the 4 bytes
alignment requirement of the type section.

The above changes will allow enough flexibility to
add new sections (and other info) to the btf_header later.

This patch also removes an unnecessary !err check
at the end of btf_parse().

Signed-off-by: Martin KaFai Lau 
---
  include/uapi/linux/btf.h |   8 +-
  kernel/bpf/btf.c | 207 +++
  2 files changed, 158 insertions(+), 57 deletions(-)

diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index bcb56ee47014..4fa479741a02 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -12,15 +12,11 @@ struct btf_header {
__u16   magic;
__u8version;
__u8flags;
-
-   __u32   parent_label;
-   __u32   parent_name;
+   __u32   hdr_len;
  
  	/* All offsets are in bytes relative to the end of this header */

-   __u32   label_off;  /* offset of label section  */
-   __u32   object_off; /* offset of data object section*/
-   __u32   func_off;   /* offset of function section   */
__u32   type_off;   /* offset of type section   */
+   __u32   type_len;   /* length of type section   */
__u32   str_off;/* offset of string section */
__u32   str_len;/* length of string section */
  };
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ded10ab47b8a..536e5981ad8c 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -12,6 +12,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  
@@ -184,15 +185,13 @@ static DEFINE_IDR(btf_idr);

  static DEFINE_SPINLOCK(btf_idr_lock);
  
  struct btf {

-   union {
-   struct btf_header *hdr;
-   void *data;
-   };
+   void *data;
struct btf_type **types;
u32 *resolved_ids;
u32 *resolved_sizes;
const char *strings;
void *nohdr_data;
+   struct btf_header hdr;
u32 nr_types;
u32 types_size;
u32 data_size;
@@ -227,6 +226,12 @@ enum resolve_mode {
  };
  
  #define MAX_RESOLVE_DEPTH 32

+#define NR_SECS 2


Not sure whether it is necessary to define NR_SECS 2 here or not.
See below.


+
+struct btf_sec_info {
+   u32 off;
+   u32 len;
+};
  
  struct btf_verifier_env {

struct btf *btf;
@@ -418,14 +423,14 @@ static const struct btf_kind_operations 
*btf_type_ops(const struct btf_type *t)
  static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
  {
return !BTF_STR_TBL_ELF_ID(offset) &&
-   BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+   BTF_STR_OFFSET(offset) < btf->hdr.str_len;
  }
  
  static const char *btf_name_by_offset(const struct btf *btf, u32 offset)

  {
if (!BTF_STR_OFFSET(offset))
return "(anon)";
-   else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+   else if (BTF_STR_OFFSET(offset) < btf->hdr.str_len)
return >strings[BTF_STR_OFFSET(offset)];
else
return "(invalid-name-offset)";
@@ -536,7 +541,8 @@ static void btf_verifier_log_member(struct btf_verifier_env 
*env,
__btf_verifier_log(log, "\n");
  }
  
-static void btf_verifier_log_hdr(struct btf_verifier_env *env)

+static void btf_verifier_log_hdr(struct btf_verifier_env *env,
+u32 btf_data_size)
  {
struct bpf_verifier_log *log = >log;
const struct btf *btf = env->btf;
@@ -545,19 +551,16 @@ static void btf_verifier_log_hdr(struct btf_verifier_env 
*env)
if (!bpf_verifier_log_needed(log))
return;
  
-	hdr = btf->hdr;

+   hdr = >hdr;
__btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
__btf_verifier_log(log, "version: %u\n", hdr->version);
__btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
-   

Re: [PATCH bpf-next 1/7] bpf: Expose check_uarg_tail_zero()

2018-05-21 Thread Yonghong Song



On 5/18/18 5:16 PM, Martin KaFai Lau wrote:

This patch exposes check_uarg_tail_zero() which will
be reused by a later BTF patch.  Its name is changed to
bpf_check_uarg_tail_zero().

Signed-off-by: Martin KaFai Lau <ka...@fb.com>


Acked-by: Yonghong Song <y...@fb.com>


Re: [PATCH bpf-next v2 7/7] tools/bpftool: add perf subcommand

2018-05-18 Thread Y Song
On Fri, May 18, 2018 at 1:51 PM, Jakub Kicinski
<jakub.kicin...@netronome.com> wrote:
> On Thu, 17 May 2018 22:03:10 -0700, Yonghong Song wrote:
>> The new command "bpftool perf [show | list]" will traverse
>> all processes under /proc, and if any fd is associated
>> with a perf event, it will print out related perf event
>> information. Documentation is also added.
>
> Thanks for the changes, it looks good with some minor nits which can be
> addressed as follow up if there is no other need to respin.  Please
> consider it:
>
> Reviewed-by: Jakub Kicinski <jakub.kicin...@netronome.com>

Most likely will need respin. Will make suggested changes then.

>
>> Below is an example to show the results using bcc commands.
>> Running the following 4 bcc commands:
>>   kprobe: trace.py '__x64_sys_nanosleep'
>>   kretprobe:  trace.py 'r::__x64_sys_nanosleep'
>>   tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
>>   uprobe: trace.py 'p:/home/yhs/a.out:main'
>>
>> The bpftool command line and result:
>>
>>   $ bpftool perf
>>   pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
>>   pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
>>   pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
>>   pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159
>>
>>   $ bpftool -j perf
>>   
>> {"pid":21711,"fd":5,"prog_id":5,"attach_info":"kprobe","func":"__x64_sys_write","offset":0},
>>  \
>>   
>> {"pid":21765,"fd":5,"prog_id":7,"attach_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0},
>>  \
>>   
>> {"pid":21767,"fd":5,"prog_id":8,"attach_info":"tracepoint","tracepoint":"sys_enter_nanosleep"},
>>  \
>>   
>> {"pid":21800,"fd":5,"prog_id":9,"attach_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}
>
> nit: this is now an array

Sorry, this is probably updated in middle of work. Will make the change in
the next revision.

>
>>   $ bpftool prog
>>   5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
>> loaded_at 2018-05-15T04:46:37-0700  uid 0
>> xlated 200B  not jited  memlock 4096B  map_ids 4
>>   7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
>> loaded_at 2018-05-15T04:48:32-0700  uid 0
>> xlated 200B  not jited  memlock 4096B  map_ids 7
>>   8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
>> loaded_at 2018-05-15T04:48:48-0700  uid 0
>> xlated 200B  not jited  memlock 4096B  map_ids 8
>>   9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
>> loaded_at 2018-05-15T04:49:52-0700  uid 0
>> xlated 200B  not jited  memlock 4096B  map_ids 9
>>
>>   $ ps ax | grep "python ./trace.py"
>>   21711 pts/0T  0:03 python ./trace.py __x64_sys_write
>>   21765 pts/0S+ 0:00 python ./trace.py r::__x64_sys_nanosleep
>>   21767 pts/2S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
>>   21800 pts/3S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
>>   22374 pts/1S+ 0:00 grep --color=auto python ./trace.py
>>
>> Signed-off-by: Yonghong Song <y...@fb.com>
>
>> diff --git a/tools/bpf/bpftool/bash-completion/bpftool 
>> b/tools/bpf/bpftool/bash-completion/bpftool
>> index b301c9b..3680ad4 100644
>> --- a/tools/bpf/bpftool/bash-completion/bpftool
>> +++ b/tools/bpf/bpftool/bash-completion/bpftool
>> @@ -448,6 +448,15 @@ _bpftool()
>>  ;;
>>  esac
>>  ;;
>> +cgroup)
>
> s/cgroup/perf/ :)

A mistake in my side to consolidate different version of code.
I did have "perf" in one of my versions and tested it properly.

>
>> +case $command in
>> +*)
>> +[[ $prev == $object ]] && \
>> +COMPREPLY=( $( compgen -W 'help \
>> +show list' -- "$cur" ) )
>> +;;
>> +esac
>> +;;
>>  esac
>>  } &&
>>  complete -F _bpftool bpftool
>
>> +static int show_proc(const char *fpath, const struct stat *sb,
>> +  int tflag, struct FTW *ftwbuf)
>

Re: [PATCH v4 3/3] bpf: add selftest for lirc_mode2 type program

2018-05-18 Thread Y Song
On Fri, May 18, 2018 at 1:17 PM, Y Song <ys114...@gmail.com> wrote:
> On Fri, May 18, 2018 at 7:07 AM, Sean Young <s...@mess.org> wrote:
>> This is simple test over rc-loopback.
>>
>> Signed-off-by: Sean Young <s...@mess.org>
>
> Acked-by: Yonghong Song <y...@fb.com>

Just one minor thing. You need to add "test_lirc_mode2_user"
in tools/testing/selftests/bpf/.gitignore
so it will not show up when you do "git status".

If the patch needs respin, you can add this in the new revision.
Otherwise, I think a followup patch to fix this should be fine.

>
>> ---
>>  tools/bpf/bpftool/prog.c  |   1 +
>>  tools/include/uapi/linux/bpf.h|  53 -
>>  tools/include/uapi/linux/lirc.h   | 217 ++
>>  tools/lib/bpf/libbpf.c|   1 +
>>  tools/testing/selftests/bpf/Makefile  |   8 +-
>>  tools/testing/selftests/bpf/bpf_helpers.h |   6 +
>>  .../testing/selftests/bpf/test_lirc_mode2.sh  |  28 +++
>>  .../selftests/bpf/test_lirc_mode2_kern.c  |  23 ++
>>  .../selftests/bpf/test_lirc_mode2_user.c  | 154 +
>>  9 files changed, 487 insertions(+), 4 deletions(-)
>>  create mode 100644 tools/include/uapi/linux/lirc.h
>>  create mode 100755 tools/testing/selftests/bpf/test_lirc_mode2.sh
>>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_kern.c
>>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_user.c
>>
>> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
>> index 9bdfdf2d3fbe..07f1ace39a46 100644
>> --- a/tools/bpf/bpftool/prog.c
>> +++ b/tools/bpf/bpftool/prog.c
>> @@ -71,6 +71,7 @@ static const char * const prog_type_name[] = {
>> [BPF_PROG_TYPE_SK_MSG]  = "sk_msg",
>> [BPF_PROG_TYPE_RAW_TRACEPOINT]  = "raw_tracepoint",
>> [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
>> +   [BPF_PROG_TYPE_LIRC_MODE2]  = "lirc_mode2",
>>  };
>>
>>  static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
>> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
>> index d94d333a8225..8227832b713e 100644
>> --- a/tools/include/uapi/linux/bpf.h
>> +++ b/tools/include/uapi/linux/bpf.h
>> @@ -141,6 +141,7 @@ enum bpf_prog_type {
>> BPF_PROG_TYPE_SK_MSG,
>> BPF_PROG_TYPE_RAW_TRACEPOINT,
>> BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
>> +   BPF_PROG_TYPE_LIRC_MODE2,
>>  };
>>
>>  enum bpf_attach_type {
>> @@ -158,6 +159,7 @@ enum bpf_attach_type {
>> BPF_CGROUP_INET6_CONNECT,
>> BPF_CGROUP_INET4_POST_BIND,
>> BPF_CGROUP_INET6_POST_BIND,
>> +   BPF_LIRC_MODE2,
>> __MAX_BPF_ATTACH_TYPE
>>  };
>>
>> @@ -1902,6 +1904,53 @@ union bpf_attr {
>>   * egress otherwise). This is the only flag supported for now.
>>   * Return
>>   * **SK_PASS** on success, or **SK_DROP** on error.
>> + *
>> + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
>> + * Description
>> + * This helper is used in programs implementing IR decoding, to
>> + * report a successfully decoded key press with *scancode*,
>> + * *toggle* value in the given *protocol*. The scancode will be
>> + * translated to a keycode using the rc keymap, and reported as
>> + * an input key down event. After a period a key up event is
>> + * generated. This period can be extended by calling either
>> + * **bpf_rc_keydown** () with the same values, or calling
>> + * **bpf_rc_repeat** ().
>> + *
>> + * Some protocols include a toggle bit, in case the button
>> + * was released and pressed again between consecutive scancodes
>> + *
>> + * The *ctx* should point to the lirc sample as passed into
>> + * the program.
>> + *
>> + * The *protocol* is the decoded protocol number (see
>> + * **enum rc_proto** for some predefined values).
>> + *
>> + * This helper is only available is the kernel was compiled with
>> + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
>> + * "**y**".
>> + *
>> + * Return
>> + * 0
>> + *
>> + * int bpf_rc_repeat(void *ctx)
>> + * Description
>> + * This helper is used in programs implem

Re: [PATCH v4 3/3] bpf: add selftest for lirc_mode2 type program

2018-05-18 Thread Y Song
On Fri, May 18, 2018 at 7:07 AM, Sean Young <s...@mess.org> wrote:
> This is simple test over rc-loopback.
>
> Signed-off-by: Sean Young <s...@mess.org>

Acked-by: Yonghong Song <y...@fb.com>

> ---
>  tools/bpf/bpftool/prog.c  |   1 +
>  tools/include/uapi/linux/bpf.h|  53 -
>  tools/include/uapi/linux/lirc.h   | 217 ++
>  tools/lib/bpf/libbpf.c|   1 +
>  tools/testing/selftests/bpf/Makefile  |   8 +-
>  tools/testing/selftests/bpf/bpf_helpers.h |   6 +
>  .../testing/selftests/bpf/test_lirc_mode2.sh  |  28 +++
>  .../selftests/bpf/test_lirc_mode2_kern.c  |  23 ++
>  .../selftests/bpf/test_lirc_mode2_user.c  | 154 +
>  9 files changed, 487 insertions(+), 4 deletions(-)
>  create mode 100644 tools/include/uapi/linux/lirc.h
>  create mode 100755 tools/testing/selftests/bpf/test_lirc_mode2.sh
>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_kern.c
>  create mode 100644 tools/testing/selftests/bpf/test_lirc_mode2_user.c
>
> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index 9bdfdf2d3fbe..07f1ace39a46 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -71,6 +71,7 @@ static const char * const prog_type_name[] = {
> [BPF_PROG_TYPE_SK_MSG]  = "sk_msg",
> [BPF_PROG_TYPE_RAW_TRACEPOINT]  = "raw_tracepoint",
> [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
> +   [BPF_PROG_TYPE_LIRC_MODE2]  = "lirc_mode2",
>  };
>
>  static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index d94d333a8225..8227832b713e 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -141,6 +141,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_SK_MSG,
> BPF_PROG_TYPE_RAW_TRACEPOINT,
> BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
> +   BPF_PROG_TYPE_LIRC_MODE2,
>  };
>
>  enum bpf_attach_type {
> @@ -158,6 +159,7 @@ enum bpf_attach_type {
> BPF_CGROUP_INET6_CONNECT,
> BPF_CGROUP_INET4_POST_BIND,
> BPF_CGROUP_INET6_POST_BIND,
> +   BPF_LIRC_MODE2,
> __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -1902,6 +1904,53 @@ union bpf_attr {
>   * egress otherwise). This is the only flag supported for now.
>   * Return
>   * **SK_PASS** on success, or **SK_DROP** on error.
> + *
> + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle)
> + * Description
> + * This helper is used in programs implementing IR decoding, to
> + * report a successfully decoded key press with *scancode*,
> + * *toggle* value in the given *protocol*. The scancode will be
> + * translated to a keycode using the rc keymap, and reported as
> + * an input key down event. After a period a key up event is
> + * generated. This period can be extended by calling either
> + * **bpf_rc_keydown** () with the same values, or calling
> + * **bpf_rc_repeat** ().
> + *
> + * Some protocols include a toggle bit, in case the button
> + * was released and pressed again between consecutive scancodes
> + *
> + * The *ctx* should point to the lirc sample as passed into
> + * the program.
> + *
> + * The *protocol* is the decoded protocol number (see
> + * **enum rc_proto** for some predefined values).
> + *
> + * This helper is only available is the kernel was compiled with
> + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
> + * "**y**".
> + *
> + * Return
> + * 0
> + *
> + * int bpf_rc_repeat(void *ctx)
> + * Description
> + * This helper is used in programs implementing IR decoding, to
> + * report a successfully decoded repeat key message. This delays
> + * the generation of a key up event for previously generated
> + * key down event.
> + *
> + * Some IR protocols like NEC have a special IR message for
> + * repeating last button, for when a button is held down.
> + *
> + * The *ctx* should point to the lirc sample as passed into
> + * the program.
> + *
> + * This helper is only available is the kernel was compiled with
> + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to
> + * "**y**"

Re: [PATCH v4 2/3] media: rc: introduce BPF_PROG_LIRC_MODE2

2018-05-18 Thread Y Song
On Fri, May 18, 2018 at 7:07 AM, Sean Young <s...@mess.org> wrote:
> Add support for BPF_PROG_LIRC_MODE2. This type of BPF program can call
> rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
> that the last key should be repeated.
>
> The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
> the target_fd must be the /dev/lircN device.
>
> Signed-off-by: Sean Young <s...@mess.org>

Acked-by: Yonghong Song <y...@fb.com>

> ---
>  drivers/media/rc/Kconfig|  13 ++
>  drivers/media/rc/Makefile   |   1 +
>  drivers/media/rc/bpf-lirc.c | 308 
>  drivers/media/rc/lirc_dev.c |  30 
>  drivers/media/rc/rc-core-priv.h |  22 +++
>  drivers/media/rc/rc-ir-raw.c|  12 +-
>  include/linux/bpf_rcdev.h   |  30 
>  include/linux/bpf_types.h   |   3 +
>  include/uapi/linux/bpf.h|  53 +-
>  kernel/bpf/syscall.c|   7 +
>  10 files changed, 476 insertions(+), 3 deletions(-)
>  create mode 100644 drivers/media/rc/bpf-lirc.c
>  create mode 100644 include/linux/bpf_rcdev.h
>
> diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
> index eb2c3b6eca7f..d5b35a6ba899 100644
> --- a/drivers/media/rc/Kconfig
> +++ b/drivers/media/rc/Kconfig
> @@ -25,6 +25,19 @@ config LIRC
>passes raw IR to and from userspace, which is needed for
>IR transmitting (aka "blasting") and for the lirc daemon.
>
> +config BPF_LIRC_MODE2
> +   bool "Support for eBPF programs attached to lirc devices"
> +   depends on BPF_SYSCALL
> +   depends on RC_CORE=y
> +   depends on LIRC
> +   help
> +  Allow attaching eBPF programs to a lirc device using the bpf(2)
> +  syscall command BPF_PROG_ATTACH. This is supported for raw IR
> +  receivers.
> +
> +  These eBPF programs can be used to decode IR into scancodes, for
> +  IR protocols not supported by the kernel decoders.
> +
>  menuconfig RC_DECODERS
> bool "Remote controller decoders"
> depends on RC_CORE
> diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
> index 2e1c87066f6c..e0340d043fe8 100644
> --- a/drivers/media/rc/Makefile
> +++ b/drivers/media/rc/Makefile
> @@ -5,6 +5,7 @@ obj-y += keymaps/
>  obj-$(CONFIG_RC_CORE) += rc-core.o
>  rc-core-y := rc-main.o rc-ir-raw.o
>  rc-core-$(CONFIG_LIRC) += lirc_dev.o
> +rc-core-$(CONFIG_BPF_LIRC_MODE2) += bpf-lirc.o
>  obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o
>  obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o
>  obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o
> diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c
> new file mode 100644
> index ..c9673df2d9cd
> --- /dev/null
> +++ b/drivers/media/rc/bpf-lirc.c
> @@ -0,0 +1,308 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// bpf-lirc.c - handles bpf
> +//
> +// Copyright (C) 2018 Sean Young <s...@mess.org>
> +
> +#include 
> +#include 
> +#include 
> +#include "rc-core-priv.h"
> +
> +/*
> + * BPF interface for raw IR
> + */
> +const struct bpf_prog_ops lirc_mode2_prog_ops = {
> +};
> +
> +BPF_CALL_1(bpf_rc_repeat, u32*, sample)
> +{
> +   struct ir_raw_event_ctrl *ctrl;
> +
> +   ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
> +
> +   rc_repeat(ctrl->dev);
> +
> +   return 0;
> +}
> +
> +static const struct bpf_func_proto rc_repeat_proto = {
> +   .func  = bpf_rc_repeat,
> +   .gpl_only  = true, /* rc_repeat is EXPORT_SYMBOL_GPL */
> +   .ret_type  = RET_INTEGER,
> +   .arg1_type = ARG_PTR_TO_CTX,
> +};
> +
> +/*
> + * Currently rc-core does not support 64-bit scancodes, but there are many
> + * known protocols with more than 32 bits. So, define the interface as u64
> + * as a future-proof.
> + */
> +BPF_CALL_4(bpf_rc_keydown, u32*, sample, u32, protocol, u64, scancode,
> +  u32, toggle)
> +{
> +   struct ir_raw_event_ctrl *ctrl;
> +
> +   ctrl = container_of(sample, struct ir_raw_event_ctrl, bpf_sample);
> +
> +   rc_keydown(ctrl->dev, protocol, scancode, toggle != 0);
> +
> +   return 0;
> +}
> +
> +static const struct bpf_func_proto rc_keydown_proto = {
> +   .func  = bpf_rc_keydown,
> +   .gpl_only  = true, /* rc_keydown is EXPORT_SYMBOL_GPL */
> +   .ret_type  = RET_INTEGER,
> +   .arg1_type = ARG_PTR_TO_CTX,
> +   .arg2_type = ARG_ANYTHING,
> +   .arg3_type = ARG_ANYTHING,
> +   .arg4_type = ARG_ANYTHING,
> +};
> +
> +static const struct bpf_func_p

Re: [PATCH v4 1/3] bpf: bpf_prog_array_copy() should return -ENOENT if exclude_prog not found

2018-05-18 Thread Y Song
On Fri, May 18, 2018 at 7:07 AM, Sean Young <s...@mess.org> wrote:
> This makes is it possible for bpf prog detach to return -ENOENT.
>
> Signed-off-by: Sean Young <s...@mess.org>

Acked-by: Yonghong Song <y...@fb.com>


Re: [PATCH bpf-next v2 1/7] perf/core: add perf_get_event() to return perf_event given a struct file

2018-05-18 Thread Yonghong Song



On 5/18/18 12:18 AM, Peter Zijlstra wrote:

On Thu, May 17, 2018 at 10:32:53PM -0700, Yonghong Song wrote:

A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.


Can't you do a narrower interface? Like return the prog. I'm not too
keen on random !perf code frobbing around inside the event.


Hi, Peter,

My initial implementation (not upstreamed) actually have the whole
function bpf_get_perf_event_info() in the events/core.c. In that
case, the "struct file *" pointer is passed. This way, the event pointer
does not need to go to kernel/bpf/syscall.c or kernel/trace/bpf_trace.c.

I dropped this mechanism since it added more codes in the events/core.c
file, and I felt that such query code might clutter events/core.c.
The function bpf_get_perf_event_info() is now placed in 
kernel/trace/bpf_trace.c.


Just getting bpf prog pointer is not enough as it does not provide
enough attachment information. Getting such information requires
poking into event/tp_event etc.

Currently we have this extern function exposed by events/core.c:
   extern struct perf_event *perf_get_event(struct file *file);
We could make the result value "const" like
   extern const struct perf_event *perf_get_event(struct file *file);
This will make it clear that we do not change "event" fields, and
merely poking at it.

Please let me know your preference.

Thanks!
Yonghong


[PATCH bpf-next v2 1/7] perf/core: add perf_get_event() to return perf_event given a struct file

2018-05-17 Thread Yonghong Song
A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/perf_event.h | 5 +
 kernel/events/core.c   | 8 
 2 files changed, 13 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..b5c1ad3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct file *perf_event_get(unsigned int fd);
+extern struct perf_event *perf_get_event(struct file *file);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct 
task_struct *child)   { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
 static inline struct file *perf_event_get(unsigned int fd) { return 
ERR_PTR(-EINVAL); }
+static inline struct perf_event *perf_get_event(struct file *file)
+{
+   return ERR_PTR(-EINVAL);
+}
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event)
 {
return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce..1e3cddb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
return file;
 }
 
+struct perf_event *perf_get_event(struct file *file)
+{
+   if (file->f_op != _fops)
+   return ERR_PTR(-EINVAL);
+
+   return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
if (!event)
-- 
2.9.5



Re: [PATCH v3 1/2] media: rc: introduce BPF_PROG_RAWIR_EVENT

2018-05-17 Thread Y Song
On Thu, May 17, 2018 at 2:45 PM, Sean Young <s...@mess.org> wrote:
> Hi,
>
> Again thanks for a thoughtful review. This will definitely will improve
> the code.
>
> On Thu, May 17, 2018 at 10:02:52AM -0700, Y Song wrote:
>> On Wed, May 16, 2018 at 2:04 PM, Sean Young <s...@mess.org> wrote:
>> > Add support for BPF_PROG_RAWIR_EVENT. This type of BPF program can call
>> > rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
>> > that the last key should be repeated.
>> >
>> > The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
>> > the target_fd must be the /dev/lircN device.
>> >
>> > Signed-off-by: Sean Young <s...@mess.org>
>> > ---
>> >  drivers/media/rc/Kconfig   |  13 ++
>> >  drivers/media/rc/Makefile  |   1 +
>> >  drivers/media/rc/bpf-rawir-event.c | 363 +
>> >  drivers/media/rc/lirc_dev.c|  24 ++
>> >  drivers/media/rc/rc-core-priv.h|  24 ++
>> >  drivers/media/rc/rc-ir-raw.c   |  14 +-
>> >  include/linux/bpf_rcdev.h  |  30 +++
>> >  include/linux/bpf_types.h  |   3 +
>> >  include/uapi/linux/bpf.h   |  55 -
>> >  kernel/bpf/syscall.c   |   7 +
>> >  10 files changed, 531 insertions(+), 3 deletions(-)
>> >  create mode 100644 drivers/media/rc/bpf-rawir-event.c
>> >  create mode 100644 include/linux/bpf_rcdev.h
>> >
>> > diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
>> > index eb2c3b6eca7f..2172d65b0213 100644
>> > --- a/drivers/media/rc/Kconfig
>> > +++ b/drivers/media/rc/Kconfig
>> > @@ -25,6 +25,19 @@ config LIRC
>> >passes raw IR to and from userspace, which is needed for
>> >IR transmitting (aka "blasting") and for the lirc daemon.
>> >
>> > +config BPF_RAWIR_EVENT
>> > +   bool "Support for eBPF programs attached to lirc devices"
>> > +   depends on BPF_SYSCALL
>> > +   depends on RC_CORE=y
>> > +   depends on LIRC
>> > +   help
>> > +  Allow attaching eBPF programs to a lirc device using the bpf(2)
>> > +  syscall command BPF_PROG_ATTACH. This is supported for raw IR
>> > +  receivers.
>> > +
>> > +  These eBPF programs can be used to decode IR into scancodes, for
>> > +  IR protocols not supported by the kernel decoders.
>> > +
>> >  menuconfig RC_DECODERS
>> > bool "Remote controller decoders"
>> > depends on RC_CORE
>> > diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
>> > index 2e1c87066f6c..74907823bef8 100644
>> > --- a/drivers/media/rc/Makefile
>> > +++ b/drivers/media/rc/Makefile
>> > @@ -5,6 +5,7 @@ obj-y += keymaps/
>> >  obj-$(CONFIG_RC_CORE) += rc-core.o
>> >  rc-core-y := rc-main.o rc-ir-raw.o
>> >  rc-core-$(CONFIG_LIRC) += lirc_dev.o
>> > +rc-core-$(CONFIG_BPF_RAWIR_EVENT) += bpf-rawir-event.o
>> >  obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o
>> >  obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o
>> >  obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o
>> > diff --git a/drivers/media/rc/bpf-rawir-event.c 
>> > b/drivers/media/rc/bpf-rawir-event.c
>> > new file mode 100644
>> > index ..7cb48b8d87b5
>> > --- /dev/null
>> > +++ b/drivers/media/rc/bpf-rawir-event.c
>> > @@ -0,0 +1,363 @@
>> > +// SPDX-License-Identifier: GPL-2.0
>> > +// bpf-rawir-event.c - handles bpf
>> > +//
>> > +// Copyright (C) 2018 Sean Young <s...@mess.org>
>> > +
>> > +#include 
>> > +#include 
>> > +#include 
>> > +#include "rc-core-priv.h"
>> > +
>> > +/*
>> > + * BPF interface for raw IR
>> > + */
>> > +const struct bpf_prog_ops rawir_event_prog_ops = {
>> > +};
>> > +
>> > +BPF_CALL_1(bpf_rc_repeat, struct bpf_rawir_event*, event)
>> > +{
>> > +   struct ir_raw_event_ctrl *ctrl;
>> > +
>> > +   ctrl = container_of(event, struct ir_raw_event_ctrl, 
>> > bpf_rawir_event);
>> > +
>> > +   rc_repeat(ctrl->dev);
>> > +
>> > +   return 0;
>> > +}
>> > +
>> > +static const struct bpf_func_proto rc_repeat_proto = {
>> > +   .func  = bpf_rc_repeat,
>> 

[PATCH bpf-next v2 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY

2018-05-17 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/trace_events.h |  15 ++
 include/uapi/linux/bpf.h |  27 ++
 kernel/bpf/syscall.c | 124 +++
 kernel/trace/bpf_trace.c |  48 +
 kernel/trace/trace_kprobe.c  |  29 ++
 kernel/trace/trace_uprobe.c  |  22 
 6 files changed, 265 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..bd08e11 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, 
void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(struct perf_event *event, u32 *prog_id,
+   u32 *attach_info, const char **buf,
+   u64 *probe_offset, u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void 
*ctx)
 {
@@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map 
*bpf_find_raw_tracepoint(const char *name
 {
return NULL;
 }
+static inline int bpf_get_perf_event_info(struct file *file, u32 *prog_id,
+ u32 *attach_info, const char **buf,
+ u64 *probe_offset, u64 *probe_addr)
+{
+   return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +569,16 @@ extern void perf_trace_del(struct perf_event *event, int 
flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(struct perf_event *event, u32 *attach_info,
+  const char **symbol, u64 *probe_offset,
+  u64 *probe_addr, bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(struct perf_event *event, u32 *attach_info,
+  const char **filename, u64 *probe_offset,
+  bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d94d333..6a22ad4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   int pid;/* input: pid */
+   int fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+*   filename for uprobe
+*/
+   __u32   prog_id;/* output: prod_id */
+   __u32

[PATCH bpf-next v2 4/7] tools/bpf: add ksym_get_addr() in trace_helpers

2018-05-17 Thread Yonghong Song
Given a kernel function name, ksym_get_addr() will return the kernel
address for this function, or 0 if it cannot find this function name
in /proc/kallsyms. This function will be used later when a kernel
address is used to initiate a kprobe perf event.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/trace_helpers.c | 12 
 tools/testing/selftests/bpf/trace_helpers.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c 
b/tools/testing/selftests/bpf/trace_helpers.c
index 8fb4fe8..3868dcb 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -72,6 +72,18 @@ struct ksym *ksym_search(long key)
return [0];
 }
 
+long ksym_get_addr(const char *name)
+{
+   int i;
+
+   for (i = 0; i < sym_cnt; i++) {
+   if (strcmp(syms[i].name, name) == 0)
+   return syms[i].addr;
+   }
+
+   return 0;
+}
+
 static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h 
b/tools/testing/selftests/bpf/trace_helpers.h
index 36d90e3..3b4bcf7 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -11,6 +11,7 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
 
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
-- 
2.9.5



[PATCH bpf-next v2 0/7] bpf: implement BPF_TASK_FD_QUERY

2018-05-17 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, this command will return bpf related information
to user space. Right now it only supports tracepoint/kprobe/uprobe
perf event fd's. For such a fd, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Patch #1 adds function perf_get_event() in kernel/events/core.c.
Patch #2 implements the bpf subcommand BPF_TASK_FD_QUERY.
Patch #3 syncs tools bpf.h header and also add bpf_task_fd_query()
in the libbpf library for samples/selftests/bpftool to use.
Patch #4 adds ksym_get_addr() utility function.
Patch #5 add a test in samples/bpf for querying k[ret]probes and
u[ret]probes.
Patch #6 add a test in tools/testing/selftests/bpf for querying
raw_tracepoint and tracepoint.
Patch #7 add a new subcommand "perf" to bpftool.

Changelogs:
  v1 -> v2:
 . changed bpf subcommand name from BPF_PERF_EVENT_QUERY
   to BPF_TASK_FD_QUERY.
 . fixed various "bpftool perf" issues and added documentation
   and auto-completion.

Yonghong Song (7):
  perf/core: add perf_get_event() to return perf_event given a struct
file
  bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
  tools/bpf: sync kernel header bpf.h and add bpf_trace_event_query in
libbpf
  tools/bpf: add ksym_get_addr() in trace_helpers
  samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
  tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
  tools/bpftool: add perf subcommand

 include/linux/perf_event.h   |   5 +
 include/linux/trace_events.h |  15 +
 include/uapi/linux/bpf.h |  27 ++
 kernel/bpf/syscall.c | 124 
 kernel/events/core.c |   8 +
 kernel/trace/bpf_trace.c |  48 +++
 kernel/trace/trace_kprobe.c  |  29 ++
 kernel/trace/trace_uprobe.c  |  22 ++
 samples/bpf/Makefile |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 379 +++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 +
 tools/bpf/bpftool/Documentation/bpftool.rst  |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool|   9 +
 tools/bpf/bpftool/main.c |   3 +-
 tools/bpf/bpftool/main.h |   1 +
 tools/bpf/bpftool/perf.c | 200 
 tools/include/uapi/linux/bpf.h   |  27 ++
 tools/lib/bpf/bpf.c  |  24 ++
 tools/lib/bpf/bpf.h  |   3 +
 tools/testing/selftests/bpf/test_progs.c | 133 
 tools/testing/selftests/bpf/trace_helpers.c  |  12 +
 tools/testing/selftests/bpf/trace_helpers.h  |   1 +
 23 files changed, 1177 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

-- 
2.9.5



[PATCH bpf-next v2 7/7] tools/bpftool: add perf subcommand

2018-05-17 Thread Yonghong Song
The new command "bpftool perf [show | list]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information. Documentation is also added.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
  kprobe: trace.py '__x64_sys_nanosleep'
  kretprobe:  trace.py 'r::__x64_sys_nanosleep'
  tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
  uprobe: trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

  $ bpftool perf
  pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
  pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
  pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
  pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159

  $ bpftool -j perf
  
{"pid":21711,"fd":5,"prog_id":5,"attach_info":"kprobe","func":"__x64_sys_write","offset":0},
 \
  
{"pid":21765,"fd":5,"prog_id":7,"attach_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0},
 \
  
{"pid":21767,"fd":5,"prog_id":8,"attach_info":"tracepoint","tracepoint":"sys_enter_nanosleep"},
 \
  
{"pid":21800,"fd":5,"prog_id":9,"attach_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}

  $ bpftool prog
  5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
  loaded_at 2018-05-15T04:46:37-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 4
  7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
  loaded_at 2018-05-15T04:48:32-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 7
  8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
  loaded_at 2018-05-15T04:48:48-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 8
  9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
  loaded_at 2018-05-15T04:49:52-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 9

  $ ps ax | grep "python ./trace.py"
  21711 pts/0T  0:03 python ./trace.py __x64_sys_write
  21765 pts/0S+ 0:00 python ./trace.py r::__x64_sys_nanosleep
  21767 pts/2S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
  21800 pts/3S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
  22374 pts/1S+ 0:00 grep --color=auto python ./trace.py

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 +
 tools/bpf/bpftool/Documentation/bpftool.rst  |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool|   9 +
 tools/bpf/bpftool/main.c |   3 +-
 tools/bpf/bpftool/main.h |   1 +
 tools/bpf/bpftool/perf.c | 200 +++
 6 files changed, 297 insertions(+), 2 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst 
b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
new file mode 100644
index 000..3e65375
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -0,0 +1,81 @@
+
+bpftool-perf
+
+---
+tool for inspection of perf related bpf prog attachments
+---
+
+:Manual section: 8
+
+SYNOPSIS
+
+
+   **bpftool** [*OPTIONS*] **perf** *COMMAND*
+
+   *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+   *COMMANDS* :=
+   { **show** | **list** | **help** }
+
+PERF COMMANDS
+=
+
+|  **bpftool** **perf { show | list }**
+|  **bpftool** **perf help**
+
+DESCRIPTION
+===
+   **bpftool perf { show | list }**
+ List all raw_tracepoint, tracepoint, kprobe attachment in the 
system.
+
+ Output will start with process id and file descriptor in that 
process,
+ followed by bpf program id, attachment information, and 
attachment point.
+ The attachment point for raw_tracepoint/tracepoint is the 
trace probe name.
+ The attachment point for k[ret]probe is either symbol name 
and offset,
+ or a kernel virtual address.
+ The attachment point for u[ret]probe is the file name and the 
file offset.
+
+   **bpftool perf help**
+ Print short help message.
+
+OPTIONS
+===
+   -h, --help
+ Print short generic h

[PATCH bpf-next v2 3/7] tools/bpf: sync kernel header bpf.h and add bpf_trace_event_query in libbpf

2018-05-17 Thread Yonghong Song
Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
implement bpf_trace_event_query() in libbpf. The test programs
in samples/bpf and tools/testing/selftests/bpf, and later bpftool
will use this libbpf function to query kernel.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/include/uapi/linux/bpf.h | 27 +++
 tools/lib/bpf/bpf.c| 24 
 tools/lib/bpf/bpf.h|  3 +++
 3 files changed, 54 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d94d333..6a22ad4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   int pid;/* input: pid */
+   int fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+*   filename for uprobe
+*/
+   __u32   prog_id;/* output: prod_id */
+   __u32   attach_info;/* output: BPF_ATTACH_* */
+   __u64   probe_offset;   /* output: probe_offset */
+   __u64   probe_addr; /* output: probe_addr */
+   } task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2450,4 +2467,14 @@ struct bpf_fib_lookup {
__u8dmac[6]; /* ETH_ALEN */
 };
 
+/* used by <task, fd> based query */
+enum {
+   BPF_ATTACH_RAW_TRACEPOINT,  /* tp name */
+   BPF_ATTACH_TRACEPOINT,  /* tp name */
+   BPF_ATTACH_KPROBE,  /* (symbol + offset) or addr */
+   BPF_ATTACH_KRETPROBE,   /* (symbol + offset) or addr */
+   BPF_ATTACH_UPROBE,  /* filename + offset */
+   BPF_ATTACH_URETPROBE,   /* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 6a8a000..da3f336 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -643,3 +643,27 @@ int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, 
__u32 log_buf_size,
 
return fd;
 }
+
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 buf_len,
+ __u32 *prog_id, __u32 *attach_info,
+ __u64 *probe_offset, __u64 *probe_addr)
+{
+   union bpf_attr attr = {};
+   int err;
+
+   attr.task_fd_query.pid = pid;
+   attr.task_fd_query.fd = fd;
+   attr.task_fd_query.flags = flags;
+   attr.task_fd_query.buf = ptr_to_u64(buf);
+   attr.task_fd_query.buf_len = buf_len;
+
+   err = sys_bpf(BPF_TASK_FD_QUERY, , sizeof(attr));
+   if (!err) {
+   *prog_id = attr.task_fd_query.prog_id;
+   *attach_info = attr.task_fd_query.attach_info;
+   *probe_offset = attr.task_fd_query.probe_offset;
+   *probe_addr = attr.task_fd_query.probe_addr;
+   }
+
+   return err;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 15bff77..9adfde6 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -107,4 +107,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type 
type, __u32 query_flags,
 int bpf_raw_tracepoint_open(const char *name, int prog_fd);
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 bool do_log);
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 buf_len,
+ __u32 *prog_id, __u32 *prog_info,
+ __u64 *probe_offset, __u64 *probe_addr);
 #endif
-- 
2.9.5



[PATCH bpf-next v2 5/7] samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY

2018-05-17 Thread Yonghong Song
This is mostly to test kprobe/uprobe which needs kernel headers.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 samples/bpf/Makefile |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 379 +++
 3 files changed, 402 insertions(+)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1..7dc85ed 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES += $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4  += -lrt
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 000..f4b0a9e
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+   return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 000..792ef24
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({ \
+   int __ret = !!(condition);  \
+   if (__ret) {\
+   printf("FAIL: %s:\n", __func__);\
+   perror(""); \
+   return -1;  \
+   }   \
+})
+
+#define CHECK_AND_RET(condition) ({\
+   int __ret = !!(condition);  \
+   if (__ret)  \
+   return -1;  \
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+   return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   errno = 0;
+   ret = (int)strtol(buf, NULL, 10);
+   CHECK_PERROR_RET(errno);
+   return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+   CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+   

[PATCH bpf-next v2 6/7] tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs

2018-05-17 Thread Yonghong Song
The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c | 133 +++
 1 file changed, 133 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733..f7ede03 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,137 @@ static void test_get_stack_raw_tp(void)
bpf_object__close(obj);
 }
 
+static void test_task_fd_query_rawtp(void)
+{
+   const char *file = "./test_get_stack_rawtp.o";
+   struct perf_event_attr attr = {};
+   __u64 probe_offset, probe_addr;
+   int efd, err, prog_fd, pmu_fd;
+   __u32 prog_id, attach_info;
+   struct bpf_object *obj;
+   __u32 duration = 0;
+   char buf[256];
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, , _fd);
+   if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+   return;
+
+   efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+   if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+
+   attr.sample_type = PERF_SAMPLE_RAW;
+   attr.type = PERF_TYPE_SOFTWARE;
+   attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+   pmu_fd = syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
+   if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+ errno))
+   goto close_prog;
+
+   err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+   if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
+ err, errno))
+   goto close_prog;
+
+   /* query (getpid(), efd */
+   err = bpf_task_fd_query(getpid(), efd, 0, buf, 256, _id,
+   _info, _offset, _addr);
+   if (CHECK(err < 0, "bpf_trace_event_query", "err %d errno %d\n", err,
+ errno))
+   goto close_prog;
+
+   err = (attach_info == BPF_ATTACH_RAW_TRACEPOINT) &&
+ (strcmp(buf, "sys_enter") == 0);
+   if (CHECK(!err, "check_results", "attach_info %d tp_name %s\n",
+ attach_info, buf))
+   goto close_prog;
+
+   goto close_prog_noerr;
+close_prog:
+   error_cnt++;
+close_prog_noerr:
+   bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+  const char *tp_name)
+{
+   const char *file = "./test_tracepoint.o";
+   int err, bytes, efd, prog_fd, pmu_fd;
+   struct perf_event_attr attr = {};
+   __u64 probe_offset, probe_addr;
+   __u32 prog_id, attach_info;
+   struct bpf_object *obj;
+   __u32 duration = 0;
+   char buf[256];
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, , _fd);
+   if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+   goto close_prog;
+
+   snprintf(buf, sizeof(buf),
+"/sys/kernel/debug/tracing/events/%s/id", probe_name);
+   efd = open(buf, O_RDONLY, 0);
+   if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+   bytes = read(efd, buf, sizeof(buf));
+   close(efd);
+   if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+ "bytes %d errno %d\n", bytes, errno))
+   goto close_prog;
+
+   attr.config = strtol(buf, NULL, 0);
+   attr.type = PERF_TYPE_TRACEPOINT;
+   attr.sample_type = PERF_SAMPLE_RAW;
+   attr.sample_period = 1;
+   attr.wakeup_events = 1;
+   pmu_fd = syscall(__NR_perf_event_open, , -1 /* pid */,
+0 /* cpu 0 */, -1 /* group id */,
+0 /* flags */);
+   if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+   goto close_pmu;
+
+   err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+   if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+ errno))
+   goto close_pmu;
+
+   err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+   if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+ errno))
+   goto close_pmu;
+
+   /* query (getpid(), pmu_fd */
+   err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, 256, _id,
+

Re: [PATCH bpf-next 2/7] bpf: introduce bpf subcommand BPF_PERF_EVENT_QUERY

2018-05-17 Thread Yonghong Song



On 5/17/18 8:32 AM, Daniel Borkmann wrote:

On 05/16/2018 11:59 PM, Yonghong Song wrote:

On 5/16/18 4:27 AM, Peter Zijlstra wrote:

On Tue, May 15, 2018 at 04:45:16PM -0700, Yonghong Song wrote:

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_PERF_EVENT_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobea perf event, BPF_PERF_EVENT_QUERY will return
     . prog_id
     . tracepoint name, or
     . k[ret]probe funcname + offset or kernel addr, or
     . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Signed-off-by: Yonghong Song <y...@fb.com>
---
   include/linux/trace_events.h |  15 ++
   include/uapi/linux/bpf.h |  25 ++
   kernel/bpf/syscall.c | 113 
+++
   kernel/trace/bpf_trace.c |  53 
   kernel/trace/trace_kprobe.c  |  29 +++
   kernel/trace/trace_uprobe.c  |  22 +
   6 files changed, 257 insertions(+)


Why is the command called *_PERF_EVENT_* ? Are there not a lot of !perf
places to attach BPF proglets?


Just gave a complete picture, the below are major places to attach
BPF programs:
    . perf based (through perf ioctl)
    . raw tracepoint based (through bpf interface)

    . netlink interface for tc, xdp, tunneling
    . setsockopt for socket filters
    . cgroup based (bpf attachment subcommand)
  mostly networking and io devices
    . some other networking socket related (sk_skb stream/parser/verdict,
  sk_msg verdict) through bpf attachment subcommand.

Currently, for cgroup based attachment, we have BPF_PROG_QUERY with input 
cgroup file descriptor. For other networking based queries, we
may need to enumerate tc filters, networking devices, open sockets, etc.
to get the attachment information.

So to have one BPF_QUERY command line may be too complex to
cover all cases.

But you are right that BPF_PERF_EVENT_QUERY name is too narrow since
it should be used for other (pid, fd) based queries as well (e.g., socket, or 
other potential uses in the future).

How about the subcommand name BPF_TASK_FD_QUERY and make bpf_attr.task_fd_query 
extensible?


I like the introspection output it provides in 7/7, it's really great!
So the query interface would only ever be tied to BPF progs whose attach
life time is tied to the life time of the application and as soon as all
refs on the fd are released it's unloaded from the system. BPF_TASK_FD_QUERY
seems okay to me, or something like BPF_ATTACH_QUERY. Even if the name is
slightly more generic, it might be more fitting with other cmds like
BPF_PROG_QUERY we have where we tell an attach point to retrieve all progs
from it (though only tied to cgroups right now, it may not be in future).


I think BPF_TASK_FD_QUERY is okay. Using BPF_ATTACH_QUERY indeed seems
a little bit broader to me as other query subcommands are possible to
query attachments with different input.

BPF_PROG_QUERY is also trying to query attachment. Currently, given a 
cgroup fd, it will query prog array attached. Sean has the patch to 
attach bpf programs to a RC device, and given a device fd, it will

query prog array attached to that device.



For all the others that are not strictly tied to the task but global, bpftool
would then need to be extended to query the various other interfaces like
netlink for retrieval which is on todo for some point in future as well. So
this set nicely complements this introspection aspect.


Totally agree.
Thanks!



Thanks,
Daniel



Re: [PATCH v3 2/2] bpf: add selftest for rawir_event type program

2018-05-17 Thread Y Song
On Wed, May 16, 2018 at 2:04 PM, Sean Young  wrote:
> This is simple test over rc-loopback.
>
> Signed-off-by: Sean Young 
> ---
>  tools/bpf/bpftool/prog.c  |   1 +
>  tools/include/uapi/linux/bpf.h|  57 +++-
>  tools/lib/bpf/libbpf.c|   1 +
>  tools/testing/selftests/bpf/Makefile  |   8 +-
>  tools/testing/selftests/bpf/bpf_helpers.h |   6 +
>  tools/testing/selftests/bpf/test_rawir.sh |  37 +
>  .../selftests/bpf/test_rawir_event_kern.c |  26 
>  .../selftests/bpf/test_rawir_event_user.c | 130 ++
>  8 files changed, 261 insertions(+), 5 deletions(-)
>  create mode 100755 tools/testing/selftests/bpf/test_rawir.sh
>  create mode 100644 tools/testing/selftests/bpf/test_rawir_event_kern.c
>  create mode 100644 tools/testing/selftests/bpf/test_rawir_event_user.c

Could you copy include/uapi/linux/lirc.h file to tools directory as well.
Otherwise, I will get the following compilation error:

gcc -Wall -O2 -I../../../include/uapi -I../../../lib
-I../../../lib/bpf -I../../../../include/generated  -I../../../include
   test_rawir_event_user.c
/home/yhs/work/bpf-next/tools/testing/selftests/bpf/libbpf.a -lcap
-lelf -lrt -lpthread -o
/home/yhs/work/bpf-next/tools/testing/selftests/bpf/test_rawir_event_user
test_rawir_event_user.c: In function ‘main’:
test_rawir_event_user.c:60:15: error: ‘LIRC_MODE_SCANCODE’ undeclared
(first use in this function); did you mean ‘LIRC_MODE_LIRCCODE’?
mode = LIRC_MODE_SCANCODE;
   ^~
   LIRC_MODE_LIRCCODE
test_rawir_event_user.c:60:15: note: each undeclared identifier is
reported only once for each function it appears in
test_rawir_event_user.c:93:29: error: storage size of ‘lsc’ isn’t known
struct lirc_scancode lsc;
 ^~~
test_rawir_event_user.c:93:29: warning: unused variable ‘lsc’
[-Wunused-variable]

>
> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index 9bdfdf2d3fbe..8889a4ee8577 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -71,6 +71,7 @@ static const char * const prog_type_name[] = {
> [BPF_PROG_TYPE_SK_MSG]  = "sk_msg",
> [BPF_PROG_TYPE_RAW_TRACEPOINT]  = "raw_tracepoint",
> [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
> +   [BPF_PROG_TYPE_RAWIR_EVENT] = "rawir_event",
>  };
>
>  static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index 1205d86a7a29..243e141e8a5b 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -141,6 +141,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_SK_MSG,
> BPF_PROG_TYPE_RAW_TRACEPOINT,
> BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
> +   BPF_PROG_TYPE_RAWIR_EVENT,
>  };
>
>  enum bpf_attach_type {
> @@ -158,6 +159,7 @@ enum bpf_attach_type {
> BPF_CGROUP_INET6_CONNECT,
> BPF_CGROUP_INET4_POST_BIND,
> BPF_CGROUP_INET6_POST_BIND,
> +   BPF_RAWIR_EVENT,
> __MAX_BPF_ATTACH_TYPE
>  };
>
> @@ -1829,7 +1831,6 @@ union bpf_attr {
>   * Return
>   * 0 on success, or a negative error in case of failure.
>   *
> - *
>   * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, 
> u32 flags)
>   * Description
>   * Do FIB lookup in kernel tables using parameters in *params*.
> @@ -1856,6 +1857,7 @@ union bpf_attr {
>   * Egress device index on success, 0 if packet needs to continue
>   * up the stack for further processing or a negative error in 
> case
>   * of failure.
> + *
>   * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map 
> *map, void *key, u64 flags)
>   * Description
>   * Add an entry to, or update a sockhash *map* referencing 
> sockets.
> @@ -1902,6 +1904,35 @@ union bpf_attr {
>   * egress otherwise). This is the only flag supported for now.
>   * Return
>   * **SK_PASS** on success, or **SK_DROP** on error.
> + *
> + * int bpf_rc_keydown(void *ctx, u32 protocol, u32 scancode, u32 toggle)
> + * Description
> + * Report decoded scancode with toggle value. For use in
> + * BPF_PROG_TYPE_RAWIR_EVENT, to report a successfully
> + * decoded scancode. This is will generate a keydown event,
> + * and a keyup event once the scancode is no longer repeated.
> + *
> + * *ctx* pointer to bpf_rawir_event, *protocol* is decoded
> + * protocol (see RC_PROTO_* enum).
> + *
> + * Some protocols include a toggle bit, in case the button
> + * was released and pressed again between consecutive scancodes,
> + * copy this bit into *toggle* if it exists, else set to 0.
> + *
> + * Return
> + *  

Re: [PATCH v3 1/2] media: rc: introduce BPF_PROG_RAWIR_EVENT

2018-05-17 Thread Y Song
On Wed, May 16, 2018 at 2:04 PM, Sean Young  wrote:
> Add support for BPF_PROG_RAWIR_EVENT. This type of BPF program can call
> rc_keydown() to reported decoded IR scancodes, or rc_repeat() to report
> that the last key should be repeated.
>
> The bpf program can be attached to using the bpf(BPF_PROG_ATTACH) syscall;
> the target_fd must be the /dev/lircN device.
>
> Signed-off-by: Sean Young 
> ---
>  drivers/media/rc/Kconfig   |  13 ++
>  drivers/media/rc/Makefile  |   1 +
>  drivers/media/rc/bpf-rawir-event.c | 363 +
>  drivers/media/rc/lirc_dev.c|  24 ++
>  drivers/media/rc/rc-core-priv.h|  24 ++
>  drivers/media/rc/rc-ir-raw.c   |  14 +-
>  include/linux/bpf_rcdev.h  |  30 +++
>  include/linux/bpf_types.h  |   3 +
>  include/uapi/linux/bpf.h   |  55 -
>  kernel/bpf/syscall.c   |   7 +
>  10 files changed, 531 insertions(+), 3 deletions(-)
>  create mode 100644 drivers/media/rc/bpf-rawir-event.c
>  create mode 100644 include/linux/bpf_rcdev.h
>
> diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig
> index eb2c3b6eca7f..2172d65b0213 100644
> --- a/drivers/media/rc/Kconfig
> +++ b/drivers/media/rc/Kconfig
> @@ -25,6 +25,19 @@ config LIRC
>passes raw IR to and from userspace, which is needed for
>IR transmitting (aka "blasting") and for the lirc daemon.
>
> +config BPF_RAWIR_EVENT
> +   bool "Support for eBPF programs attached to lirc devices"
> +   depends on BPF_SYSCALL
> +   depends on RC_CORE=y
> +   depends on LIRC
> +   help
> +  Allow attaching eBPF programs to a lirc device using the bpf(2)
> +  syscall command BPF_PROG_ATTACH. This is supported for raw IR
> +  receivers.
> +
> +  These eBPF programs can be used to decode IR into scancodes, for
> +  IR protocols not supported by the kernel decoders.
> +
>  menuconfig RC_DECODERS
> bool "Remote controller decoders"
> depends on RC_CORE
> diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile
> index 2e1c87066f6c..74907823bef8 100644
> --- a/drivers/media/rc/Makefile
> +++ b/drivers/media/rc/Makefile
> @@ -5,6 +5,7 @@ obj-y += keymaps/
>  obj-$(CONFIG_RC_CORE) += rc-core.o
>  rc-core-y := rc-main.o rc-ir-raw.o
>  rc-core-$(CONFIG_LIRC) += lirc_dev.o
> +rc-core-$(CONFIG_BPF_RAWIR_EVENT) += bpf-rawir-event.o
>  obj-$(CONFIG_IR_NEC_DECODER) += ir-nec-decoder.o
>  obj-$(CONFIG_IR_RC5_DECODER) += ir-rc5-decoder.o
>  obj-$(CONFIG_IR_RC6_DECODER) += ir-rc6-decoder.o
> diff --git a/drivers/media/rc/bpf-rawir-event.c 
> b/drivers/media/rc/bpf-rawir-event.c
> new file mode 100644
> index ..7cb48b8d87b5
> --- /dev/null
> +++ b/drivers/media/rc/bpf-rawir-event.c
> @@ -0,0 +1,363 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// bpf-rawir-event.c - handles bpf
> +//
> +// Copyright (C) 2018 Sean Young 
> +
> +#include 
> +#include 
> +#include 
> +#include "rc-core-priv.h"
> +
> +/*
> + * BPF interface for raw IR
> + */
> +const struct bpf_prog_ops rawir_event_prog_ops = {
> +};
> +
> +BPF_CALL_1(bpf_rc_repeat, struct bpf_rawir_event*, event)
> +{
> +   struct ir_raw_event_ctrl *ctrl;
> +
> +   ctrl = container_of(event, struct ir_raw_event_ctrl, bpf_rawir_event);
> +
> +   rc_repeat(ctrl->dev);
> +
> +   return 0;
> +}
> +
> +static const struct bpf_func_proto rc_repeat_proto = {
> +   .func  = bpf_rc_repeat,
> +   .gpl_only  = true, /* rc_repeat is EXPORT_SYMBOL_GPL */
> +   .ret_type  = RET_INTEGER,
> +   .arg1_type = ARG_PTR_TO_CTX,
> +};
> +
> +BPF_CALL_4(bpf_rc_keydown, struct bpf_rawir_event*, event, u32, protocol,
> +  u32, scancode, u32, toggle)
> +{
> +   struct ir_raw_event_ctrl *ctrl;
> +
> +   ctrl = container_of(event, struct ir_raw_event_ctrl, bpf_rawir_event);
> +
> +   rc_keydown(ctrl->dev, protocol, scancode, toggle != 0);
> +
> +   return 0;
> +}
> +
> +static const struct bpf_func_proto rc_keydown_proto = {
> +   .func  = bpf_rc_keydown,
> +   .gpl_only  = true, /* rc_keydown is EXPORT_SYMBOL_GPL */
> +   .ret_type  = RET_INTEGER,
> +   .arg1_type = ARG_PTR_TO_CTX,
> +   .arg2_type = ARG_ANYTHING,
> +   .arg3_type = ARG_ANYTHING,
> +   .arg4_type = ARG_ANYTHING,
> +};
> +
> +static const struct bpf_func_proto *
> +rawir_event_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> +{
> +   switch (func_id) {
> +   case BPF_FUNC_rc_repeat:
> +   return _repeat_proto;
> +   case BPF_FUNC_rc_keydown:
> +   return _keydown_proto;
> +   case BPF_FUNC_map_lookup_elem:
> +   return _map_lookup_elem_proto;
> +   case BPF_FUNC_map_update_elem:
> +   return _map_update_elem_proto;
> +   case BPF_FUNC_map_delete_elem:
> +   return _map_delete_elem_proto;
> +   case BPF_FUNC_ktime_get_ns:
> 

Re: [PATCH bpf-next 2/7] bpf: introduce bpf subcommand BPF_PERF_EVENT_QUERY

2018-05-16 Thread Yonghong Song



On 5/16/18 4:27 AM, Peter Zijlstra wrote:

On Tue, May 15, 2018 at 04:45:16PM -0700, Yonghong Song wrote:

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_PERF_EVENT_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobea perf event, BPF_PERF_EVENT_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Signed-off-by: Yonghong Song <y...@fb.com>
---
  include/linux/trace_events.h |  15 ++
  include/uapi/linux/bpf.h |  25 ++
  kernel/bpf/syscall.c | 113 +++
  kernel/trace/bpf_trace.c |  53 
  kernel/trace/trace_kprobe.c  |  29 +++
  kernel/trace/trace_uprobe.c  |  22 +
  6 files changed, 257 insertions(+)


Why is the command called *_PERF_EVENT_* ? Are there not a lot of !perf
places to attach BPF proglets?


Just gave a complete picture, the below are major places to attach
BPF programs:
   . perf based (through perf ioctl)
   . raw tracepoint based (through bpf interface)

   . netlink interface for tc, xdp, tunneling
   . setsockopt for socket filters
   . cgroup based (bpf attachment subcommand)
 mostly networking and io devices
   . some other networking socket related (sk_skb stream/parser/verdict,
 sk_msg verdict) through bpf attachment subcommand.

Currently, for cgroup based attachment, we have BPF_PROG_QUERY with 
input cgroup file descriptor. For other networking based queries, we

may need to enumerate tc filters, networking devices, open sockets, etc.
to get the attachment information.

So to have one BPF_QUERY command line may be too complex to
cover all cases.

But you are right that BPF_PERF_EVENT_QUERY name is too narrow since
it should be used for other (pid, fd) based queries as well (e.g., 
socket, or other potential uses in the future).


How about the subcommand name BPF_TASK_FD_QUERY and make 
bpf_attr.task_fd_query extensible?


Thanks!


[PATCH bpf-next] bpf: fix sock hashmap kmalloc warning

2018-05-16 Thread Yonghong Song
syzbot reported a kernel warning below:
  WARNING: CPU: 0 PID: 4499 at mm/slab_common.c:996 kmalloc_slab+0x56/0x70 
mm/slab_common.c:996
  Kernel panic - not syncing: panic_on_warn set ...

  CPU: 0 PID: 4499 Comm: syz-executor050 Not tainted 4.17.0-rc3+ #9
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
Google 01/01/2011
  Call Trace:
   __dump_stack lib/dump_stack.c:77 [inline]
   dump_stack+0x1b9/0x294 lib/dump_stack.c:113
   panic+0x22f/0x4de kernel/panic.c:184
   __warn.cold.8+0x163/0x1b3 kernel/panic.c:536
   report_bug+0x252/0x2d0 lib/bug.c:186
   fixup_bug arch/x86/kernel/traps.c:178 [inline]
   do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
   do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
   invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
  RIP: 0010:kmalloc_slab+0x56/0x70 mm/slab_common.c:996
  RSP: 0018:8801d907fc58 EFLAGS: 00010246
  RAX:  RBX: 8801aeecb280 RCX: 8185ebd7
  RDX:  RSI:  RDI: ffe1
  RBP: 8801d907fc58 R08: 8801adb5e1c0 R09: ed0035a84700
  R10: ed0035a84700 R11: 8801ad423803 R12: 8801aeecb280
  R13: fff4 R14: 8801ad891a00 R15: 014200c0
   __do_kmalloc mm/slab.c:3713 [inline]
   __kmalloc+0x25/0x760 mm/slab.c:3727
   kmalloc include/linux/slab.h:517 [inline]
   map_get_next_key+0x24a/0x640 kernel/bpf/syscall.c:858
   __do_sys_bpf kernel/bpf/syscall.c:2131 [inline]
   __se_sys_bpf kernel/bpf/syscall.c:2096 [inline]
   __x64_sys_bpf+0x354/0x4f0 kernel/bpf/syscall.c:2096
   do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

The test case is against sock hashmap with a key size 0xffe1.
Such a large key size will cause the below code in function
sock_hash_alloc() overflowing and produces a smaller elem_size,
hence map creation will be successful.
htab->elem_size = sizeof(struct htab_elem) +
  round_up(htab->map.key_size, 8);

Later, when map_get_next_key is called and kernel tries
to allocate the key unsuccessfully, it will issue
the above warning.

Similar to hashtab, ensure the key size is at most
MAX_BPF_STACK for a successful map creation.

Fixes: 81110384441a ("bpf: sockmap, add hash map support")
Reported-by: syzbot+e4566d29080e7f346...@syzkaller.appspotmail.com
Signed-off-by: Yonghong Song <y...@fb.com>
---
 kernel/bpf/sockmap.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 56879c9fd3a4..79f5e899 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -1990,6 +1990,12 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr 
*attr)
attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
 
+   if (attr->key_size > MAX_BPF_STACK)
+   /* eBPF programs initialize keys on stack, so they cannot be
+* larger than max stack size
+*/
+   return ERR_PTR(-E2BIG);
+
err = bpf_tcp_ulp_register();
if (err && err != -EEXIST)
return ERR_PTR(err);
-- 
2.14.3



Re: [PATCH bpf-next] samples/bpf: Decrement ttl in fib forwarding example

2018-05-16 Thread Y Song
On Tue, May 15, 2018 at 4:20 PM, David Ahern <dsah...@gmail.com> wrote:
> Only consider forwarding packets if ttl in received packet is > 1 and
> decrement ttl before handing off to bpf_redirect_map.
>
> Signed-off-by: David Ahern <dsah...@gmail.com>

I did not test this patch, but it looks good to me with visual inspection.
Acked-by: Yonghong Song <y...@fb.com>


Re: [PATCH bpf-next 7/7] tools/bpftool: add perf subcommand

2018-05-15 Thread Yonghong Song



On 5/15/18 9:41 PM, Jakub Kicinski wrote:

On Tue, 15 May 2018 16:45:21 -0700, Yonghong Song wrote:

The new command "bpftool perf [show]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
   kprobe: trace.py '__x64_sys_nanosleep'
   kretprobe:  trace.py 'r::__x64_sys_nanosleep'
   tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
   uprobe: trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

   $ bpftool perf
   21711: prog_id 5 kprobe func __x64_sys_write offset 0
   21765: prog_id 7 kretprobe func __x64_sys_nanosleep offset 0
   21767: prog_id 8 tracepoint sys_enter_nanosleep
   21800: prog_id 9 uprobe filename /home/yhs/a.out offset 1159

   $ bpftool -j perf
   
{"pid":21711,"prog_id":5,"prog_info":"kprobe","func":"__x64_sys_write","offset":0},
 \
   
{"pid":21765,"prog_id":7,"prog_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0},
 \
   
{"pid":21767,"prog_id":8,"prog_info":"tracepoint","tracepoint":"sys_enter_nanosleep"},
 \
   
{"pid":21800,"prog_id":9,"prog_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}


You need to wrap the objects inside an array, so

if (json_output)
jsonw_start_array(json_wtr);
nftw();
if (json_output)
jsonw_end_array(json_wtr);

otherwise output will not be a valid JSON.  To validate JSON try:

$ bpftool -j perf | python -m json.tool


Thanks for detailed review! All of your comments make sense.
I will address them in next revision after getting some feedback
for other patches.




   $ bpftool prog
   5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
  loaded_at 2018-05-15T04:46:37-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 4
   7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
  loaded_at 2018-05-15T04:48:32-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 7
   8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
  loaded_at 2018-05-15T04:48:48-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 8
   9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
  loaded_at 2018-05-15T04:49:52-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 9

   $ ps ax | grep "python ./trace.py"
   21711 pts/0T  0:03 python ./trace.py __x64_sys_write
   21765 pts/0S+     0:00 python ./trace.py r::__x64_sys_nanosleep
   21767 pts/2S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
   21800 pts/3S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
   22374 pts/1S+ 0:00 grep --color=auto python ./trace.py

Signed-off-by: Yonghong Song <y...@fb.com>
---
  tools/bpf/bpftool/main.c |   3 +-
  tools/bpf/bpftool/main.h |   1 +
  tools/bpf/bpftool/perf.c | 188 +++


Would you be able to also extend the Documentation/ and bash
completions?


  3 files changed, 191 insertions(+), 1 deletion(-)
  create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 1ec852d..eea7f14 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -87,7 +87,7 @@ static int do_help(int argc, char **argv)
"   %s batch file FILE\n"
"   %s version\n"
"\n"
-   "   OBJECT := { prog | map | cgroup }\n"
+   "   OBJECT := { prog | map | cgroup | perf }\n"
"   " HELP_SPEC_OPTIONS "\n"
"",
bin_name, bin_name, bin_name);
@@ -216,6 +216,7 @@ static const struct cmd cmds[] = {
{ "prog", do_prog },
{ "map",  do_map },
{ "cgroup",   do_cgroup },
+   { "perf", do_perf },
{ "version",  do_version },
{ 0 }
  };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 6173cd9..63fdb31 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -119,6 +119,7 @@ int do_prog(int argc, char **arg);
  int do_map(int argc, char **arg);
  int do_event_pipe(int argc, char **argv);
  int do_cgroup(int argc, char **arg);
+int do_perf(int argc, char **arg);
  
  int prog_parse_fd(int *argc, char ***argv);

  int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 
*info_len);
diff --git a/tools/bpf/bpftool/perf.c b/too

[PATCH bpf-next 5/7] samples/bpf: add a samples/bpf test for BPF_PERF_EVENT_QUERY

2018-05-15 Thread Yonghong Song
This is mostly to test kprobe/uprobe which needs kernel headers.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 samples/bpf/Makefile|   4 +
 samples/bpf/perf_event_query_kern.c |  19 ++
 samples/bpf/perf_event_query_user.c | 376 
 3 files changed, 399 insertions(+)
 create mode 100644 samples/bpf/perf_event_query_kern.c
 create mode 100644 samples/bpf/perf_event_query_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1..c23e8fe 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
+hostprogs-y += perf_event_query
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+perf_event_query-objs := bpf_load.o perf_event_query_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
+always += perf_event_query_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_perf_event_query_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES += $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4  += -lrt
diff --git a/samples/bpf/perf_event_query_kern.c 
b/samples/bpf/perf_event_query_kern.c
new file mode 100644
index 000..f4b0a9e
--- /dev/null
+++ b/samples/bpf/perf_event_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+   return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+   return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/perf_event_query_user.c 
b/samples/bpf/perf_event_query_user.c
new file mode 100644
index 000..bf46578
--- /dev/null
+++ b/samples/bpf/perf_event_query_user.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({ \
+   int __ret = !!(condition);  \
+   if (__ret) {\
+   printf("FAIL: %s:\n", __func__);\
+   perror(""); \
+   return -1;  \
+   }   \
+})
+
+#define CHECK_AND_RET(condition) ({\
+   int __ret = !!(condition);  \
+   if (__ret)  \
+   return -1;  \
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+   return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   errno = 0;
+   ret = (int)strtol(buf, NULL, 10);
+   CHECK_PERROR_RET(errno);
+   return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+   char buf[256];
+   int fd, ret;
+
+   ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+   fd = open(buf, O_RDONLY);
+   CHECK_PERROR_RET(fd < 0);
+
+   ret = read(fd, buf, sizeof(buf));
+   close(fd);
+   CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+   CHECK_PERROR_RET(strlen(buf) < strlen("config:")

[PATCH bpf-next 0/7] bpf: implement BPF_PERF_EVENT_QUERY for perf event query

2018-05-15 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_PERF_EVENT_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobea perf event, BPF_PERF_EVENT_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Patch #1 adds function perf_get_event() in kernel/events/core.c.
Patch #2 implements the bpf subcommand BPF_PERF_EVENT_QUERY.
Patch #3 syncs tools bpf.h header and also add bpf_trace_event_query()
in the libbpf library for samples/selftests/bpftool to use.
Patch #4 adds ksym_get_addr() utility function.
Patch #5 add a test in samples/bpf for querying k[ret]probes and
u[ret]probes.
Patch #6 add a test in tools/testing/selftests/bpf for querying
raw_tracepoint and tracepoint.
Patch #7 add a new subcommand "perf" to bpftool.

Yonghong Song (7):
  perf/core: add perf_get_event() to return perf_event given a struct
file
  bpf: introduce bpf subcommand BPF_PERF_EVENT_QUERY
  tools/bpf: sync kernel header bpf.h and add bpf_trace_event_query in
libbpf
  tools/bpf: add ksym_get_addr() in trace_helpers
  samples/bpf: add a samples/bpf test for BPF_PERF_EVENT_QUERY
  tools/bpf: add two BPF_PERF_EVENT_QUERY tests in test_progs
  tools/bpftool: add perf subcommand

 include/linux/perf_event.h  |   5 +
 include/linux/trace_events.h|  15 ++
 include/uapi/linux/bpf.h|  25 ++
 kernel/bpf/syscall.c| 113 +
 kernel/events/core.c|   8 +
 kernel/trace/bpf_trace.c|  53 
 kernel/trace/trace_kprobe.c |  29 +++
 kernel/trace/trace_uprobe.c |  22 ++
 samples/bpf/Makefile|   4 +
 samples/bpf/perf_event_query_kern.c |  19 ++
 samples/bpf/perf_event_query_user.c | 376 
 tools/bpf/bpftool/main.c|   3 +-
 tools/bpf/bpftool/main.h|   1 +
 tools/bpf/bpftool/perf.c| 188 ++
 tools/include/uapi/linux/bpf.h  |  25 ++
 tools/lib/bpf/bpf.c |  23 ++
 tools/lib/bpf/bpf.h |   3 +
 tools/testing/selftests/bpf/test_progs.c| 133 ++
 tools/testing/selftests/bpf/trace_helpers.c |  12 +
 tools/testing/selftests/bpf/trace_helpers.h |   1 +
 20 files changed, 1057 insertions(+), 1 deletion(-)
 create mode 100644 samples/bpf/perf_event_query_kern.c
 create mode 100644 samples/bpf/perf_event_query_user.c
 create mode 100644 tools/bpf/bpftool/perf.c

-- 
2.9.5



[PATCH bpf-next 3/7] tools/bpf: sync kernel header bpf.h and add bpf_trace_event_query in libbpf

2018-05-15 Thread Yonghong Song
Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
implement bpf_trace_event_query() in libbpf. The test programs
in samples/bpf and tools/testing/selftests/bpf, and later bpftool
will use this libbpf function to query kernel.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/include/uapi/linux/bpf.h | 25 +
 tools/lib/bpf/bpf.c| 23 +++
 tools/lib/bpf/bpf.h|  3 +++
 3 files changed, 51 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 1205d86..a209f01 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_PERF_EVENT_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   int pid;/* input: pid */
+   int fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+*   filename for uprobe
+*/
+   __u32   prog_id;/* output: prod_id */
+   __u32   prog_info;  /* output: BPF_PERF_INFO_* */
+   __u64   probe_offset;   /* output: probe_offset */
+   __u64   probe_addr; /* output: probe_addr */
+   } perf_event_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2450,4 +2467,12 @@ struct bpf_fib_lookup {
__u8dmac[6]; /* ETH_ALEN */
 };
 
+enum {
+   BPF_PERF_INFO_TP_NAME,  /* tp name */
+   BPF_PERF_INFO_KPROBE,   /* (symbol + offset) or addr */
+   BPF_PERF_INFO_KRETPROBE,/* (symbol + offset) or addr */
+   BPF_PERF_INFO_UPROBE,   /* filename + offset */
+   BPF_PERF_INFO_URETPROBE,/* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index a3a8fb2..e0152aa 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -641,3 +641,26 @@ int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, 
__u32 log_buf_size,
 
return fd;
 }
+
+int bpf_trace_event_query(int pid, int fd, char *buf, __u32 buf_len,
+ __u32 *prog_id, __u32 *prog_info,
+ __u64 *probe_offset, __u64 *probe_addr)
+{
+   union bpf_attr attr = {};
+   int err;
+
+   attr.perf_event_query.pid = pid;
+   attr.perf_event_query.fd = fd;
+   attr.perf_event_query.buf = ptr_to_u64(buf);
+   attr.perf_event_query.buf_len = buf_len;
+
+   err = sys_bpf(BPF_PERF_EVENT_QUERY, , sizeof(attr));
+   if (!err) {
+   *prog_id = attr.perf_event_query.prog_id;
+   *prog_info = attr.perf_event_query.prog_info;
+   *probe_offset = attr.perf_event_query.probe_offset;
+   *probe_addr = attr.perf_event_query.probe_addr;
+   }
+
+   return err;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index fb3a146..53d05fc 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -105,4 +105,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type 
type, __u32 query_flags,
 int bpf_raw_tracepoint_open(const char *name, int prog_fd);
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 bool do_log);
+int bpf_trace_event_query(int pid, int fd, char *buf, __u32 buf_len,
+ __u32 *prog_id, __u32 *prog_info,
+ __u64 *probe_offset, __u64 *probe_addr);
 #endif
-- 
2.9.5



[PATCH bpf-next 2/7] bpf: introduce bpf subcommand BPF_PERF_EVENT_QUERY

2018-05-15 Thread Yonghong Song
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_PERF_EVENT_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobea perf event, BPF_PERF_EVENT_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/trace_events.h |  15 ++
 include/uapi/linux/bpf.h |  25 ++
 kernel/bpf/syscall.c | 113 +++
 kernel/trace/bpf_trace.c |  53 
 kernel/trace/trace_kprobe.c  |  29 +++
 kernel/trace/trace_uprobe.c  |  22 +
 6 files changed, 257 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..ec1f604 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, 
void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(struct file *file, u32 *prog_id, u32 *prog_info,
+   const char **buf, u64 *probe_offset,
+   u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void 
*ctx)
 {
@@ -504,6 +507,12 @@ static inline struct bpf_raw_event_map 
*bpf_find_raw_tracepoint(const char *name
 {
return NULL;
 }
+static inline int bpf_get_perf_event_info(struct file *file, u32 *prog_id,
+ u32 *prog_info, const char **buf,
+ u64 *probe_offset, u64 *probe_addr)
+{
+   return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +569,16 @@ extern void perf_trace_del(struct perf_event *event, int 
flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(struct perf_event *event, u32 *prog_info,
+  const char **symbol, u64 *probe_offset,
+  u64 *probe_addr, bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(struct perf_event *event, u32 *prog_info,
+  const char **filename, u64 *probe_offset,
+  bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d94d333..b78eca1 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+   BPF_PERF_EVENT_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32   btf_log_size;
__u32   btf_log_level;
};
+
+   struct {
+   int pid;/* input: pid */
+   int fd; /* input: fd */
+   __u32   flags;  /* input: flags */
+   __u32   buf_len;/* input: buf len */
+   __aligned_u64   buf;/* input/output:
+*   tp_name for tracepoint
+*   symbol for kprobe
+*   filename for uprobe
+*/
+   __u32   prog_id;/* output: prod_id */
+   __u

[PATCH bpf-next 6/7] tools/bpf: add two BPF_PERF_EVENT_QUERY tests in test_progs

2018-05-15 Thread Yonghong Song
The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c | 133 +++
 1 file changed, 133 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c 
b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733..138d1e9 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,137 @@ static void test_get_stack_raw_tp(void)
bpf_object__close(obj);
 }
 
+static void test_query_trace_event_rawtp(void)
+{
+   const char *file = "./test_get_stack_rawtp.o";
+   struct perf_event_attr attr = {};
+   int efd, err, prog_fd, pmu_fd;
+   struct bpf_object *obj;
+   __u32 duration = 0;
+   char buf[256];
+   __u32 prog_id, prog_info;
+   __u64 probe_offset, probe_addr;
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, , _fd);
+   if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+   return;
+
+   efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+   if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+
+   attr.sample_type = PERF_SAMPLE_RAW;
+   attr.type = PERF_TYPE_SOFTWARE;
+   attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+   pmu_fd = syscall(__NR_perf_event_open, , getpid(), -1, -1, 0);
+   if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+ errno))
+   goto close_prog;
+
+   err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+   if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
+ err, errno))
+   goto close_prog;
+
+   /* query (getpid(), efd */
+   err = bpf_trace_event_query(getpid(), efd, buf, 256, _id,
+   _info, _offset, _addr);
+   if (CHECK(err < 0, "bpf_trace_event_query", "err %d errno %d\n", err,
+ errno))
+   goto close_prog;
+
+   err = (prog_info == BPF_PERF_INFO_TP_NAME) &&
+ (strcmp(buf, "sys_enter") == 0);
+   if (CHECK(!err, "check_results", "prog_info %d tp_name %s\n",
+ prog_info, buf))
+   goto close_prog;
+
+   goto close_prog_noerr;
+close_prog:
+   error_cnt++;
+close_prog_noerr:
+   bpf_object__close(obj);
+}
+
+static void test_query_trace_event_tp_core(const char *probe_name,
+  const char *tp_name)
+{
+   const char *file = "./test_tracepoint.o";
+   int err, bytes, efd, prog_fd, pmu_fd;
+   struct perf_event_attr attr = {};
+   struct bpf_object *obj;
+   __u32 duration = 0;
+   char buf[256];
+   __u32 prog_id, prog_info;
+   __u64 probe_offset, probe_addr;
+
+   err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, , _fd);
+   if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+   goto close_prog;
+
+   snprintf(buf, sizeof(buf),
+"/sys/kernel/debug/tracing/events/%s/id", probe_name);
+   efd = open(buf, O_RDONLY, 0);
+   if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+   goto close_prog;
+   bytes = read(efd, buf, sizeof(buf));
+   close(efd);
+   if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+ "bytes %d errno %d\n", bytes, errno))
+   goto close_prog;
+
+   attr.config = strtol(buf, NULL, 0);
+   attr.type = PERF_TYPE_TRACEPOINT;
+   attr.sample_type = PERF_SAMPLE_RAW;
+   attr.sample_period = 1;
+   attr.wakeup_events = 1;
+   pmu_fd = syscall(__NR_perf_event_open, , -1 /* pid */,
+0 /* cpu 0 */, -1 /* group id */,
+0 /* flags */);
+   if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+   goto close_pmu;
+
+   err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+   if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+ errno))
+   goto close_pmu;
+
+   err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+   if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+ errno))
+   goto close_pmu;
+
+   /* query (getpid(), pmu_fd */
+   err = bpf_trace_event_query(getpid(), pmu_fd, buf, 256, _id,
+

[PATCH bpf-next 7/7] tools/bpftool: add perf subcommand

2018-05-15 Thread Yonghong Song
The new command "bpftool perf [show]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
  kprobe: trace.py '__x64_sys_nanosleep'
  kretprobe:  trace.py 'r::__x64_sys_nanosleep'
  tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
  uprobe: trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

  $ bpftool perf
  21711: prog_id 5 kprobe func __x64_sys_write offset 0
  21765: prog_id 7 kretprobe func __x64_sys_nanosleep offset 0
  21767: prog_id 8 tracepoint sys_enter_nanosleep
  21800: prog_id 9 uprobe filename /home/yhs/a.out offset 1159

  $ bpftool -j perf
  
{"pid":21711,"prog_id":5,"prog_info":"kprobe","func":"__x64_sys_write","offset":0},
 \
  
{"pid":21765,"prog_id":7,"prog_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0},
 \
  
{"pid":21767,"prog_id":8,"prog_info":"tracepoint","tracepoint":"sys_enter_nanosleep"},
 \
  
{"pid":21800,"prog_id":9,"prog_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}

  $ bpftool prog
  5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
  loaded_at 2018-05-15T04:46:37-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 4
  7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
  loaded_at 2018-05-15T04:48:32-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 7
  8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
  loaded_at 2018-05-15T04:48:48-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 8
  9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
  loaded_at 2018-05-15T04:49:52-0700  uid 0
  xlated 200B  not jited  memlock 4096B  map_ids 9

  $ ps ax | grep "python ./trace.py"
  21711 pts/0T  0:03 python ./trace.py __x64_sys_write
  21765 pts/0S+     0:00 python ./trace.py r::__x64_sys_nanosleep
  21767 pts/2S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
  21800 pts/3S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
  22374 pts/1S+ 0:00 grep --color=auto python ./trace.py

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/bpf/bpftool/main.c |   3 +-
 tools/bpf/bpftool/main.h |   1 +
 tools/bpf/bpftool/perf.c | 188 +++
 3 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 1ec852d..eea7f14 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -87,7 +87,7 @@ static int do_help(int argc, char **argv)
"   %s batch file FILE\n"
"   %s version\n"
"\n"
-   "   OBJECT := { prog | map | cgroup }\n"
+   "   OBJECT := { prog | map | cgroup | perf }\n"
"   " HELP_SPEC_OPTIONS "\n"
"",
bin_name, bin_name, bin_name);
@@ -216,6 +216,7 @@ static const struct cmd cmds[] = {
{ "prog",   do_prog },
{ "map",do_map },
{ "cgroup", do_cgroup },
+   { "perf",   do_perf },
{ "version",do_version },
{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 6173cd9..63fdb31 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -119,6 +119,7 @@ int do_prog(int argc, char **arg);
 int do_map(int argc, char **arg);
 int do_event_pipe(int argc, char **argv);
 int do_cgroup(int argc, char **arg);
+int do_perf(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
 int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 
*info_len);
diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
new file mode 100644
index 000..6d676e4
--- /dev/null
+++ b/tools/bpf/bpftool/perf.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+// Author: Yonghong Song <y...@fb.com>
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "main.h"
+
+static void print_perf_json(int pid, __u32 prog_id, __u32 prog_info,
+   char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+   jsonw_start_object(json_wtr);
+   jsonw_int_field(json_wtr, "pid", pid);
+   jsonw_uint_field(json_wtr, &

[PATCH bpf-next 1/7] perf/core: add perf_get_event() to return perf_event given a struct file

2018-05-15 Thread Yonghong Song
A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 include/linux/perf_event.h | 5 +
 kernel/events/core.c   | 8 
 2 files changed, 13 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..b5c1ad3 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct file *perf_event_get(unsigned int fd);
+extern struct perf_event *perf_get_event(struct file *file);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct 
task_struct *child)   { }
 static inline void perf_event_free_task(struct task_struct *task)  { }
 static inline void perf_event_delayed_put(struct task_struct *task){ }
 static inline struct file *perf_event_get(unsigned int fd) { return 
ERR_PTR(-EINVAL); }
+static inline struct perf_event *perf_get_event(struct file *file)
+{
+   return ERR_PTR(-EINVAL);
+}
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event 
*event)
 {
return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce..1e3cddb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
return file;
 }
 
+struct perf_event *perf_get_event(struct file *file)
+{
+   if (file->f_op != _fops)
+   return ERR_PTR(-EINVAL);
+
+   return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
if (!event)
-- 
2.9.5



[PATCH bpf-next 4/7] tools/bpf: add ksym_get_addr() in trace_helpers

2018-05-15 Thread Yonghong Song
Given a kernel function name, ksym_get_addr() will return the kernel
address for this function, or 0 if it cannot find this function name
in /proc/kallsyms. This function will be used later when a kernel
address is used to initiate a kprobe perf event.

Signed-off-by: Yonghong Song <y...@fb.com>
---
 tools/testing/selftests/bpf/trace_helpers.c | 12 
 tools/testing/selftests/bpf/trace_helpers.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c 
b/tools/testing/selftests/bpf/trace_helpers.c
index 8fb4fe8..3868dcb 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -72,6 +72,18 @@ struct ksym *ksym_search(long key)
return [0];
 }
 
+long ksym_get_addr(const char *name)
+{
+   int i;
+
+   for (i = 0; i < sym_cnt; i++) {
+   if (strcmp(syms[i].name, name) == 0)
+   return syms[i].addr;
+   }
+
+   return 0;
+}
+
 static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h 
b/tools/testing/selftests/bpf/trace_helpers.h
index 36d90e3..3b4bcf7 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -11,6 +11,7 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
 
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
-- 
2.9.5



<    1   2   3   4   5   6   7   8   9   10   >