Re: [PATCH bpf] bpf: preallocate a perf_sample_data per event fd

2019-06-08 Thread Steven Rostedt
On Fri, 31 May 2019 05:26:30 +
Song Liu  wrote:

> We can also do something like
> 
>ee = kzalloc(sizeof(struct bpf_event_entry) + sizeof(struct 
> perf_sample_data));
>ee->sd = (void *)ee + sizeof(struct bpf_event_entry);

Or perhaps:

ee->sd = (struct perf_sample_data *)(ee + 1);

-- Steve


Re: [PATCH bpf] bpf: preallocate a perf_sample_data per event fd

2019-05-30 Thread Song Liu



> On May 30, 2019, at 5:01 PM, Matt Mullins  wrote:
> 
> On Thu, 2019-05-30 at 23:28 +, Song Liu wrote:
>>> On May 30, 2019, at 3:55 PM, Matt Mullins  wrote:
>>> 
>>> It is possible that a BPF program can be called while another BPF
>>> program is executing bpf_perf_event_output.  This has been observed with
>>> I/O completion occurring as a result of an interrupt:
>>> 
>>> bpf_prog_247fd1341cddaea4_trace_req_end+0x8d7/0x1000
>>> ? trace_call_bpf+0x82/0x100
>>> ? sch_direct_xmit+0xe2/0x230
>>> ? blk_mq_end_request+0x1/0x100
>>> ? blk_mq_end_request+0x5/0x100
>>> ? kprobe_perf_func+0x19b/0x240
>>> ? __qdisc_run+0x86/0x520
>>> ? blk_mq_end_request+0x1/0x100
>>> ? blk_mq_end_request+0x5/0x100
>>> ? kprobe_ftrace_handler+0x90/0xf0
>>> ? ftrace_ops_assist_func+0x6e/0xe0
>>> ? ip6_input_finish+0xbf/0x460
>>> ? 0xa01e80bf
>>> ? nbd_dbg_flags_show+0xc0/0xc0 [nbd]
>>> ? blkdev_issue_zeroout+0x200/0x200
>>> ? blk_mq_end_request+0x1/0x100
>>> ? blk_mq_end_request+0x5/0x100
>>> ? flush_smp_call_function_queue+0x6c/0xe0
>>> ? smp_call_function_single_interrupt+0x32/0xc0
>>> ? call_function_single_interrupt+0xf/0x20
>>> ? call_function_single_interrupt+0xa/0x20
>>> ? swiotlb_map_page+0x140/0x140
>>> ? refcount_sub_and_test+0x1a/0x50
>>> ? tcp_wfree+0x20/0xf0
>>> ? skb_release_head_state+0x62/0xc0
>>> ? skb_release_all+0xe/0x30
>>> ? napi_consume_skb+0xb5/0x100
>>> ? mlx5e_poll_tx_cq+0x1df/0x4e0
>>> ? mlx5e_poll_tx_cq+0x38c/0x4e0
>>> ? mlx5e_napi_poll+0x58/0xc30
>>> ? mlx5e_napi_poll+0x232/0xc30
>>> ? net_rx_action+0x128/0x340
>>> ? __do_softirq+0xd4/0x2ad
>>> ? irq_exit+0xa5/0xb0
>>> ? do_IRQ+0x7d/0xc0
>>> ? common_interrupt+0xf/0xf
>>> 
>>> ? __rb_free_aux+0xf0/0xf0
>>> ? perf_output_sample+0x28/0x7b0
>>> ? perf_prepare_sample+0x54/0x4a0
>>> ? perf_event_output+0x43/0x60
>>> ? bpf_perf_event_output_raw_tp+0x15f/0x180
>>> ? blk_mq_start_request+0x1/0x120
>>> ? bpf_prog_411a64a706fc6044_should_trace+0xad4/0x1000
>>> ? bpf_trace_run3+0x2c/0x80
>>> ? nbd_send_cmd+0x4c2/0x690 [nbd]
>>> 
>>> This also cannot be alleviated by further splitting the per-cpu
>>> perf_sample_data structs (as in commit 283ca526a9bd ("bpf: fix
>>> corruption on concurrent perf_event_output calls")), as a raw_tp could
>>> be attached to the block:block_rq_complete tracepoint and execute during
>>> another raw_tp.  Instead, keep a pre-allocated perf_sample_data
>>> structure per perf_event_array element and fail a bpf_perf_event_output
>>> if that element is concurrently being used.
>>> 
>>> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for 
>>> perf_sample_data")
>>> Signed-off-by: Matt Mullins 
>>> ---
>>> It felt a bit overkill, but I had to split bpf_event_entry into its own
>>> header file to break an include cycle from perf_event.h -> cgroup.h ->
>>> cgroup-defs.h -> bpf-cgroup.h -> bpf.h -> (potentially) perf_event.h.
>>> 
>>> include/linux/bpf.h   |  7 ---
>>> include/linux/bpf_event.h | 20 
>>> kernel/bpf/arraymap.c |  2 ++
>>> kernel/trace/bpf_trace.c  | 30 +-
>>> 4 files changed, 39 insertions(+), 20 deletions(-)
>>> create mode 100644 include/linux/bpf_event.h
>>> 
>>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>>> index 4fb3aa2dc975..13b253a36402 100644
>>> --- a/include/linux/bpf.h
>>> +++ b/include/linux/bpf.h
>>> @@ -467,13 +467,6 @@ static inline bool bpf_map_flags_access_ok(u32 
>>> access_flags)
>>>(BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
>>> }
>>> 
>> 
>> I think we can avoid the include cycle as:
>> 
>> +struct perf_sample_data *sd;
>> struct bpf_event_entry {
>>  struct perf_event *event;
>>  struct file *perf_file;
>>  struct file *map_file;
>>  struct rcu_head rcu;
>> +struct perf_sample_data *sd;
>> };
> 
> Yeah, that totally works.  I was mostly doing this so we had only one
> kmalloc allocation, but I'm not too worried about having an extra
> object in kmalloc-64 if it simplifies the code a lot.

We can also do something like

   ee = kzalloc(sizeof(struct bpf_event_entry) + sizeof(struct 
perf_sample_data));
   ee->sd = (void *)ee + sizeof(struct bpf_event_entry);

Thanks,
Song

> 
>> 
>>> -struct bpf_event_entry {
>>> -   struct perf_event *event;
>>> -   struct file *perf_file;
>>> -   struct file *map_file;
>>> -   struct rcu_head rcu;
>>> -};
>>> -
>>> bool bpf_prog_array_compatible(struct bpf_array *array, const struct 
>>> bpf_prog *fp);
>>> int bpf_prog_calc_tag(struct bpf_prog *fp);
>>> 
>>> diff --git a/include/linux/bpf_event.h b/include/linux/bpf_event.h
>>> new file mode 100644
>>> index ..9f415990f921
>>> --- /dev/null
>>> +++ b/include/linux/bpf_event.h
>>> @@ -0,0 +1,20 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +
>>> +#ifndef _LINUX_BPF_EVENT_H
>>> +#define _LINUX_BPF_EVENT_H
>>> 

Re: [PATCH bpf] bpf: preallocate a perf_sample_data per event fd

2019-05-30 Thread Matt Mullins
On Thu, 2019-05-30 at 23:28 +, Song Liu wrote:
> > On May 30, 2019, at 3:55 PM, Matt Mullins  wrote:
> > 
> > It is possible that a BPF program can be called while another BPF
> > program is executing bpf_perf_event_output.  This has been observed with
> > I/O completion occurring as a result of an interrupt:
> > 
> > bpf_prog_247fd1341cddaea4_trace_req_end+0x8d7/0x1000
> > ? trace_call_bpf+0x82/0x100
> > ? sch_direct_xmit+0xe2/0x230
> > ? blk_mq_end_request+0x1/0x100
> > ? blk_mq_end_request+0x5/0x100
> > ? kprobe_perf_func+0x19b/0x240
> > ? __qdisc_run+0x86/0x520
> > ? blk_mq_end_request+0x1/0x100
> > ? blk_mq_end_request+0x5/0x100
> > ? kprobe_ftrace_handler+0x90/0xf0
> > ? ftrace_ops_assist_func+0x6e/0xe0
> > ? ip6_input_finish+0xbf/0x460
> > ? 0xa01e80bf
> > ? nbd_dbg_flags_show+0xc0/0xc0 [nbd]
> > ? blkdev_issue_zeroout+0x200/0x200
> > ? blk_mq_end_request+0x1/0x100
> > ? blk_mq_end_request+0x5/0x100
> > ? flush_smp_call_function_queue+0x6c/0xe0
> > ? smp_call_function_single_interrupt+0x32/0xc0
> > ? call_function_single_interrupt+0xf/0x20
> > ? call_function_single_interrupt+0xa/0x20
> > ? swiotlb_map_page+0x140/0x140
> > ? refcount_sub_and_test+0x1a/0x50
> > ? tcp_wfree+0x20/0xf0
> > ? skb_release_head_state+0x62/0xc0
> > ? skb_release_all+0xe/0x30
> > ? napi_consume_skb+0xb5/0x100
> > ? mlx5e_poll_tx_cq+0x1df/0x4e0
> > ? mlx5e_poll_tx_cq+0x38c/0x4e0
> > ? mlx5e_napi_poll+0x58/0xc30
> > ? mlx5e_napi_poll+0x232/0xc30
> > ? net_rx_action+0x128/0x340
> > ? __do_softirq+0xd4/0x2ad
> > ? irq_exit+0xa5/0xb0
> > ? do_IRQ+0x7d/0xc0
> > ? common_interrupt+0xf/0xf
> > 
> > ? __rb_free_aux+0xf0/0xf0
> > ? perf_output_sample+0x28/0x7b0
> > ? perf_prepare_sample+0x54/0x4a0
> > ? perf_event_output+0x43/0x60
> > ? bpf_perf_event_output_raw_tp+0x15f/0x180
> > ? blk_mq_start_request+0x1/0x120
> > ? bpf_prog_411a64a706fc6044_should_trace+0xad4/0x1000
> > ? bpf_trace_run3+0x2c/0x80
> > ? nbd_send_cmd+0x4c2/0x690 [nbd]
> > 
> > This also cannot be alleviated by further splitting the per-cpu
> > perf_sample_data structs (as in commit 283ca526a9bd ("bpf: fix
> > corruption on concurrent perf_event_output calls")), as a raw_tp could
> > be attached to the block:block_rq_complete tracepoint and execute during
> > another raw_tp.  Instead, keep a pre-allocated perf_sample_data
> > structure per perf_event_array element and fail a bpf_perf_event_output
> > if that element is concurrently being used.
> > 
> > Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for 
> > perf_sample_data")
> > Signed-off-by: Matt Mullins 
> > ---
> > It felt a bit overkill, but I had to split bpf_event_entry into its own
> > header file to break an include cycle from perf_event.h -> cgroup.h ->
> > cgroup-defs.h -> bpf-cgroup.h -> bpf.h -> (potentially) perf_event.h.
> > 
> > include/linux/bpf.h   |  7 ---
> > include/linux/bpf_event.h | 20 
> > kernel/bpf/arraymap.c |  2 ++
> > kernel/trace/bpf_trace.c  | 30 +-
> > 4 files changed, 39 insertions(+), 20 deletions(-)
> > create mode 100644 include/linux/bpf_event.h
> > 
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index 4fb3aa2dc975..13b253a36402 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -467,13 +467,6 @@ static inline bool bpf_map_flags_access_ok(u32 
> > access_flags)
> >(BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
> > }
> > 
> 
> I think we can avoid the include cycle as:
> 
> +struct perf_sample_data *sd;
> struct bpf_event_entry {
>   struct perf_event *event;
>   struct file *perf_file;
>   struct file *map_file;
>   struct rcu_head rcu;
> + struct perf_sample_data *sd;
> };

Yeah, that totally works.  I was mostly doing this so we had only one
kmalloc allocation, but I'm not too worried about having an extra
object in kmalloc-64 if it simplifies the code a lot.

> 
> > -struct bpf_event_entry {
> > -   struct perf_event *event;
> > -   struct file *perf_file;
> > -   struct file *map_file;
> > -   struct rcu_head rcu;
> > -};
> > -
> > bool bpf_prog_array_compatible(struct bpf_array *array, const struct 
> > bpf_prog *fp);
> > int bpf_prog_calc_tag(struct bpf_prog *fp);
> > 
> > diff --git a/include/linux/bpf_event.h b/include/linux/bpf_event.h
> > new file mode 100644
> > index ..9f415990f921
> > --- /dev/null
> > +++ b/include/linux/bpf_event.h
> > @@ -0,0 +1,20 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +
> > +#ifndef _LINUX_BPF_EVENT_H
> > +#define _LINUX_BPF_EVENT_H
> > +
> > +#include 
> > +#include 
> > +
> > +struct file;
> > +
> > +struct bpf_event_entry {
> > +   struct perf_event *event;
> > +   struct file *perf_file;
> > +   struct file *map_file;
> > +   struct rcu_head rcu;
> > +   struct perf_sample_data sd;
> > +   

Re: [PATCH bpf] bpf: preallocate a perf_sample_data per event fd

2019-05-30 Thread Song Liu



> On May 30, 2019, at 3:55 PM, Matt Mullins  wrote:
> 
> It is possible that a BPF program can be called while another BPF
> program is executing bpf_perf_event_output.  This has been observed with
> I/O completion occurring as a result of an interrupt:
> 
>   bpf_prog_247fd1341cddaea4_trace_req_end+0x8d7/0x1000
>   ? trace_call_bpf+0x82/0x100
>   ? sch_direct_xmit+0xe2/0x230
>   ? blk_mq_end_request+0x1/0x100
>   ? blk_mq_end_request+0x5/0x100
>   ? kprobe_perf_func+0x19b/0x240
>   ? __qdisc_run+0x86/0x520
>   ? blk_mq_end_request+0x1/0x100
>   ? blk_mq_end_request+0x5/0x100
>   ? kprobe_ftrace_handler+0x90/0xf0
>   ? ftrace_ops_assist_func+0x6e/0xe0
>   ? ip6_input_finish+0xbf/0x460
>   ? 0xa01e80bf
>   ? nbd_dbg_flags_show+0xc0/0xc0 [nbd]
>   ? blkdev_issue_zeroout+0x200/0x200
>   ? blk_mq_end_request+0x1/0x100
>   ? blk_mq_end_request+0x5/0x100
>   ? flush_smp_call_function_queue+0x6c/0xe0
>   ? smp_call_function_single_interrupt+0x32/0xc0
>   ? call_function_single_interrupt+0xf/0x20
>   ? call_function_single_interrupt+0xa/0x20
>   ? swiotlb_map_page+0x140/0x140
>   ? refcount_sub_and_test+0x1a/0x50
>   ? tcp_wfree+0x20/0xf0
>   ? skb_release_head_state+0x62/0xc0
>   ? skb_release_all+0xe/0x30
>   ? napi_consume_skb+0xb5/0x100
>   ? mlx5e_poll_tx_cq+0x1df/0x4e0
>   ? mlx5e_poll_tx_cq+0x38c/0x4e0
>   ? mlx5e_napi_poll+0x58/0xc30
>   ? mlx5e_napi_poll+0x232/0xc30
>   ? net_rx_action+0x128/0x340
>   ? __do_softirq+0xd4/0x2ad
>   ? irq_exit+0xa5/0xb0
>   ? do_IRQ+0x7d/0xc0
>   ? common_interrupt+0xf/0xf
>   
>   ? __rb_free_aux+0xf0/0xf0
>   ? perf_output_sample+0x28/0x7b0
>   ? perf_prepare_sample+0x54/0x4a0
>   ? perf_event_output+0x43/0x60
>   ? bpf_perf_event_output_raw_tp+0x15f/0x180
>   ? blk_mq_start_request+0x1/0x120
>   ? bpf_prog_411a64a706fc6044_should_trace+0xad4/0x1000
>   ? bpf_trace_run3+0x2c/0x80
>   ? nbd_send_cmd+0x4c2/0x690 [nbd]
> 
> This also cannot be alleviated by further splitting the per-cpu
> perf_sample_data structs (as in commit 283ca526a9bd ("bpf: fix
> corruption on concurrent perf_event_output calls")), as a raw_tp could
> be attached to the block:block_rq_complete tracepoint and execute during
> another raw_tp.  Instead, keep a pre-allocated perf_sample_data
> structure per perf_event_array element and fail a bpf_perf_event_output
> if that element is concurrently being used.
> 
> Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
> Signed-off-by: Matt Mullins 
> ---
> It felt a bit overkill, but I had to split bpf_event_entry into its own
> header file to break an include cycle from perf_event.h -> cgroup.h ->
> cgroup-defs.h -> bpf-cgroup.h -> bpf.h -> (potentially) perf_event.h.
> 
> include/linux/bpf.h   |  7 ---
> include/linux/bpf_event.h | 20 
> kernel/bpf/arraymap.c |  2 ++
> kernel/trace/bpf_trace.c  | 30 +-
> 4 files changed, 39 insertions(+), 20 deletions(-)
> create mode 100644 include/linux/bpf_event.h
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 4fb3aa2dc975..13b253a36402 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -467,13 +467,6 @@ static inline bool bpf_map_flags_access_ok(u32 
> access_flags)
>  (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG);
> }
> 

I think we can avoid the include cycle as:

+struct perf_sample_data *sd;
struct bpf_event_entry {
struct perf_event *event;
struct file *perf_file;
struct file *map_file;
struct rcu_head rcu;
+   struct perf_sample_data *sd;
};

> -struct bpf_event_entry {
> - struct perf_event *event;
> - struct file *perf_file;
> - struct file *map_file;
> - struct rcu_head rcu;
> -};
> -
> bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog 
> *fp);
> int bpf_prog_calc_tag(struct bpf_prog *fp);
> 
> diff --git a/include/linux/bpf_event.h b/include/linux/bpf_event.h
> new file mode 100644
> index ..9f415990f921
> --- /dev/null
> +++ b/include/linux/bpf_event.h
> @@ -0,0 +1,20 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _LINUX_BPF_EVENT_H
> +#define _LINUX_BPF_EVENT_H
> +
> +#include 
> +#include 
> +
> +struct file;
> +
> +struct bpf_event_entry {
> + struct perf_event *event;
> + struct file *perf_file;
> + struct file *map_file;
> + struct rcu_head rcu;
> + struct perf_sample_data sd;
> + atomic_t in_use;
> +};
> +
> +#endif /* _LINUX_BPF_EVENT_H */
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index 584636c9e2eb..08e5e486d563 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
> @@ -11,6 +11,7 @@
>  * General Public License for more details.
>  */
> #include 
> +#include 
> #include 
> #include 
> #include 
> @@ -659,6 +660,7 @@