Re: [PATCH net-next] trace_events_filter: conditional trace event (tcp_probe full=0)

2018-02-12 Thread Md. Islam
On Mon, Feb 12, 2018 at 11:29 AM, Masami Hiramatsu  wrote:
> On Mon, 12 Feb 2018 00:08:46 -0500
> "Md. Islam"  wrote:
>
>> Recently tcp_probe kernel module has been replaced by trace_event. Old
>> tcp_probe had full=0 option where it only takes a snapshot only when
>> congestion window is changed. However I did not find such
>> functionality in trace_event.
>
> Yes, that seems broken to me. You should filter by using perf script or
> bpf. I'm not so clear about network stack, but it seems that cwnd can be
> set for each tcp connection. This means "current snd_cwnd" must be stored
> for each connection.
>
>> This is why I implemented this
>> "conditional trace_event" where a snapshot is taken only if some field
>> is changed. For instance, following filter will record a snapshot only
>> if snd_cwnd field is changed from the previous snapshot:
>>
>> cd /sys/kernel/debug/tracing
>> echo 1 > events/tcp/tcp_probe/enable
>> echo "(sport == 8554 && snd_cwnd =< 0)"  > events/tcp/tcp_probe/filter &
>> cat trace_pipe > /home/tamim/net-next/trace.data
>
> Sounds interesting, but it seems like a hack.
> Filter pred is stored for each trace event (tracepoint) so it is shared
> among several context. At least you need a lock to exclude each thread
> to update it. And I don't recommend to update filter_pred while tracing,
> since it means we have a kind of "hidden state" in the filer. And again,
> that filter_pred is shared, other tcp connection can see it.
> (sport can be same as other connection's sport from other IP)
>
> So IMHO, this kind of hack we don't implement in the kernel. Instead,
> you can use perf-script and program it by python or perl. It gives you
> much safer way to filter it out.
>
> Or, please try to make it as complete feature. For example, maybe we can
> introduce a global 'named' variable (which has correct accessor with spinlock
> or atomic update), and you can refer it with name.
> Then, it becomes safe and more generic, like below;
>
> $ cd /sys/kernel/debug/tracing
> $ touch vars/cwnd
> $ echo '(sport == 12345 && snd_cwnd != $cwnd)' > events/tcp/tcp_probe/filter
> $ echo 'update_var:cwnd:snd_cwnd if traced' > events/tcp/tcp_probe/trigger
>
> Thank you,


Hi Masami

Thanks for the suggestions! It does makes sense. I created a global
atomic variable in predicate. Then I'm updating that based on the
filter. Now it can be done as below:

cd /sys/kernel/debug/tracing &&
echo 1 > events/tcp/tcp_probe/enable &&
echo "(sport == 8554 && snd_cwnd != \$cwnd)"  > events/tcp/tcp_probe/filter

The filter identify the global variable by the presence of $. However
needed to pass '\$' to catch $ in kernel. The patch is :



diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2a6d032..15a83c7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1402,6 +1402,7 @@ struct regex {
 struct filter_pred {
 filter_pred_fn_t fn;
 u64 val;
+atomic64_t  global_var;
 struct regexregex;
 unsigned short*ops;
 struct ftrace_event_field *field;
@@ -1427,6 +1428,16 @@ static inline bool is_function_field(struct
ftrace_event_field *field)
 return field->filter_type == FILTER_TRACE_FN;
 }

+/// The string prerdicate is a global variable if it starts with $.
+/// \param field predicate
+/// \return
+static inline bool is_global_variable(char *field)
+{
+if(strlen(field) == 0)
+return false;
+return field[0] == '$';
+}
+
 extern enum regex_type
 filter_parse_regex(char *buff, int len, char **search, int *not);
 extern void print_event_filter(struct trace_event_file *file,
diff --git a/kernel/trace/trace_events_filter.c
b/kernel/trace/trace_events_filter.c
index 61e7f06..8bf43353 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -202,6 +202,15 @@ static int filter_pred_##size(struct filter_pred
*pred, void *event)\
 return match;\
 }

+#define DEFINE_GLOBAL_PRED(size)\
+static int filter_global_pred_##size(struct filter_pred *pred, void
*event)\
+{\
+u##size *addr = (u##size *)(event + pred->offset);\
+u##size old_global_var =
(u##size)atomic64_xchg(&pred->global_var, (u64)*addr); \
+return (old_global_var == *addr) ^ pred->not;\
+}
+
+
 DEFINE_COMPARISON_PRED(s64);
 DEFINE_COMPARISON_PRED(u64);
 DEFINE_COMPARISON_PRED(s32);
@@ -216,6 +225,11 @@ DEFINE_EQUALITY_PRED(32);
 DEFINE_EQUALITY_PRED(16);
 DEFINE_EQUALITY_PRED(8);

+DEFINE_GLOBAL_PRED(64);
+DEFINE_GLOBAL_PRED(32);
+DEFINE_GLOBAL_PRED(16);
+DEFINE_GLOBAL_PRED(8);
+
 /* Filter predicate for fixed sized arrays of characters */
 static int filter_pred_string(struct filter_pred *pred, void *event)
 {
@@ -988,6 +1002,29 @@ static bool is_legal_op(struct
ftrace_event_field *field, enum filter_op_ids op)
 return true;
 }

+static filter_pred_fn_t select_global_fn(enum filter_op_ids op,
+

Re: [PATCH net-next] trace_events_filter: conditional trace event (tcp_probe full=0)

2018-02-12 Thread David Miller
From: Masami Hiramatsu 
Date: Tue, 13 Feb 2018 01:29:03 +0900

> On Mon, 12 Feb 2018 00:08:46 -0500
> "Md. Islam"  wrote:
> 
>> Recently tcp_probe kernel module has been replaced by trace_event. Old
>> tcp_probe had full=0 option where it only takes a snapshot only when
>> congestion window is changed. However I did not find such
>> functionality in trace_event.
> 
> Yes, that seems broken to me. You should filter by using perf script or
> bpf. I'm not so clear about network stack, but it seems that cwnd can be
> set for each tcp connection. This means "current snd_cwnd" must be stored
> for each connection.

'bpf' is definitely the way to do custom stateful things like this.


Re: [PATCH net-next] trace_events_filter: conditional trace event (tcp_probe full=0)

2018-02-12 Thread Masami Hiramatsu
On Mon, 12 Feb 2018 00:08:46 -0500
"Md. Islam"  wrote:

> Recently tcp_probe kernel module has been replaced by trace_event. Old
> tcp_probe had full=0 option where it only takes a snapshot only when
> congestion window is changed. However I did not find such
> functionality in trace_event.

Yes, that seems broken to me. You should filter by using perf script or
bpf. I'm not so clear about network stack, but it seems that cwnd can be
set for each tcp connection. This means "current snd_cwnd" must be stored
for each connection.

> This is why I implemented this
> "conditional trace_event" where a snapshot is taken only if some field
> is changed. For instance, following filter will record a snapshot only
> if snd_cwnd field is changed from the previous snapshot:
> 
> cd /sys/kernel/debug/tracing
> echo 1 > events/tcp/tcp_probe/enable
> echo "(sport == 8554 && snd_cwnd =< 0)"  > events/tcp/tcp_probe/filter &
> cat trace_pipe > /home/tamim/net-next/trace.data

Sounds interesting, but it seems like a hack.
Filter pred is stored for each trace event (tracepoint) so it is shared
among several context. At least you need a lock to exclude each thread
to update it. And I don't recommend to update filter_pred while tracing,
since it means we have a kind of "hidden state" in the filer. And again,
that filter_pred is shared, other tcp connection can see it.
(sport can be same as other connection's sport from other IP)

So IMHO, this kind of hack we don't implement in the kernel. Instead,
you can use perf-script and program it by python or perl. It gives you
much safer way to filter it out.

Or, please try to make it as complete feature. For example, maybe we can
introduce a global 'named' variable (which has correct accessor with spinlock
or atomic update), and you can refer it with name.
Then, it becomes safe and more generic, like below;

$ cd /sys/kernel/debug/tracing
$ touch vars/cwnd
$ echo '(sport == 12345 && snd_cwnd != $cwnd)' > events/tcp/tcp_probe/filter
$ echo 'update_var:cwnd:snd_cwnd if traced' > events/tcp/tcp_probe/trigger

Thank you,

> 
> This will record a snapshot where source port is 8554 and snd_cwnd is
> not same as the snd_cwnd in previous snapshot. For lack of better
> operator, I used "=<" to represent the field on which the filter will
> be applied.
> 
> The way it works is, it updates the predicate with new value every
> time. So the next time, if the field is same as in the predicate, the
> snapshot is ignored. Initially it checks if  the field is 0 or not.
> 
> diff --git a/kernel/trace/trace_events_filter.c
> b/kernel/trace/trace_events_filter.c
> index 61e7f06..8d733fa 100644
> --- a/kernel/trace/trace_events_filter.c
> +++ b/kernel/trace/trace_events_filter.c
> @@ -39,6 +39,7 @@ enum filter_op_ids
>  OP_AND,
>  OP_GLOB,
>  OP_NE,
> +OP_NE_PREV,
>  OP_EQ,
>  OP_LT,
>  OP_LE,
> @@ -62,6 +63,7 @@ static struct filter_op filter_ops[] = {
>  { OP_AND,"&&",2 },
>  { OP_GLOB,"~",4 },
>  { OP_NE,"!=",4 },
> +{ OP_NE_PREV,   "=<",4 },
>  { OP_EQ,"==",4 },
>  { OP_LT,"<",5 },
>  { OP_LE,"<=",5 },
> @@ -202,6 +204,21 @@ static int filter_pred_##size(struct filter_pred
> *pred, void *event)\
>  return match;\
>  }
> 
> +#define DEFINE_NE_PREV_PRED(size)\
> +static int filter_ne_prev_pred_##size(struct filter_pred *pred, void
> *event)\
> +{\
> +u##size *addr = (u##size *)(event + pred->offset);\
> +u##size val = (u##size)pred->val;\
> +int match;  \
> +\
> +match = (val == *addr) ^ pred->not;\
> +\
> +if(match == 1)  \
> +pred->val = *addr;  \
> +return match;\
> +}
> +
> +
>  DEFINE_COMPARISON_PRED(s64);
>  DEFINE_COMPARISON_PRED(u64);
>  DEFINE_COMPARISON_PRED(s32);
> @@ -216,6 +233,11 @@ DEFINE_EQUALITY_PRED(32);
>  DEFINE_EQUALITY_PRED(16);
>  DEFINE_EQUALITY_PRED(8);
> 
> +DEFINE_NE_PREV_PRED(64);
> +DEFINE_NE_PREV_PRED(32);
> +DEFINE_NE_PREV_PRED(16);
> +DEFINE_NE_PREV_PRED(8);
> +
>  /* Filter predicate for fixed sized arrays of characters */
>  static int filter_pred_string(struct filter_pred *pred, void *event)
>  {
> @@ -980,7 +1002,7 @@ int filter_assign_type(const char *type)
>  static bool is_legal_op(struct ftrace_event_field *field, enum
> filter_op_ids op)
>  {
>  if (is_string_field(field) &&
> -(op != OP_EQ && op != OP_NE && op != OP_GLOB))
> +(op != OP_EQ && op != OP_NE && op != OP_GLOB && op != OP_NE_PREV))
>  return false;
>  if (!is_string_field(field) && op == OP_