Re: [PATCH v8 3/6] rcu/trace: Add tracing for how segcb list changes

2020-11-04 Thread Paul E. McKenney
On Tue, Nov 03, 2020 at 08:47:21AM -0500, Joel Fernandes wrote:

[ . . . ]

> ---8<---
> 
> From: "Joel Fernandes (Google)" 
> Subject: [PATCH] rcu/trace: Add tracing for how segcb list changes
> 
> Track how the segcb list changes before/after acceleration, during
> queuing and during dequeuing.
> 
> This has proved useful to discover an optimization to avoid unwanted GP
> requests when there are no callbacks accelerated. The overhead is minimal as
> each segment's length is now stored in the respective segment.
> 
> Signed-off-by: Joel Fernandes (Google) 
> ---
>  include/trace/events/rcu.h | 25 +
>  kernel/rcu/rcu_segcblist.c | 34 ++
>  kernel/rcu/rcu_segcblist.h |  5 +
>  kernel/rcu/tree.c  |  9 +
>  4 files changed, 73 insertions(+)
> 
> diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
> index 155b5cb43cfd..5f8f2ee1a936 100644
> --- a/include/trace/events/rcu.h
> +++ b/include/trace/events/rcu.h
> @@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
> __entry->qlen)
>  );
>  
> +TRACE_EVENT_RCU(rcu_segcb_stats,
> +
> + TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
> +
> + TP_ARGS(ctx, cb_count, gp_seq),
> +
> + TP_STRUCT__entry(
> + __field(const char *, ctx)
> + __array(int, cb_count, RCU_CBLIST_NSEGS)
> + __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
> + ),
> +
> + TP_fast_assign(
> + __entry->ctx = ctx;
> + memcpy(__entry->cb_count, cb_count, RCU_CBLIST_NSEGS * 
> sizeof(int));
> + memcpy(__entry->gp_seq, gp_seq, RCU_CBLIST_NSEGS * 
> sizeof(unsigned long));
> + ),
> +
> + TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, 
> NEXT=%d) "
> +   "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, 
> NEXT=%lu)", __entry->ctx,
> +   __entry->cb_count[0], __entry->cb_count[1], 
> __entry->cb_count[2], __entry->cb_count[3],
> +   __entry->gp_seq[0], __entry->gp_seq[1], 
> __entry->gp_seq[2], __entry->gp_seq[3])
> +
> +);
> +
>  /*
>   * Tracepoint for the registration of a single RCU callback of the special
>   * kvfree() form.  The first argument is the RCU type, the second argument
> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> index 357c19bbcb00..2a03949d0b82 100644
> --- a/kernel/rcu/rcu_segcblist.c
> +++ b/kernel/rcu/rcu_segcblist.c
> @@ -14,6 +14,7 @@
>  #include 
>  
>  #include "rcu_segcblist.h"
> +#include "rcu.h"
>  
>  /* Initialize simple callback list. */
>  void rcu_cblist_init(struct rcu_cblist *rclp)
> @@ -328,6 +329,39 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist 
> *rsclp,
>   rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
>  }
>  
> +/*
> + * Return how many CBs each segment along with their gp_seq values.
> + *
> + * This function is O(N) where N is the number of segments. Only used from
> + * tracing code which is usually disabled in production.
> + */
> +#ifdef CONFIG_RCU_TRACE
> +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> +  int cbcount[RCU_CBLIST_NSEGS],

Given that negative numbers are not possible here, shouldn't
the above be unsigned int?

> +  unsigned long gpseq[RCU_CBLIST_NSEGS])
> +{
> + int i;
> +
> + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> + gpseq[i] = rsclp->gp_seq[i];
> + }

OK, I will bite...

Why can't this function be inlined into the TP_fast_assign portion of the
rcu_segcb_stats trace event?  Or is the usual do-while(0) loop that one
might use to define "i" somehow illegal where TP_fast_assign() is used?
That would also allow the gp_seq array to be memcpy()ed directly, without
the intervening loop.

Or you could take the hit of the extra storage and directly memcpy()
both arrays.

> +}
> +
> +void __trace_rcu_segcb_stats(struct rcu_segcblist *rsclp, const char 
> *context)
> +{
> + int cbs[RCU_CBLIST_NSEGS];
> + unsigned long gps[RCU_CBLIST_NSEGS];
> +
> + if (!trace_rcu_segcb_stats_enabled())
> + return;
> +
> + rcu_segcblist_countseq(rsclp, cbs, gps);
> +
> + trace_rcu_segcb_stats(context, cbs, gps);
> +}
> +#endif
> +
>  /*
>   * Extract only those callbacks still pending (not yet ready to be
>   * invoked) from the specified rcu_segcblist structure and place them in
> diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
> index cd35c9faaf51..7750734fa116 100644
> --- a/kernel/rcu/rcu_segcblist.h
> +++ b/kernel/rcu/rcu_segcblist.h
> @@ -103,3 +103,8 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, 
> unsigned long seq);
>  bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long 
> 

Re: [PATCH v8 3/6] rcu/trace: Add tracing for how segcb list changes

2020-11-03 Thread Joel Fernandes
On Mon, Oct 26, 2020 at 12:59:13PM +0100, Frederic Weisbecker wrote:
> On Wed, Oct 21, 2020 at 03:08:10PM -0400, Joel Fernandes (Google) wrote:
> > Track how the segcb list changes before/after acceleration, during
> > queuing and during dequeuing.
> > 
> > This has proved useful to discover an optimization to avoid unwanted GP
> > requests when there are no callbacks accelerated. The overhead is minimal as
> > each segment's length is now stored in the respective segment.
> > 
> > Signed-off-by: Joel Fernandes (Google) 
> > ---
> >  include/trace/events/rcu.h | 25 +
> >  kernel/rcu/rcu_segcblist.c | 31 +++
> >  kernel/rcu/rcu_segcblist.h |  5 +
> >  kernel/rcu/tree.c  |  9 +
> >  4 files changed, 70 insertions(+)
> > 
> > diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
> > index 155b5cb43cfd..9f2237d9b0c8 100644
> > --- a/include/trace/events/rcu.h
> > +++ b/include/trace/events/rcu.h
> > @@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
> >   __entry->qlen)
> >  );
> >  
> > +TRACE_EVENT_RCU(rcu_segcb,
> 
> You might extend the segcblist tracing in the future to trace 
> queue/dequeue/merge whatever...
> Which would give trace_rcu_segcb_queue, trace_rcu_segcb_dequeue, etc...
> 
> So I suggest you rename this one to something like rcu_segcb_stats for 
> precision.

Ok, I changed it to that and also changed the internal wrapper to
__rcu_segcb_stats. Full patch below.

> > +
> > +   TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
> > +
> > +   TP_ARGS(ctx, cb_count, gp_seq),
> > +
> > +   TP_STRUCT__entry(
> > +   __field(const char *, ctx)
> > +   __array(int, cb_count, RCU_CBLIST_NSEGS)
> > +   __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
> > +   ),
> > +
> > +   TP_fast_assign(
> > +   __entry->ctx = ctx;
> > +   memcpy(__entry->cb_count, cb_count, RCU_CBLIST_NSEGS * 
> > sizeof(int));
> > +   memcpy(__entry->gp_seq, gp_seq, RCU_CBLIST_NSEGS * 
> > sizeof(unsigned long));
> > +   ),
> > +
> > +   TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, 
> > NEXT=%d) "
> > + "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, 
> > NEXT=%lu)", __entry->ctx,
> > + __entry->cb_count[0], __entry->cb_count[1], 
> > __entry->cb_count[2], __entry->cb_count[3],
> > + __entry->gp_seq[0], __entry->gp_seq[1], 
> > __entry->gp_seq[2], __entry->gp_seq[3])
> > +
> > +);
> > +
> >  /*
> >   * Tracepoint for the registration of a single RCU callback of the special
> >   * kvfree() form.  The first argument is the RCU type, the second argument
> > diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> > index 357c19bbcb00..b0aaa51e0ee6 100644
> > --- a/kernel/rcu/rcu_segcblist.c
> > +++ b/kernel/rcu/rcu_segcblist.c
> > @@ -14,6 +14,7 @@
> >  #include 
> >  
> >  #include "rcu_segcblist.h"
> > +#include "rcu.h"
> >  
> >  /* Initialize simple callback list. */
> >  void rcu_cblist_init(struct rcu_cblist *rclp)
> > @@ -328,6 +329,36 @@ void rcu_segcblist_extract_done_cbs(struct 
> > rcu_segcblist *rsclp,
> > rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
> >  }
> >  
> > +/*
> > + * Return how many CBs each segment along with their gp_seq values.
> > + *
> > + * This function is O(N) where N is the number of segments. Only used from
> > + * tracing code which is usually disabled in production.
> > + */
> > +#ifdef CONFIG_RCU_TRACE
> > +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> > +int cbcount[RCU_CBLIST_NSEGS],
> > +unsigned long gpseq[RCU_CBLIST_NSEGS])
> > +{
> > +   int i;
> > +
> > +   for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> > +   cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> > +   gpseq[i] = rsclp->gp_seq[i];
> > +   }
> > +}
> 
> So that is called all the time even if the trace event isn't enabled. The
> goal of trace events are also to avoid the overhead of tracing when its off.

The overhead is only if RCU_TRACE is enabled but I added the following to
keep that low. Is it Ok now? Full patch below.

> This should be moved inside the trace event definition. We can even avoid the
> loop altogether.

No you cannot move it into the definition. If you see the trace event
definition, it is not aware of the rcu segcblist struct internals. So we've
to retrieve the lengths and provide them to it. segcblist is internal to
kernel/rcu/.

I'll add your Reviewed-by tag to this patch but let me know if you object to
that. thanks!

 - Joel

---8<---

From: "Joel Fernandes (Google)" 
Subject: [PATCH] rcu/trace: Add tracing for how segcb list changes

Track how the segcb list changes before/after acceleration, during
queuing and during dequeuing.

This has proved useful to 

Re: [PATCH v8 3/6] rcu/trace: Add tracing for how segcb list changes

2020-10-26 Thread Frederic Weisbecker
On Wed, Oct 21, 2020 at 03:08:10PM -0400, Joel Fernandes (Google) wrote:
> Track how the segcb list changes before/after acceleration, during
> queuing and during dequeuing.
> 
> This has proved useful to discover an optimization to avoid unwanted GP
> requests when there are no callbacks accelerated. The overhead is minimal as
> each segment's length is now stored in the respective segment.
> 
> Signed-off-by: Joel Fernandes (Google) 
> ---
>  include/trace/events/rcu.h | 25 +
>  kernel/rcu/rcu_segcblist.c | 31 +++
>  kernel/rcu/rcu_segcblist.h |  5 +
>  kernel/rcu/tree.c  |  9 +
>  4 files changed, 70 insertions(+)
> 
> diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
> index 155b5cb43cfd..9f2237d9b0c8 100644
> --- a/include/trace/events/rcu.h
> +++ b/include/trace/events/rcu.h
> @@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
> __entry->qlen)
>  );
>  
> +TRACE_EVENT_RCU(rcu_segcb,

You might extend the segcblist tracing in the future to trace 
queue/dequeue/merge whatever...
Which would give trace_rcu_segcb_queue, trace_rcu_segcb_dequeue, etc...

So I suggest you rename this one to something like rcu_segcb_stats for 
precision.


> +
> + TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
> +
> + TP_ARGS(ctx, cb_count, gp_seq),
> +
> + TP_STRUCT__entry(
> + __field(const char *, ctx)
> + __array(int, cb_count, RCU_CBLIST_NSEGS)
> + __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
> + ),
> +
> + TP_fast_assign(
> + __entry->ctx = ctx;
> + memcpy(__entry->cb_count, cb_count, RCU_CBLIST_NSEGS * 
> sizeof(int));
> + memcpy(__entry->gp_seq, gp_seq, RCU_CBLIST_NSEGS * 
> sizeof(unsigned long));
> + ),
> +
> + TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, 
> NEXT=%d) "
> +   "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, 
> NEXT=%lu)", __entry->ctx,
> +   __entry->cb_count[0], __entry->cb_count[1], 
> __entry->cb_count[2], __entry->cb_count[3],
> +   __entry->gp_seq[0], __entry->gp_seq[1], 
> __entry->gp_seq[2], __entry->gp_seq[3])
> +
> +);
> +
>  /*
>   * Tracepoint for the registration of a single RCU callback of the special
>   * kvfree() form.  The first argument is the RCU type, the second argument
> diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
> index 357c19bbcb00..b0aaa51e0ee6 100644
> --- a/kernel/rcu/rcu_segcblist.c
> +++ b/kernel/rcu/rcu_segcblist.c
> @@ -14,6 +14,7 @@
>  #include 
>  
>  #include "rcu_segcblist.h"
> +#include "rcu.h"
>  
>  /* Initialize simple callback list. */
>  void rcu_cblist_init(struct rcu_cblist *rclp)
> @@ -328,6 +329,36 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist 
> *rsclp,
>   rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
>  }
>  
> +/*
> + * Return how many CBs each segment along with their gp_seq values.
> + *
> + * This function is O(N) where N is the number of segments. Only used from
> + * tracing code which is usually disabled in production.
> + */
> +#ifdef CONFIG_RCU_TRACE
> +static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
> +  int cbcount[RCU_CBLIST_NSEGS],
> +  unsigned long gpseq[RCU_CBLIST_NSEGS])
> +{
> + int i;
> +
> + for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
> + cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
> + gpseq[i] = rsclp->gp_seq[i];
> + }
> +}

So that is called all the time even if the trace event isn't enabled. The
goal of trace events are also to avoid the overhead of tracing when its off.

This should be moved inside the trace event definition. We can even avoid the
loop altogether.

Thanks.


[PATCH v8 3/6] rcu/trace: Add tracing for how segcb list changes

2020-10-21 Thread Joel Fernandes (Google)
Track how the segcb list changes before/after acceleration, during
queuing and during dequeuing.

This has proved useful to discover an optimization to avoid unwanted GP
requests when there are no callbacks accelerated. The overhead is minimal as
each segment's length is now stored in the respective segment.

Signed-off-by: Joel Fernandes (Google) 
---
 include/trace/events/rcu.h | 25 +
 kernel/rcu/rcu_segcblist.c | 31 +++
 kernel/rcu/rcu_segcblist.h |  5 +
 kernel/rcu/tree.c  |  9 +
 4 files changed, 70 insertions(+)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 155b5cb43cfd..9f2237d9b0c8 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -505,6 +505,31 @@ TRACE_EVENT_RCU(rcu_callback,
  __entry->qlen)
 );
 
+TRACE_EVENT_RCU(rcu_segcb,
+
+   TP_PROTO(const char *ctx, int *cb_count, unsigned long *gp_seq),
+
+   TP_ARGS(ctx, cb_count, gp_seq),
+
+   TP_STRUCT__entry(
+   __field(const char *, ctx)
+   __array(int, cb_count, RCU_CBLIST_NSEGS)
+   __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS)
+   ),
+
+   TP_fast_assign(
+   __entry->ctx = ctx;
+   memcpy(__entry->cb_count, cb_count, RCU_CBLIST_NSEGS * 
sizeof(int));
+   memcpy(__entry->gp_seq, gp_seq, RCU_CBLIST_NSEGS * 
sizeof(unsigned long));
+   ),
+
+   TP_printk("%s cb_count: (DONE=%d, WAIT=%d, NEXT_READY=%d, 
NEXT=%d) "
+ "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, 
NEXT=%lu)", __entry->ctx,
+ __entry->cb_count[0], __entry->cb_count[1], 
__entry->cb_count[2], __entry->cb_count[3],
+ __entry->gp_seq[0], __entry->gp_seq[1], 
__entry->gp_seq[2], __entry->gp_seq[3])
+
+);
+
 /*
  * Tracepoint for the registration of a single RCU callback of the special
  * kvfree() form.  The first argument is the RCU type, the second argument
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 357c19bbcb00..b0aaa51e0ee6 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -14,6 +14,7 @@
 #include 
 
 #include "rcu_segcblist.h"
+#include "rcu.h"
 
 /* Initialize simple callback list. */
 void rcu_cblist_init(struct rcu_cblist *rclp)
@@ -328,6 +329,36 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist 
*rsclp,
rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0);
 }
 
+/*
+ * Return how many CBs each segment along with their gp_seq values.
+ *
+ * This function is O(N) where N is the number of segments. Only used from
+ * tracing code which is usually disabled in production.
+ */
+#ifdef CONFIG_RCU_TRACE
+static void rcu_segcblist_countseq(struct rcu_segcblist *rsclp,
+int cbcount[RCU_CBLIST_NSEGS],
+unsigned long gpseq[RCU_CBLIST_NSEGS])
+{
+   int i;
+
+   for (i = 0; i < RCU_CBLIST_NSEGS; i++) {
+   cbcount[i] = rcu_segcblist_get_seglen(rsclp, i);
+   gpseq[i] = rsclp->gp_seq[i];
+   }
+}
+
+void trace_rcu_segcb_list(struct rcu_segcblist *rsclp, const char *context)
+{
+   int cbs[RCU_CBLIST_NSEGS];
+   unsigned long gps[RCU_CBLIST_NSEGS];
+
+   rcu_segcblist_countseq(rsclp, cbs, gps);
+
+   trace_rcu_segcb(context, cbs, gps);
+}
+#endif
+
 /*
  * Extract only those callbacks still pending (not yet ready to be
  * invoked) from the specified rcu_segcblist structure and place them in
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index cd35c9faaf51..c2e274ae0912 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -103,3 +103,8 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, 
unsigned long seq);
 bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
 void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
 struct rcu_segcblist *src_rsclp);
+#ifdef CONFIG_RCU_TRACE
+void trace_rcu_segcb_list(struct rcu_segcblist *rsclp, const char *context);
+#else
+#define trace_rcu_segcb_list(...)
+#endif
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 24c00020ab83..346a05506935 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1497,6 +1497,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, 
struct rcu_data *rdp)
if (!rcu_segcblist_pend_cbs(>cblist))
return false;
 
+   trace_rcu_segcb_list(>cblist, TPS("SegCbPreAcc"));
+
/*
 * Callbacks are often registered with incomplete grace-period
 * information.  Something about the fact that getting exact
@@ -1517,6 +1519,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, 
struct rcu_data *rdp)
else
trace_rcu_grace_period(rcu_state.name, gp_seq_req,