Re: [PATCH] sched: Add trace for task wake up latency and leave running time

Peter Zijlstra Fri, 21 Aug 2020 11:51:00 -0700


> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 93ecd930efd3..edb622c40a90 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1324,6 +1324,13 @@ struct task_struct {
>       /* CPU-specific state of this task: */
>       struct thread_struct            thread;
>  
> +     /* Task wake up time stamp */
> +     u64                             ts_wakeup;
> +     /* Previous task switch out time stamp */
> +     u64                             pre_ts_end;
> +     /* Next task switch in time stamp */
> +     u64                             next_ts_start;
> +     bool                            wakeup_state;
>       /*
>        * WARNING: on x86, 'thread_struct' contains a variable-sized
>        * structure.  It *MUST* be at the end of 'task_struct'.


^^^ did you read that comment?

> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index fec25b9cfbaf..e99c6d573a42 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -183,6 +183,72 @@ TRACE_EVENT(sched_switch,
>               __entry->next_comm, __entry->next_pid, __entry->next_prio)
>  );
>  
> +DECLARE_EVENT_CLASS(sched_latency_template,
> +
> +     TP_PROTO(bool preempt,
> +              struct task_struct *prev,
> +              struct task_struct *next,
> +              u64 time),
> +
> +     TP_ARGS(preempt, prev, next, time),
> +
> +     TP_STRUCT__entry(
> +             __array(        char,   prev_comm,      TASK_COMM_LEN   )
> +             __field(        pid_t,  prev_pid                        )
> +             __field(        int,    prev_prio                       )
> +             __field(        long,   prev_state                      )
> +             __array(        char,   next_comm,      TASK_COMM_LEN   )
> +             __field(        pid_t,  next_pid                        )
> +             __field(        int,    next_prio                       )
> +             __field(        u64,    time                            )
> +     ),
> +
> +     TP_fast_assign(
> +             memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> +             __entry->prev_pid       = prev->pid;
> +             __entry->prev_prio      = prev->prio;
> +             __entry->prev_state     = __trace_sched_switch_state(preempt, 
> prev);
> +             memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> +             __entry->next_pid       = next->pid;
> +             __entry->next_prio      = next->prio;
> +             __entry->time           = time;
> +             /* XXX SCHED_DEADLINE */
> +     ),
> +
> +     TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> 
> next_comm=%s next_pid=%d next_prio=%d passed time=%llu (ns)",
> +             __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
> +
> +             (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
> +               __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), 
> "|",
> +                             { TASK_INTERRUPTIBLE, "S" },
> +                             { TASK_UNINTERRUPTIBLE, "D" },
> +                             { __TASK_STOPPED, "T" },
> +                             { __TASK_TRACED, "t" },
> +                             { EXIT_DEAD, "X" },
> +                             { EXIT_ZOMBIE, "Z" },
> +                             { TASK_PARKED, "P" },
> +                             { TASK_DEAD, "I" }) :
> +               "R",
> +
> +             __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
> +             __entry->next_comm, __entry->next_pid, __entry->next_prio,
> +             __entry->time)
> +);

NAK, that tracepoint is already broken, we don't want to proliferate the
broken.

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 8471a0f7eb32..b5a1928dc948 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2464,6 +2464,8 @@ static void ttwu_do_wakeup(struct rq *rq, struct 
> task_struct *p, int wake_flags,
>  {
>       check_preempt_curr(rq, p, wake_flags);
>       p->state = TASK_RUNNING;
> +     p->ts_wakeup = local_clock();
> +     p->wakeup_state = true;
>       trace_sched_wakeup(p);
>  
>  #ifdef CONFIG_SMP

NAK, userless overhead.

> @@ -2846,6 +2848,8 @@ try_to_wake_up(struct task_struct *p, unsigned int 
> state, int wake_flags)
>               success = 1;
>               trace_sched_waking(p);
>               p->state = TASK_RUNNING;
> +             p->ts_wakeup = local_clock();
> +             p->wakeup_state = true;
>               trace_sched_wakeup(p);
>               goto out;
>       }

idem


> @@ -3355,6 +3362,8 @@ void wake_up_new_task(struct task_struct *p)
>       post_init_entity_util_avg(p);
>  
>       activate_task(rq, p, ENQUEUE_NOCLOCK);
> +     p->ts_wakeup = local_clock();
> +     p->wakeup_state = true;
>       trace_sched_wakeup_new(p);
>       check_preempt_curr(rq, p, WF_FORK);
>  #ifdef CONFIG_SMP

idem

> @@ -4521,8 +4530,19 @@ static void __sched notrace __schedule(bool preempt)
>  
>               psi_sched_switch(prev, next, !task_on_rq_queued(prev));
>  
> +             next->next_ts_start = prev->pre_ts_end = local_clock();
> +
>               trace_sched_switch(preempt, prev, next);
>  
> +             if (next->ts_wakeup && next->wakeup_state) {
> +                     trace_sched_wakeup_latency(preempt, prev, next,
> +                             next->next_ts_start - next->ts_wakeup);
> +                     next->wakeup_state = false;
> +             }
> +
> +             trace_sched_leave_running_time(preempt, prev,
> +                     next, next->next_ts_start - next->pre_ts_end);
> +
>               /* Also unlocks the rq: */
>               rq = context_switch(rq, prev, next, &rf);
>       } else {

This really isn't ever going to happen.

Re: [PATCH] sched: Add trace for task wake up latency and leave running time

Reply via email to