Hi,
While tracing the scheduling events of a particular task, I noticed that
sometimes, bpf_ktime_get_ns() returns a previous time stamp. Whenever the time
stamps become out of order, the subsequent events follow the earlier
timestamp. For example, something like:
Old TSP New TSP
23456 23458
23458 23461
....
....
23509 23467
23467 23470
23470.......
...... 23527
23527 23446
23446 .........
I have used a return probe with sys_clone(0) to store the thread Ids of a
particular task, and used this information in the sched_switch tracepoint to
trace the switching events of my task. The task is run on an 8-core machine.
What could be reason for this behaviour? I have included the code, in case I
have done something wrong???
######################################################################
from bcc import BPF
# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
#include <linux/types.h>
struct values_t{
u32 threadId;
char comm[TASK_COMM_LEN];
};
BPF_HASH(threadList, u32, u32); //Stores threadIds of participating threads
BPF_HASH(tsp, u32, u64, 1); //Stores timestamp of previous event
/*struct sched_switch_args {
// from /sys/kernel/debug/tracing/events/random/urandom_read/format
u64 __unused__;
char prev_comm[16];
pid_t prev_pid;
int prev_prio;
long prev_state;
char next_comm[16];
pid_t next_pid;
int next_prio;
};*/
//Return Probe for sys_clone() - stores the thread Ids of a task with command,
"prod"
int trace_sys_clone(struct pt_regs *ctx) {
struct values_t value = {};
u32 one = 1, *val, *val1;
bpf_get_current_comm(&value.comm, sizeof(value.comm));
if(value.comm[0]=='p' && value.comm[1] == 'r' && value.comm[2]=='o' &&
value.comm[3]=='d') {
//Store the parent thread ID
value.threadId = bpf_get_current_pid_tgid();
val = threadList.lookup_or_init(&value.threadId, &one);
//Stores other thread IDs
struct values_t value1={};
value1.threadId = PT_REGS_RC(ctx);
val1 = threadList.lookup_or_init(&value1.threadId, &one);
}
return 0;
}
//Tracepoint probe for the Sched_Switch tracepoint - stores the previous event
timestamp in a map
TRACEPOINT_PROBE(sched, sched_switch){
u32 *prev, *next; //Pointers to entries in threadList map
u32 next_pid, prev_pid, key=0;
//Copy data to BPF stack
bpf_probe_read(&next_pid, sizeof(next_pid), &args->next_pid);
bpf_probe_read(&prev_pid, sizeof(prev_pid), &args->prev_pid);
//Look up thread ids in the list created by sys_clone()
next = threadList.lookup(&next_pid);
prev = threadList.lookup(&prev_pid);
//If task-threads are involved in Context Switch
if( prev || next){
u64 newTS = bpf_ktime_get_ns();
u64 *oldTS = tsp.lookup_or_init(&key, &newTS); //Retrieve previous
timestamp
if(oldTS==0)
return 0;
if(newTS > *oldTS)
bpf_trace_printk("Correct: Old: %llu New: %llu\\n", *oldTS,
newTS);
else
bpf_trace_printk("Wrong: Old: %llu New: %llu\\n", *oldTS,
newTS);
tsp.update(&key, &newTS); //Update timestamp
}
return 0;
}
"""
# load BPF program
b = BPF(text=bpf_text)
b.attach_kretprobe(event="sys_clone", fn_name="trace_sys_clone")
# format output
while 1:
try:
(task, pid, cpu, flags, ts, msg) = b.trace_fields()
print("%s" % (msg))
except ValueError:
continue
################################################################################
The tool cpudist.py also mentions about a case when timestamps may be out of
order.
Many Thanks ,
Reena.
_______________________________________________
iovisor-dev mailing list
[email protected]
https://lists.iovisor.org/mailman/listinfo/iovisor-dev