Hi,

While tracing the scheduling events of a particular task, I noticed that 
sometimes,  bpf_ktime_get_ns() returns a previous time stamp. Whenever the time 
stamps become out of order, the subsequent events follow the  earlier 
timestamp. For example, something like:


Old TSP            New TSP


23456               23458

23458               23461

....

....

23509               23467

23467               23470

23470.......

......                    23527

23527                23446

23446                .........


I have used a return probe with sys_clone(0) to store the thread Ids of a 
particular task, and used this information in the sched_switch tracepoint to 
trace the switching events of my task. The task is run on an 8-core machine.


What could be reason for this behaviour? I have included the code, in case I 
have done something wrong???


######################################################################


from bcc import BPF

# define BPF program
bpf_text = """

#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
#include <linux/types.h>

struct values_t{
        u32 threadId;
        char comm[TASK_COMM_LEN];
};

BPF_HASH(threadList, u32, u32);     //Stores threadIds of participating threads
BPF_HASH(tsp, u32, u64, 1);         //Stores timestamp of previous event

/*struct sched_switch_args {
    // from /sys/kernel/debug/tracing/events/random/urandom_read/format
    u64 __unused__;
    char prev_comm[16];
    pid_t prev_pid;
    int prev_prio;
    long prev_state;
    char next_comm[16];
    pid_t next_pid;
    int next_prio;
};*/

//Return Probe for sys_clone() - stores the thread Ids of a task with command, 
"prod"
int trace_sys_clone(struct pt_regs *ctx) {

        struct values_t value = {};
        u32 one = 1, *val, *val1;

        bpf_get_current_comm(&value.comm, sizeof(value.comm));

        if(value.comm[0]=='p' && value.comm[1] == 'r' && value.comm[2]=='o' && 
value.comm[3]=='d') {

                //Store the parent thread ID
                value.threadId = bpf_get_current_pid_tgid();
                val = threadList.lookup_or_init(&value.threadId, &one);

                //Stores other thread IDs
                struct values_t value1={};
                value1.threadId = PT_REGS_RC(ctx);
                val1 = threadList.lookup_or_init(&value1.threadId, &one);
        }
        return 0;
}


//Tracepoint probe for the Sched_Switch tracepoint - stores the previous event 
timestamp in a map

TRACEPOINT_PROBE(sched, sched_switch){

    u32 *prev, *next;  //Pointers to entries in threadList map
    u32 next_pid, prev_pid, key=0;

    //Copy data to BPF stack
    bpf_probe_read(&next_pid, sizeof(next_pid), &args->next_pid);
    bpf_probe_read(&prev_pid, sizeof(prev_pid), &args->prev_pid);

    //Look up thread ids in the list created by sys_clone()
    next = threadList.lookup(&next_pid);
    prev = threadList.lookup(&prev_pid);

    //If task-threads are involved in Context Switch
    if( prev || next){

        u64 newTS = bpf_ktime_get_ns();
        u64 *oldTS = tsp.lookup_or_init(&key, &newTS); //Retrieve previous 
timestamp

        if(oldTS==0)
                return 0;

        if(newTS > *oldTS)
                bpf_trace_printk("Correct: Old: %llu New: %llu\\n", *oldTS, 
newTS);
        else
                bpf_trace_printk("Wrong: Old: %llu New: %llu\\n", *oldTS, 
newTS);

        tsp.update(&key, &newTS); //Update timestamp
     }
    return 0;
}
"""

# load BPF program
b = BPF(text=bpf_text)
b.attach_kretprobe(event="sys_clone", fn_name="trace_sys_clone")

# format output
while 1:
   try:
      (task, pid, cpu, flags, ts, msg) = b.trace_fields()
      print("%s" % (msg))

   except ValueError:
        continue

 
################################################################################


The tool cpudist.py also mentions about a case when timestamps may be out of 
order.


Many Thanks ,

Reena.



_______________________________________________
iovisor-dev mailing list
[email protected]
https://lists.iovisor.org/mailman/listinfo/iovisor-dev

Reply via email to