This patch adds lock elapsed time trace facility for f2fs rwsemphore.

If total elapsed time of critical region covered by lock exceeds a
threshold, it will print tracepoint to dump information of lock related
context, including:
- thread information
- CPU/IO priority
- lock information
- elapsed time
 - total time
 - running time (depend on CONFIG_64BIT)
 - runnable time (depend on CONFIG_SCHED_INFO and CONFIG_SCHEDSTATS)
 - io sleep time (depend on CONFIG_TASK_DELAY_ACCT and
                  /proc/sys/kernel/task_delayacct)
 - other time    (by default other time will account nonio sleep time,
                  but, if above kconfig is not defined, other time will
                  include runnable time and/or io sleep time as wll)

output:
    f2fs_lock_elapsed_time: dev = (254,52), comm: sh, pid: 13855, prio: 120, 
ioprio_class: 2, ioprio_data: 4, lock_name: cp_rwsem, lock_type: rlock, total: 
1000, running: 993, runnable: 2, io_sleep: 0, other: 5

Signed-off-by: Chao Yu <[email protected]>
---
 fs/f2fs/checkpoint.c        | 106 ++++++++++++++++++++++++++++++++++++
 fs/f2fs/f2fs.h              |  51 +++++++++++++++--
 include/trace/events/f2fs.h |  68 +++++++++++++++++++++++
 3 files changed, 221 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 300664269eb6..bc6058a3122b 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -14,6 +14,9 @@
 #include <linux/pagevec.h>
 #include <linux/swap.h>
 #include <linux/kthread.h>
+#include <linux/delayacct.h>
+#include <linux/ioprio.h>
+#include <linux/math64.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -21,6 +24,109 @@
 #include "iostat.h"
 #include <trace/events/f2fs.h>
 
+static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts)
+{
+       ts->total_time = ktime_get();
+#ifdef CONFIG_64BIT
+       ts->running_time = current->se.sum_exec_runtime;
+#endif
+#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
+       ts->runnable_time = current->sched_info.run_delay;
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+       if (current->delays)
+               ts->io_sleep_time = current->delays->blkio_delay;
+#endif
+}
+
+static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem,
+                                               struct f2fs_lock_context *lc)
+{
+       lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled();
+       if (!lc->lock_trace)
+               return;
+
+       get_lock_elapsed_time(&lc->ts);
+}
+
+static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
+                               struct f2fs_lock_context *lc, bool is_write)
+{
+       struct f2fs_time_stat tts;
+       unsigned long long total_time;
+       unsigned long long running_time = 0;
+       unsigned long long runnable_time = 0;
+       unsigned long long io_sleep_time = 0;
+       unsigned long long other_time = 0;
+       unsigned npm = NSEC_PER_MSEC;
+
+       if (!lc->lock_trace)
+               return;
+
+       get_lock_elapsed_time(&tts);
+
+       total_time = div_u64(tts.total_time - lc->ts.total_time, npm);
+       if (total_time <= MAX_LOCK_ELAPSED_TIME)
+               return;
+
+#ifdef CONFIG_64BIT
+       running_time = div_u64(tts.running_time - lc->ts.running_time, npm);
+#endif
+#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
+       runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm);
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+       io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm);
+#endif
+       if (total_time > running_time + io_sleep_time + runnable_time)
+               other_time = total_time - running_time -
+                               io_sleep_time - runnable_time;
+
+       trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current,
+                       get_current_ioprio(), total_time, running_time,
+                       runnable_time, io_sleep_time, other_time);
+}
+
+void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+       f2fs_down_read(sem);
+       trace_lock_elapsed_time_start(sem, lc);
+}
+
+int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct 
f2fs_lock_context *lc)
+{
+       if (!f2fs_down_read_trylock(sem))
+               return 0;
+       trace_lock_elapsed_time_start(sem, lc);
+       return 1;
+}
+
+void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+       f2fs_up_read(sem);
+       trace_lock_elapsed_time_end(sem, lc, false);
+}
+
+void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context 
*lc)
+{
+       f2fs_down_write(sem);
+       trace_lock_elapsed_time_start(sem, lc);
+}
+
+int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct 
f2fs_lock_context *lc)
+{
+       if (!f2fs_down_write_trylock(sem))
+               return 0;
+       trace_lock_elapsed_time_start(sem, lc);
+       return 1;
+}
+
+void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+       f2fs_up_write(sem);
+       trace_lock_elapsed_time_end(sem, lc, true);
+}
+
 #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
 
 static struct kmem_cache *ino_entry_slab;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c3e1611fce18..7e22315dbedd 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -173,6 +173,10 @@ enum device_allocation_policy {
        ALLOCATE_FORWARD_FROM_HINT,
 };
 
+enum f2fs_lock_name {
+       LOCK_NAME_NONE,
+};
+
 /*
  * An implementation of an rwsem that is explicitly unfair to readers. This
  * prevents priority inversion when a low-priority reader acquires the read 
lock
@@ -181,6 +185,8 @@ enum device_allocation_policy {
  */
 
 struct f2fs_rwsem {
+       struct f2fs_sb_info *sbi;
+       enum f2fs_lock_name name;
         struct rw_semaphore internal_rwsem;
 #ifdef CONFIG_F2FS_UNFAIR_RWSEM
         wait_queue_head_t read_waiters;
@@ -1409,6 +1415,24 @@ struct f2fs_gc_control {
        unsigned int nr_free_secs;      /* # of free sections to do GC */
 };
 
+struct f2fs_time_stat {
+       unsigned long long total_time;          /* total wall clock time */
+#ifdef CONFIG_64BIT
+       unsigned long long running_time;        /* running time */
+#endif
+#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
+       unsigned long long runnable_time;       /* runnable(including 
preempted) time */
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+       unsigned long long io_sleep_time;       /* IO sleep time */
+#endif
+};
+
+struct f2fs_lock_context {
+       struct f2fs_time_stat ts;
+       bool lock_trace;
+};
+
 /*
  * For s_flag in struct f2fs_sb_info
  * Modification on enum should be synchronized with s_flag array
@@ -1525,6 +1549,9 @@ enum f2fs_lookup_mode {
        LOOKUP_AUTO,
 };
 
+/* a threshold of maximum elapsed time in critical region to print tracepoint 
*/
+#define MAX_LOCK_ELAPSED_TIME          500
+
 static inline int f2fs_test_bit(unsigned int nr, char *addr);
 static inline void f2fs_set_bit(unsigned int nr, char *addr);
 static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -2263,16 +2290,22 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info 
*sbi, unsigned int f)
        spin_unlock_irqrestore(&sbi->cp_lock, flags);
 }
 
-#define init_f2fs_rwsem(sem)                                   \
+#define init_f2fs_rwsem(sem)   __init_f2fs_rwsem(sem, NULL, LOCK_NAME_NONE)
+#define init_f2fs_rwsem_trace  __init_f2fs_rwsem
+
+#define __init_f2fs_rwsem(sem, sbi, name)                      \
 do {                                                           \
        static struct lock_class_key __key;                     \
                                                                \
-       __init_f2fs_rwsem((sem), #sem, &__key);                 \
+       do_init_f2fs_rwsem((sem), #sem, &__key, sbi, name);     \
 } while (0)
 
-static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
-               const char *sem_name, struct lock_class_key *key)
+static inline void do_init_f2fs_rwsem(struct f2fs_rwsem *sem,
+               const char *sem_name, struct lock_class_key *key,
+               struct f2fs_sb_info *sbi, enum f2fs_lock_name name)
 {
+       sem->sbi = sbi;
+       sem->name = name;
        __init_rwsem(&sem->internal_rwsem, sem_name, key);
 #ifdef CONFIG_F2FS_UNFAIR_RWSEM
        init_waitqueue_head(&sem->read_waiters);
@@ -2341,6 +2374,16 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem)
 #endif
 }
 
+void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context 
*lc);
+int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem,
+                                               struct f2fs_lock_context *lc);
+void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
+void f2fs_down_write_trace(struct f2fs_rwsem *sem,
+                                               struct f2fs_lock_context *lc);
+int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem,
+                                               struct f2fs_lock_context *lc);
+void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
+
 static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
 {
        unsigned long flags;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 635dcabcf1c7..9a852a16df9c 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -184,6 +184,10 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT);
                { CP_PHASE_FINISH_BLOCK_OPS,    "finish block_ops" },           
        \
                { CP_PHASE_FINISH_CHECKPOINT,   "finish checkpoint" })
 
+#define show_lock_name(lock)                                           \
+       __print_symbolic(lock,                                          \
+               { LOCK_NAME_NONE,               "none" })
+
 struct f2fs_sb_info;
 struct f2fs_io_info;
 struct extent_info;
@@ -2452,6 +2456,70 @@ DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end,
        TP_ARGS(inode, offset, bytes)
 );
 
+TRACE_EVENT(f2fs_lock_elapsed_time,
+
+       TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name,
+               bool is_write, struct task_struct *p, int ioprio,
+               unsigned long long total_time,
+               unsigned long long running_time,
+               unsigned long long runnable_time,
+               unsigned long long io_sleep_time,
+               unsigned long long other_time),
+
+       TP_ARGS(sbi, lock_name, is_write, p, ioprio, total_time, running_time,
+               runnable_time, io_sleep_time, other_time),
+
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __array(char, comm, TASK_COMM_LEN)
+               __field(pid_t, pid)
+               __field(int, prio)
+               __field(int, ioprio_class)
+               __field(int, ioprio_data)
+               __field(unsigned int, lock_name)
+               __field(bool, is_write)
+               __field(unsigned long long, total_time)
+               __field(unsigned long long, running_time)
+               __field(unsigned long long, runnable_time)
+               __field(unsigned long long, io_sleep_time)
+               __field(unsigned long long, other_time)
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = sbi->sb->s_dev;
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->prio           = p->prio;
+               __entry->ioprio_class   = IOPRIO_PRIO_CLASS(ioprio);
+               __entry->ioprio_data    = IOPRIO_PRIO_DATA(ioprio);
+               __entry->lock_name      = lock_name;
+               __entry->is_write       = is_write;
+               __entry->total_time     = total_time;
+               __entry->running_time   = running_time;
+               __entry->runnable_time  = runnable_time;
+               __entry->io_sleep_time  = io_sleep_time;
+               __entry->other_time     = other_time;
+       ),
+
+       TP_printk("dev = (%d,%d), comm: %s, pid: %d, prio: %d, "
+               "ioprio_class: %d, ioprio_data: %d, lock_name: %s, "
+               "lock_type: %s, total: %llu, running: %llu, "
+               "runnable: %llu, io_sleep: %llu, other: %llu",
+               show_dev(__entry->dev),
+               __entry->comm,
+               __entry->pid,
+               __entry->prio,
+               __entry->ioprio_class,
+               __entry->ioprio_data,
+               show_lock_name(__entry->lock_name),
+               __entry->is_write ? "wlock" : "rlock",
+               __entry->total_time,
+               __entry->running_time,
+               __entry->runnable_time,
+               __entry->io_sleep_time,
+               __entry->other_time)
+);
+
 #endif /* _TRACE_F2FS_H */
 
  /* This part must be outside protection */
-- 
2.49.0



_______________________________________________
Linux-f2fs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

Reply via email to