This patch adds lock elapsed time trace facility for f2fs rwsemphore.
If total elapsed time of critical region covered by lock exceeds a
threshold, it will print tracepoint to dump information of lock related
context, including:
- thread information
- CPU/IO priority
- lock information
- elapsed time
- total time
- running time (depend on CONFIG_64BIT)
- runnable time (depend on CONFIG_SCHED_INFO and CONFIG_SCHEDSTATS)
- io sleep time (depend on CONFIG_TASK_DELAY_ACCT and
/proc/sys/kernel/task_delayacct)
- other time (by default other time will account nonio sleep time,
but, if above kconfig is not defined, other time will
include runnable time and/or io sleep time as wll)
output:
f2fs_lock_elapsed_time: dev = (254,52), comm: sh, pid: 13855, prio: 120,
ioprio_class: 2, ioprio_data: 4, lock_name: cp_rwsem, lock_type: rlock, total:
1000, running: 993, runnable: 2, io_sleep: 0, other: 5
Signed-off-by: Chao Yu <[email protected]>
---
fs/f2fs/checkpoint.c | 106 ++++++++++++++++++++++++++++++++++++
fs/f2fs/f2fs.h | 51 +++++++++++++++--
include/trace/events/f2fs.h | 68 +++++++++++++++++++++++
3 files changed, 221 insertions(+), 4 deletions(-)
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 300664269eb6..bc6058a3122b 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -14,6 +14,9 @@
#include <linux/pagevec.h>
#include <linux/swap.h>
#include <linux/kthread.h>
+#include <linux/delayacct.h>
+#include <linux/ioprio.h>
+#include <linux/math64.h>
#include "f2fs.h"
#include "node.h"
@@ -21,6 +24,109 @@
#include "iostat.h"
#include <trace/events/f2fs.h>
+static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts)
+{
+ ts->total_time = ktime_get();
+#ifdef CONFIG_64BIT
+ ts->running_time = current->se.sum_exec_runtime;
+#endif
+#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
+ ts->runnable_time = current->sched_info.run_delay;
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+ if (current->delays)
+ ts->io_sleep_time = current->delays->blkio_delay;
+#endif
+}
+
+static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc)
+{
+ lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled();
+ if (!lc->lock_trace)
+ return;
+
+ get_lock_elapsed_time(&lc->ts);
+}
+
+static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc, bool is_write)
+{
+ struct f2fs_time_stat tts;
+ unsigned long long total_time;
+ unsigned long long running_time = 0;
+ unsigned long long runnable_time = 0;
+ unsigned long long io_sleep_time = 0;
+ unsigned long long other_time = 0;
+ unsigned npm = NSEC_PER_MSEC;
+
+ if (!lc->lock_trace)
+ return;
+
+ get_lock_elapsed_time(&tts);
+
+ total_time = div_u64(tts.total_time - lc->ts.total_time, npm);
+ if (total_time <= MAX_LOCK_ELAPSED_TIME)
+ return;
+
+#ifdef CONFIG_64BIT
+ running_time = div_u64(tts.running_time - lc->ts.running_time, npm);
+#endif
+#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
+ runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm);
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+ io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm);
+#endif
+ if (total_time > running_time + io_sleep_time + runnable_time)
+ other_time = total_time - running_time -
+ io_sleep_time - runnable_time;
+
+ trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current,
+ get_current_ioprio(), total_time, running_time,
+ runnable_time, io_sleep_time, other_time);
+}
+
+void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+ f2fs_down_read(sem);
+ trace_lock_elapsed_time_start(sem, lc);
+}
+
+int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct
f2fs_lock_context *lc)
+{
+ if (!f2fs_down_read_trylock(sem))
+ return 0;
+ trace_lock_elapsed_time_start(sem, lc);
+ return 1;
+}
+
+void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+ f2fs_up_read(sem);
+ trace_lock_elapsed_time_end(sem, lc, false);
+}
+
+void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context
*lc)
+{
+ f2fs_down_write(sem);
+ trace_lock_elapsed_time_start(sem, lc);
+}
+
+int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct
f2fs_lock_context *lc)
+{
+ if (!f2fs_down_write_trylock(sem))
+ return 0;
+ trace_lock_elapsed_time_start(sem, lc);
+ return 1;
+}
+
+void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
+{
+ f2fs_up_write(sem);
+ trace_lock_elapsed_time_end(sem, lc, true);
+}
+
#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
static struct kmem_cache *ino_entry_slab;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c3e1611fce18..7e22315dbedd 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -173,6 +173,10 @@ enum device_allocation_policy {
ALLOCATE_FORWARD_FROM_HINT,
};
+enum f2fs_lock_name {
+ LOCK_NAME_NONE,
+};
+
/*
* An implementation of an rwsem that is explicitly unfair to readers. This
* prevents priority inversion when a low-priority reader acquires the read
lock
@@ -181,6 +185,8 @@ enum device_allocation_policy {
*/
struct f2fs_rwsem {
+ struct f2fs_sb_info *sbi;
+ enum f2fs_lock_name name;
struct rw_semaphore internal_rwsem;
#ifdef CONFIG_F2FS_UNFAIR_RWSEM
wait_queue_head_t read_waiters;
@@ -1409,6 +1415,24 @@ struct f2fs_gc_control {
unsigned int nr_free_secs; /* # of free sections to do GC */
};
+struct f2fs_time_stat {
+ unsigned long long total_time; /* total wall clock time */
+#ifdef CONFIG_64BIT
+ unsigned long long running_time; /* running time */
+#endif
+#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
+ unsigned long long runnable_time; /* runnable(including
preempted) time */
+#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+ unsigned long long io_sleep_time; /* IO sleep time */
+#endif
+};
+
+struct f2fs_lock_context {
+ struct f2fs_time_stat ts;
+ bool lock_trace;
+};
+
/*
* For s_flag in struct f2fs_sb_info
* Modification on enum should be synchronized with s_flag array
@@ -1525,6 +1549,9 @@ enum f2fs_lookup_mode {
LOOKUP_AUTO,
};
+/* a threshold of maximum elapsed time in critical region to print tracepoint
*/
+#define MAX_LOCK_ELAPSED_TIME 500
+
static inline int f2fs_test_bit(unsigned int nr, char *addr);
static inline void f2fs_set_bit(unsigned int nr, char *addr);
static inline void f2fs_clear_bit(unsigned int nr, char *addr);
@@ -2263,16 +2290,22 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info
*sbi, unsigned int f)
spin_unlock_irqrestore(&sbi->cp_lock, flags);
}
-#define init_f2fs_rwsem(sem) \
+#define init_f2fs_rwsem(sem) __init_f2fs_rwsem(sem, NULL, LOCK_NAME_NONE)
+#define init_f2fs_rwsem_trace __init_f2fs_rwsem
+
+#define __init_f2fs_rwsem(sem, sbi, name) \
do { \
static struct lock_class_key __key; \
\
- __init_f2fs_rwsem((sem), #sem, &__key); \
+ do_init_f2fs_rwsem((sem), #sem, &__key, sbi, name); \
} while (0)
-static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
- const char *sem_name, struct lock_class_key *key)
+static inline void do_init_f2fs_rwsem(struct f2fs_rwsem *sem,
+ const char *sem_name, struct lock_class_key *key,
+ struct f2fs_sb_info *sbi, enum f2fs_lock_name name)
{
+ sem->sbi = sbi;
+ sem->name = name;
__init_rwsem(&sem->internal_rwsem, sem_name, key);
#ifdef CONFIG_F2FS_UNFAIR_RWSEM
init_waitqueue_head(&sem->read_waiters);
@@ -2341,6 +2374,16 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem)
#endif
}
+void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context
*lc);
+int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc);
+void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
+void f2fs_down_write_trace(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc);
+int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem,
+ struct f2fs_lock_context *lc);
+void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
+
static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
{
unsigned long flags;
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 635dcabcf1c7..9a852a16df9c 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -184,6 +184,10 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT);
{ CP_PHASE_FINISH_BLOCK_OPS, "finish block_ops" },
\
{ CP_PHASE_FINISH_CHECKPOINT, "finish checkpoint" })
+#define show_lock_name(lock) \
+ __print_symbolic(lock, \
+ { LOCK_NAME_NONE, "none" })
+
struct f2fs_sb_info;
struct f2fs_io_info;
struct extent_info;
@@ -2452,6 +2456,70 @@ DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end,
TP_ARGS(inode, offset, bytes)
);
+TRACE_EVENT(f2fs_lock_elapsed_time,
+
+ TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name,
+ bool is_write, struct task_struct *p, int ioprio,
+ unsigned long long total_time,
+ unsigned long long running_time,
+ unsigned long long runnable_time,
+ unsigned long long io_sleep_time,
+ unsigned long long other_time),
+
+ TP_ARGS(sbi, lock_name, is_write, p, ioprio, total_time, running_time,
+ runnable_time, io_sleep_time, other_time),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __array(char, comm, TASK_COMM_LEN)
+ __field(pid_t, pid)
+ __field(int, prio)
+ __field(int, ioprio_class)
+ __field(int, ioprio_data)
+ __field(unsigned int, lock_name)
+ __field(bool, is_write)
+ __field(unsigned long long, total_time)
+ __field(unsigned long long, running_time)
+ __field(unsigned long long, runnable_time)
+ __field(unsigned long long, io_sleep_time)
+ __field(unsigned long long, other_time)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = sbi->sb->s_dev;
+ memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+ __entry->pid = p->pid;
+ __entry->prio = p->prio;
+ __entry->ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
+ __entry->ioprio_data = IOPRIO_PRIO_DATA(ioprio);
+ __entry->lock_name = lock_name;
+ __entry->is_write = is_write;
+ __entry->total_time = total_time;
+ __entry->running_time = running_time;
+ __entry->runnable_time = runnable_time;
+ __entry->io_sleep_time = io_sleep_time;
+ __entry->other_time = other_time;
+ ),
+
+ TP_printk("dev = (%d,%d), comm: %s, pid: %d, prio: %d, "
+ "ioprio_class: %d, ioprio_data: %d, lock_name: %s, "
+ "lock_type: %s, total: %llu, running: %llu, "
+ "runnable: %llu, io_sleep: %llu, other: %llu",
+ show_dev(__entry->dev),
+ __entry->comm,
+ __entry->pid,
+ __entry->prio,
+ __entry->ioprio_class,
+ __entry->ioprio_data,
+ show_lock_name(__entry->lock_name),
+ __entry->is_write ? "wlock" : "rlock",
+ __entry->total_time,
+ __entry->running_time,
+ __entry->runnable_time,
+ __entry->io_sleep_time,
+ __entry->other_time)
+);
+
#endif /* _TRACE_F2FS_H */
/* This part must be outside protection */
--
2.49.0
_______________________________________________
Linux-f2fs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel