Stuck when disabling checkpoint:

[  403.438233][  T739]  __switch_to+0x164/0x324
[  403.442532][  T739]  __schedule+0xbd0/0x1d28
[  403.446822][  T739]  schedule+0x4c/0x118
[  403.450758][  T739]  schedule_preempt_disabled+0x24/0x44
[  403.456083][  T739]  rwsem_down_write_slowpath+0x438/0x988
[  403.461588][  T739]  down_write+0x10c/0x16c
[  403.465785][  T739]  f2fs_down_write_trace+0x1c/0x70
[  403.470797][  T739]  f2fs_disable_checkpoint+0x114/0x2b8
[  403.476126][  T739]  f2fs_fill_super+0x22c8/0x2728
[  403.480924][  T739]  get_tree_bdev_flags+0x144/0x1dc
[  403.485894][  T739]  get_tree_bdev+0x14/0x24
[  403.490176][  T739]  f2fs_get_tree+0x18/0x28
[  403.494448][  T739]  vfs_get_tree+0x48/0x10c
[  403.498724][  T739]  path_mount+0x62c/0xb74

On 01/04, Chao Yu wrote:
> This patch adds lock elapsed time trace facility for f2fs rwsemphore.
> 
> If total elapsed time of critical region covered by lock exceeds a
> threshold, it will print tracepoint to dump information of lock related
> context, including:
> - thread information
> - CPU/IO priority
> - lock information
> - elapsed time
>  - total time
>  - running time (depend on CONFIG_64BIT)
>  - runnable time (depend on CONFIG_SCHED_INFO and CONFIG_SCHEDSTATS)
>  - io sleep time (depend on CONFIG_TASK_DELAY_ACCT and
>                 /proc/sys/kernel/task_delayacct)
>  - other time    (by default other time will account nonio sleep time,
>                   but, if above kconfig is not defined, other time will
>                   include runnable time and/or io sleep time as wll)
> 
> output:
>     f2fs_lock_elapsed_time: dev = (254,52), comm: sh, pid: 13855, prio: 120, 
> ioprio_class: 2, ioprio_data: 4, lock_name: cp_rwsem, lock_type: rlock, 
> total: 1000, running: 993, runnable: 2, io_sleep: 0, other: 5
> 
> Signed-off-by: Chao Yu <[email protected]>
> ---
>  fs/f2fs/checkpoint.c        | 106 ++++++++++++++++++++++++++++++++++++
>  fs/f2fs/f2fs.h              |  51 +++++++++++++++--
>  include/trace/events/f2fs.h |  68 +++++++++++++++++++++++
>  3 files changed, 221 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> index 300664269eb6..bc6058a3122b 100644
> --- a/fs/f2fs/checkpoint.c
> +++ b/fs/f2fs/checkpoint.c
> @@ -14,6 +14,9 @@
>  #include <linux/pagevec.h>
>  #include <linux/swap.h>
>  #include <linux/kthread.h>
> +#include <linux/delayacct.h>
> +#include <linux/ioprio.h>
> +#include <linux/math64.h>
>  
>  #include "f2fs.h"
>  #include "node.h"
> @@ -21,6 +24,109 @@
>  #include "iostat.h"
>  #include <trace/events/f2fs.h>
>  
> +static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts)
> +{
> +     ts->total_time = ktime_get();
> +#ifdef CONFIG_64BIT
> +     ts->running_time = current->se.sum_exec_runtime;
> +#endif
> +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
> +     ts->runnable_time = current->sched_info.run_delay;
> +#endif
> +#ifdef CONFIG_TASK_DELAY_ACCT
> +     if (current->delays)
> +             ts->io_sleep_time = current->delays->blkio_delay;
> +#endif
> +}
> +
> +static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem,
> +                                             struct f2fs_lock_context *lc)
> +{
> +     lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled();
> +     if (!lc->lock_trace)
> +             return;
> +
> +     get_lock_elapsed_time(&lc->ts);
> +}
> +
> +static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
> +                             struct f2fs_lock_context *lc, bool is_write)
> +{
> +     struct f2fs_time_stat tts;
> +     unsigned long long total_time;
> +     unsigned long long running_time = 0;
> +     unsigned long long runnable_time = 0;
> +     unsigned long long io_sleep_time = 0;
> +     unsigned long long other_time = 0;
> +     unsigned npm = NSEC_PER_MSEC;
> +
> +     if (!lc->lock_trace)
> +             return;
> +
> +     get_lock_elapsed_time(&tts);
> +
> +     total_time = div_u64(tts.total_time - lc->ts.total_time, npm);
> +     if (total_time <= MAX_LOCK_ELAPSED_TIME)
> +             return;
> +
> +#ifdef CONFIG_64BIT
> +     running_time = div_u64(tts.running_time - lc->ts.running_time, npm);
> +#endif
> +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
> +     runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm);
> +#endif
> +#ifdef CONFIG_TASK_DELAY_ACCT
> +     io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm);
> +#endif
> +     if (total_time > running_time + io_sleep_time + runnable_time)
> +             other_time = total_time - running_time -
> +                             io_sleep_time - runnable_time;
> +
> +     trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current,
> +                     get_current_ioprio(), total_time, running_time,
> +                     runnable_time, io_sleep_time, other_time);
> +}
> +
> +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context 
> *lc)
> +{
> +     f2fs_down_read(sem);
> +     trace_lock_elapsed_time_start(sem, lc);
> +}
> +
> +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct 
> f2fs_lock_context *lc)
> +{
> +     if (!f2fs_down_read_trylock(sem))
> +             return 0;
> +     trace_lock_elapsed_time_start(sem, lc);
> +     return 1;
> +}
> +
> +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> +     f2fs_up_read(sem);
> +     trace_lock_elapsed_time_end(sem, lc, false);
> +}
> +
> +void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context 
> *lc)
> +{
> +     f2fs_down_write(sem);
> +     trace_lock_elapsed_time_start(sem, lc);
> +}
> +
> +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct 
> f2fs_lock_context *lc)
> +{
> +     if (!f2fs_down_write_trylock(sem))
> +             return 0;
> +     trace_lock_elapsed_time_start(sem, lc);
> +     return 1;
> +}
> +
> +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context 
> *lc)
> +{
> +     f2fs_up_write(sem);
> +     trace_lock_elapsed_time_end(sem, lc, true);
> +}
> +
>  #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
>  
>  static struct kmem_cache *ino_entry_slab;
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index c3e1611fce18..7e22315dbedd 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -173,6 +173,10 @@ enum device_allocation_policy {
>       ALLOCATE_FORWARD_FROM_HINT,
>  };
>  
> +enum f2fs_lock_name {
> +     LOCK_NAME_NONE,
> +};
> +
>  /*
>   * An implementation of an rwsem that is explicitly unfair to readers. This
>   * prevents priority inversion when a low-priority reader acquires the read 
> lock
> @@ -181,6 +185,8 @@ enum device_allocation_policy {
>   */
>  
>  struct f2fs_rwsem {
> +     struct f2fs_sb_info *sbi;
> +     enum f2fs_lock_name name;
>          struct rw_semaphore internal_rwsem;
>  #ifdef CONFIG_F2FS_UNFAIR_RWSEM
>          wait_queue_head_t read_waiters;
> @@ -1409,6 +1415,24 @@ struct f2fs_gc_control {
>       unsigned int nr_free_secs;      /* # of free sections to do GC */
>  };
>  
> +struct f2fs_time_stat {
> +     unsigned long long total_time;          /* total wall clock time */
> +#ifdef CONFIG_64BIT
> +     unsigned long long running_time;        /* running time */
> +#endif
> +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
> +     unsigned long long runnable_time;       /* runnable(including 
> preempted) time */
> +#endif
> +#ifdef CONFIG_TASK_DELAY_ACCT
> +     unsigned long long io_sleep_time;       /* IO sleep time */
> +#endif
> +};
> +
> +struct f2fs_lock_context {
> +     struct f2fs_time_stat ts;
> +     bool lock_trace;
> +};
> +
>  /*
>   * For s_flag in struct f2fs_sb_info
>   * Modification on enum should be synchronized with s_flag array
> @@ -1525,6 +1549,9 @@ enum f2fs_lookup_mode {
>       LOOKUP_AUTO,
>  };
>  
> +/* a threshold of maximum elapsed time in critical region to print 
> tracepoint */
> +#define MAX_LOCK_ELAPSED_TIME                500
> +
>  static inline int f2fs_test_bit(unsigned int nr, char *addr);
>  static inline void f2fs_set_bit(unsigned int nr, char *addr);
>  static inline void f2fs_clear_bit(unsigned int nr, char *addr);
> @@ -2263,16 +2290,22 @@ static inline void clear_ckpt_flags(struct 
> f2fs_sb_info *sbi, unsigned int f)
>       spin_unlock_irqrestore(&sbi->cp_lock, flags);
>  }
>  
> -#define init_f2fs_rwsem(sem)                                 \
> +#define init_f2fs_rwsem(sem) __init_f2fs_rwsem(sem, NULL, LOCK_NAME_NONE)
> +#define init_f2fs_rwsem_trace        __init_f2fs_rwsem
> +
> +#define __init_f2fs_rwsem(sem, sbi, name)                    \
>  do {                                                         \
>       static struct lock_class_key __key;                     \
>                                                               \
> -     __init_f2fs_rwsem((sem), #sem, &__key);                 \
> +     do_init_f2fs_rwsem((sem), #sem, &__key, sbi, name);     \
>  } while (0)
>  
> -static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
> -             const char *sem_name, struct lock_class_key *key)
> +static inline void do_init_f2fs_rwsem(struct f2fs_rwsem *sem,
> +             const char *sem_name, struct lock_class_key *key,
> +             struct f2fs_sb_info *sbi, enum f2fs_lock_name name)
>  {
> +     sem->sbi = sbi;
> +     sem->name = name;
>       __init_rwsem(&sem->internal_rwsem, sem_name, key);
>  #ifdef CONFIG_F2FS_UNFAIR_RWSEM
>       init_waitqueue_head(&sem->read_waiters);
> @@ -2341,6 +2374,16 @@ static inline void f2fs_up_write(struct f2fs_rwsem 
> *sem)
>  #endif
>  }
>  
> +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context 
> *lc);
> +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem,
> +                                             struct f2fs_lock_context *lc);
> +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context 
> *lc);
> +void f2fs_down_write_trace(struct f2fs_rwsem *sem,
> +                                             struct f2fs_lock_context *lc);
> +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem,
> +                                             struct f2fs_lock_context *lc);
> +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context 
> *lc);
> +
>  static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
>  {
>       unsigned long flags;
> diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
> index 635dcabcf1c7..9a852a16df9c 100644
> --- a/include/trace/events/f2fs.h
> +++ b/include/trace/events/f2fs.h
> @@ -184,6 +184,10 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT);
>               { CP_PHASE_FINISH_BLOCK_OPS,    "finish block_ops" },           
>         \
>               { CP_PHASE_FINISH_CHECKPOINT,   "finish checkpoint" })
>  
> +#define show_lock_name(lock)                                         \
> +     __print_symbolic(lock,                                          \
> +             { LOCK_NAME_NONE,               "none" })
> +
>  struct f2fs_sb_info;
>  struct f2fs_io_info;
>  struct extent_info;
> @@ -2452,6 +2456,70 @@ DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end,
>       TP_ARGS(inode, offset, bytes)
>  );
>  
> +TRACE_EVENT(f2fs_lock_elapsed_time,
> +
> +     TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name,
> +             bool is_write, struct task_struct *p, int ioprio,
> +             unsigned long long total_time,
> +             unsigned long long running_time,
> +             unsigned long long runnable_time,
> +             unsigned long long io_sleep_time,
> +             unsigned long long other_time),
> +
> +     TP_ARGS(sbi, lock_name, is_write, p, ioprio, total_time, running_time,
> +             runnable_time, io_sleep_time, other_time),
> +
> +     TP_STRUCT__entry(
> +             __field(dev_t, dev)
> +             __array(char, comm, TASK_COMM_LEN)
> +             __field(pid_t, pid)
> +             __field(int, prio)
> +             __field(int, ioprio_class)
> +             __field(int, ioprio_data)
> +             __field(unsigned int, lock_name)
> +             __field(bool, is_write)
> +             __field(unsigned long long, total_time)
> +             __field(unsigned long long, running_time)
> +             __field(unsigned long long, runnable_time)
> +             __field(unsigned long long, io_sleep_time)
> +             __field(unsigned long long, other_time)
> +     ),
> +
> +     TP_fast_assign(
> +             __entry->dev            = sbi->sb->s_dev;
> +             memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
> +             __entry->pid            = p->pid;
> +             __entry->prio           = p->prio;
> +             __entry->ioprio_class   = IOPRIO_PRIO_CLASS(ioprio);
> +             __entry->ioprio_data    = IOPRIO_PRIO_DATA(ioprio);
> +             __entry->lock_name      = lock_name;
> +             __entry->is_write       = is_write;
> +             __entry->total_time     = total_time;
> +             __entry->running_time   = running_time;
> +             __entry->runnable_time  = runnable_time;
> +             __entry->io_sleep_time  = io_sleep_time;
> +             __entry->other_time     = other_time;
> +     ),
> +
> +     TP_printk("dev = (%d,%d), comm: %s, pid: %d, prio: %d, "
> +             "ioprio_class: %d, ioprio_data: %d, lock_name: %s, "
> +             "lock_type: %s, total: %llu, running: %llu, "
> +             "runnable: %llu, io_sleep: %llu, other: %llu",
> +             show_dev(__entry->dev),
> +             __entry->comm,
> +             __entry->pid,
> +             __entry->prio,
> +             __entry->ioprio_class,
> +             __entry->ioprio_data,
> +             show_lock_name(__entry->lock_name),
> +             __entry->is_write ? "wlock" : "rlock",
> +             __entry->total_time,
> +             __entry->running_time,
> +             __entry->runnable_time,
> +             __entry->io_sleep_time,
> +             __entry->other_time)
> +);
> +
>  #endif /* _TRACE_F2FS_H */
>  
>   /* This part must be outside protection */
> -- 
> 2.49.0


_______________________________________________
Linux-f2fs-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

Reply via email to