Re: How to debug outliers in `wb_wait_for_completion()` in `ksys_sync()`?
Dear Linux folks, On 08/28/18 07:27, Paul Menzel wrote: > Using `sleepgraph.py` [1][2] to profile the suspend to RAM (STR) > times, shows that `ksys_enter` takes a noticeable amount of time. > > 13 ms on a TUXEDO Book BU1406 with the NVMe device *SAMSUNG > MZVKW512HMJP-0*, which is quite good, and over a 60 ms on ASRock > E350M1 with an SSD SanDisk device. It’s 109 ms on the ASRock E350M1, which makes up one third of the total time to reach ACPI S3. > Adding `devicefilter: ksys_sync` to `config/suspend-callgraph.cfg`, > and running `sudo ./sleepgraph.py -config > config/suspend-callgraph.cfg`, the attached HTML output shows, that > `iterate_supers` takes 6 to 7 ms twice. > > • `iterate_supers` (6.316 ms @ 388.944557) > • `iterate_supers` (0.201 ms @ 388.950873) > • `iterate_supers` (7.421 ms @ 388.951074) > > Normally, `sync_inodes_one_sb` only takes microseconds, but once in > both cases it takes several milliseconds. > > • sync_inodes_one_sb (0.001 ms @ 388.944660) > • sync_inodes_one_sb (5.978 ms @ 388.944665) > • sync_inodes_one_sb (0.001 ms @ 388.950645) > > Please find an excerpt from the call graph from ftrace below, and > note the time increase between 388.944751 and 388.950636. > >> 388.944700 | 0) sleepgr-11355 | | >> wb_wait_for_completion() { >> 388.944701 | 0) sleepgr-11355 | | >> _cond_resched() { >> 388.944701 | 0) sleepgr-11355 | 0.064 us | >> rcu_all_qs(); >> 388.944702 | 0) sleepgr-11355 | 0.664 us | } /* >> _cond_resched */ >> 388.944702 | 0) sleepgr-11355 | 0.067 us | >> init_wait_entry(); >> 388.944703 | 0) sleepgr-11355 | | >> prepare_to_wait_event() { >> 388.944703 | 0) sleepgr-11355 | 0.080 us | >> _raw_spin_lock_irqsave(); >> 388.944704 | 0) sleepgr-11355 | 0.073 us | >> _raw_spin_unlock_irqrestore(); >> 388.944704 | 0) sleepgr-11355 | 1.388 us | } /* >> prepare_to_wait_event */ >> 388.944705 | 0) sleepgr-11355 | | schedule() { >> 388.944705 | 0) sleepgr-11355 | 0.085 us | >> rcu_note_context_switch(); >> 388.944706 | 0) sleepgr-11355 | 0.064 us | >> _raw_spin_lock(); >> 388.944707 | 0) sleepgr-11355 | 0.093 us | >> update_rq_clock(); >> 388.944708 | 0) sleepgr-11355 | | >> deactivate_task() { >> 388.944708 | 0) sleepgr-11355 | | >> dequeue_task_fair() { >> 388.944708 | 0) sleepgr-11355 | | >> dequeue_entity() { >> 388.944709 | 0) sleepgr-11355 | | >> update_curr() { >> 388.944709 | 0) sleepgr-11355 | 0.095 us | >> update_min_vruntime(); >> 388.944710 | 0) sleepgr-11355 | 0.126 us | >> cpuacct_charge(); >> 388.944710 | 0) sleepgr-11355 | | >> __cgroup_account_cputime() { >> 388.944711 | 0) sleepgr-11355 | 0.055 us | >> cgroup_rstat_updated(); >> 388.944711 | 0) sleepgr-11355 | 0.675 us | } >> /* __cgroup_account_cputime */ >> 388.944712 | 0) sleepgr-11355 | 2.779 us | } /* >> update_curr */ >> 388.944712 | 0) sleepgr-11355 | 0.121 us | >> __update_load_avg_se(); >> 388.944713 | 0) sleepgr-11355 | 0.118 us | >> __update_load_avg_cfs_rq(); >> 388.944713 | 0) sleepgr-11355 | 0.056 us | >> clear_buddies(); >> 388.944714 | 0) sleepgr-11355 | 0.066 us | >> account_entity_dequeue(); >> 388.944714 | 0) sleepgr-11355 | 0.050 us | >> update_cfs_group(); >> 388.944715 | 0) sleepgr-11355 | 6.127 us | } /* >> dequeue_entity */ >> 388.944715 | 0) sleepgr-11355 | | >> dequeue_entity() { >> 388.944715 | 0) sleepgr-11355 | | >> update_curr() { >> 388.944716 | 0) sleepgr-11355 | 0.097 us | >> __calc_delta(); >> 388.944716 | 0) sleepgr-11355 | 0.071 us | >> update_min_vruntime(); >> 388.944717 | 0) sleepgr-11355 | 1.473 us | } /* >> update_curr */ >> 388.944717 | 0) sleepgr-11355 | 0.142 us | >> __update_load_avg_se(); >> 388.944718 | 0) sleepgr-11355 | 0.100 us | >> __update_load_avg_cfs_rq(); >> 388.944719 | 0) sleepgr-11355 | 0.069 us | >> clear_buddies(); >> 388.944719 | 0) sleepgr-11355 | 0.098 us | >> account_entity_dequeue(); >> 388.944720 |
Re: How to debug outliers in `wb_wait_for_completion()` in `ksys_sync()`?
Dear Linux folks, On 08/28/18 07:27, Paul Menzel wrote: > Using `sleepgraph.py` [1][2] to profile the suspend to RAM (STR) > times, shows that `ksys_enter` takes a noticeable amount of time. > > 13 ms on a TUXEDO Book BU1406 with the NVMe device *SAMSUNG > MZVKW512HMJP-0*, which is quite good, and over a 60 ms on ASRock > E350M1 with an SSD SanDisk device. It’s 109 ms on the ASRock E350M1, which makes up one third of the total time to reach ACPI S3. > Adding `devicefilter: ksys_sync` to `config/suspend-callgraph.cfg`, > and running `sudo ./sleepgraph.py -config > config/suspend-callgraph.cfg`, the attached HTML output shows, that > `iterate_supers` takes 6 to 7 ms twice. > > • `iterate_supers` (6.316 ms @ 388.944557) > • `iterate_supers` (0.201 ms @ 388.950873) > • `iterate_supers` (7.421 ms @ 388.951074) > > Normally, `sync_inodes_one_sb` only takes microseconds, but once in > both cases it takes several milliseconds. > > • sync_inodes_one_sb (0.001 ms @ 388.944660) > • sync_inodes_one_sb (5.978 ms @ 388.944665) > • sync_inodes_one_sb (0.001 ms @ 388.950645) > > Please find an excerpt from the call graph from ftrace below, and > note the time increase between 388.944751 and 388.950636. > >> 388.944700 | 0) sleepgr-11355 | | >> wb_wait_for_completion() { >> 388.944701 | 0) sleepgr-11355 | | >> _cond_resched() { >> 388.944701 | 0) sleepgr-11355 | 0.064 us | >> rcu_all_qs(); >> 388.944702 | 0) sleepgr-11355 | 0.664 us | } /* >> _cond_resched */ >> 388.944702 | 0) sleepgr-11355 | 0.067 us | >> init_wait_entry(); >> 388.944703 | 0) sleepgr-11355 | | >> prepare_to_wait_event() { >> 388.944703 | 0) sleepgr-11355 | 0.080 us | >> _raw_spin_lock_irqsave(); >> 388.944704 | 0) sleepgr-11355 | 0.073 us | >> _raw_spin_unlock_irqrestore(); >> 388.944704 | 0) sleepgr-11355 | 1.388 us | } /* >> prepare_to_wait_event */ >> 388.944705 | 0) sleepgr-11355 | | schedule() { >> 388.944705 | 0) sleepgr-11355 | 0.085 us | >> rcu_note_context_switch(); >> 388.944706 | 0) sleepgr-11355 | 0.064 us | >> _raw_spin_lock(); >> 388.944707 | 0) sleepgr-11355 | 0.093 us | >> update_rq_clock(); >> 388.944708 | 0) sleepgr-11355 | | >> deactivate_task() { >> 388.944708 | 0) sleepgr-11355 | | >> dequeue_task_fair() { >> 388.944708 | 0) sleepgr-11355 | | >> dequeue_entity() { >> 388.944709 | 0) sleepgr-11355 | | >> update_curr() { >> 388.944709 | 0) sleepgr-11355 | 0.095 us | >> update_min_vruntime(); >> 388.944710 | 0) sleepgr-11355 | 0.126 us | >> cpuacct_charge(); >> 388.944710 | 0) sleepgr-11355 | | >> __cgroup_account_cputime() { >> 388.944711 | 0) sleepgr-11355 | 0.055 us | >> cgroup_rstat_updated(); >> 388.944711 | 0) sleepgr-11355 | 0.675 us | } >> /* __cgroup_account_cputime */ >> 388.944712 | 0) sleepgr-11355 | 2.779 us | } /* >> update_curr */ >> 388.944712 | 0) sleepgr-11355 | 0.121 us | >> __update_load_avg_se(); >> 388.944713 | 0) sleepgr-11355 | 0.118 us | >> __update_load_avg_cfs_rq(); >> 388.944713 | 0) sleepgr-11355 | 0.056 us | >> clear_buddies(); >> 388.944714 | 0) sleepgr-11355 | 0.066 us | >> account_entity_dequeue(); >> 388.944714 | 0) sleepgr-11355 | 0.050 us | >> update_cfs_group(); >> 388.944715 | 0) sleepgr-11355 | 6.127 us | } /* >> dequeue_entity */ >> 388.944715 | 0) sleepgr-11355 | | >> dequeue_entity() { >> 388.944715 | 0) sleepgr-11355 | | >> update_curr() { >> 388.944716 | 0) sleepgr-11355 | 0.097 us | >> __calc_delta(); >> 388.944716 | 0) sleepgr-11355 | 0.071 us | >> update_min_vruntime(); >> 388.944717 | 0) sleepgr-11355 | 1.473 us | } /* >> update_curr */ >> 388.944717 | 0) sleepgr-11355 | 0.142 us | >> __update_load_avg_se(); >> 388.944718 | 0) sleepgr-11355 | 0.100 us | >> __update_load_avg_cfs_rq(); >> 388.944719 | 0) sleepgr-11355 | 0.069 us | >> clear_buddies(); >> 388.944719 | 0) sleepgr-11355 | 0.098 us | >> account_entity_dequeue(); >> 388.944720 |
How to debug outliers in `wb_wait_for_completion()` in `ksys_sync()`?
Dear Linux folks, Using `sleepgraph.py` [1][2] to profile the suspend to RAM (STR) times, shows that `ksys_enter` takes a noticeable amount of time. 13 ms on a TUXEDO Book BU1406 with the NVMe device *SAMSUNG MZVKW512HMJP-0*, which is quite good, and over a 60 ms on ASRock E350M1 with an SSD SanDisk device. Adding `devicefilter: ksys_sync` to `config/suspend-callgraph.cfg`, and running `sudo ./sleepgraph.py -config config/suspend-callgraph.cfg`, the attached HTML output shows, that `iterate_supers` takes 6 to 7 ms twice. • `iterate_supers` (6.316 ms @ 388.944557) • `iterate_supers` (0.201 ms @ 388.950873) • `iterate_supers` (7.421 ms @ 388.951074) Normally, `sync_inodes_one_sb` only takes microseconds, but once in both cases it takes several milliseconds. • sync_inodes_one_sb (0.001 ms @ 388.944660) • sync_inodes_one_sb (5.978 ms @ 388.944665) • sync_inodes_one_sb (0.001 ms @ 388.950645) Please find an excerpt from the call graph from ftrace below, and note the time increase between 388.944751 and 388.950636. 388.944700 | 0) sleepgr-11355 | | wb_wait_for_completion() { 388.944701 | 0) sleepgr-11355 | |_cond_resched() { 388.944701 | 0) sleepgr-11355 | 0.064 us| rcu_all_qs(); 388.944702 | 0) sleepgr-11355 | 0.664 us|} /* _cond_resched */ 388.944702 | 0) sleepgr-11355 | 0.067 us| init_wait_entry(); 388.944703 | 0) sleepgr-11355 | | prepare_to_wait_event() { 388.944703 | 0) sleepgr-11355 | 0.080 us| _raw_spin_lock_irqsave(); 388.944704 | 0) sleepgr-11355 | 0.073 us| _raw_spin_unlock_irqrestore(); 388.944704 | 0) sleepgr-11355 | 1.388 us|} /* prepare_to_wait_event */ 388.944705 | 0) sleepgr-11355 | |schedule() { 388.944705 | 0) sleepgr-11355 | 0.085 us| rcu_note_context_switch(); 388.944706 | 0) sleepgr-11355 | 0.064 us| _raw_spin_lock(); 388.944707 | 0) sleepgr-11355 | 0.093 us| update_rq_clock(); 388.944708 | 0) sleepgr-11355 | | deactivate_task() { 388.944708 | 0) sleepgr-11355 | | dequeue_task_fair() { 388.944708 | 0) sleepgr-11355 | | dequeue_entity() { 388.944709 | 0) sleepgr-11355 | | update_curr() { 388.944709 | 0) sleepgr-11355 | 0.095 us| update_min_vruntime(); 388.944710 | 0) sleepgr-11355 | 0.126 us| cpuacct_charge(); 388.944710 | 0) sleepgr-11355 | | __cgroup_account_cputime() { 388.944711 | 0) sleepgr-11355 | 0.055 us| cgroup_rstat_updated(); 388.944711 | 0) sleepgr-11355 | 0.675 us| } /* __cgroup_account_cputime */ 388.944712 | 0) sleepgr-11355 | 2.779 us|} /* update_curr */ 388.944712 | 0) sleepgr-11355 | 0.121 us| __update_load_avg_se(); 388.944713 | 0) sleepgr-11355 | 0.118 us| __update_load_avg_cfs_rq(); 388.944713 | 0) sleepgr-11355 | 0.056 us| clear_buddies(); 388.944714 | 0) sleepgr-11355 | 0.066 us| account_entity_dequeue(); 388.944714 | 0) sleepgr-11355 | 0.050 us| update_cfs_group(); 388.944715 | 0) sleepgr-11355 | 6.127 us| } /* dequeue_entity */ 388.944715 | 0) sleepgr-11355 | | dequeue_entity() { 388.944715 | 0) sleepgr-11355 | | update_curr() { 388.944716 | 0) sleepgr-11355 | 0.097 us| __calc_delta(); 388.944716 | 0) sleepgr-11355 | 0.071 us| update_min_vruntime(); 388.944717 | 0) sleepgr-11355 | 1.473 us|} /* update_curr */ 388.944717 | 0) sleepgr-11355 | 0.142 us| __update_load_avg_se(); 388.944718 | 0) sleepgr-11355 | 0.100 us| __update_load_avg_cfs_rq(); 388.944719 | 0) sleepgr-11355 | 0.069 us| clear_buddies(); 388.944719 | 0) sleepgr-11355 | 0.098 us| account_entity_dequeue(); 388.944720 | 0) sleepgr-11355 | | update_cfs_group() { 388.944720 | 0) sleepgr-11355 | 0.110 us| reweight_entity(); 388.944721 | 0) sleepgr-11355 | 0.847 us|} /* update_cfs_group */ 388.944721 | 0) sleepgr-11355 | 6.197 us| } /* dequeue_entity */ 388.944722 | 0) sleepgr-11355
How to debug outliers in `wb_wait_for_completion()` in `ksys_sync()`?
Dear Linux folks, Using `sleepgraph.py` [1][2] to profile the suspend to RAM (STR) times, shows that `ksys_enter` takes a noticeable amount of time. 13 ms on a TUXEDO Book BU1406 with the NVMe device *SAMSUNG MZVKW512HMJP-0*, which is quite good, and over a 60 ms on ASRock E350M1 with an SSD SanDisk device. Adding `devicefilter: ksys_sync` to `config/suspend-callgraph.cfg`, and running `sudo ./sleepgraph.py -config config/suspend-callgraph.cfg`, the attached HTML output shows, that `iterate_supers` takes 6 to 7 ms twice. • `iterate_supers` (6.316 ms @ 388.944557) • `iterate_supers` (0.201 ms @ 388.950873) • `iterate_supers` (7.421 ms @ 388.951074) Normally, `sync_inodes_one_sb` only takes microseconds, but once in both cases it takes several milliseconds. • sync_inodes_one_sb (0.001 ms @ 388.944660) • sync_inodes_one_sb (5.978 ms @ 388.944665) • sync_inodes_one_sb (0.001 ms @ 388.950645) Please find an excerpt from the call graph from ftrace below, and note the time increase between 388.944751 and 388.950636. 388.944700 | 0) sleepgr-11355 | | wb_wait_for_completion() { 388.944701 | 0) sleepgr-11355 | |_cond_resched() { 388.944701 | 0) sleepgr-11355 | 0.064 us| rcu_all_qs(); 388.944702 | 0) sleepgr-11355 | 0.664 us|} /* _cond_resched */ 388.944702 | 0) sleepgr-11355 | 0.067 us| init_wait_entry(); 388.944703 | 0) sleepgr-11355 | | prepare_to_wait_event() { 388.944703 | 0) sleepgr-11355 | 0.080 us| _raw_spin_lock_irqsave(); 388.944704 | 0) sleepgr-11355 | 0.073 us| _raw_spin_unlock_irqrestore(); 388.944704 | 0) sleepgr-11355 | 1.388 us|} /* prepare_to_wait_event */ 388.944705 | 0) sleepgr-11355 | |schedule() { 388.944705 | 0) sleepgr-11355 | 0.085 us| rcu_note_context_switch(); 388.944706 | 0) sleepgr-11355 | 0.064 us| _raw_spin_lock(); 388.944707 | 0) sleepgr-11355 | 0.093 us| update_rq_clock(); 388.944708 | 0) sleepgr-11355 | | deactivate_task() { 388.944708 | 0) sleepgr-11355 | | dequeue_task_fair() { 388.944708 | 0) sleepgr-11355 | | dequeue_entity() { 388.944709 | 0) sleepgr-11355 | | update_curr() { 388.944709 | 0) sleepgr-11355 | 0.095 us| update_min_vruntime(); 388.944710 | 0) sleepgr-11355 | 0.126 us| cpuacct_charge(); 388.944710 | 0) sleepgr-11355 | | __cgroup_account_cputime() { 388.944711 | 0) sleepgr-11355 | 0.055 us| cgroup_rstat_updated(); 388.944711 | 0) sleepgr-11355 | 0.675 us| } /* __cgroup_account_cputime */ 388.944712 | 0) sleepgr-11355 | 2.779 us|} /* update_curr */ 388.944712 | 0) sleepgr-11355 | 0.121 us| __update_load_avg_se(); 388.944713 | 0) sleepgr-11355 | 0.118 us| __update_load_avg_cfs_rq(); 388.944713 | 0) sleepgr-11355 | 0.056 us| clear_buddies(); 388.944714 | 0) sleepgr-11355 | 0.066 us| account_entity_dequeue(); 388.944714 | 0) sleepgr-11355 | 0.050 us| update_cfs_group(); 388.944715 | 0) sleepgr-11355 | 6.127 us| } /* dequeue_entity */ 388.944715 | 0) sleepgr-11355 | | dequeue_entity() { 388.944715 | 0) sleepgr-11355 | | update_curr() { 388.944716 | 0) sleepgr-11355 | 0.097 us| __calc_delta(); 388.944716 | 0) sleepgr-11355 | 0.071 us| update_min_vruntime(); 388.944717 | 0) sleepgr-11355 | 1.473 us|} /* update_curr */ 388.944717 | 0) sleepgr-11355 | 0.142 us| __update_load_avg_se(); 388.944718 | 0) sleepgr-11355 | 0.100 us| __update_load_avg_cfs_rq(); 388.944719 | 0) sleepgr-11355 | 0.069 us| clear_buddies(); 388.944719 | 0) sleepgr-11355 | 0.098 us| account_entity_dequeue(); 388.944720 | 0) sleepgr-11355 | | update_cfs_group() { 388.944720 | 0) sleepgr-11355 | 0.110 us| reweight_entity(); 388.944721 | 0) sleepgr-11355 | 0.847 us|} /* update_cfs_group */ 388.944721 | 0) sleepgr-11355 | 6.197 us| } /* dequeue_entity */ 388.944722 | 0) sleepgr-11355