This patch adds balance_dirty_pages_ub() which is mostly copy-paste of balance_dirty_pages() from PCS6. balance_dirty_pages_ub() is invoked only for containers. The original balance_dirty_pages() still used for global writeback.
https://jira.sw.ru/browse/PSBM-33841 Signed-off-by: Andrey Ryabinin <[email protected]> --- fs/fs-writeback.c | 31 ++++++ include/linux/backing-dev.h | 2 + mm/page-writeback.c | 246 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 279 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 91c1b07..836ce88 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -195,6 +195,28 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) bdi_wakeup_thread(bdi); } +/** + * bdi_start_background_writeback_ub - start background writeback for ub + * @bdi: the backing device to write from + * @ub: taks's io beancounter + * + * Description: + * This makes sure WB_SYNC_NONE background writeback happens. When + * this function returns, it is only guaranteed that for given BDI + * some IO is happening if we are over background dirty threshold. + * Caller need not hold sb s_umount semaphore. + */ +void bdi_start_background_writeback_ub(struct backing_dev_info *bdi, + struct user_beancounter *ub) +{ + /* + * We just wake up the flusher thread. It will perform background + * writeback as soon as there is no other work to do. + */ + trace_writeback_wake_background(bdi); + __bdi_start_writeback(bdi, LONG_MAX, true, WB_REASON_BACKGROUND, ub); +} + /* * Remove the inode from the writeback list it is on. */ @@ -708,6 +730,15 @@ static long writeback_sb_inodes(struct super_block *sb, * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); + /* Filter ub inodes if bdi dirty limit isn't exceeded */ + if (work->ub && !wb->bdi->dirty_exceeded && + (inode->i_state & I_DIRTY) == I_DIRTY_PAGES && + ub_should_skip_writeback(work->ub, inode)) { + spin_unlock(&inode->i_lock); + redirty_tail(inode, wb); + continue; + } + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); redirty_tail(inode, wb); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b7668cf..ae0e828 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -130,6 +130,8 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); +void bdi_start_background_writeback_ub(struct backing_dev_info *bdi, + struct user_beancounter *ub); long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason, struct user_beancounter *ub); void bdi_writeback_workfn(struct work_struct *work); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 429c759..3c30a64 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1394,6 +1394,190 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force + * the caller to perform writeback if the system is over `vm_dirty_ratio'. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. + */ +static void balance_dirty_pages_ub(struct address_space *mapping, + unsigned long write_chunk) +{ + long nr_reclaimable, bdi_nr_reclaimable; + long nr_writeback, bdi_nr_writeback; + long ub_dirty, ub_writeback; + long ub_thresh, ub_background_thresh; + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long bdi_thresh; + unsigned long pages_written = 0; + unsigned long pause = 1; + struct user_beancounter *ub = get_io_ub(); + + struct backing_dev_info *bdi = mapping->backing_dev_info; + + for (;;) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = write_chunk, + .range_cyclic = 1, + }; + + global_dirty_limits(&background_thresh, &dirty_thresh); + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + + if (ub_dirty_limits(&ub_background_thresh, &ub_thresh, ub)) { + ub_dirty = ub_stat_get(ub, dirty_pages); + ub_writeback = ub_stat_get(ub, writeback_pages); + } else { + ub_dirty = ub_writeback = 0; + ub_thresh = ub_background_thresh = LONG_MAX / 2; + } + + nr_reclaimable = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + nr_writeback = global_page_state(NR_WRITEBACK); + + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + + /* + * Check thresholds, set dirty_exceeded flags and + * start background writeback before throttling. + */ + if (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) { + if (!bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; + if (!writeback_in_progress(bdi)) + bdi_start_background_writeback(bdi); + } else if (ub_dirty + ub_writeback > ub_thresh) { + if (!test_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags)) + set_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags); + if (!writeback_in_progress(bdi)) + bdi_start_background_writeback_ub(bdi, ub); + } else + break; + + /* + * Throttle it only when the background writeback cannot + * catch-up. This avoids (excessively) small writeouts + * when the bdi limits are ramping up. + */ + if (bdi_cap_account_writeback(bdi) && + nr_reclaimable + nr_writeback < + (background_thresh + dirty_thresh) / 2 && + ub_dirty + ub_writeback < + (ub_background_thresh + ub_thresh) / 2) + break; + + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + * Only move pages to writeback if this bdi is over its + * threshold otherwise wait until the disk writes catch + * up. + */ + if (bdi_nr_reclaimable > bdi_thresh) { + writeback_inodes_wb(&bdi->wb, wbc.nr_to_write, + WB_REASON_BACKGROUND, NULL); + pages_written += write_chunk - wbc.nr_to_write; + global_dirty_limits(&background_thresh, &dirty_thresh); + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + } else if (ub_dirty > ub_thresh) { + writeback_inodes_wb(&bdi->wb, wbc.nr_to_write, + WB_REASON_BACKGROUND, ub); + pages_written += write_chunk - wbc.nr_to_write; + ub_dirty = ub_stat_get(ub, dirty_pages); + ub_writeback = ub_stat_get(ub, writeback_pages); + } + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (bdi_thresh < 2*bdi_stat_error(bdi)) { + bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); + } else if (bdi_nr_reclaimable) { + bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); + } + + /* fixup ub-stat per-cpu drift to avoid false-positive */ + if (ub_dirty + ub_writeback > ub_thresh && + ub_dirty + ub_writeback - ub_thresh < + UB_STAT_BATCH * num_possible_cpus()) { + ub_dirty = ub_stat_get_exact(ub, dirty_pages); + ub_writeback = ub_stat_get_exact(ub, writeback_pages); + } + + if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh && + ub_dirty + ub_writeback <= ub_thresh) + break; + + if (pages_written >= write_chunk) + break; /* We've done our duty */ + + __set_current_state(TASK_KILLABLE); + io_schedule_timeout(pause); + + /* + * Increase the delay for each loop, up to our previous + * default of taking a 100ms nap. + */ + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; + + if (fatal_signal_pending(current)) + break; + } + + if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && + bdi->dirty_exceeded) + bdi->dirty_exceeded = 0; + + if (ub_dirty + ub_writeback < ub_thresh && + test_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags)) + clear_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags); + + virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_BALANCE_DIRTY, + (void*)write_chunk); + + /* + * Even if this is filtered writeback for other ub it will write + * inodes for this ub, because ub->dirty_exceeded is set. + */ + if (writeback_in_progress(bdi)) + return; + + /* + * In laptop mode, we wait until hitting the higher threshold before + * starting background writeout, and then write out all the way down + * to the lower threshold. So slow writers cause minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if ((laptop_mode && pages_written) || + (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) + > background_thresh))) + bdi_start_background_writeback(bdi); + else if ((laptop_mode && pages_written) || + (!laptop_mode && ub_dirty > ub_background_thresh)) + bdi_start_background_writeback_ub(bdi, ub); +} + +/* + * balance_dirty_pages() must be called by processes which are generating dirty + * data. It looks at the number of dirty pages in the machine and will force * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2. * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. @@ -1636,6 +1820,62 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); */ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; +/* + * When balance_dirty_pages decides that the caller needs to perform some + * non-background writeback, this is how many pages it will attempt to write. + * It should be somewhat larger than dirtied pages to ensure that reasonably + * large amounts of I/O are submitted. + */ +static inline long sync_writeback_pages(unsigned long dirtied) +{ + if (dirtied < ratelimit_pages) + dirtied = ratelimit_pages; + + return dirtied + dirtied / 2; +} + +/** + * balance_dirty_pages_ratelimited_nr - balance dirty memory state + * @mapping: address_space which was dirtied + * @nr_pages_dirtied: number of pages which the caller has just dirtied + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * On really big machines, get_writeback_state is expensive, so try to avoid + * calling it too often (ratelimiting). But once we're over the dirty memory + * limit we decrease the ratelimiting by a lot, to prevent individual processes + * from overshooting the limit by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, + unsigned long nr_pages_dirtied) +{ + unsigned long ratelimit; + int *p; + + ratelimit = ratelimit_pages; + if (mapping->backing_dev_info->dirty_exceeded || + test_bit(UB_DIRTY_EXCEEDED, &get_io_ub()->ub_flags)) + ratelimit = 8; + + /* + * Check the rate limiting. Also, we do not want to throttle real-time + * tasks in balance_dirty_pages(). Period. + */ + preempt_disable(); + p = &__get_cpu_var(bdp_ratelimits); + *p += nr_pages_dirtied; + if (unlikely(*p >= ratelimit)) { + ratelimit = sync_writeback_pages(*p); + *p = 0; + preempt_enable(); + balance_dirty_pages_ub(mapping, ratelimit); + return; + } + preempt_enable(); +} + /** * balance_dirty_pages_ratelimited - balance dirty memory state * @mapping: address_space which was dirtied @@ -1654,6 +1894,12 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) struct backing_dev_info *bdi = mapping->backing_dev_info; int ratelimit; int *p; + struct user_beancounter *ub = get_io_ub(); + + if (ub != get_ub0()) { + balance_dirty_pages_ratelimited_nr(mapping, 1); + return; + } if (!bdi_cap_account_dirty(bdi)) return; -- 2.4.10 _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
