page-writeback: Introduce per-CT dirty memory limit.

Andrey Ryabinin Fri, 15 Jan 2016 07:25:21 -0800

This patch adds balance_dirty_pages_ub() which is mostly copy-paste
of balance_dirty_pages() from PCS6. balance_dirty_pages_ub() is invoked
only for containers. The original balance_dirty_pages() still used for
global writeback.


https://jira.sw.ru/browse/PSBM-33841

Signed-off-by: Andrey Ryabinin <[email protected]>
---
 fs/fs-writeback.c           |  31 ++++++
 include/linux/backing-dev.h |   2 +
 mm/page-writeback.c         | 246 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 279 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91c1b07..836ce88 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -195,6 +195,28 @@ void bdi_start_background_writeback(struct 
backing_dev_info *bdi)
        bdi_wakeup_thread(bdi);
 }
 
+/**
+ * bdi_start_background_writeback_ub - start background writeback for ub
+ * @bdi: the backing device to write from
+ * @ub: taks's io beancounter
+ *
+ * Description:
+ *   This makes sure WB_SYNC_NONE background writeback happens. When
+ *   this function returns, it is only guaranteed that for given BDI
+ *   some IO is happening if we are over background dirty threshold.
+ *   Caller need not hold sb s_umount semaphore.
+ */
+void bdi_start_background_writeback_ub(struct backing_dev_info *bdi,
+                               struct user_beancounter *ub)
+{
+       /*
+        * We just wake up the flusher thread. It will perform background
+        * writeback as soon as there is no other work to do.
+        */
+       trace_writeback_wake_background(bdi);
+       __bdi_start_writeback(bdi, LONG_MAX, true, WB_REASON_BACKGROUND, ub);
+}
+
 /*
  * Remove the inode from the writeback list it is on.
  */
@@ -708,6 +730,15 @@ static long writeback_sb_inodes(struct super_block *sb,
                 * kind writeout is handled by the freer.
                 */
                spin_lock(&inode->i_lock);
+               /* Filter ub inodes if bdi dirty limit isn't exceeded */
+               if (work->ub && !wb->bdi->dirty_exceeded &&
+                   (inode->i_state & I_DIRTY) == I_DIRTY_PAGES &&
+                       ub_should_skip_writeback(work->ub, inode)) {
+                       spin_unlock(&inode->i_lock);
+                       redirty_tail(inode, wb);
+                       continue;
+               }
+
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        redirty_tail(inode, wb);
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index b7668cf..ae0e828 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -130,6 +130,8 @@ int bdi_setup_and_register(struct backing_dev_info *, char 
*, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
                        enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
+void bdi_start_background_writeback_ub(struct backing_dev_info *bdi,
+                               struct user_beancounter *ub);
 long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                        enum wb_reason reason, struct user_beancounter *ub);
 void bdi_writeback_workfn(struct work_struct *work);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 429c759..3c30a64 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1394,6 +1394,190 @@ static inline void bdi_dirty_limits(struct 
backing_dev_info *bdi,
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
+ * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * If we're over `background_thresh' then the writeback threads are woken to
+ * perform some writeout.
+ */
+static void balance_dirty_pages_ub(struct address_space *mapping,
+                               unsigned long write_chunk)
+{
+       long nr_reclaimable, bdi_nr_reclaimable;
+       long nr_writeback, bdi_nr_writeback;
+       long ub_dirty, ub_writeback;
+       long ub_thresh, ub_background_thresh;
+       unsigned long background_thresh;
+       unsigned long dirty_thresh;
+       unsigned long bdi_thresh;
+       unsigned long pages_written = 0;
+       unsigned long pause = 1;
+       struct user_beancounter *ub = get_io_ub();
+
+       struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+       for (;;) {
+               struct writeback_control wbc = {
+                       .sync_mode      = WB_SYNC_NONE,
+                       .nr_to_write    = write_chunk,
+                       .range_cyclic   = 1,
+               };
+
+               global_dirty_limits(&background_thresh, &dirty_thresh);
+               bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+
+               if (ub_dirty_limits(&ub_background_thresh, &ub_thresh, ub)) {
+                       ub_dirty = ub_stat_get(ub, dirty_pages);
+                       ub_writeback = ub_stat_get(ub, writeback_pages);
+               } else {
+                       ub_dirty = ub_writeback = 0;
+                       ub_thresh = ub_background_thresh = LONG_MAX / 2;
+               }
+
+               nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+                                       global_page_state(NR_UNSTABLE_NFS);
+               nr_writeback = global_page_state(NR_WRITEBACK);
+
+               bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+               bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+
+               /*
+                * Check thresholds, set dirty_exceeded flags and
+                * start background writeback before throttling.
+                */
+               if (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) {
+                       if (!bdi->dirty_exceeded)
+                               bdi->dirty_exceeded = 1;
+                       if (!writeback_in_progress(bdi))
+                               bdi_start_background_writeback(bdi);
+               } else if (ub_dirty + ub_writeback > ub_thresh) {
+                       if (!test_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags))
+                               set_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags);
+                       if (!writeback_in_progress(bdi))
+                               bdi_start_background_writeback_ub(bdi, ub);
+               } else
+                       break;
+
+               /*
+                * Throttle it only when the background writeback cannot
+                * catch-up. This avoids (excessively) small writeouts
+                * when the bdi limits are ramping up.
+                */
+               if (bdi_cap_account_writeback(bdi) &&
+                   nr_reclaimable + nr_writeback <
+                               (background_thresh + dirty_thresh) / 2 &&
+                   ub_dirty + ub_writeback <
+                               (ub_background_thresh + ub_thresh) / 2)
+                       break;
+
+               /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+                * Unstable writes are a feature of certain networked
+                * filesystems (i.e. NFS) in which data may have been
+                * written to the server's write cache, but has not yet
+                * been flushed to permanent storage.
+                * Only move pages to writeback if this bdi is over its
+                * threshold otherwise wait until the disk writes catch
+                * up.
+                */
+               if (bdi_nr_reclaimable > bdi_thresh) {
+                       writeback_inodes_wb(&bdi->wb, wbc.nr_to_write,
+                                       WB_REASON_BACKGROUND, NULL);
+                       pages_written += write_chunk - wbc.nr_to_write;
+                       global_dirty_limits(&background_thresh, &dirty_thresh);
+                       bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+               } else if (ub_dirty > ub_thresh) {
+                       writeback_inodes_wb(&bdi->wb, wbc.nr_to_write,
+                                       WB_REASON_BACKGROUND, ub);
+                       pages_written += write_chunk - wbc.nr_to_write;
+                       ub_dirty = ub_stat_get(ub, dirty_pages);
+                       ub_writeback = ub_stat_get(ub, writeback_pages);
+               }
+
+               /*
+                * In order to avoid the stacked BDI deadlock we need
+                * to ensure we accurately count the 'dirty' pages when
+                * the threshold is low.
+                *
+                * Otherwise it would be possible to get thresh+n pages
+                * reported dirty, even though there are thresh-m pages
+                * actually dirty; with m+n sitting in the percpu
+                * deltas.
+                */
+               if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+                       bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+                       bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+               } else if (bdi_nr_reclaimable) {
+                       bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                       bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+               }
+
+               /* fixup ub-stat per-cpu drift to avoid false-positive */
+               if (ub_dirty + ub_writeback > ub_thresh &&
+                   ub_dirty + ub_writeback - ub_thresh <
+                                   UB_STAT_BATCH * num_possible_cpus()) {
+                       ub_dirty = ub_stat_get_exact(ub, dirty_pages);
+                       ub_writeback = ub_stat_get_exact(ub, writeback_pages);
+               }
+
+               if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh &&
+                   ub_dirty + ub_writeback <= ub_thresh)
+                       break;
+
+               if (pages_written >= write_chunk)
+                       break;          /* We've done our duty */
+
+               __set_current_state(TASK_KILLABLE);
+               io_schedule_timeout(pause);
+
+               /*
+                * Increase the delay for each loop, up to our previous
+                * default of taking a 100ms nap.
+                */
+               pause <<= 1;
+               if (pause > HZ / 10)
+                       pause = HZ / 10;
+
+               if (fatal_signal_pending(current))
+                       break;
+       }
+
+       if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
+                       bdi->dirty_exceeded)
+               bdi->dirty_exceeded = 0;
+
+       if (ub_dirty + ub_writeback < ub_thresh &&
+           test_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags))
+               clear_bit(UB_DIRTY_EXCEEDED, &ub->ub_flags);
+
+       virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_BALANCE_DIRTY,
+                              (void*)write_chunk);
+
+       /*
+        * Even if this is filtered writeback for other ub it will write
+        * inodes for this ub, because ub->dirty_exceeded is set.
+        */
+       if (writeback_in_progress(bdi))
+               return;
+
+       /*
+        * In laptop mode, we wait until hitting the higher threshold before
+        * starting background writeout, and then write out all the way down
+        * to the lower threshold.  So slow writers cause minimal disk activity.
+        *
+        * In normal mode, we start background writeout at the lower
+        * background_thresh, to keep the amount of dirty memory low.
+        */
+       if ((laptop_mode && pages_written) ||
+           (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
+                              + global_page_state(NR_UNSTABLE_NFS))
+                                         > background_thresh)))
+               bdi_start_background_writeback(bdi);
+       else if ((laptop_mode && pages_written) ||
+                (!laptop_mode && ub_dirty > ub_background_thresh))
+               bdi_start_background_writeback_ub(bdi, ub);
+}
+
+/*
+ * balance_dirty_pages() must be called by processes which are generating dirty
+ * data.  It looks at the number of dirty pages in the machine and will force
  * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
  * If we're over `background_thresh' then the writeback threads are woken to
  * perform some writeout.
@@ -1636,6 +1820,62 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
  */
 DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
 
+/*
+ * When balance_dirty_pages decides that the caller needs to perform some
+ * non-background writeback, this is how many pages it will attempt to write.
+ * It should be somewhat larger than dirtied pages to ensure that reasonably
+ * large amounts of I/O are submitted.
+ */
+static inline long sync_writeback_pages(unsigned long dirtied)
+{
+       if (dirtied < ratelimit_pages)
+               dirtied = ratelimit_pages;
+
+       return dirtied + dirtied / 2;
+}
+
+/**
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * @mapping: address_space which was dirtied
+ * @nr_pages_dirtied: number of pages which the caller has just dirtied
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied.  The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * On really big machines, get_writeback_state is expensive, so try to avoid
+ * calling it too often (ratelimiting).  But once we're over the dirty memory
+ * limit we decrease the ratelimiting by a lot, to prevent individual processes
+ * from overshooting the limit by (ratelimit_pages) each.
+ */
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+                                       unsigned long nr_pages_dirtied)
+{
+       unsigned long ratelimit;
+       int *p;
+
+       ratelimit = ratelimit_pages;
+       if (mapping->backing_dev_info->dirty_exceeded ||
+           test_bit(UB_DIRTY_EXCEEDED, &get_io_ub()->ub_flags))
+               ratelimit = 8;
+
+       /*
+        * Check the rate limiting. Also, we do not want to throttle real-time
+        * tasks in balance_dirty_pages(). Period.
+        */
+       preempt_disable();
+       p =  &__get_cpu_var(bdp_ratelimits);
+       *p += nr_pages_dirtied;
+       if (unlikely(*p >= ratelimit)) {
+               ratelimit = sync_writeback_pages(*p);
+               *p = 0;
+               preempt_enable();
+               balance_dirty_pages_ub(mapping, ratelimit);
+               return;
+       }
+       preempt_enable();
+}
+
 /**
  * balance_dirty_pages_ratelimited - balance dirty memory state
  * @mapping: address_space which was dirtied
@@ -1654,6 +1894,12 @@ void balance_dirty_pages_ratelimited(struct 
address_space *mapping)
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ratelimit;
        int *p;
+       struct user_beancounter *ub = get_io_ub();
+
+       if (ub != get_ub0()) {
+               balance_dirty_pages_ratelimited_nr(mapping, 1);
+               return;
+       }
 
        if (!bdi_cap_account_dirty(bdi))
                return;
-- 
2.4.10

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH rh7 2/2] mm/page-writeback: Introduce per-CT dirty memory limit.

Reply via email to