On 2018/7/23 2:38 PM, Stefan Priebe - Profihost AG wrote:
> Hi Coly,
> 
> thanks for this patch. I was right on the way to debug why i have lot's
> of stalled I/O on latest SLES12-SP3 kernel. I'll add this patch and try
> if this solves my issues.
> 
> Each cache set has two backing devices.

Hi Stefan,

Thanks for the testing.

Coly Li


> 
> Am 22.07.2018 um 18:13 schrieb Coly Li:
>> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
>> allows the writeback rate to be faster if there is no I/O request on a
>> bcache device. It works well if there is only one bcache device attached
>> to the cache set. If there are many bcache devices attached to a cache
>> set, it may introduce performance regression because multiple faster
>> writeback threads of the idle bcache devices will compete the btree level
>> locks with the bcache device who have I/O requests coming.
>>
>> This patch fixes the above issue by only permitting fast writebac when
>> all bcache devices attached on the cache set are idle. And if one of the
>> bcache devices has new I/O request coming, minimized all writeback
>> throughput immediately and let PI controller __update_writeback_rate()
>> to decide the upcoming writeback rate for each bcache device.
>>
>> Also when all bcache devices are idle, limited wrieback rate to a small
>> number is wast of thoughput, especially when backing devices are slower
>> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
>> rate for each backing device if the whole cache set is idle. A faster
>> writeback rate in idle time means new I/Os may have more available space
>> for dirty data, and people may observe a better write performance then.
>>
>> Please note bcache may change its cache mode in run time, and this patch
>> still works if the cache mode is switched from writeback mode and there
>> is still dirty data on cache.
>>
>> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing 
>> idle")
>> Cc: [email protected] #4.16+
>> Signed-off-by: Coly Li <[email protected]>
>> Cc: Michael Lyle <[email protected]>
>> ---
>>  drivers/md/bcache/bcache.h    |  9 +---
>>  drivers/md/bcache/request.c   | 42 ++++++++++++++-
>>  drivers/md/bcache/sysfs.c     | 14 +++--
>>  drivers/md/bcache/util.c      |  2 +-
>>  drivers/md/bcache/util.h      |  2 +-
>>  drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++----------
>>  6 files changed, 126 insertions(+), 41 deletions(-)
>>
>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> index d6bf294f3907..f7451e8be03c 100644
>> --- a/drivers/md/bcache/bcache.h
>> +++ b/drivers/md/bcache/bcache.h
>> @@ -328,13 +328,6 @@ struct cached_dev {
>>       */
>>      atomic_t                has_dirty;
>>  
>> -    /*
>> -     * Set to zero by things that touch the backing volume-- except
>> -     * writeback.  Incremented by writeback.  Used to determine when to
>> -     * accelerate idle writeback.
>> -     */
>> -    atomic_t                backing_idle;
>> -
>>      struct bch_ratelimit    writeback_rate;
>>      struct delayed_work     writeback_rate_update;
>>  
>> @@ -514,6 +507,8 @@ struct cache_set {
>>      struct cache_accounting accounting;
>>  
>>      unsigned long           flags;
>> +    atomic_t                idle_counter;
>> +    atomic_t                at_max_writeback_rate;
>>  
>>      struct cache_sb         sb;
>>  
>> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
>> index ae67f5fa8047..fe45f561a054 100644
>> --- a/drivers/md/bcache/request.c
>> +++ b/drivers/md/bcache/request.c
>> @@ -1104,6 +1104,34 @@ static void detached_dev_do_request(struct 
>> bcache_device *d, struct bio *bio)
>>  
>>  /* Cached devices - read & write stuff */
>>  
>> +static void quit_max_writeback_rate(struct cache_set *c)
>> +{
>> +    int i;
>> +    struct bcache_device *d;
>> +    struct cached_dev *dc;
>> +
>> +    mutex_lock(&bch_register_lock);
>> +
>> +    for (i = 0; i < c->devices_max_used; i++) {
>> +            if (!c->devices[i])
>> +                    continue;
>> +
>> +            if (UUID_FLASH_ONLY(&c->uuids[i]))
>> +                    continue;
>> +
>> +            d = c->devices[i];
>> +            dc = container_of(d, struct cached_dev, disk);
>> +            /*
>> +             * set writeback rate to default minimum value,
>> +             * then let update_writeback_rate() to decide the
>> +             * upcoming rate.
>> +             */
>> +            atomic64_set(&dc->writeback_rate.rate, 1);
>> +    }
>> +
>> +    mutex_unlock(&bch_register_lock);
>> +}
>> +
>>  static blk_qc_t cached_dev_make_request(struct request_queue *q,
>>                                      struct bio *bio)
>>  {
>> @@ -1119,7 +1147,19 @@ static blk_qc_t cached_dev_make_request(struct 
>> request_queue *q,
>>              return BLK_QC_T_NONE;
>>      }
>>  
>> -    atomic_set(&dc->backing_idle, 0);
>> +    if (d->c) {
>> +            atomic_set(&d->c->idle_counter, 0);
>> +            /*
>> +             * If at_max_writeback_rate of cache set is true and new I/O
>> +             * comes, quit max writeback rate of all cached devices
>> +             * attached to this cache set, and set at_max_writeback_rate
>> +             * to false.
>> +             */
>> +            if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
>> +                    atomic_set(&d->c->at_max_writeback_rate, 0);
>> +                    quit_max_writeback_rate(d->c);
>> +            }
>> +    }
>>      generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
>>  
>>      bio_set_dev(bio, dc->bdev);
>> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
>> index 225b15aa0340..d719021bff81 100644
>> --- a/drivers/md/bcache/sysfs.c
>> +++ b/drivers/md/bcache/sysfs.c
>> @@ -170,7 +170,8 @@ SHOW(__bch_cached_dev)
>>      var_printf(writeback_running,   "%i");
>>      var_print(writeback_delay);
>>      var_print(writeback_percent);
>> -    sysfs_hprint(writeback_rate,    dc->writeback_rate.rate << 9);
>> +    sysfs_hprint(writeback_rate,
>> +                 atomic64_read(&dc->writeback_rate.rate) << 9);
>>      sysfs_hprint(io_errors,         atomic_read(&dc->io_errors));
>>      sysfs_printf(io_error_limit,    "%i", dc->error_limit);
>>      sysfs_printf(io_disable,        "%i", dc->io_disable);
>> @@ -188,7 +189,8 @@ SHOW(__bch_cached_dev)
>>              char change[20];
>>              s64 next_io;
>>  
>> -            bch_hprint(rate,        dc->writeback_rate.rate << 9);
>> +            bch_hprint(rate,
>> +                       atomic64_read(&dc->writeback_rate.rate) << 9);
>>              bch_hprint(dirty,       bcache_dev_sectors_dirty(&dc->disk) << 
>> 9);
>>              bch_hprint(target,      dc->writeback_rate_target << 9);
>>              bch_hprint(proportional,dc->writeback_rate_proportional << 9);
>> @@ -255,8 +257,12 @@ STORE(__cached_dev)
>>  
>>      sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
>>  
>> -    sysfs_strtoul_clamp(writeback_rate,
>> -                        dc->writeback_rate.rate, 1, INT_MAX);
>> +    if (attr == &sysfs_writeback_rate) {
>> +            int v;
>> +
>> +            sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
>> +            atomic64_set(&dc->writeback_rate.rate, v);
>> +    }
>>  
>>      sysfs_strtoul_clamp(writeback_rate_update_seconds,
>>                          dc->writeback_rate_update_seconds,
>> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
>> index fc479b026d6d..84f90c3d996d 100644
>> --- a/drivers/md/bcache/util.c
>> +++ b/drivers/md/bcache/util.c
>> @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, 
>> uint64_t done)
>>  {
>>      uint64_t now = local_clock();
>>  
>> -    d->next += div_u64(done * NSEC_PER_SEC, d->rate);
>> +    d->next += div_u64(done * NSEC_PER_SEC, atomic64_read(&d->rate));
>>  
>>      /* Bound the time.  Don't let us fall further than 2 seconds behind
>>       * (this prevents unnecessary backlog that would make it impossible
>> diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
>> index cced87f8eb27..7e17f32ab563 100644
>> --- a/drivers/md/bcache/util.h
>> +++ b/drivers/md/bcache/util.h
>> @@ -442,7 +442,7 @@ struct bch_ratelimit {
>>       * Rate at which we want to do work, in units per second
>>       * The units here correspond to the units passed to bch_next_delay()
>>       */
>> -    uint32_t                rate;
>> +    atomic64_t              rate;
>>  };
>>  
>>  static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>> index ad45ebe1a74b..72059f910230 100644
>> --- a/drivers/md/bcache/writeback.c
>> +++ b/drivers/md/bcache/writeback.c
>> @@ -49,6 +49,63 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
>>      return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
>>  }
>>  
>> +static bool set_at_max_writeback_rate(struct cache_set *c,
>> +                                  struct cached_dev *dc)
>> +{
>> +    int i, dirty_dc_nr = 0;
>> +    struct bcache_device *d;
>> +
>> +    mutex_lock(&bch_register_lock);
>> +    for (i = 0; i < c->devices_max_used; i++) {
>> +            if (!c->devices[i])
>> +                    continue;
>> +            if (UUID_FLASH_ONLY(&c->uuids[i]))
>> +                    continue;
>> +            d = c->devices[i];
>> +            dc = container_of(d, struct cached_dev, disk);
>> +            if (atomic_read(&dc->has_dirty))
>> +                    dirty_dc_nr++;
>> +    }
>> +    mutex_unlock(&bch_register_lock);
>> +
>> +    /*
>> +     * Idle_counter is increased everytime when update_writeback_rate()
>> +     * is rescheduled in. If all backing devices attached to the same
>> +     * cache set has same dc->writeback_rate_update_seconds value, it
>> +     * is about 10 rounds of update_writeback_rate() is called on each
>> +     * backing device, then the code will fall through at set 1 to
>> +     * c->at_max_writeback_rate, and a max wrteback rate to each
>> +     * dc->writeback_rate.rate. This is not very accurate but works well
>> +     * to make sure the whole cache set has no new I/O coming before
>> +     * writeback rate is set to a max number.
>> +     */
>> +    if (atomic_inc_return(&c->idle_counter) < dirty_dc_nr * 10)
>> +            return false;
>> +
>> +    if (atomic_read(&c->at_max_writeback_rate) != 1)
>> +            atomic_set(&c->at_max_writeback_rate, 1);
>> +
>> +
>> +    atomic64_set(&dc->writeback_rate.rate, INT_MAX);
>> +
>> +    /* keep writeback_rate_target as existing value */
>> +    dc->writeback_rate_proportional = 0;
>> +    dc->writeback_rate_integral_scaled = 0;
>> +    dc->writeback_rate_change = 0;
>> +
>> +    /*
>> +     * Check c->idle_counter and c->at_max_writeback_rate agagain in case
>> +     * new I/O arrives during before set_at_max_writeback_rate() returns.
>> +     * Then the writeback rate is set to 1, and its new value should be
>> +     * decided via __update_writeback_rate().
>> +     */
>> +    if (atomic_read(&c->idle_counter) < dirty_dc_nr * 10 ||
>> +        !atomic_read(&c->at_max_writeback_rate))
>> +            return false;
>> +
>> +    return true;
>> +}
>> +
>>  static void __update_writeback_rate(struct cached_dev *dc)
>>  {
>>      /*
>> @@ -104,8 +161,9 @@ static void __update_writeback_rate(struct cached_dev 
>> *dc)
>>  
>>      dc->writeback_rate_proportional = proportional_scaled;
>>      dc->writeback_rate_integral_scaled = integral_scaled;
>> -    dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
>> -    dc->writeback_rate.rate = new_rate;
>> +    dc->writeback_rate_change = new_rate -
>> +                    atomic64_read(&dc->writeback_rate.rate);
>> +    atomic64_set(&dc->writeback_rate.rate, new_rate);
>>      dc->writeback_rate_target = target;
>>  }
>>  
>> @@ -138,9 +196,16 @@ static void update_writeback_rate(struct work_struct 
>> *work)
>>  
>>      down_read(&dc->writeback_lock);
>>  
>> -    if (atomic_read(&dc->has_dirty) &&
>> -        dc->writeback_percent)
>> -            __update_writeback_rate(dc);
>> +    if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
>> +            /*
>> +             * If the whole cache set is idle, set_at_max_writeback_rate()
>> +             * will set writeback rate to a max number. Then it is
>> +             * unncessary to update writeback rate for an idle cache set
>> +             * in maximum writeback rate number(s).
>> +             */
>> +            if (!set_at_max_writeback_rate(c, dc))
>> +                    __update_writeback_rate(dc);
>> +    }
>>  
>>      up_read(&dc->writeback_lock);
>>  
>> @@ -422,27 +487,6 @@ static void read_dirty(struct cached_dev *dc)
>>  
>>              delay = writeback_delay(dc, size);
>>  
>> -            /* If the control system would wait for at least half a
>> -             * second, and there's been no reqs hitting the backing disk
>> -             * for awhile: use an alternate mode where we have at most
>> -             * one contiguous set of writebacks in flight at a time.  If
>> -             * someone wants to do IO it will be quick, as it will only
>> -             * have to contend with one operation in flight, and we'll
>> -             * be round-tripping data to the backing disk as quickly as
>> -             * it can accept it.
>> -             */
>> -            if (delay >= HZ / 2) {
>> -                    /* 3 means at least 1.5 seconds, up to 7.5 if we
>> -                     * have slowed way down.
>> -                     */
>> -                    if (atomic_inc_return(&dc->backing_idle) >= 3) {
>> -                            /* Wait for current I/Os to finish */
>> -                            closure_sync(&cl);
>> -                            /* And immediately launch a new set. */
>> -                            delay = 0;
>> -                    }
>> -            }
>> -
>>              while (!kthread_should_stop() &&
>>                     !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
>>                     delay) {
>> @@ -715,7 +759,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>>      dc->writeback_running           = true;
>>      dc->writeback_percent           = 10;
>>      dc->writeback_delay             = 30;
>> -    dc->writeback_rate.rate         = 1024;
>> +    atomic64_set(&dc->writeback_rate.rate, 1024);
>>      dc->writeback_rate_minimum      = 8;
>>  
>>      dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
>>

Reply via email to