Signed-off-by: Kent Overstreet <[email protected]>
---
 drivers/md/bcache/movinggc.c  |  245 +++++++++++++++++++++++++
 drivers/md/bcache/writeback.c |  402 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 647 insertions(+)

diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
new file mode 100644
index 0000000..62be218
--- /dev/null
+++ b/drivers/md/bcache/movinggc.c
@@ -0,0 +1,245 @@
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+struct moving_io {
+       struct keybuf_key       *w;
+       struct search           s;
+       struct bbio             bio;
+};
+
+static bool moving_pred(struct keybuf *buf, struct bkey *k)
+{
+       struct cache_set *c = container_of(buf, struct cache_set,
+                                          moving_gc_keys);
+
+       for (unsigned i = 0; i < KEY_PTRS(k); i++) {
+               struct cache *ca = PTR_CACHE(c, k, i);
+               struct bucket *g = PTR_BUCKET(c, k, i);
+
+               if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
+                       return true;
+       }
+
+       return false;
+}
+
+/* Moving GC - IO loop */
+
+static void moving_io_destructor(struct closure *cl)
+{
+       struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+       kfree(io);
+}
+
+static void write_moving_finish(struct closure *cl)
+{
+       struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+       struct bio *bio = &io->bio.bio;
+       struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
+
+       while (bv-- != bio->bi_io_vec)
+               __free_page(bv->bv_page);
+
+       pr_debug("%s %s", io->s.op.insert_collision
+                ? "collision moving" : "moved",
+                pkey(&io->w->key));
+
+       bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
+
+       atomic_dec_bug(&io->s.op.c->in_flight);
+       closure_wake_up(&io->s.op.c->moving_gc_wait);
+
+       closure_return_with_destructor(cl, moving_io_destructor);
+}
+
+static void read_moving_endio(struct bio *bio, int error)
+{
+       struct moving_io *io = container_of(bio->bi_private,
+                                           struct moving_io, s.cl);
+
+       if (error)
+               io->s.error = error;
+
+       bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
+}
+
+static void moving_init(struct moving_io *io)
+{
+       struct bio *bio = &io->bio.bio;
+
+       bio_init(bio);
+       bio_get(bio);
+       bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+       bio->bi_size            = KEY_SIZE(&io->w->key) << 9;
+       bio->bi_max_vecs        = DIV_ROUND_UP(KEY_SIZE(&io->w->key), 
PAGE_SECTORS);
+       bio->bi_private         = &io->s.cl;
+       bio->bi_io_vec          = bio->bi_inline_vecs;
+       bio_map(bio, NULL);
+}
+
+static void write_moving(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       struct moving_io *io = container_of(s, struct moving_io, s);
+
+       if (!s->error) {
+               trace_bcache_write_moving(&io->bio.bio);
+
+               moving_init(io);
+
+               io->bio.bio.bi_sector   = KEY_START(&io->w->key);
+               s->op.lock              = -1;
+               s->op.write_prio        = 1;
+               s->op.cache_bio         = &io->bio.bio;
+
+               s->writeback            = KEY_DIRTY(&io->w->key);
+               s->op.csum              = KEY_CSUM(&io->w->key);
+
+               s->op.type = BTREE_REPLACE;
+               bkey_copy(&s->op.replace, &io->w->key);
+
+               closure_init(&s->op.cl, cl);
+               bch_bio_insert(&s->op.cl);
+       }
+
+       continue_at(cl, write_moving_finish, NULL);
+}
+
+static void read_moving_submit(struct closure *cl)
+{
+       struct search *s = container_of(cl, struct search, cl);
+       struct moving_io *io = container_of(s, struct moving_io, s);
+       struct bio *bio = &io->bio.bio;
+
+       trace_bcache_read_moving(bio);
+       bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
+
+       continue_at(cl, write_moving, bch_gc_wq);
+}
+
+static void read_moving(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
+       struct keybuf_key *w;
+       struct moving_io *io;
+       struct bio *bio;
+
+       /* XXX: if we error, background writeback could stall indefinitely */
+
+       while (!atomic_read(&c->closing)) {
+               w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
+               if (!w)
+                       break;
+
+               io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
+                            * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+                            GFP_KERNEL);
+               if (!io)
+                       goto err;
+
+               w->private      = io;
+               io->w           = w;
+               io->s.op.inode  = KEY_INODE(&w->key);
+               io->s.op.c      = c;
+
+               moving_init(io);
+               bio = &io->bio.bio;
+
+               bio->bi_rw      = READ;
+               bio->bi_end_io  = read_moving_endio;
+
+               if (bio_alloc_pages(bio, GFP_KERNEL))
+                       goto err;
+
+               pr_debug("%s", pkey(&w->key));
+
+               closure_call(read_moving_submit, &io->s.cl, &c->gc.cl);
+
+               if (atomic_inc_return(&c->in_flight) >= 64) {
+                       closure_wait_event(&c->moving_gc_wait, cl,
+                                          atomic_read(&c->in_flight) < 64);
+                       continue_at(cl, read_moving, bch_gc_wq);
+               }
+       }
+
+       if (0) {
+err:           if (!IS_ERR_OR_NULL(w->private))
+                       kfree(w->private);
+
+               bch_keybuf_del(&c->moving_gc_keys, w);
+       }
+
+       closure_return(cl);
+}
+
+void bch_moving_gc(struct closure *cl)
+{
+       struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
+       struct cache *ca;
+       struct bucket *b;
+
+       bool bucket_cmp(struct bucket *l, struct bucket *r)
+       {
+               return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
+       }
+
+       unsigned top(struct cache *ca)
+       {
+               return GC_SECTORS_USED(heap_peek(&ca->heap));
+       }
+
+       if (!c->copy_gc_enabled)
+               closure_return(cl);
+
+       mutex_lock(&c->bucket_lock);
+
+       for_each_cache(ca, c) {
+               unsigned sectors_to_move = 0;
+               unsigned reserve_sectors = ca->sb.bucket_size *
+                       min(fifo_used(&ca->free), ca->free.size / 2);
+
+               ca->heap.used = 0;
+
+               for_each_bucket(b, ca) {
+                       if (!GC_SECTORS_USED(b))
+                               continue;
+
+                       if (!heap_full(&ca->heap)) {
+                               sectors_to_move += GC_SECTORS_USED(b);
+                               heap_add(&ca->heap, b, bucket_cmp);
+                       } else if (bucket_cmp(b, heap_peek(&ca->heap))) {
+                               sectors_to_move -= top(ca);
+                               sectors_to_move += GC_SECTORS_USED(b);
+
+                               ca->heap.data[0] = b;
+                               heap_sift(&ca->heap, 0, bucket_cmp);
+                       }
+               }
+
+               while (sectors_to_move > reserve_sectors) {
+                       heap_pop(&ca->heap, b, bucket_cmp);
+                       sectors_to_move -= GC_SECTORS_USED(b);
+               }
+
+               ca->gc_move_threshold = top(ca);
+
+               pr_debug("threshold %u", ca->gc_move_threshold);
+       }
+
+       mutex_unlock(&c->bucket_lock);
+
+       c->moving_gc_keys.last_scanned = ZERO_KEY;
+
+       closure_init(&c->moving_gc, cl);
+       read_moving(&c->moving_gc);
+
+       closure_return(cl);
+}
+
+void bch_moving_init_cache_set(struct cache_set *c)
+{
+       bch_keybuf_init(&c->moving_gc_keys, moving_pred);
+}
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
new file mode 100644
index 0000000..1a3110d
--- /dev/null
+++ b/drivers/md/bcache/writeback.c
@@ -0,0 +1,402 @@
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+
+static struct workqueue_struct *dirty_wq;
+
+static void read_dirty(struct closure *);
+
+struct dirty_io {
+       struct closure          cl;
+       struct cached_dev       *dc;
+       struct bio              bio;
+};
+
+/* Rate limiting */
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
+       struct cache_set *c = dc->disk.c;
+       uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+       uint64_t cache_dirty_target =
+               div_u64(cache_sectors * dc->writeback_percent, 100);
+
+       int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
+                                  c->cached_dev_sectors);
+
+       /* PD controller */
+
+       int change = 0;
+       int64_t error;
+       int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
+       int64_t derivative = dirty - dc->disk.sectors_dirty_last;
+
+       dc->disk.sectors_dirty_last = dirty;
+
+       derivative *= dc->writeback_rate_d_term;
+       derivative = clamp(derivative, -dirty, dirty);
+
+       derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
+                             dc->writeback_rate_d_smooth, 0);
+
+       /* Avoid divide by zero */
+       if (!target)
+               goto out;
+
+       error = div64_s64((dirty + derivative - target) << 8, target);
+
+       change = div_s64((dc->writeback_rate.rate * error) >> 8,
+                        dc->writeback_rate_p_term_inverse);
+
+       /* Don't increase writeback rate if the device isn't keeping up */
+       if (change > 0 &&
+           time_after64(local_clock(),
+                        dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
+               change = 0;
+
+       dc->writeback_rate.rate =
+               clamp_t(int64_t, dc->writeback_rate.rate + change,
+                       1, NSEC_PER_MSEC);
+out:
+       dc->writeback_rate_derivative = derivative;
+       dc->writeback_rate_change = change;
+       dc->writeback_rate_target = target;
+
+       schedule_delayed_work(&dc->writeback_rate_update,
+                             dc->writeback_rate_update_seconds * HZ);
+}
+
+static void update_writeback_rate(struct work_struct *work)
+{
+       struct cached_dev *dc = container_of(to_delayed_work(work),
+                                            struct cached_dev,
+                                            writeback_rate_update);
+
+       down_read(&dc->writeback_lock);
+
+       if (atomic_read(&dc->has_dirty) &&
+           dc->writeback_percent)
+               __update_writeback_rate(dc);
+
+       up_read(&dc->writeback_lock);
+}
+
+static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
+{
+       if (atomic_read(&dc->disk.detaching) ||
+           !dc->writeback_percent)
+               return 0;
+
+       return next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+}
+
+/* Background writeback */
+
+static bool dirty_pred(struct keybuf *buf, struct bkey *k)
+{
+       return KEY_DIRTY(k);
+}
+
+static void dirty_init(struct keybuf_key *w)
+{
+       struct dirty_io *io = w->private;
+       struct bio *bio = &io->bio;
+
+       bio_init(bio);
+       if (!io->dc->writeback_percent)
+               bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+       bio->bi_size            = KEY_SIZE(&w->key) << 9;
+       bio->bi_max_vecs        = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
+       bio->bi_private         = w;
+       bio->bi_io_vec          = bio->bi_inline_vecs;
+       bio_map(bio, NULL);
+}
+
+static void refill_dirty(struct closure *cl)
+{
+       struct cached_dev *dc = container_of(cl, struct cached_dev,
+                                            writeback.cl);
+       struct keybuf *buf = &dc->writeback_keys;
+       bool searched_from_start = false;
+       struct bkey end = MAX_KEY;
+       SET_KEY_INODE(&end, dc->disk.id);
+
+       if (!atomic_read(&dc->disk.detaching) &&
+           !dc->writeback_running)
+               closure_return(cl);
+
+       down_write(&dc->writeback_lock);
+
+       if (!atomic_read(&dc->has_dirty)) {
+               SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+               bch_write_bdev_super(dc, NULL);
+               up_write(&dc->writeback_lock);
+               closure_return(cl);
+       }
+
+       if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
+               buf->last_scanned = KEY(dc->disk.id, 0, 0);
+               searched_from_start = true;
+       }
+
+       bch_refill_keybuf(dc->disk.c, buf, &end);
+
+       if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
+               /* Searched the entire btree - delay for awhile */
+
+               if (RB_EMPTY_ROOT(&buf->keys)) {
+                       atomic_set(&dc->has_dirty, 0);
+                       cached_dev_put(dc);
+               }
+
+               up_write(&dc->writeback_lock);
+               closure_delay(&dc->writeback, dc->writeback_delay * HZ);
+               continue_at(cl, refill_dirty, dirty_wq);
+       }
+
+       up_write(&dc->writeback_lock);
+
+       ratelimit_reset(&dc->writeback_rate);
+
+       read_dirty(cl);
+}
+
+void bch_writeback_queue(struct cached_dev *dc)
+{
+       if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
+               closure_delay(&dc->writeback, dc->writeback_delay * HZ);
+               continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
+       }
+}
+
+void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
+{
+       atomic_long_add(sectors, &dc->disk.sectors_dirty);
+
+       if (!atomic_read(&dc->has_dirty) &&
+           !atomic_xchg(&dc->has_dirty, 1)) {
+               atomic_inc(&dc->count);
+
+               if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+                       SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
+                       /* XXX: should do this synchronously */
+                       bch_write_bdev_super(dc, NULL);
+               }
+
+               bch_writeback_queue(dc);
+
+               if (dc->writeback_percent)
+                       schedule_delayed_work(&dc->writeback_rate_update,
+                                     dc->writeback_rate_update_seconds * HZ);
+       }
+}
+
+/* Background writeback - IO loop */
+
+static void dirty_io_destructor(struct closure *cl)
+{
+       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+       kfree(io);
+}
+
+static void write_dirty_finish(struct closure *cl)
+{
+       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+       struct keybuf_key *w = io->bio.bi_private;
+       struct cached_dev *dc = io->dc;
+       struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
+
+       while (bv-- != io->bio.bi_io_vec)
+               __free_page(bv->bv_page);
+
+       /* This is kind of a dumb way of signalling errors. */
+       if (KEY_DIRTY(&w->key)) {
+               unsigned i;
+               struct btree_op op;
+               bch_btree_op_init_stack(&op);
+
+               op.type = BTREE_REPLACE;
+               bkey_copy(&op.replace, &w->key);
+
+               SET_KEY_DIRTY(&w->key, false);
+               bch_keylist_add(&op.keys, &w->key);
+
+               for (i = 0; i < KEY_PTRS(&w->key); i++)
+                       atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
+
+               pr_debug("clearing %s", pkey(&w->key));
+               bch_btree_insert(&op, dc->disk.c);
+               closure_sync(&op.cl);
+
+               atomic_long_inc(op.insert_collision
+                               ? &dc->disk.c->writeback_keys_failed
+                               : &dc->disk.c->writeback_keys_done);
+       }
+
+       bch_keybuf_del(&dc->writeback_keys, w);
+       atomic_dec_bug(&dc->in_flight);
+
+       closure_wake_up(&dc->writeback_wait);
+
+       closure_return_with_destructor(cl, dirty_io_destructor);
+}
+
+static void dirty_endio(struct bio *bio, int error)
+{
+       struct keybuf_key *w = bio->bi_private;
+       struct dirty_io *io = w->private;
+
+       if (error)
+               SET_KEY_DIRTY(&w->key, false);
+
+       closure_put(&io->cl);
+}
+
+static void write_dirty(struct closure *cl)
+{
+       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+       struct keybuf_key *w = io->bio.bi_private;
+
+       dirty_init(w);
+       io->bio.bi_rw           = WRITE;
+       io->bio.bi_sector       = KEY_START(&w->key);
+       io->bio.bi_bdev         = io->dc->bdev;
+       io->bio.bi_end_io       = dirty_endio;
+
+       trace_bcache_write_dirty(&io->bio);
+       closure_bio_submit(&io->bio, cl);
+
+       continue_at(cl, write_dirty_finish, dirty_wq);
+}
+
+static void read_dirty_endio(struct bio *bio, int error)
+{
+       struct keybuf_key *w = bio->bi_private;
+       struct dirty_io *io = w->private;
+
+       bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
+                           error, "reading dirty data from cache");
+
+       dirty_endio(bio, error);
+}
+
+static void read_dirty_submit(struct closure *cl)
+{
+       struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+       trace_bcache_read_dirty(&io->bio);
+       closure_bio_submit(&io->bio, cl);
+
+       continue_at(cl, write_dirty, dirty_wq);
+}
+
+static void read_dirty(struct closure *cl)
+{
+       struct cached_dev *dc = container_of(cl, struct cached_dev,
+                                            writeback.cl);
+       unsigned delay = writeback_delay(dc, 0);
+       struct keybuf_key *w;
+       struct dirty_io *io;
+
+       /*
+        * XXX: if we error, background writeback just spins. Should use some
+        * mempools.
+        */
+
+       while (1) {
+               w = bch_keybuf_next(&dc->writeback_keys);
+               if (!w)
+                       break;
+
+               BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
+
+               if (delay > 0 &&
+                   (KEY_START(&w->key) != dc->last_read ||
+                    jiffies_to_msecs(delay) > 50)) {
+                       w->private = NULL;
+
+                       closure_delay(&dc->writeback, delay);
+                       continue_at(cl, read_dirty, dirty_wq);
+               }
+
+               dc->last_read   = KEY_OFFSET(&w->key);
+
+               io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
+                            * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+                            GFP_KERNEL);
+               if (!io)
+                       goto err;
+
+               w->private      = io;
+               io->dc          = dc;
+
+               dirty_init(w);
+               io->bio.bi_sector       = PTR_OFFSET(&w->key, 0);
+               io->bio.bi_bdev         = PTR_CACHE(dc->disk.c,
+                                                   &w->key, 0)->bdev;
+               io->bio.bi_rw           = READ;
+               io->bio.bi_end_io       = read_dirty_endio;
+
+               if (bio_alloc_pages(&io->bio, GFP_KERNEL))
+                       goto err_free;
+
+               pr_debug("%s", pkey(&w->key));
+
+               closure_call(read_dirty_submit, &io->cl, &dc->disk.cl);
+
+               delay = writeback_delay(dc, KEY_SIZE(&w->key));
+
+               atomic_inc(&dc->in_flight);
+
+               if (!closure_wait_event(&dc->writeback_wait, cl,
+                                       atomic_read(&dc->in_flight) < 64))
+                       continue_at(cl, read_dirty, dirty_wq);
+       }
+
+       if (0) {
+err_free:
+               kfree(w->private);
+err:
+               bch_keybuf_del(&dc->writeback_keys, w);
+       }
+
+       refill_dirty(cl);
+}
+
+void bch_writeback_init_cached_dev(struct cached_dev *dc)
+{
+       closure_init_unlocked(&dc->writeback);
+       init_rwsem(&dc->writeback_lock);
+
+       bch_keybuf_init(&dc->writeback_keys, dirty_pred);
+
+       dc->writeback_metadata          = true;
+       dc->writeback_running           = true;
+       dc->writeback_delay             = 30;
+       dc->writeback_rate.rate         = 1024;
+
+       dc->writeback_rate_update_seconds = 30;
+       dc->writeback_rate_d_term       = 16;
+       dc->writeback_rate_p_term_inverse = 64;
+       dc->writeback_rate_d_smooth     = 8;
+
+       INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+       schedule_delayed_work(&dc->writeback_rate_update,
+                             dc->writeback_rate_update_seconds * HZ);
+}
+
+void bch_writeback_exit(void)
+{
+       if (dirty_wq)
+               destroy_workqueue(dirty_wq);
+}
+
+int __init bch_writeback_init(void)
+{
+       dirty_wq = create_singlethread_workqueue("bcache_writeback");
+       if (!dirty_wq)
+               return -ENOMEM;
+
+       return 0;
+}
-- 
1.7.9.3.327.g2980b

--
To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to