Hello, Kent.

On Wed, May 09, 2012 at 11:10:48PM -0400, Kent Overstreet wrote:
> +#define BKEY_PADDED(key)                                     \
> +     union { struct bkey key; uint64_t key ## _pad[8]; }

What does the '8' mean?  And why does struct bbio use 3 instead?  Does
this have to be a macro?  Can't we have struct bkey_padded (or
whatever)?

> +struct io {
> +     /* Used to track sequential IO so it can be skipped */
> +     struct hlist_node       hash;
> +     struct list_head        lru;
> +
> +     unsigned long           jiffies;
> +     unsigned                sequential;
> +     sector_t                last;
> +};

I don't think bcache can have struct io. :P

> +struct dirty_io {
> +     struct closure          cl;
> +     struct cached_dev       *d;
> +     struct bio              bio;
> +};
> +
> +struct dirty {
> +     struct rb_node          node;
> +     BKEY_PADDED(key);
> +     struct dirty_io         *io;
> +};
...
> +struct cache {

Nor these and so on.

> +/* Bkey fields: all units are in sectors */
> +
> +#define KEY_FIELD(name, field, offset, size)                         \
> +     BITMASK(name, struct bkey, field, offset, size)
> +
> +#define PTR_FIELD(name, offset, size)                                        
> \
> +     static inline uint64_t name(const struct bkey *k, unsigned i)   \
> +     { return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); }  \
> +                                                                     \
> +     static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
> +     {                                                               \
> +             k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset);     \
> +             k->ptr[i] |= v << offset;                               \
> +     }
> +
> +KEY_FIELD(KEY_PTRS,  header, 60, 3)
> +KEY_FIELD(HEADER_SIZE,       header, 58, 2)
> +KEY_FIELD(KEY_CSUM,  header, 56, 2)
> +KEY_FIELD(KEY_PINNED,        header, 55, 1)
> +KEY_FIELD(KEY_DIRTY, header, 36, 1)
> +
> +KEY_FIELD(KEY_SIZE,  header, 20, 16)
> +KEY_FIELD(KEY_DEV,   header, 0,  20)
> +
> +KEY_FIELD(KEY_SECTOR,        key,    16, 47)
> +KEY_FIELD(KEY_SNAPSHOT,      key,    0,  16)
> +
> +PTR_FIELD(PTR_DEV,           51, 12)
> +PTR_FIELD(PTR_OFFSET,                8,  43)
> +PTR_FIELD(PTR_GEN,           0,  8)

So, apart from the the macros, key is 64bit containing the backend
device and extent offset and size with the ptr fields somehow point to
cache.  Am I understanding it correctly?  If right, I'm *tiny* bit
apprehensive about using only 43bits for offset.  While the block 9
bits means 52bit addressing, the 9 bit block size is now there mostly
to work as buffer between memory bitness growth and storage device
size growth so that we have 9 bit buffer as storage device reaches
ulong addressing limit.  Probably those days are far out enough.

> +void btree_read_done(struct closure *cl)
> +{
> +     struct btree *b = container_of(cl, struct btree, io.cl);
> +     struct bset *i = b->sets[0].data;
> +     struct btree_iter *iter = b->c->fill_iter;
> +     const char *err = "bad btree header";
> +     BUG_ON(b->nsets || b->written);
> +
> +     bbio_free(b->bio, b->c);
> +     b->bio = NULL;
> +
> +     mutex_lock(&b->c->fill_lock);
> +     iter->used = 0;
> +
> +     if (btree_node_io_error(b) ||
> +         !i->seq)
> +             goto err;
> +
> +     for (;
> +          b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
> +          i = write_block(b)) {
> +             err = "unsupported bset version";
> +             if (i->version > BCACHE_BSET_VERSION)
> +                     goto err;
> +
> +             err = "bad btree header";
> +             if (b->written + set_blocks(i, b->c) > btree_blocks(b))
> +                     goto err;
> +
> +             err = "bad magic";
> +             if (i->magic != bset_magic(b->c))
> +                     goto err;
> +
> +             err = "bad checksum";
> +             switch (i->version) {
> +             case 0:
> +                     if (i->csum != csum_set(i))
> +                             goto err;
> +                     break;
> +             case BCACHE_BSET_VERSION:
> +                     if (i->csum != btree_csum_set(b, i))
> +                             goto err;
> +                     break;
> +             }
> +
> +             err = "empty set";
> +             if (i != b->sets[0].data && !i->keys)
> +                     goto err;
> +
> +             btree_iter_push(iter, i->start, end(i));
> +
> +             b->written += set_blocks(i, b->c);
> +     }
> +
> +     err = "corrupted btree";
> +     for (i = write_block(b);
> +          index(i, b) < btree_blocks(b);
> +          i = ((void *) i) + block_bytes(b->c))
> +             if (i->seq == b->sets[0].data->seq)
> +                     goto err;
> +
> +     btree_sort_and_fix_extents(b, iter);
> +
> +     i = b->sets[0].data;
> +     err = "short btree key";
> +     if (b->sets[0].size &&
> +         bkey_cmp(&b->key, &b->sets[0].end) < 0)
> +             goto err;
> +
> +     if (b->written < btree_blocks(b))
> +             bset_init_next(b);
> +
> +     if (0) {
> +err:         set_btree_node_io_error(b);
> +             cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys",
> +                             err, PTR_BUCKET_NR(b->c, &b->key, 0),
> +                             index(i, b), i->keys);
> +     }

Please don't do that.  Just define out: label, move error specific
path to the end of the function and jump to out at the end of that.

> +
> +     mutex_unlock(&b->c->fill_lock);
> +
> +     spin_lock(&b->c->btree_read_time_lock);
> +     time_stats_update(&b->c->btree_read_time, b->io_start_time);
> +     spin_unlock(&b->c->btree_read_time_lock);
> +
> +     smp_wmb(); /* read_done is our write lock */
> +     set_btree_node_read_done(b);
> +
> +     closure_return(cl);
> +}
> +
> +static void btree_read_resubmit(struct closure *cl)
> +{
> +     struct btree *b = container_of(cl, struct btree, io.cl);
> +
> +     submit_bbio_split(b->bio, b->c, &b->key, 0);
> +     continue_at(&b->io.cl, btree_read_done, system_wq);
> +}

I suppose submit_bbio_split() can't fail here somehow unlike from
btree_read() path?  If so, please add a comment to explain and maybe
WARN_ON_ONCE() on failure.  Subtlety to comment ratio is *way* off.

> +static struct btree *mca_bucket_alloc(struct cache_set *c,
> +                                   struct bkey *k, gfp_t gfp)
> +{
> +     struct btree *b = kzalloc(sizeof(struct btree), gfp);
> +     if (!b)
> +             return NULL;
> +
> +     init_rwsem(&b->lock);
> +     INIT_LIST_HEAD(&b->list);
> +     INIT_DELAYED_WORK(&b->work, btree_write_work);
> +     b->c = c;
> +     closure_init_unlocked(&b->io);
> +
> +     mca_data_alloc(b, k, gfp);
> +     return b->sets[0].data ? b : NULL;
> +}

mca_data_alloc() failure path seems like a resource leak but it isn't
because mca_data_alloc() puts it in free list.  Is the extra level of
caching necessary?  How is it different from sl?b allocator cache?
And either way, comments please.

> +static int mca_reap(struct btree *b, struct closure *cl)
> +{
> +     lockdep_assert_held(&b->c->bucket_lock);
> +
> +     if (!down_write_trylock(&b->lock))
> +             return -1;
> +
> +     BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
> +
> +     if (cl && btree_node_dirty(b))
> +             btree_write(b, true, NULL);
> +
> +     if (cl)
> +             closure_wait_event_async(&b->io.wait, cl,
> +                      atomic_read(&b->io.cl.remaining) == -1);
> +
> +     if (btree_node_dirty(b) ||
> +         atomic_read(&b->io.cl.remaining) != -1 ||
> +         work_pending(&b->work.work)) {
> +             rw_unlock(true, b);
> +             return -EAGAIN;
> +     }
> +
> +     return 0;
> +}

Mixing -1 and -EAGAIN returns usually isn't a good idea.

> +static struct btree *alloc_bucket(struct cache_set *c, struct bkey *k,
> +                               int level, struct closure *cl)
> +{
> +     struct btree *b, *i;
> +     unsigned page_order = ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
> +
> +     lockdep_assert_held(&c->bucket_lock);
> +retry:
> +     if (find_bucket(c, k))
> +             return NULL;
> +
> +     /* btree_free() doesn't free memory; it sticks the node on the end of
> +      * the list. Check if there's any freed nodes there:
> +      */
> +     list_for_each_entry(b, &c->btree_cache_freeable, list)
> +             if (page_order <= b->page_order &&
> +                 !b->key.ptr[0] &&
> +                 !mca_reap(b, NULL))
> +                     goto out;
> +
> +     /* We never free struct btree itself, just the memory that holds the on
> +      * disk node. Check the freed list before allocating a new one:
> +      */
> +     list_for_each_entry(b, &c->btree_cache_freed, list)
> +             if (!mca_reap(b, NULL)) {
> +                     mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
> +                     if (!b->sets[0].data) {
> +                             rw_unlock(true, b);
> +                             goto err;
> +                     } else
> +                             goto out;
> +             }
> +
> +     b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
> +     if (!b)
> +             goto err;
> +
> +     BUG_ON(!down_write_trylock(&b->lock));
> +out:
> +     BUG_ON(!closure_is_unlocked(&b->io.cl));
> +
> +     bkey_copy(&b->key, k);
> +     list_move(&b->list, &c->btree_cache);
> +     hlist_del_init_rcu(&b->hash);
> +     hlist_add_head_rcu(&b->hash, hash_bucket(c, k));
> +     lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
> +
> +     b->flags        = 0;
> +     b->level        = level;
> +     b->written      = 0;
> +     b->nsets        = 0;
> +     for (int i = 0; i < MAX_BSETS; i++)
> +             b->sets[i].size = 0;
> +     for (int i = 1; i < MAX_BSETS; i++)
> +             b->sets[i].data = NULL;

Why separate loops?

> +
> +     return b;
> +err:
> +     if (current->bio_list)
> +             return ERR_PTR(-EAGAIN);

What does this test do?

Thanks.

-- 
tejun
--
To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to