On Sat, Mar 22, 2014 at 11:44:46AM +0100, Jiri Slaby wrote:
> From: Joe Thornber <[email protected]>
> 
> 3.12-stable review patch.  If anyone has any objections, please let me know.

This patch is tagged for stable 3.13+ so I'm not sure it's suitable for a
3.12 kernel.

Cheers,
--
Luís

> ===============
> 
> commit cebc2de44d3bce53e46476e774126c298ca2c8a9 upstream.
> 
> This has been a relatively long-standing issue that wasn't nailed down
> until Teng-Feng Yang's meticulous bug report to dm-devel on 3/7/2014,
> see: http://www.redhat.com/archives/dm-devel/2014-March/msg00021.html
> 
> From that report:
>   "When decreasing the reference count of a metadata block with its
>   reference count equals 3, we will call dm_btree_remove() to remove
>   this enrty from the B+tree which keeps the reference count info in
>   metadata device.
> 
>   The B+tree will try to rebalance the entry of the child nodes in each
>   node it traversed, and the rebalance process contains the following
>   steps.
> 
>   (1) Finding the corresponding children in current node (shadow_current(s))
>   (2) Shadow the children block (issue BOP_INC)
>   (3) redistribute keys among children, and free children if necessary (issue 
> BOP_DEC)
> 
>   Since the update of a metadata block's reference count could be
>   recursive, we will stash these reference count update operations in
>   smm->uncommitted and then process them in a FILO fashion.
> 
>   The problem is that step(3) could free the children which is created
>   in step(2), so the BOP_DEC issued in step(3) will be carried out
>   before the BOP_INC issued in step(2) since these BOPs will be
>   processed in FILO fashion. Once the BOP_DEC from step(3) tries to
>   decrease the reference count of newly shadow block, it will report
>   failure for its reference equals 0 before decreasing. It looks like we
>   can solve this issue by processing these BOPs in a FIFO fashion
>   instead of FILO."
> 
> Commit 5b564d80 ("dm space map: disallow decrementing a reference count
> below zero") changed the code to report an error for this temporary
> refcount decrement below zero.  So what was previously a harmless
> invalid refcount became a hard failure due to the new error path:
> 
>  device-mapper: space map common: unable to decrement a reference count below > 0
>  device-mapper: thin: 253:6: dm_thin_insert_block() failed: error = -22
>  device-mapper: thin: 253:6: switching pool to read-only mode
> 
> This bug is in dm persistent-data code that is common to the DM thin and
> cache targets.  So any users of those targets should apply this fix.
> 
> Fix this by applying recursive space map operations in FIFO order rather
> than FILO.
> 
> Resolves: https://bugzilla.kernel.org/show_bug.cgi?id=68801
> 
> Reported-by: Apollon Oikonomopoulos <[email protected]>
> Reported-by: [email protected]
> Reported-by: Teng-Feng Yang <[email protected]>
> Signed-off-by: Joe Thornber <[email protected]>
> Signed-off-by: Mike Snitzer <[email protected]>
> Signed-off-by: Jiri Slaby <[email protected]>
> ---
>  drivers/md/persistent-data/dm-space-map-metadata.c | 113 
> +++++++++++++++++----
>  1 file changed, 92 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c 
> b/drivers/md/persistent-data/dm-space-map-metadata.c
> index afb419e514bf..579b58200bf2 100644
> --- a/drivers/md/persistent-data/dm-space-map-metadata.c
> +++ b/drivers/md/persistent-data/dm-space-map-metadata.c
> @@ -91,6 +91,69 @@ struct block_op {
>       dm_block_t block;
>  };
>  
> +struct bop_ring_buffer {
> +     unsigned begin;
> +     unsigned end;
> +     struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
> +};
> +
> +static void brb_init(struct bop_ring_buffer *brb)
> +{
> +     brb->begin = 0;
> +     brb->end = 0;
> +}
> +
> +static bool brb_empty(struct bop_ring_buffer *brb)
> +{
> +     return brb->begin == brb->end;
> +}
> +
> +static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
> +{
> +     unsigned r = old + 1;
> +     return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
> +}
> +
> +static int brb_push(struct bop_ring_buffer *brb,
> +                 enum block_op_type type, dm_block_t b)
> +{
> +     struct block_op *bop;
> +     unsigned next = brb_next(brb, brb->end);
> +
> +     /*
> +      * We don't allow the last bop to be filled, this way we can
> +      * differentiate between full and empty.
> +      */
> +     if (next == brb->begin)
> +             return -ENOMEM;
> +
> +     bop = brb->bops + brb->end;
> +     bop->type = type;
> +     bop->block = b;
> +
> +     brb->end = next;
> +
> +     return 0;
> +}
> +
> +static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
> +{
> +     struct block_op *bop;
> +
> +     if (brb_empty(brb))
> +             return -ENODATA;
> +
> +     bop = brb->bops + brb->begin;
> +     result->type = bop->type;
> +     result->block = bop->block;
> +
> +     brb->begin = brb_next(brb, brb->begin);
> +
> +     return 0;
> +}
> +
> +/*----------------------------------------------------------------*/
> +
>  struct sm_metadata {
>       struct dm_space_map sm;
>  
> @@ -101,25 +164,20 @@ struct sm_metadata {
>  
>       unsigned recursion_count;
>       unsigned allocated_this_transaction;
> -     unsigned nr_uncommitted;
> -     struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
> +     struct bop_ring_buffer uncommitted;
>  
>       struct threshold threshold;
>  };
>  
>  static int add_bop(struct sm_metadata *smm, enum block_op_type type, 
> dm_block_t b)
>  {
> -     struct block_op *op;
> +     int r = brb_push(&smm->uncommitted, type, b);
>  
> -     if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
> +     if (r) {
>               DMERR("too many recursive allocations");
>               return -ENOMEM;
>       }
>  
> -     op = smm->uncommitted + smm->nr_uncommitted++;
> -     op->type = type;
> -     op->block = b;
> -
>       return 0;
>  }
>  
> @@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
>               return -ENOMEM;
>       }
>  
> -     if (smm->recursion_count == 1 && smm->nr_uncommitted) {
> -             while (smm->nr_uncommitted && !r) {
> -                     smm->nr_uncommitted--;
> -                     r = commit_bop(smm, smm->uncommitted +
> -                                    smm->nr_uncommitted);
> +     if (smm->recursion_count == 1) {
> +             while (!brb_empty(&smm->uncommitted)) {
> +                     struct block_op bop;
> +
> +                     r = brb_pop(&smm->uncommitted, &bop);
> +                     if (r) {
> +                             DMERR("bug in bop ring buffer");
> +                             break;
> +                     }
> +
> +                     r = commit_bop(smm, &bop);
>                       if (r)
>                               break;
>               }
> @@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map 
> *sm, dm_block_t *count)
>  static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
>                                uint32_t *result)
>  {
> -     int r, i;
> +     int r;
> +     unsigned i;
>       struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
>       unsigned adjustment = 0;
>  
> @@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map 
> *sm, dm_block_t b,
>        * We may have some uncommitted adjustments to add.  This list
>        * should always be really short.
>        */
> -     for (i = 0; i < smm->nr_uncommitted; i++) {
> -             struct block_op *op = smm->uncommitted + i;
> +     for (i = smm->uncommitted.begin;
> +          i != smm->uncommitted.end;
> +          i = brb_next(&smm->uncommitted, i)) {
> +             struct block_op *op = smm->uncommitted.bops + i;
>  
>               if (op->block != b)
>                       continue;
> @@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, 
> dm_block_t b,
>  static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
>                                             dm_block_t b, int *result)
>  {
> -     int r, i, adjustment = 0;
> +     int r, adjustment = 0;
> +     unsigned i;
>       struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
>       uint32_t rc;
>  
> @@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct 
> dm_space_map *sm,
>        * We may have some uncommitted adjustments to add.  This list
>        * should always be really short.
>        */
> -     for (i = 0; i < smm->nr_uncommitted; i++) {
> -             struct block_op *op = smm->uncommitted + i;
> +     for (i = smm->uncommitted.begin;
> +          i != smm->uncommitted.end;
> +          i = brb_next(&smm->uncommitted, i)) {
> +
> +             struct block_op *op = smm->uncommitted.bops + i;
>  
>               if (op->block != b)
>                       continue;
> @@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
>       smm->begin = superblock + 1;
>       smm->recursion_count = 0;
>       smm->allocated_this_transaction = 0;
> -     smm->nr_uncommitted = 0;
> +     brb_init(&smm->uncommitted);
>       threshold_init(&smm->threshold);
>  
>       memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
> @@ -713,7 +784,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
>       smm->begin = 0;
>       smm->recursion_count = 0;
>       smm->allocated_this_transaction = 0;
> -     smm->nr_uncommitted = 0;
> +     brb_init(&smm->uncommitted);
>       threshold_init(&smm->threshold);
>  
>       memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
> -- 
> 1.9.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe stable" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to