Hi Alex,

On 2017/12/26 16:52, alex chen wrote:
> Hi Changwei,
> 
> On 2017/12/26 14:22, Changwei Ge wrote:
>> A crash issue was reported by John.
>> The call trace follows:
>> ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2]
>> ocfs2_change_extent_flag+0x33a/0x470 [ocfs2]
>> ocfs2_mark_extent_written+0x172/0x220 [ocfs2]
>> ocfs2_dio_end_io+0x62d/0x910 [ocfs2]
>> dio_complete+0x19a/0x1a0
>> do_blockdev_direct_IO+0x19dd/0x1eb0
>> __blockdev_direct_IO+0x43/0x50
>> ocfs2_direct_IO+0x8f/0xa0 [ocfs2]
>> generic_file_direct_write+0xb2/0x170
>> __generic_file_write_iter+0xc3/0x1b0
>> ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2]
>> __vfs_write+0xae/0xf0
>> vfs_write+0xb8/0x1b0
>> SyS_write+0x4f/0xb0
>> system_call_fastpath+0x16/0x75
>>
>> The BUG code told that extent tree wants to grow but no metadata
>> was reserved ahead of time.
>>   From my investigation into this issue, the root cause it that although
>> enough metadata is not reserved, there should be enough for following use.
>> Rightmost extent is merged into its left one due to a certain times of
>> marking extent written. Because during marking extent written, we got many
>> physically continuous extents. At last, an empty extent showed up and the
>> rightmost path is removed from extent tree.
>>
>> Add a new mechanism to reuse extent block cached in dealloc which were
>> just unlinked from extent tree to solve this crash issue.
>>
>> Criteria is that during marking extents *written*, if extent rotation
>> and merging results in unlinking extent with growing extent tree later
>> without any metadata reserved ahead of time, try to reuse those extents
>> in dealloc in which deleted extents are cached.
> How can you guarantee that those extent blocks cached by removed the 
> rightmost path
> are enough for next growing extent tree?

It can be definitely guaranteed that we can find enough extent block from 
dealloc,
since the estimation about metadata before operating b+ tree should be accurate.
All extent blocks in dealloc come from that extent tree.

> 
>>
>> Also, this patch addresses the issue John reported that ::dw_zero_count is
>> not calculated properly.
>>
>> After applying this patch, the issue John reported was gone.
>> Thanks for the reproducer provided by John.
>> And this patch has passed ocfs2-test(29 cases) suite running by New H3C 
>> Group.
>>
>> Reported-by: John Lightsey <j...@nixnuts.net>
>> Signed-off-by: Changwei Ge <ge.chang...@h3c.com>
>> ---
>>    fs/ocfs2/alloc.c | 128 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>    fs/ocfs2/alloc.h |   1 +
>>    fs/ocfs2/aops.c  |  14 ++++--
>>    3 files changed, 135 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..eb41074 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct 
>> ocfs2_extent_tree *et,
>>                                   struct ocfs2_extent_rec *rec);
>>    static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
>>    static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
>> +
>> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
>> +                                    struct ocfs2_extent_tree *et,
>> +                                    struct buffer_head **new_eb_bh,
>> +                                    int blk_cnt);
>> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et);
>> +
>>    static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
>>      .eo_set_last_eb_blk     = ocfs2_dinode_set_last_eb_blk,
>>      .eo_get_last_eb_blk     = ocfs2_dinode_get_last_eb_blk,
>> @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct 
>> ocfs2_extent_tree *et,
>>      if (!obj)
>>              obj = (void *)bh->b_data;
>>      et->et_object = obj;
>> +    et->et_dealloc = NULL;
>>    
>>      et->et_ops->eo_fill_root_el(et);
>>      if (!et->et_ops->eo_fill_max_leaf_clusters)
>> @@ -1213,8 +1221,15 @@ static int ocfs2_add_branch(handle_t *handle,
>>              goto bail;
>>      }
>>    
>> -    status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
>> -                                       meta_ac, new_eb_bhs);
>> +    if (meta_ac) {
>> +            status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
>> +                                               meta_ac, new_eb_bhs);
>> +    } else if (!ocfs2_is_dealloc_empty(et)) {
>> +            status = ocfs2_reuse_blk_from_dealloc(handle, et,
>> +                                                  new_eb_bhs, new_blocks);
>> +    } else {
>> +            BUG();
>> +    }
>>      if (status < 0) {
>>              mlog_errno(status);
>>              goto bail;
>> @@ -1347,8 +1362,15 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
>>      struct ocfs2_extent_list  *root_el;
>>      struct ocfs2_extent_list  *eb_el;
>>    
>> -    status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
>> -                                       &new_eb_bh);
>> +    if (meta_ac) {
>> +            status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
>> +                                               &new_eb_bh);
>> +    } else if (!ocfs2_is_dealloc_empty(et)) {
>> +            status = ocfs2_reuse_blk_from_dealloc(handle, et,
>> +                                                  &new_eb_bh, 1);
>> +    } else {
>> +            BUG();
>> +    }
>>      if (status < 0) {
>>              mlog_errno(status);
>>              goto bail;
>> @@ -1511,7 +1533,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct 
>> ocfs2_extent_tree *et,
>>      int depth = le16_to_cpu(el->l_tree_depth);
>>      struct buffer_head *bh = NULL;
>>    
>> -    BUG_ON(meta_ac == NULL);
>> +    BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et));
>>    
>>      shift = ocfs2_find_branch_target(et, &bh);
>>      if (shift < 0) {
>> @@ -6585,6 +6607,102 @@ ocfs2_find_per_slot_free_list(int type,
>>      return fl;
>>    }
>>    
>> +/* Return Value 1 indicates empty */
>> +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et)
>> +{
>> +    struct ocfs2_per_slot_free_list *fl;
>> +    struct ocfs2_super *osb =
>> +            OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
>> +
>> +    if (!et->et_dealloc)
>> +            return 1;
>> +
>> +    fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
>> +                                       osb->slot_num, et->et_dealloc);
>> +
>> +    return fl ? 0 : 1;
>> +}
>> +
>> +/* If extent was deleted from tree due to extent rotation and merging, and
>> + * no metadata is reserved ahead of time. Try to reuse some extents
>> + * just deleted. This is only used to reuse extent blocks.
>> + * It is supposed to find enough extent blocks in dealloc if our estimation
>> + * on metadata is accurate.
>> + */
>> +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle,
>> +                                    struct ocfs2_extent_tree *et,
>> +                                    struct buffer_head **new_eb_bh,
>> +                                    int blk_cnt)
>> +{
>> +    int i, status = -ENOSPC;
>> +    struct ocfs2_cached_dealloc_ctxt *dealloc;
>> +    struct ocfs2_per_slot_free_list *fl;
>> +    struct ocfs2_cached_block_free *bf;
>> +    struct ocfs2_extent_block *eb;
>> +    struct ocfs2_super *osb =
>> +            OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
>> +
>> +    dealloc = et->et_dealloc;
>> +    if (!dealloc)
>> +            goto bail;
>> +
>> +    for (i = 0; i < blk_cnt; i++) {
>> +            fl = ocfs2_find_per_slot_free_list(EXTENT_ALLOC_SYSTEM_INODE,
>> +                                               osb->slot_num, dealloc);
>> +            /* The caller should guarantee that dealloc has cached some
>> +             * extent blocks.
>> +             */
>> +            BUG_ON(!fl);
>> +
> It is too crude to BUGON here.

This can't happen. In my design, we must find a valid extent block here, 
otherwise something
goes wrong.
So we'd better BUG here to find out the code bug ASAP.

> 
>> +            dealloc->c_first_suballocator = fl->f_next_suballocator;
>> +            bf = fl->f_first;
>> +            new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk);
>> +            if (new_eb_bh[i] == NULL) {
>> +                    status = -ENOMEM;
>> +                    mlog_errno(status);
>> +                    goto bail;
>> +            }
>> +
>> +            mlog(0, "Reusing block(%llu) from dealloc\n", bf->free_blk);
>> +
>> +            ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]);
>> +
>> +            status = ocfs2_journal_access_eb(handle, et->et_ci,
>> +                                             new_eb_bh[i],
>> +                                             OCFS2_JOURNAL_ACCESS_CREATE);
>> +            if (status < 0) {
>> +                    mlog_errno(status);
>> +                    goto bail;
>> +            }
>> +
>> +            memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize);
>> +            eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data;
>> +
>> +            /* We can't guarantee that buffer head is still cached, so
>> +             * polutlate the extent block again.
>> +             */
>> +            strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
>> +            eb->h_blkno = cpu_to_le64(bf->free_blk);
>> +            eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
>> +            eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
>> +            eb->h_suballoc_loc = cpu_to_le64(bf->free_bg);
>> +            eb->h_suballoc_bit = cpu_to_le16(bf->free_bit);
>> +            eb->h_list.l_count =
>> +                    cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
>> +
>> +            /* We'll also be dirtied by the caller, so
>> +             * this isn't absolutely necessary.
>> +             */
>> +            ocfs2_journal_dirty(handle, new_eb_bh[i]);
>> +
>> +            kfree(fl);
>> +            kfree(bf);
>> +    }
>> +
>> +bail:
>> +    return status;
>> +}
>> +
>>    int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
>>                            int type, int slot, u64 suballoc,
>>                            u64 blkno, unsigned int bit)
>> diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
>> index 27b75cf..250bcac 100644
>> --- a/fs/ocfs2/alloc.h
>> +++ b/fs/ocfs2/alloc.h
>> @@ -61,6 +61,7 @@ struct ocfs2_extent_tree {
>>      ocfs2_journal_access_func               et_root_journal_access;
>>      void                                    *et_object;
>>      unsigned int                            et_max_leaf_clusters;
>> +    struct ocfs2_cached_dealloc_ctxt        *et_dealloc;
>>    };
>>    
>>    /*
>> diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
>> index d151632..1bbd5c8 100644
>> --- a/fs/ocfs2/aops.c
>> +++ b/fs/ocfs2/aops.c
>> @@ -797,6 +797,7 @@ struct ocfs2_write_ctxt {
>>      struct ocfs2_cached_dealloc_ctxt w_dealloc;
>>    
>>      struct list_head                w_unwritten_list;
>> +    unsigned int                    w_unwritten_count;
>>    };
>>    
>>    void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
>> @@ -1386,6 +1387,7 @@ static int ocfs2_unwritten_check(struct inode *inode,
>>      desc->c_clear_unwritten = 0;
>>      list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list);
>>      list_add_tail(&new->ue_node, &wc->w_unwritten_list);
>> +    wc->w_unwritten_count++;
>>      new = NULL;
>>    unlock:
>>      spin_unlock(&oi->ip_lock);
>> @@ -1762,8 +1764,8 @@ int ocfs2_write_begin_nolock(struct address_space 
>> *mapping,
>>               */
>>              ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
>>                                            wc->w_di_bh);
>> -            ret = ocfs2_lock_allocators(inode, &et,
>> -                                        clusters_to_alloc, extents_to_split,
>> +            ret = ocfs2_lock_allocators(inode, &et, clusters_to_alloc,
>> +                                        2*extents_to_split,
> Why here is 2*extents_to_split not extents_to_split?

Like other code snippet where ocfs2_lock_allocators is invoked, we have to 
double extent number passing
it into ocfs2_lock_allocators.

BTW, I have already posted v2 of this patch.
Please comment in that patch.

Thanks,
Changwei

> 
>>                                          &data_ac, &meta_ac);
>>              if (ret) {
>>                      mlog_errno(ret);
>> @@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, 
>> sector_t iblock,
>>              ue->ue_phys = desc->c_phys;
>>    
>>              list_splice_tail_init(&wc->w_unwritten_list, 
>> &dwc->dw_zero_list);
>> -            dwc->dw_zero_count++;
>> +            dwc->dw_zero_count += wc->w_unwritten_count;
>>      }
>>    
>>      ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
>> @@ -2330,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode,
>>    
>>      ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
>>    
>> +    /* Attach dealloc with extent tree in case that we may reuse extents
>> +     * which are already unlinked from current extent tree due to extent
>> +     * rotation and merging.
>> +     */
>> +    et.et_dealloc = &dealloc;
>> +
>>      ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2,
>>                                  &data_ac, &meta_ac);
>>      if (ret) {
>>
> 
> 


_______________________________________________
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel

Reply via email to