On Wed, Jan 23, 2013 at 10:54:39AM +0800, Miao Xie wrote:
> On Tue, 22 Jan 2013 22:24:15 +0800, Liu Bo wrote:
> > On Tue, Jan 22, 2013 at 06:49:00PM +0800, Miao Xie wrote:
> >> btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the
> >> first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/
> >> btrfs_queue_worker for this inode, and then it locks the list, checks the
> >> head of the list again. But because we don't delete the first inode that it
> >> deals with before, it will fetch the same inode. As a result, this function
> >> allocates a huge amount of btrfs_delalloc_work structures, and OOM happens.
> >>
> >> Fix this problem by splice this delalloc list.
> >>
> >> Reported-by: Alex Lyakas <alex.bt...@zadarastorage.com>
> >> Signed-off-by: Miao Xie <mi...@cn.fujitsu.com>
> >> ---
> >>  fs/btrfs/inode.c |   55 
> >> ++++++++++++++++++++++++++++++++++++++++-------------
> >>  1 files changed, 41 insertions(+), 14 deletions(-)
> >>
> >> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> >> index 67ed24a..86f1d25 100644
> >> --- a/fs/btrfs/inode.c
> >> +++ b/fs/btrfs/inode.c
> >> @@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct 
> >> btrfs_delalloc_work *work)
> >>   */
> >>  int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
> >>  {
> >> -  struct list_head *head = &root->fs_info->delalloc_inodes;
> >>    struct btrfs_inode *binode;
> >>    struct inode *inode;
> >>    struct btrfs_delalloc_work *work, *next;
> >>    struct list_head works;
> >> +  struct list_head splice;
> >>    int ret = 0;
> >>  
> >>    if (root->fs_info->sb->s_flags & MS_RDONLY)
> >>            return -EROFS;
> >>  
> >>    INIT_LIST_HEAD(&works);
> >> -
> >> +  INIT_LIST_HEAD(&splice);
> >> +again:
> >>    spin_lock(&root->fs_info->delalloc_lock);
> >> -  while (!list_empty(head)) {
> >> -          binode = list_entry(head->next, struct btrfs_inode,
> >> +  list_splice_init(&root->fs_info->delalloc_inodes, &splice);
> >> +  while (!list_empty(&splice)) {
> >> +          binode = list_entry(splice.next, struct btrfs_inode,
> >>                                delalloc_inodes);
> >> +
> >> +          list_del_init(&binode->delalloc_inodes);
> >> +
> > 
> > I believe this patch can work well, but it's a little complex.
> > 
> > How about adding a flag in runtime_flags set?
> 
> I have tried to adding a flag in runtime_flags, but I found it is not a good
> way, because
> - it can not avoid traversing the delalloc list repeatedly when someone write
>   data into the file endlessly. In fact, it is unnecessary because we can just
>   see that data as the one which is written after the flush is done.
> - bit operation need lock the bus, but we have a spin lock to protect all
>   the relative variants, so it is unnecessary.
> 
> besides that, there is something wrong with the following patch.

Okay, I see the problem.

But with [PATCH 4/5], I think maybe we can merge these two patches and
simplify things as following?

Just flush them once,

        spin_lock(&root->fs_info->delalloc_lock);
        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
        spin_unlock(&root->fs_info->delalloc_lock);

        while (!list_empty(&splice)) {
                ...
        }

thanks,
liubo

> 
> > We can use the flag instead of 'delalloc_inodes' list to tell if we
> > have clear the delalloc bytes, and the most important thing is it
> > won't touch the original code logic too much.
> > 
> > thanks,
> > liubo
> > 
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 67ed24a..692ed0e 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
> >             BTRFS_I(inode)->delalloc_bytes -= len;
> >  
> >             if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
> > -               !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
> > -                   list_del_init(&BTRFS_I(inode)->delalloc_inodes);
> > +               test_bit(BTRFS_INODE_FLUSH, 
> > &BTRFS_I(inode)->runtime_flags)) {
> > +                   clear_bit(BTRFS_INODE_FLUSH, 
> > &BTRFS_I(inode)->runtime_flags);
> >             }
> 
> We can not remove list_del_init(), because the delalloc file can be flushed 
> not only
> by btrfs_start_delalloc_inodes(), but also by flusher or the task who invoke 
> btrfs_sync_file().
> 
> >             spin_unlock(&root->fs_info->delalloc_lock);
> >     }
> > @@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root 
> > *root, int delay_iput)
> >             binode = list_entry(head->next, struct btrfs_inode,
> >                                 delalloc_inodes);
> >             inode = igrab(&binode->vfs_inode);
> > -           if (!inode)
> > -                   list_del_init(&binode->delalloc_inodes);
> > +
> > +           list_del_init(&binode->delalloc_inodes);
> > +
> >             spin_unlock(&root->fs_info->delalloc_lock);
> >             if (inode) {
> >                     work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
> > @@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root 
> > *root, int delay_iput)
> >                             goto out;
> >                     }
> >                     list_add_tail(&work->list, &works);
> > +                   set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags);
> 
> if someone flush the file before set_bit(), no one will clear bit.
> 
> >                     btrfs_queue_worker(&root->fs_info->flush_workers,
> >                                        &work->work);
> >             }
> > @@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root 
> > *root, int delay_iput)
> >     }
> >     spin_unlock(&root->fs_info->delalloc_lock);
> >  
> > +   /* make sure we clear all delalloc bytes we have scheduled */
> > +   while (!list_empty(&works)) {
> > +           work = list_entry(works.next, struct btrfs_delalloc_work,
> > +                             list);
> > +           binode = btrfs_ino(work->inode);
> 
>                       ^^^^^^BTRFS_I(), not btrfs_ino()
> 
> > +           if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) {
> > +                   list_del_init(&work->list);
> > +                   btrfs_wait_and_free_delalloc_work(work);
> 
> We must wait and free all the delalloc work here, or memory leak will happen.
> 
> Thanks
> Miao
> 
> > +           }
> > +           cond_resched();
> > +   }
> > +
> >     /* the filemap_flush will queue IO into the worker threads, but
> >      * we have to make sure the IO is actually started and that
> >      * ordered extents get created before we return
> > 
> > 
> > 
> >>            inode = igrab(&binode->vfs_inode);
> >>            if (!inode)
> >> -                  list_del_init(&binode->delalloc_inodes);
> >> +                  continue;
> >> +
> >> +          list_add_tail(&binode->delalloc_inodes,
> >> +                        &root->fs_info->delalloc_inodes);
> >>            spin_unlock(&root->fs_info->delalloc_lock);
> >> -          if (inode) {
> >> -                  work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
> >> -                  if (!work) {
> >> -                          ret = -ENOMEM;
> >> -                          goto out;
> >> -                  }
> >> -                  list_add_tail(&work->list, &works);
> >> -                  btrfs_queue_worker(&root->fs_info->flush_workers,
> >> -                                     &work->work);
> >> +
> >> +          work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
> >> +          if (unlikely(!work)) {
> >> +                  ret = -ENOMEM;
> >> +                  goto out;
> >>            }
> >> +          list_add_tail(&work->list, &works);
> >> +          btrfs_queue_worker(&root->fs_info->flush_workers,
> >> +                             &work->work);
> >> +
> >>            cond_resched();
> >>            spin_lock(&root->fs_info->delalloc_lock);
> >>    }
> >>    spin_unlock(&root->fs_info->delalloc_lock);
> >>  
> >> +  list_for_each_entry_safe(work, next, &works, list) {
> >> +          list_del_init(&work->list);
> >> +          btrfs_wait_and_free_delalloc_work(work);
> >> +  }
> >> +
> >> +  spin_lock(&root->fs_info->delalloc_lock);
> >> +  if (!list_empty(&root->fs_info->delalloc_inodes)) {
> >> +          spin_unlock(&root->fs_info->delalloc_lock);
> >> +          goto again;
> >> +  }
> >> +  spin_unlock(&root->fs_info->delalloc_lock);
> >> +
> >>    /* the filemap_flush will queue IO into the worker threads, but
> >>     * we have to make sure the IO is actually started and that
> >>     * ordered extents get created before we return
> >> @@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root 
> >> *root, int delay_iput)
> >>                atomic_read(&root->fs_info->async_delalloc_pages) == 0));
> >>    }
> >>    atomic_dec(&root->fs_info->async_submit_draining);
> >> +  return 0;
> >>  out:
> >>    list_for_each_entry_safe(work, next, &works, list) {
> >>            list_del_init(&work->list);
> >>            btrfs_wait_and_free_delalloc_work(work);
> >>    }
> >> +
> >> +  if (!list_empty_careful(&splice)) {
> >> +          spin_lock(&root->fs_info->delalloc_lock);
> >> +          list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
> >> +          spin_unlock(&root->fs_info->delalloc_lock);
> >> +  }
> >>    return ret;
> >>  }
> >>  
> >> -- 
> >> 1.6.5.2
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> >> the body of a message to majord...@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to