On      wed, 23 Jan 2013 11:56:55 +0800, Liu Bo wrote:
> On Wed, Jan 23, 2013 at 10:54:39AM +0800, Miao Xie wrote:
>> On Tue, 22 Jan 2013 22:24:15 +0800, Liu Bo wrote:
>>> On Tue, Jan 22, 2013 at 06:49:00PM +0800, Miao Xie wrote:
>>>> btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the
>>>> first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/
>>>> btrfs_queue_worker for this inode, and then it locks the list, checks the
>>>> head of the list again. But because we don't delete the first inode that it
>>>> deals with before, it will fetch the same inode. As a result, this function
>>>> allocates a huge amount of btrfs_delalloc_work structures, and OOM happens.
>>>>
>>>> Fix this problem by splice this delalloc list.
>>>>
>>>> Reported-by: Alex Lyakas <[email protected]>
>>>> Signed-off-by: Miao Xie <[email protected]>
>>>> ---
>>>>  fs/btrfs/inode.c |   55 
>>>> ++++++++++++++++++++++++++++++++++++++++-------------
>>>>  1 files changed, 41 insertions(+), 14 deletions(-)
>>>>
>>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>>> index 67ed24a..86f1d25 100644
>>>> --- a/fs/btrfs/inode.c
>>>> +++ b/fs/btrfs/inode.c
>>>> @@ -7545,41 +7545,61 @@ void btrfs_wait_and_free_delalloc_work(struct 
>>>> btrfs_delalloc_work *work)
>>>>   */
>>>>  int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
>>>>  {
>>>> -  struct list_head *head = &root->fs_info->delalloc_inodes;
>>>>    struct btrfs_inode *binode;
>>>>    struct inode *inode;
>>>>    struct btrfs_delalloc_work *work, *next;
>>>>    struct list_head works;
>>>> +  struct list_head splice;
>>>>    int ret = 0;
>>>>  
>>>>    if (root->fs_info->sb->s_flags & MS_RDONLY)
>>>>            return -EROFS;
>>>>  
>>>>    INIT_LIST_HEAD(&works);
>>>> -
>>>> +  INIT_LIST_HEAD(&splice);
>>>> +again:
>>>>    spin_lock(&root->fs_info->delalloc_lock);
>>>> -  while (!list_empty(head)) {
>>>> -          binode = list_entry(head->next, struct btrfs_inode,
>>>> +  list_splice_init(&root->fs_info->delalloc_inodes, &splice);
>>>> +  while (!list_empty(&splice)) {
>>>> +          binode = list_entry(splice.next, struct btrfs_inode,
>>>>                                delalloc_inodes);
>>>> +
>>>> +          list_del_init(&binode->delalloc_inodes);
>>>> +
>>>
>>> I believe this patch can work well, but it's a little complex.
>>>
>>> How about adding a flag in runtime_flags set?
>>
>> I have tried to adding a flag in runtime_flags, but I found it is not a good
>> way, because
>> - it can not avoid traversing the delalloc list repeatedly when someone write
>>   data into the file endlessly. In fact, it is unnecessary because we can 
>> just
>>   see that data as the one which is written after the flush is done.
>> - bit operation need lock the bus, but we have a spin lock to protect all
>>   the relative variants, so it is unnecessary.
>>
>> besides that, there is something wrong with the following patch.
> 
> Okay, I see the problem.
> 
> But with [PATCH 4/5], I think maybe we can merge these two patches and
> simplify things as following?
> 
> Just flush them once,
> 
>       spin_lock(&root->fs_info->delalloc_lock);
>       list_splice_init(&root->fs_info->delalloc_inodes, &splice);
>       spin_unlock(&root->fs_info->delalloc_lock);
> 
>       while (!list_empty(&splice)) {
>               ...
>       }

No, we can't. The other tasks which flush the delalloc data may remove the inode
from the delalloc list/splice list. If we release the lock, we will meet the 
race
between list traversing and list_del().

Thanks
Miao

> 
> thanks,
> liubo
> 
>>
>>> We can use the flag instead of 'delalloc_inodes' list to tell if we
>>> have clear the delalloc bytes, and the most important thing is it
>>> won't touch the original code logic too much.
>>>
>>> thanks,
>>> liubo
>>>
>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>> index 67ed24a..692ed0e 100644
>>> --- a/fs/btrfs/inode.c
>>> +++ b/fs/btrfs/inode.c
>>> @@ -1555,8 +1555,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
>>>             BTRFS_I(inode)->delalloc_bytes -= len;
>>>  
>>>             if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
>>> -               !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
>>> -                   list_del_init(&BTRFS_I(inode)->delalloc_inodes);
>>> +               test_bit(BTRFS_INODE_FLUSH, 
>>> &BTRFS_I(inode)->runtime_flags)) {
>>> +                   clear_bit(BTRFS_INODE_FLUSH, 
>>> &BTRFS_I(inode)->runtime_flags);
>>>             }
>>
>> We can not remove list_del_init(), because the delalloc file can be flushed 
>> not only
>> by btrfs_start_delalloc_inodes(), but also by flusher or the task who invoke 
>> btrfs_sync_file().
>>
>>>             spin_unlock(&root->fs_info->delalloc_lock);
>>>     }
>>> @@ -7562,8 +7562,9 @@ int btrfs_start_delalloc_inodes(struct btrfs_root 
>>> *root, int delay_iput)
>>>             binode = list_entry(head->next, struct btrfs_inode,
>>>                                 delalloc_inodes);
>>>             inode = igrab(&binode->vfs_inode);
>>> -           if (!inode)
>>> -                   list_del_init(&binode->delalloc_inodes);
>>> +
>>> +           list_del_init(&binode->delalloc_inodes);
>>> +
>>>             spin_unlock(&root->fs_info->delalloc_lock);
>>>             if (inode) {
>>>                     work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
>>> @@ -7572,6 +7573,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root 
>>> *root, int delay_iput)
>>>                             goto out;
>>>                     }
>>>                     list_add_tail(&work->list, &works);
>>> +                   set_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags);
>>
>> if someone flush the file before set_bit(), no one will clear bit.
>>
>>>                     btrfs_queue_worker(&root->fs_info->flush_workers,
>>>                                        &work->work);
>>>             }
>>> @@ -7580,6 +7582,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root 
>>> *root, int delay_iput)
>>>     }
>>>     spin_unlock(&root->fs_info->delalloc_lock);
>>>  
>>> +   /* make sure we clear all delalloc bytes we have scheduled */
>>> +   while (!list_empty(&works)) {
>>> +           work = list_entry(works.next, struct btrfs_delalloc_work,
>>> +                             list);
>>> +           binode = btrfs_ino(work->inode);
>>
>>                      ^^^^^^BTRFS_I(), not btrfs_ino()
>>
>>> +           if (!test_bit(BTRFS_INODE_FLUSH, &binode->runtime_flags)) {
>>> +                   list_del_init(&work->list);
>>> +                   btrfs_wait_and_free_delalloc_work(work);
>>
>> We must wait and free all the delalloc work here, or memory leak will happen.
>>
>> Thanks
>> Miao
>>
>>> +           }
>>> +           cond_resched();
>>> +   }
>>> +
>>>     /* the filemap_flush will queue IO into the worker threads, but
>>>      * we have to make sure the IO is actually started and that
>>>      * ordered extents get created before we return
>>>
>>>
>>>
>>>>            inode = igrab(&binode->vfs_inode);
>>>>            if (!inode)
>>>> -                  list_del_init(&binode->delalloc_inodes);
>>>> +                  continue;
>>>> +
>>>> +          list_add_tail(&binode->delalloc_inodes,
>>>> +                        &root->fs_info->delalloc_inodes);
>>>>            spin_unlock(&root->fs_info->delalloc_lock);
>>>> -          if (inode) {
>>>> -                  work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
>>>> -                  if (!work) {
>>>> -                          ret = -ENOMEM;
>>>> -                          goto out;
>>>> -                  }
>>>> -                  list_add_tail(&work->list, &works);
>>>> -                  btrfs_queue_worker(&root->fs_info->flush_workers,
>>>> -                                     &work->work);
>>>> +
>>>> +          work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
>>>> +          if (unlikely(!work)) {
>>>> +                  ret = -ENOMEM;
>>>> +                  goto out;
>>>>            }
>>>> +          list_add_tail(&work->list, &works);
>>>> +          btrfs_queue_worker(&root->fs_info->flush_workers,
>>>> +                             &work->work);
>>>> +
>>>>            cond_resched();
>>>>            spin_lock(&root->fs_info->delalloc_lock);
>>>>    }
>>>>    spin_unlock(&root->fs_info->delalloc_lock);
>>>>  
>>>> +  list_for_each_entry_safe(work, next, &works, list) {
>>>> +          list_del_init(&work->list);
>>>> +          btrfs_wait_and_free_delalloc_work(work);
>>>> +  }
>>>> +
>>>> +  spin_lock(&root->fs_info->delalloc_lock);
>>>> +  if (!list_empty(&root->fs_info->delalloc_inodes)) {
>>>> +          spin_unlock(&root->fs_info->delalloc_lock);
>>>> +          goto again;
>>>> +  }
>>>> +  spin_unlock(&root->fs_info->delalloc_lock);
>>>> +
>>>>    /* the filemap_flush will queue IO into the worker threads, but
>>>>     * we have to make sure the IO is actually started and that
>>>>     * ordered extents get created before we return
>>>> @@ -7592,11 +7612,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root 
>>>> *root, int delay_iput)
>>>>                atomic_read(&root->fs_info->async_delalloc_pages) == 0));
>>>>    }
>>>>    atomic_dec(&root->fs_info->async_submit_draining);
>>>> +  return 0;
>>>>  out:
>>>>    list_for_each_entry_safe(work, next, &works, list) {
>>>>            list_del_init(&work->list);
>>>>            btrfs_wait_and_free_delalloc_work(work);
>>>>    }
>>>> +
>>>> +  if (!list_empty_careful(&splice)) {
>>>> +          spin_lock(&root->fs_info->delalloc_lock);
>>>> +          list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
>>>> +          spin_unlock(&root->fs_info->delalloc_lock);
>>>> +  }
>>>>    return ret;
>>>>  }
>>>>  
>>>> -- 
>>>> 1.6.5.2
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>>> the body of a message to [email protected]
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to