Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-16 Thread Changwei Ge
It looks good to me.

Reviewed-by: Changwei Ge 

On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
> 
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
> 
> Signed-off-by: Gang He 
> ---
>   fs/ocfs2/alloc.c | 44 
>   1 file changed, 44 insertions(+)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;
>   
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   
>   trace_ocfs2_trim_fs(start, len, minlen);
>   
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> + if (ret < 0) {
> + if (ret != -EAGAIN) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> +  "finish, which is running from another node.\n",
> +  osb->dev_str);
> + ret = ocfs2_trim_fs_lock(osb, , 0);
> + if (ret < 0) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + if (info.tf_valid && info.tf_success &&
> + info.tf_start == start && info.tf_len == len &&
> + info.tf_minlen == minlen) {
> + /* Avoid sending duplicated trim to a shared device */
> + mlog(ML_NOTICE, "The same trim on device (%s) was "
> +  "just done from node (%u), return.\n",
> +  osb->dev_str, info.tf_nodenum);
> + range->len = info.tf_trimlen;
> + goto out_trimunlock;
> + }
> + }
> +
> + info.tf_nodenum = osb->node_num;
> + info.tf_start = start;
> + info.tf_len = len;
> + info.tf_minlen = minlen;
> +
>   /* Determine first and last group to examine based on start and len */
>   first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>   if (first_group == osb->first_cluster_group_blkno)
> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>   }
>   range->len = trimmed * sb->s_blocksize;
> +
> + info.tf_trimlen = range->len;
> + info.tf_success = (ret ? 0 : 1);
> + pinfo = 
> +out_trimunlock:
> + ocfs2_trim_fs_unlock(osb, pinfo);
> + ocfs2_trim_fs_lock_res_uninit(osb);
>   out_unlock:
>   ocfs2_inode_unlock(main_bm_inode, 0);
>   brelse(main_bm_bh);
> 


Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-16 Thread Changwei Ge
It looks good to me.

Reviewed-by: Changwei Ge 

On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
> 
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
> 
> Signed-off-by: Gang He 
> ---
>   fs/ocfs2/alloc.c | 44 
>   1 file changed, 44 insertions(+)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;
>   
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   
>   trace_ocfs2_trim_fs(start, len, minlen);
>   
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> + if (ret < 0) {
> + if (ret != -EAGAIN) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> +  "finish, which is running from another node.\n",
> +  osb->dev_str);
> + ret = ocfs2_trim_fs_lock(osb, , 0);
> + if (ret < 0) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + if (info.tf_valid && info.tf_success &&
> + info.tf_start == start && info.tf_len == len &&
> + info.tf_minlen == minlen) {
> + /* Avoid sending duplicated trim to a shared device */
> + mlog(ML_NOTICE, "The same trim on device (%s) was "
> +  "just done from node (%u), return.\n",
> +  osb->dev_str, info.tf_nodenum);
> + range->len = info.tf_trimlen;
> + goto out_trimunlock;
> + }
> + }
> +
> + info.tf_nodenum = osb->node_num;
> + info.tf_start = start;
> + info.tf_len = len;
> + info.tf_minlen = minlen;
> +
>   /* Determine first and last group to examine based on start and len */
>   first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>   if (first_group == osb->first_cluster_group_blkno)
> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>   }
>   range->len = trimmed * sb->s_blocksize;
> +
> + info.tf_trimlen = range->len;
> + info.tf_success = (ret ? 0 : 1);
> + pinfo = 
> +out_trimunlock:
> + ocfs2_trim_fs_unlock(osb, pinfo);
> + ocfs2_trim_fs_lock_res_uninit(osb);
>   out_unlock:
>   ocfs2_inode_unlock(main_bm_inode, 0);
>   brelse(main_bm_bh);
> 


Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-11 Thread Changwei Ge
On 2018/1/11 15:57, Gang He wrote:
> 
> 
> 

>> On 2018/1/11 15:19, Gang He wrote:
>>>
>>>
>>>
>>
 On 2018/1/11 12:31, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/11 11:33, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/11 10:07, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/10 18:14, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 17:05, Gang He wrote:
> Hi Changwei,
>
>

>> Hi Gang,
>>
>> On 2017/12/14 13:16, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>>> Compare with first version, I change the fstrim commands' 
>>> returned
>>> value and behavior in case which meets a fstrim command is 
>>> running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>  fs/ocfs2/alloc.c | 44 
>>> 
>>>  1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
>>> struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>
>> I think *pinfo* is not necessary.
> This pointer is necessary, since it can be NULL or non-NULL 
> depend on the
 code logic.

 This point is OK for me.

>
>>>  
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block 
>>> *sb, struct
>> fstrim_range *range)
>>>  
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>  
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>
>> I don't get why try to lock here and if fails, acquire the same 
>> lock again
>> later but wait until granted.
> Please think about the user case, the patch is only used to 
> handle this
 case.
> When the administer configures a fstrim schedule task on each 
> node, then
 each node will trigger a fstrim on shared disks concurrently.
> In this case, we should avoid duplicated fstrim on a shared disk 
> since this
 will waste CPU/IO resources and affect SSD lifetime sometimes.

 I'm not worrying about that trimfs will affect SSD's lifetime 
 quite a lot,
 since physical-logical address converting table resides in RAM 
 while SSD is
 working.
 And that table won't be at a big scale. My point here is not 
 affecting this
 patch. Just a tip here.
>>> This depend on SSD firmware implementation, but for secure-trim, it 
>>> really
>> possibly affect SSD lifetime.
>>>
> Firstly, we use try_lock to get fstrim dlm lock to identify if 
> there is any
 other node which is doing fstrim on the disk.
> If not, this node is 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-11 Thread Changwei Ge
On 2018/1/11 15:57, Gang He wrote:
> 
> 
> 

>> On 2018/1/11 15:19, Gang He wrote:
>>>
>>>
>>>
>>
 On 2018/1/11 12:31, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/11 11:33, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/11 10:07, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/10 18:14, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 17:05, Gang He wrote:
> Hi Changwei,
>
>

>> Hi Gang,
>>
>> On 2017/12/14 13:16, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>>> Compare with first version, I change the fstrim commands' 
>>> returned
>>> value and behavior in case which meets a fstrim command is 
>>> running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>  fs/ocfs2/alloc.c | 44 
>>> 
>>>  1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
>>> struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>
>> I think *pinfo* is not necessary.
> This pointer is necessary, since it can be NULL or non-NULL 
> depend on the
 code logic.

 This point is OK for me.

>
>>>  
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block 
>>> *sb, struct
>> fstrim_range *range)
>>>  
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>  
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>
>> I don't get why try to lock here and if fails, acquire the same 
>> lock again
>> later but wait until granted.
> Please think about the user case, the patch is only used to 
> handle this
 case.
> When the administer configures a fstrim schedule task on each 
> node, then
 each node will trigger a fstrim on shared disks concurrently.
> In this case, we should avoid duplicated fstrim on a shared disk 
> since this
 will waste CPU/IO resources and affect SSD lifetime sometimes.

 I'm not worrying about that trimfs will affect SSD's lifetime 
 quite a lot,
 since physical-logical address converting table resides in RAM 
 while SSD is
 working.
 And that table won't be at a big scale. My point here is not 
 affecting this
 patch. Just a tip here.
>>> This depend on SSD firmware implementation, but for secure-trim, it 
>>> really
>> possibly affect SSD lifetime.
>>>
> Firstly, we use try_lock to get fstrim dlm lock to identify if 
> there is any
 other node which is doing fstrim on the disk.
> If not, this node is the first one, 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He



>>> 
> On 2018/1/11 15:19, Gang He wrote:
>> 
>> 
>> 
>
>>> On 2018/1/11 12:31, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/11 11:33, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> On 2018/1/11 10:07, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/10 18:14, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> On 2018/1/10 17:05, Gang He wrote:
 Hi Changwei,


>>>
> Hi Gang,
>
> On 2017/12/14 13:16, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>>
>> Compare with first version, I change the fstrim commands' 
>> returned
>> value and behavior in case which meets a fstrim command is 
>> running
>> on a shared disk.
>>
>> Signed-off-by: Gang He 
>> ---
>> fs/ocfs2/alloc.c | 44 
>> 
>> 1 file changed, 44 insertions(+)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
>> struct
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
>
> I think *pinfo* is not necessary.
 This pointer is necessary, since it can be NULL or non-NULL depend 
 on the
>>> code logic.
>>>
>>> This point is OK for me.
>>>

>> 
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
>> struct
> fstrim_range *range)
>> 
>>  trace_ocfs2_trim_fs(start, len, minlen);
>> 
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>
> I don't get why try to lock here and if fails, acquire the same 
> lock again
> later but wait until granted.
 Please think about the user case, the patch is only used to handle 
 this
>>> case.
 When the administer configures a fstrim schedule task on each 
 node, then
>>> each node will trigger a fstrim on shared disks concurrently.
 In this case, we should avoid duplicated fstrim on a shared disk 
 since this
>>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>>
>>> I'm not worrying about that trimfs will affect SSD's lifetime quite 
>>> a lot,
>>> since physical-logical address converting table resides in RAM 
>>> while SSD is
>>> working.
>>> And that table won't be at a big scale. My point here is not 
>>> affecting this
>>> patch. Just a tip here.
>> This depend on SSD firmware implementation, but for secure-trim, it 
>> really
> possibly affect SSD lifetime.
>>
 Firstly, we use try_lock to get fstrim dlm lock to identify if 
 there is any
>>> other node which is doing fstrim on the disk.
 If not, this node is the first one, this node should do fstrim 
 operation on
>>> the disk.
 If yes, this node is not the first one, this node should wait 
 until the
>>> 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He



>>> 
> On 2018/1/11 15:19, Gang He wrote:
>> 
>> 
>> 
>
>>> On 2018/1/11 12:31, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/11 11:33, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> On 2018/1/11 10:07, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/10 18:14, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> On 2018/1/10 17:05, Gang He wrote:
 Hi Changwei,


>>>
> Hi Gang,
>
> On 2017/12/14 13:16, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>>
>> Compare with first version, I change the fstrim commands' 
>> returned
>> value and behavior in case which meets a fstrim command is 
>> running
>> on a shared disk.
>>
>> Signed-off-by: Gang He 
>> ---
>> fs/ocfs2/alloc.c | 44 
>> 
>> 1 file changed, 44 insertions(+)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
>> struct
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
>
> I think *pinfo* is not necessary.
 This pointer is necessary, since it can be NULL or non-NULL depend 
 on the
>>> code logic.
>>>
>>> This point is OK for me.
>>>

>> 
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
>> struct
> fstrim_range *range)
>> 
>>  trace_ocfs2_trim_fs(start, len, minlen);
>> 
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>
> I don't get why try to lock here and if fails, acquire the same 
> lock again
> later but wait until granted.
 Please think about the user case, the patch is only used to handle 
 this
>>> case.
 When the administer configures a fstrim schedule task on each 
 node, then
>>> each node will trigger a fstrim on shared disks concurrently.
 In this case, we should avoid duplicated fstrim on a shared disk 
 since this
>>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>>
>>> I'm not worrying about that trimfs will affect SSD's lifetime quite 
>>> a lot,
>>> since physical-logical address converting table resides in RAM 
>>> while SSD is
>>> working.
>>> And that table won't be at a big scale. My point here is not 
>>> affecting this
>>> patch. Just a tip here.
>> This depend on SSD firmware implementation, but for secure-trim, it 
>> really
> possibly affect SSD lifetime.
>>
 Firstly, we use try_lock to get fstrim dlm lock to identify if 
 there is any
>>> other node which is doing fstrim on the disk.
 If not, this node is the first one, this node should do fstrim 
 operation on
>>> the disk.
 If yes, this node is not the first one, this node should wait 
 until the
>>> first node is 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/11 15:19, Gang He wrote:
> 
> 
> 

>> On 2018/1/11 12:31, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/11 11:33, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/11 10:07, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 18:14, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/10 17:05, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 Hi Gang,

 On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
>
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
>
> Signed-off-by: Gang He 
> ---
> fs/ocfs2/alloc.c | 44 
> 
> 1 file changed, 44 insertions(+)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
> struct
 fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

 I think *pinfo* is not necessary.
>>> This pointer is necessary, since it can be NULL or non-NULL depend 
>>> on the
>> code logic.
>>
>> This point is OK for me.
>>
>>>
> 
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
> struct
 fstrim_range *range)
> 
>   trace_ocfs2_trim_fs(start, len, minlen);
> 
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

 I don't get why try to lock here and if fails, acquire the same 
 lock again
 later but wait until granted.
>>> Please think about the user case, the patch is only used to handle 
>>> this
>> case.
>>> When the administer configures a fstrim schedule task on each node, 
>>> then
>> each node will trigger a fstrim on shared disks concurrently.
>>> In this case, we should avoid duplicated fstrim on a shared disk 
>>> since this
>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>
>> I'm not worrying about that trimfs will affect SSD's lifetime quite 
>> a lot,
>> since physical-logical address converting table resides in RAM while 
>> SSD is
>> working.
>> And that table won't be at a big scale. My point here is not 
>> affecting this
>> patch. Just a tip here.
> This depend on SSD firmware implementation, but for secure-trim, it 
> really
 possibly affect SSD lifetime.
>
>>> Firstly, we use try_lock to get fstrim dlm lock to identify if 
>>> there is any
>> other node which is doing fstrim on the disk.
>>> If not, this node is the first one, this node should do fstrim 
>>> operation on
>> the disk.
>>> If yes, this node is not the first one, this node should wait until 
>>> the
>> first node is done for fstrim operation, then return the result from 
>> DLM
>> lock's value.
>>>
 Can it just acquire the _trimfs_ lock as a 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/11 15:19, Gang He wrote:
> 
> 
> 

>> On 2018/1/11 12:31, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/11 11:33, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/11 10:07, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 18:14, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/10 17:05, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 Hi Gang,

 On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
>
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
>
> Signed-off-by: Gang He 
> ---
> fs/ocfs2/alloc.c | 44 
> 
> 1 file changed, 44 insertions(+)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
> struct
 fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

 I think *pinfo* is not necessary.
>>> This pointer is necessary, since it can be NULL or non-NULL depend 
>>> on the
>> code logic.
>>
>> This point is OK for me.
>>
>>>
> 
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
> struct
 fstrim_range *range)
> 
>   trace_ocfs2_trim_fs(start, len, minlen);
> 
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

 I don't get why try to lock here and if fails, acquire the same 
 lock again
 later but wait until granted.
>>> Please think about the user case, the patch is only used to handle 
>>> this
>> case.
>>> When the administer configures a fstrim schedule task on each node, 
>>> then
>> each node will trigger a fstrim on shared disks concurrently.
>>> In this case, we should avoid duplicated fstrim on a shared disk 
>>> since this
>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>
>> I'm not worrying about that trimfs will affect SSD's lifetime quite 
>> a lot,
>> since physical-logical address converting table resides in RAM while 
>> SSD is
>> working.
>> And that table won't be at a big scale. My point here is not 
>> affecting this
>> patch. Just a tip here.
> This depend on SSD firmware implementation, but for secure-trim, it 
> really
 possibly affect SSD lifetime.
>
>>> Firstly, we use try_lock to get fstrim dlm lock to identify if 
>>> there is any
>> other node which is doing fstrim on the disk.
>>> If not, this node is the first one, this node should do fstrim 
>>> operation on
>> the disk.
>>> If yes, this node is not the first one, this node should wait until 
>>> the
>> first node is done for fstrim operation, then return the result from 
>> DLM
>> lock's value.
>>>
 Can it just acquire the _trimfs_ lock as a blocking one 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He



>>> 
> On 2018/1/11 12:31, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> On 2018/1/11 11:33, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/11 10:07, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> On 2018/1/10 18:14, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/10 17:05, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> Hi Gang,
>>>
>>> On 2017/12/14 13:16, Gang He wrote:
 As you know, ocfs2 has support trim the underlying disk via
 fstrim command. But there is a problem, ocfs2 is a shared disk
 cluster file system, if the user configures a scheduled fstrim
 job on each file system node, this will trigger multiple nodes
 trim a shared disk simultaneously, it is very wasteful for CPU
 and IO consumption, also might negatively affect the lifetime
 of poor-quality SSD devices.
 Then, we introduce a trimfs dlm lock to communicate with each
 other in this case, which will make only one fstrim command to
 do the trimming on a shared disk among the cluster, the fstrim
 commands from the other nodes should wait for the first fstrim
 to finish and returned success directly, to avoid running a the
 same trim on the shared disk again.

 Compare with first version, I change the fstrim commands' returned
 value and behavior in case which meets a fstrim command is running
 on a shared disk.

 Signed-off-by: Gang He 
 ---
fs/ocfs2/alloc.c | 44 
 
1 file changed, 44 insertions(+)

 diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
 index ab5105f..5c9c3e2 100644
 --- a/fs/ocfs2/alloc.c
 +++ b/fs/ocfs2/alloc.c
 @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
 struct
>>> fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
 +  struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>
>>> I think *pinfo* is not necessary.
>> This pointer is necessary, since it can be NULL or non-NULL depend 
>> on the
> code logic.
>
> This point is OK for me.
>
>>

start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
 @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
 struct
>>> fstrim_range *range)

trace_ocfs2_trim_fs(start, len, minlen);

 +  ocfs2_trim_fs_lock_res_init(osb);
 +  ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>
>>> I don't get why try to lock here and if fails, acquire the same 
>>> lock again
>>> later but wait until granted.
>> Please think about the user case, the patch is only used to handle 
>> this
> case.
>> When the administer configures a fstrim schedule task on each node, 
>> then
> each node will trigger a fstrim on shared disks concurrently.
>> In this case, we should avoid duplicated fstrim on a shared disk 
>> since this
> will waste CPU/IO resources and affect SSD lifetime sometimes.
>
> I'm not worrying about that trimfs will affect SSD's lifetime quite a 
> lot,
> since physical-logical address converting table resides in RAM while 
> SSD is
> working.
> And that table won't be at a big scale. My point here is not 
> affecting this
> patch. Just a tip here.
 This depend on SSD firmware implementation, but for secure-trim, it 
 really
>>> possibly affect SSD lifetime.

>> Firstly, we use try_lock to get fstrim dlm lock to identify if there 
>> is any
> other node which is doing fstrim on the disk.
>> If not, this node is the first one, this node should do fstrim 
>> operation on
> the disk.
>> If yes, this node is not the first one, this node should wait until 
>> the
> first node is done for fstrim operation, then return the result from 
> DLM
> lock's value.
>>
>>> Can it just acquire the _trimfs_ lock as a blocking one directly 
>>> here?
>> We can not do a blocking lock directly, since we need to identify if 
>> there
> is any 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He



>>> 
> On 2018/1/11 12:31, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> On 2018/1/11 11:33, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/11 10:07, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> On 2018/1/10 18:14, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/10 17:05, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> Hi Gang,
>>>
>>> On 2017/12/14 13:16, Gang He wrote:
 As you know, ocfs2 has support trim the underlying disk via
 fstrim command. But there is a problem, ocfs2 is a shared disk
 cluster file system, if the user configures a scheduled fstrim
 job on each file system node, this will trigger multiple nodes
 trim a shared disk simultaneously, it is very wasteful for CPU
 and IO consumption, also might negatively affect the lifetime
 of poor-quality SSD devices.
 Then, we introduce a trimfs dlm lock to communicate with each
 other in this case, which will make only one fstrim command to
 do the trimming on a shared disk among the cluster, the fstrim
 commands from the other nodes should wait for the first fstrim
 to finish and returned success directly, to avoid running a the
 same trim on the shared disk again.

 Compare with first version, I change the fstrim commands' returned
 value and behavior in case which meets a fstrim command is running
 on a shared disk.

 Signed-off-by: Gang He 
 ---
fs/ocfs2/alloc.c | 44 
 
1 file changed, 44 insertions(+)

 diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
 index ab5105f..5c9c3e2 100644
 --- a/fs/ocfs2/alloc.c
 +++ b/fs/ocfs2/alloc.c
 @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
 struct
>>> fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
 +  struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>
>>> I think *pinfo* is not necessary.
>> This pointer is necessary, since it can be NULL or non-NULL depend 
>> on the
> code logic.
>
> This point is OK for me.
>
>>

start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
 @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
 struct
>>> fstrim_range *range)

trace_ocfs2_trim_fs(start, len, minlen);

 +  ocfs2_trim_fs_lock_res_init(osb);
 +  ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>
>>> I don't get why try to lock here and if fails, acquire the same 
>>> lock again
>>> later but wait until granted.
>> Please think about the user case, the patch is only used to handle 
>> this
> case.
>> When the administer configures a fstrim schedule task on each node, 
>> then
> each node will trigger a fstrim on shared disks concurrently.
>> In this case, we should avoid duplicated fstrim on a shared disk 
>> since this
> will waste CPU/IO resources and affect SSD lifetime sometimes.
>
> I'm not worrying about that trimfs will affect SSD's lifetime quite a 
> lot,
> since physical-logical address converting table resides in RAM while 
> SSD is
> working.
> And that table won't be at a big scale. My point here is not 
> affecting this
> patch. Just a tip here.
 This depend on SSD firmware implementation, but for secure-trim, it 
 really
>>> possibly affect SSD lifetime.

>> Firstly, we use try_lock to get fstrim dlm lock to identify if there 
>> is any
> other node which is doing fstrim on the disk.
>> If not, this node is the first one, this node should do fstrim 
>> operation on
> the disk.
>> If yes, this node is not the first one, this node should wait until 
>> the
> first node is done for fstrim operation, then return the result from 
> DLM
> lock's value.
>>
>>> Can it just acquire the _trimfs_ lock as a blocking one directly 
>>> here?
>> We can not do a blocking lock directly, since we need to identify if 
>> there
> is any other node has 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/11 12:31, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/11 11:33, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/11 10:07, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/10 18:14, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 17:05, Gang He wrote:
> Hi Changwei,
>
>

>> Hi Gang,
>>
>> On 2017/12/14 13:16, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>>> Compare with first version, I change the fstrim commands' returned
>>> value and behavior in case which meets a fstrim command is running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>fs/ocfs2/alloc.c | 44 
>>> 
>>>1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
>>> struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>
>> I think *pinfo* is not necessary.
> This pointer is necessary, since it can be NULL or non-NULL depend on 
> the
 code logic.

 This point is OK for me.

>
>>>
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
>>> struct
>> fstrim_range *range)
>>>
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>
>> I don't get why try to lock here and if fails, acquire the same lock 
>> again
>> later but wait until granted.
> Please think about the user case, the patch is only used to handle 
> this
 case.
> When the administer configures a fstrim schedule task on each node, 
> then
 each node will trigger a fstrim on shared disks concurrently.
> In this case, we should avoid duplicated fstrim on a shared disk 
> since this
 will waste CPU/IO resources and affect SSD lifetime sometimes.

 I'm not worrying about that trimfs will affect SSD's lifetime quite a 
 lot,
 since physical-logical address converting table resides in RAM while 
 SSD is
 working.
 And that table won't be at a big scale. My point here is not affecting 
 this
 patch. Just a tip here.
>>> This depend on SSD firmware implementation, but for secure-trim, it 
>>> really
>> possibly affect SSD lifetime.
>>>
> Firstly, we use try_lock to get fstrim dlm lock to identify if there 
> is any
 other node which is doing fstrim on the disk.
> If not, this node is the first one, this node should do fstrim 
> operation on
 the disk.
> If yes, this node is not the first one, this node should wait until 
> the
 first node is done for fstrim operation, then return the result from 
 DLM
 lock's value.
>
>> Can it just acquire the _trimfs_ lock as a blocking one directly 
>> here?
> We can not do a blocking lock directly, since we need to identify if 
> there
 is any other node has being do fstrim operation when this node start 
 to do
 fstrim.

 Thanks for your 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/11 12:31, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/11 11:33, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/11 10:07, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/10 18:14, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 17:05, Gang He wrote:
> Hi Changwei,
>
>

>> Hi Gang,
>>
>> On 2017/12/14 13:16, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>>> Compare with first version, I change the fstrim commands' returned
>>> value and behavior in case which meets a fstrim command is running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>fs/ocfs2/alloc.c | 44 
>>> 
>>>1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
>>> struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>
>> I think *pinfo* is not necessary.
> This pointer is necessary, since it can be NULL or non-NULL depend on 
> the
 code logic.

 This point is OK for me.

>
>>>
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
>>> struct
>> fstrim_range *range)
>>>
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>
>> I don't get why try to lock here and if fails, acquire the same lock 
>> again
>> later but wait until granted.
> Please think about the user case, the patch is only used to handle 
> this
 case.
> When the administer configures a fstrim schedule task on each node, 
> then
 each node will trigger a fstrim on shared disks concurrently.
> In this case, we should avoid duplicated fstrim on a shared disk 
> since this
 will waste CPU/IO resources and affect SSD lifetime sometimes.

 I'm not worrying about that trimfs will affect SSD's lifetime quite a 
 lot,
 since physical-logical address converting table resides in RAM while 
 SSD is
 working.
 And that table won't be at a big scale. My point here is not affecting 
 this
 patch. Just a tip here.
>>> This depend on SSD firmware implementation, but for secure-trim, it 
>>> really
>> possibly affect SSD lifetime.
>>>
> Firstly, we use try_lock to get fstrim dlm lock to identify if there 
> is any
 other node which is doing fstrim on the disk.
> If not, this node is the first one, this node should do fstrim 
> operation on
 the disk.
> If yes, this node is not the first one, this node should wait until 
> the
 first node is done for fstrim operation, then return the result from 
 DLM
 lock's value.
>
>> Can it just acquire the _trimfs_ lock as a blocking one directly 
>> here?
> We can not do a blocking lock directly, since we need to identify if 
> there
 is any other node has being do fstrim operation when this node start 
 to do
 fstrim.

 Thanks for your elaboration.

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> On 2018/1/11 11:33, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> On 2018/1/11 10:07, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/10 18:14, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> On 2018/1/10 17:05, Gang He wrote:
 Hi Changwei,


>>>
> Hi Gang,
>
> On 2017/12/14 13:16, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>>
>> Compare with first version, I change the fstrim commands' returned
>> value and behavior in case which meets a fstrim command is running
>> on a shared disk.
>>
>> Signed-off-by: Gang He 
>> ---
>>   fs/ocfs2/alloc.c | 44 
>> 
>>   1 file changed, 44 insertions(+)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
>> struct
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
>
> I think *pinfo* is not necessary.
 This pointer is necessary, since it can be NULL or non-NULL depend on 
 the
>>> code logic.
>>>
>>> This point is OK for me.
>>>

>>   
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
>> struct
> fstrim_range *range)
>>   
>>  trace_ocfs2_trim_fs(start, len, minlen);
>>   
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>
> I don't get why try to lock here and if fails, acquire the same lock 
> again
> later but wait until granted.
 Please think about the user case, the patch is only used to handle this
>>> case.
 When the administer configures a fstrim schedule task on each node, 
 then
>>> each node will trigger a fstrim on shared disks concurrently.
 In this case, we should avoid duplicated fstrim on a shared disk since 
 this
>>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>>
>>> I'm not worrying about that trimfs will affect SSD's lifetime quite a 
>>> lot,
>>> since physical-logical address converting table resides in RAM while 
>>> SSD is
>>> working.
>>> And that table won't be at a big scale. My point here is not affecting 
>>> this
>>> patch. Just a tip here.
>> This depend on SSD firmware implementation, but for secure-trim, it 
>> really
> possibly affect SSD lifetime.
>>
 Firstly, we use try_lock to get fstrim dlm lock to identify if there 
 is any
>>> other node which is doing fstrim on the disk.
 If not, this node is the first one, this node should do fstrim 
 operation on
>>> the disk.
 If yes, this node is not the first one, this node should wait until the
>>> first node is done for fstrim operation, then return the result from DLM
>>> lock's value.

> Can it just acquire the _trimfs_ lock as a blocking one directly here?
 We can not do a blocking lock directly, since we need to identify if 
 there
>>> is any other node has being do fstrim operation when this node start to 
>>> do
>>> fstrim.
>>>
>>> Thanks for your elaboration.
>>>
>>> Well how about the third node trying to trimming fs too?
>>> It needs LVB from the second node.
>>> But it seems that the second node can't provide a valid LVB.

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> On 2018/1/11 11:33, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> On 2018/1/11 10:07, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/10 18:14, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> On 2018/1/10 17:05, Gang He wrote:
 Hi Changwei,


>>>
> Hi Gang,
>
> On 2017/12/14 13:16, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>>
>> Compare with first version, I change the fstrim commands' returned
>> value and behavior in case which meets a fstrim command is running
>> on a shared disk.
>>
>> Signed-off-by: Gang He 
>> ---
>>   fs/ocfs2/alloc.c | 44 
>> 
>>   1 file changed, 44 insertions(+)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, 
>> struct
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
>
> I think *pinfo* is not necessary.
 This pointer is necessary, since it can be NULL or non-NULL depend on 
 the
>>> code logic.
>>>
>>> This point is OK for me.
>>>

>>   
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
>> struct
> fstrim_range *range)
>>   
>>  trace_ocfs2_trim_fs(start, len, minlen);
>>   
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>
> I don't get why try to lock here and if fails, acquire the same lock 
> again
> later but wait until granted.
 Please think about the user case, the patch is only used to handle this
>>> case.
 When the administer configures a fstrim schedule task on each node, 
 then
>>> each node will trigger a fstrim on shared disks concurrently.
 In this case, we should avoid duplicated fstrim on a shared disk since 
 this
>>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>>
>>> I'm not worrying about that trimfs will affect SSD's lifetime quite a 
>>> lot,
>>> since physical-logical address converting table resides in RAM while 
>>> SSD is
>>> working.
>>> And that table won't be at a big scale. My point here is not affecting 
>>> this
>>> patch. Just a tip here.
>> This depend on SSD firmware implementation, but for secure-trim, it 
>> really
> possibly affect SSD lifetime.
>>
 Firstly, we use try_lock to get fstrim dlm lock to identify if there 
 is any
>>> other node which is doing fstrim on the disk.
 If not, this node is the first one, this node should do fstrim 
 operation on
>>> the disk.
 If yes, this node is not the first one, this node should wait until the
>>> first node is done for fstrim operation, then return the result from DLM
>>> lock's value.

> Can it just acquire the _trimfs_ lock as a blocking one directly here?
 We can not do a blocking lock directly, since we need to identify if 
 there
>>> is any other node has being do fstrim operation when this node start to 
>>> do
>>> fstrim.
>>>
>>> Thanks for your elaboration.
>>>
>>> Well how about the third node trying to trimming fs too?
>>> It needs LVB from the second node.
>>> But it seems that the second node can't provide a valid LVB.
>>> So the 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/11 11:33, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/11 10:07, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 18:14, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/10 17:05, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 Hi Gang,

 On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
>
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
>
> Signed-off-by: Gang He 
> ---
>   fs/ocfs2/alloc.c | 44 
> 
>   1 file changed, 44 insertions(+)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

 I think *pinfo* is not necessary.
>>> This pointer is necessary, since it can be NULL or non-NULL depend on 
>>> the
>> code logic.
>>
>> This point is OK for me.
>>
>>>
>   
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
> struct
 fstrim_range *range)
>   
>   trace_ocfs2_trim_fs(start, len, minlen);
>   
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

 I don't get why try to lock here and if fails, acquire the same lock 
 again
 later but wait until granted.
>>> Please think about the user case, the patch is only used to handle this
>> case.
>>> When the administer configures a fstrim schedule task on each node, then
>> each node will trigger a fstrim on shared disks concurrently.
>>> In this case, we should avoid duplicated fstrim on a shared disk since 
>>> this
>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>
>> I'm not worrying about that trimfs will affect SSD's lifetime quite a 
>> lot,
>> since physical-logical address converting table resides in RAM while SSD 
>> is
>> working.
>> And that table won't be at a big scale. My point here is not affecting 
>> this
>> patch. Just a tip here.
> This depend on SSD firmware implementation, but for secure-trim, it really
 possibly affect SSD lifetime.
>
>>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is 
>>> any
>> other node which is doing fstrim on the disk.
>>> If not, this node is the first one, this node should do fstrim 
>>> operation on
>> the disk.
>>> If yes, this node is not the first one, this node should wait until the
>> first node is done for fstrim operation, then return the result from DLM
>> lock's value.
>>>
 Can it just acquire the _trimfs_ lock as a blocking one directly here?
>>> We can not do a blocking lock directly, since we need to identify if 
>>> there
>> is any other node has being do fstrim operation when this node start to 
>> do
>> fstrim.
>>
>> Thanks for your elaboration.
>>
>> Well how about the third node trying to trimming fs too?
>> It needs LVB from the second node.
>> But it seems that the second node can't provide a valid LVB.
>> So the third node will perform trimfs once more.
> No, the second node does not change DLM lock's value, but the DLM lock's
 value is still valid.
> 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/11 11:33, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/11 10:07, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 18:14, Gang He wrote:
> Hi Changwei,
>
>

>> On 2018/1/10 17:05, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 Hi Gang,

 On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
>
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
>
> Signed-off-by: Gang He 
> ---
>   fs/ocfs2/alloc.c | 44 
> 
>   1 file changed, 44 insertions(+)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

 I think *pinfo* is not necessary.
>>> This pointer is necessary, since it can be NULL or non-NULL depend on 
>>> the
>> code logic.
>>
>> This point is OK for me.
>>
>>>
>   
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, 
> struct
 fstrim_range *range)
>   
>   trace_ocfs2_trim_fs(start, len, minlen);
>   
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

 I don't get why try to lock here and if fails, acquire the same lock 
 again
 later but wait until granted.
>>> Please think about the user case, the patch is only used to handle this
>> case.
>>> When the administer configures a fstrim schedule task on each node, then
>> each node will trigger a fstrim on shared disks concurrently.
>>> In this case, we should avoid duplicated fstrim on a shared disk since 
>>> this
>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>
>> I'm not worrying about that trimfs will affect SSD's lifetime quite a 
>> lot,
>> since physical-logical address converting table resides in RAM while SSD 
>> is
>> working.
>> And that table won't be at a big scale. My point here is not affecting 
>> this
>> patch. Just a tip here.
> This depend on SSD firmware implementation, but for secure-trim, it really
 possibly affect SSD lifetime.
>
>>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is 
>>> any
>> other node which is doing fstrim on the disk.
>>> If not, this node is the first one, this node should do fstrim 
>>> operation on
>> the disk.
>>> If yes, this node is not the first one, this node should wait until the
>> first node is done for fstrim operation, then return the result from DLM
>> lock's value.
>>>
 Can it just acquire the _trimfs_ lock as a blocking one directly here?
>>> We can not do a blocking lock directly, since we need to identify if 
>>> there
>> is any other node has being do fstrim operation when this node start to 
>> do
>> fstrim.
>>
>> Thanks for your elaboration.
>>
>> Well how about the third node trying to trimming fs too?
>> It needs LVB from the second node.
>> But it seems that the second node can't provide a valid LVB.
>> So the third node will perform trimfs once more.
> No, the second node does not change DLM lock's value, but the DLM lock's
 value is still valid.
> The third node 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> On 2018/1/11 10:07, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> On 2018/1/10 18:14, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/10 17:05, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> Hi Gang,
>>>
>>> On 2017/12/14 13:16, Gang He wrote:
 As you know, ocfs2 has support trim the underlying disk via
 fstrim command. But there is a problem, ocfs2 is a shared disk
 cluster file system, if the user configures a scheduled fstrim
 job on each file system node, this will trigger multiple nodes
 trim a shared disk simultaneously, it is very wasteful for CPU
 and IO consumption, also might negatively affect the lifetime
 of poor-quality SSD devices.
 Then, we introduce a trimfs dlm lock to communicate with each
 other in this case, which will make only one fstrim command to
 do the trimming on a shared disk among the cluster, the fstrim
 commands from the other nodes should wait for the first fstrim
 to finish and returned success directly, to avoid running a the
 same trim on the shared disk again.

 Compare with first version, I change the fstrim commands' returned
 value and behavior in case which meets a fstrim command is running
 on a shared disk.

 Signed-off-by: Gang He 
 ---
  fs/ocfs2/alloc.c | 44 
  1 file changed, 44 insertions(+)

 diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
 index ab5105f..5c9c3e2 100644
 --- a/fs/ocfs2/alloc.c
 +++ b/fs/ocfs2/alloc.c
 @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
 +  struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>
>>> I think *pinfo* is not necessary.
>> This pointer is necessary, since it can be NULL or non-NULL depend on the
> code logic.
>
> This point is OK for me.
>
>>
  
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
 @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
  
trace_ocfs2_trim_fs(start, len, minlen);
  
 +  ocfs2_trim_fs_lock_res_init(osb);
 +  ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>
>>> I don't get why try to lock here and if fails, acquire the same lock 
>>> again
>>> later but wait until granted.
>> Please think about the user case, the patch is only used to handle this
> case.
>> When the administer configures a fstrim schedule task on each node, then
> each node will trigger a fstrim on shared disks concurrently.
>> In this case, we should avoid duplicated fstrim on a shared disk since 
>> this
> will waste CPU/IO resources and affect SSD lifetime sometimes.
>
> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
> since physical-logical address converting table resides in RAM while SSD 
> is
> working.
> And that table won't be at a big scale. My point here is not affecting 
> this
> patch. Just a tip here.
 This depend on SSD firmware implementation, but for secure-trim, it really
>>> possibly affect SSD lifetime.

>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is 
>> any
> other node which is doing fstrim on the disk.
>> If not, this node is the first one, this node should do fstrim operation 
>> on
> the disk.
>> If yes, this node is not the first one, this node should wait until the
> first node is done for fstrim operation, then return the result from DLM
> lock's value.
>>
>>> Can it just acquire the _trimfs_ lock as a blocking one directly here?
>> We can not do a blocking lock directly, since we need to identify if 
>> there
> is any other node has being do fstrim operation when this node start to do
> fstrim.
>
> Thanks for your elaboration.
>
> Well how about the third node trying to trimming fs too?
> It needs LVB from the second node.
> But it seems that the second node can't provide a valid LVB.
> So the third node will perform trimfs once more.
 No, the second node does not change DLM lock's value, but the DLM lock's
>>> value is still valid.
 The third node also refer to this DLM lock's value, then do the same logic
>>> like the second node.
>>>
>>> Hi Gang,
>>> I don't see any places where ocfs2_lock_res::ocfs2_lock_res_ops::set_lvb is
>>> set while flag LOCK_TYPE_USES_LVB is added.

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> On 2018/1/11 10:07, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> On 2018/1/10 18:14, Gang He wrote:
 Hi Changwei,


>>>
> On 2018/1/10 17:05, Gang He wrote:
>> Hi Changwei,
>>
>>
>
>>> Hi Gang,
>>>
>>> On 2017/12/14 13:16, Gang He wrote:
 As you know, ocfs2 has support trim the underlying disk via
 fstrim command. But there is a problem, ocfs2 is a shared disk
 cluster file system, if the user configures a scheduled fstrim
 job on each file system node, this will trigger multiple nodes
 trim a shared disk simultaneously, it is very wasteful for CPU
 and IO consumption, also might negatively affect the lifetime
 of poor-quality SSD devices.
 Then, we introduce a trimfs dlm lock to communicate with each
 other in this case, which will make only one fstrim command to
 do the trimming on a shared disk among the cluster, the fstrim
 commands from the other nodes should wait for the first fstrim
 to finish and returned success directly, to avoid running a the
 same trim on the shared disk again.

 Compare with first version, I change the fstrim commands' returned
 value and behavior in case which meets a fstrim command is running
 on a shared disk.

 Signed-off-by: Gang He 
 ---
  fs/ocfs2/alloc.c | 44 
  1 file changed, 44 insertions(+)

 diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
 index ab5105f..5c9c3e2 100644
 --- a/fs/ocfs2/alloc.c
 +++ b/fs/ocfs2/alloc.c
 @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
 +  struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>
>>> I think *pinfo* is not necessary.
>> This pointer is necessary, since it can be NULL or non-NULL depend on the
> code logic.
>
> This point is OK for me.
>
>>
  
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
 @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
  
trace_ocfs2_trim_fs(start, len, minlen);
  
 +  ocfs2_trim_fs_lock_res_init(osb);
 +  ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>
>>> I don't get why try to lock here and if fails, acquire the same lock 
>>> again
>>> later but wait until granted.
>> Please think about the user case, the patch is only used to handle this
> case.
>> When the administer configures a fstrim schedule task on each node, then
> each node will trigger a fstrim on shared disks concurrently.
>> In this case, we should avoid duplicated fstrim on a shared disk since 
>> this
> will waste CPU/IO resources and affect SSD lifetime sometimes.
>
> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
> since physical-logical address converting table resides in RAM while SSD 
> is
> working.
> And that table won't be at a big scale. My point here is not affecting 
> this
> patch. Just a tip here.
 This depend on SSD firmware implementation, but for secure-trim, it really
>>> possibly affect SSD lifetime.

>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is 
>> any
> other node which is doing fstrim on the disk.
>> If not, this node is the first one, this node should do fstrim operation 
>> on
> the disk.
>> If yes, this node is not the first one, this node should wait until the
> first node is done for fstrim operation, then return the result from DLM
> lock's value.
>>
>>> Can it just acquire the _trimfs_ lock as a blocking one directly here?
>> We can not do a blocking lock directly, since we need to identify if 
>> there
> is any other node has being do fstrim operation when this node start to do
> fstrim.
>
> Thanks for your elaboration.
>
> Well how about the third node trying to trimming fs too?
> It needs LVB from the second node.
> But it seems that the second node can't provide a valid LVB.
> So the third node will perform trimfs once more.
 No, the second node does not change DLM lock's value, but the DLM lock's
>>> value is still valid.
 The third node also refer to this DLM lock's value, then do the same logic
>>> like the second node.
>>>
>>> Hi Gang,
>>> I don't see any places where ocfs2_lock_res::ocfs2_lock_res_ops::set_lvb is
>>> set while flag LOCK_TYPE_USES_LVB is added.
>>>
>>> Are 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/11 10:07, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/10 18:14, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 17:05, Gang He wrote:
> Hi Changwei,
>
>

>> Hi Gang,
>>
>> On 2017/12/14 13:16, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>>> Compare with first version, I change the fstrim commands' returned
>>> value and behavior in case which meets a fstrim command is running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>  fs/ocfs2/alloc.c | 44 
>>>  1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>
>> I think *pinfo* is not necessary.
> This pointer is necessary, since it can be NULL or non-NULL depend on the
 code logic.

 This point is OK for me.

>
>>>  
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>  
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>  
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>
>> I don't get why try to lock here and if fails, acquire the same lock 
>> again
>> later but wait until granted.
> Please think about the user case, the patch is only used to handle this
 case.
> When the administer configures a fstrim schedule task on each node, then
 each node will trigger a fstrim on shared disks concurrently.
> In this case, we should avoid duplicated fstrim on a shared disk since 
> this
 will waste CPU/IO resources and affect SSD lifetime sometimes.

 I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
 since physical-logical address converting table resides in RAM while SSD is
 working.
 And that table won't be at a big scale. My point here is not affecting this
 patch. Just a tip here.
>>> This depend on SSD firmware implementation, but for secure-trim, it really
>> possibly affect SSD lifetime.
>>>
> Firstly, we use try_lock to get fstrim dlm lock to identify if there is 
> any
 other node which is doing fstrim on the disk.
> If not, this node is the first one, this node should do fstrim operation 
> on
 the disk.
> If yes, this node is not the first one, this node should wait until the
 first node is done for fstrim operation, then return the result from DLM
 lock's value.
>
>> Can it just acquire the _trimfs_ lock as a blocking one directly here?
> We can not do a blocking lock directly, since we need to identify if there
 is any other node has being do fstrim operation when this node start to do
 fstrim.

 Thanks for your elaboration.

 Well how about the third node trying to trimming fs too?
 It needs LVB from the second node.
 But it seems that the second node can't provide a valid LVB.
 So the third node will perform trimfs once more.
>>> No, the second node does not change DLM lock's value, but the DLM lock's
>> value is still valid.
>>> The third node also refer to this DLM lock's value, then do the same logic
>> like the second node.
>>
>> Hi Gang,
>> I don't see any places where ocfs2_lock_res::ocfs2_lock_res_ops::set_lvb is
>> set while flag LOCK_TYPE_USES_LVB is added.
>>
>> Are you sure below code path can work well?
> Yes, have done a full testing on two and three nodes.
> 
>> ocfs2_process_blocked_lock
>> 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/11 10:07, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/10 18:14, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 On 2018/1/10 17:05, Gang He wrote:
> Hi Changwei,
>
>

>> Hi Gang,
>>
>> On 2017/12/14 13:16, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>>> Compare with first version, I change the fstrim commands' returned
>>> value and behavior in case which meets a fstrim command is running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>  fs/ocfs2/alloc.c | 44 
>>>  1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>
>> I think *pinfo* is not necessary.
> This pointer is necessary, since it can be NULL or non-NULL depend on the
 code logic.

 This point is OK for me.

>
>>>  
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>  
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>  
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>
>> I don't get why try to lock here and if fails, acquire the same lock 
>> again
>> later but wait until granted.
> Please think about the user case, the patch is only used to handle this
 case.
> When the administer configures a fstrim schedule task on each node, then
 each node will trigger a fstrim on shared disks concurrently.
> In this case, we should avoid duplicated fstrim on a shared disk since 
> this
 will waste CPU/IO resources and affect SSD lifetime sometimes.

 I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
 since physical-logical address converting table resides in RAM while SSD is
 working.
 And that table won't be at a big scale. My point here is not affecting this
 patch. Just a tip here.
>>> This depend on SSD firmware implementation, but for secure-trim, it really
>> possibly affect SSD lifetime.
>>>
> Firstly, we use try_lock to get fstrim dlm lock to identify if there is 
> any
 other node which is doing fstrim on the disk.
> If not, this node is the first one, this node should do fstrim operation 
> on
 the disk.
> If yes, this node is not the first one, this node should wait until the
 first node is done for fstrim operation, then return the result from DLM
 lock's value.
>
>> Can it just acquire the _trimfs_ lock as a blocking one directly here?
> We can not do a blocking lock directly, since we need to identify if there
 is any other node has being do fstrim operation when this node start to do
 fstrim.

 Thanks for your elaboration.

 Well how about the third node trying to trimming fs too?
 It needs LVB from the second node.
 But it seems that the second node can't provide a valid LVB.
 So the third node will perform trimfs once more.
>>> No, the second node does not change DLM lock's value, but the DLM lock's
>> value is still valid.
>>> The third node also refer to this DLM lock's value, then do the same logic
>> like the second node.
>>
>> Hi Gang,
>> I don't see any places where ocfs2_lock_res::ocfs2_lock_res_ops::set_lvb is
>> set while flag LOCK_TYPE_USES_LVB is added.
>>
>> Are you sure below code path can work well?
> Yes, have done a full testing on two and three nodes.
> 
>> ocfs2_process_blocked_lock
>> ocfs2_unblock_lock

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> On 2018/1/10 18:14, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> On 2018/1/10 17:05, Gang He wrote:
 Hi Changwei,


>>>
> Hi Gang,
>
> On 2017/12/14 13:16, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>>
>> Compare with first version, I change the fstrim commands' returned
>> value and behavior in case which meets a fstrim command is running
>> on a shared disk.
>>
>> Signed-off-by: Gang He 
>> ---
>> fs/ocfs2/alloc.c | 44 
>> 1 file changed, 44 insertions(+)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
>
> I think *pinfo* is not necessary.
 This pointer is necessary, since it can be NULL or non-NULL depend on the
>>> code logic.
>>>
>>> This point is OK for me.
>>>

>> 
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
> fstrim_range *range)
>> 
>>  trace_ocfs2_trim_fs(start, len, minlen);
>> 
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>
> I don't get why try to lock here and if fails, acquire the same lock again
> later but wait until granted.
 Please think about the user case, the patch is only used to handle this
>>> case.
 When the administer configures a fstrim schedule task on each node, then
>>> each node will trigger a fstrim on shared disks concurrently.
 In this case, we should avoid duplicated fstrim on a shared disk since this
>>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>>
>>> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
>>> since physical-logical address converting table resides in RAM while SSD is
>>> working.
>>> And that table won't be at a big scale. My point here is not affecting this
>>> patch. Just a tip here.
>> This depend on SSD firmware implementation, but for secure-trim, it really 
> possibly affect SSD lifetime.
>> 
 Firstly, we use try_lock to get fstrim dlm lock to identify if there is any
>>> other node which is doing fstrim on the disk.
 If not, this node is the first one, this node should do fstrim operation on
>>> the disk.
 If yes, this node is not the first one, this node should wait until the
>>> first node is done for fstrim operation, then return the result from DLM
>>> lock's value.

> Can it just acquire the _trimfs_ lock as a blocking one directly here?
 We can not do a blocking lock directly, since we need to identify if there
>>> is any other node has being do fstrim operation when this node start to do
>>> fstrim.
>>>
>>> Thanks for your elaboration.
>>>
>>> Well how about the third node trying to trimming fs too?
>>> It needs LVB from the second node.
>>> But it seems that the second node can't provide a valid LVB.
>>> So the third node will perform trimfs once more.
>> No, the second node does not change DLM lock's value, but the DLM lock's 
> value is still valid.
>> The third node also refer to this DLM lock's value, then do the same logic 
> like the second node.
> 
> Hi Gang,
> I don't see any places where ocfs2_lock_res::ocfs2_lock_res_ops::set_lvb is 
> set while flag LOCK_TYPE_USES_LVB is added.
> 
> Are you sure below code path can work well?
Yes, have done a full testing on two and three nodes.

> ocfs2_process_blocked_lock
>ocfs2_unblock_lock
>Reference to ::set_lvb since LOCK_TYPE_USES_LVB is set.
> 
the set_lvb callback function is not necessary, if we update DLM lock value by 
ourselves 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> On 2018/1/10 18:14, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> On 2018/1/10 17:05, Gang He wrote:
 Hi Changwei,


>>>
> Hi Gang,
>
> On 2017/12/14 13:16, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>>
>> Compare with first version, I change the fstrim commands' returned
>> value and behavior in case which meets a fstrim command is running
>> on a shared disk.
>>
>> Signed-off-by: Gang He 
>> ---
>> fs/ocfs2/alloc.c | 44 
>> 1 file changed, 44 insertions(+)
>>
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
>
> I think *pinfo* is not necessary.
 This pointer is necessary, since it can be NULL or non-NULL depend on the
>>> code logic.
>>>
>>> This point is OK for me.
>>>

>> 
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
> fstrim_range *range)
>> 
>>  trace_ocfs2_trim_fs(start, len, minlen);
>> 
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>
> I don't get why try to lock here and if fails, acquire the same lock again
> later but wait until granted.
 Please think about the user case, the patch is only used to handle this
>>> case.
 When the administer configures a fstrim schedule task on each node, then
>>> each node will trigger a fstrim on shared disks concurrently.
 In this case, we should avoid duplicated fstrim on a shared disk since this
>>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>>
>>> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
>>> since physical-logical address converting table resides in RAM while SSD is
>>> working.
>>> And that table won't be at a big scale. My point here is not affecting this
>>> patch. Just a tip here.
>> This depend on SSD firmware implementation, but for secure-trim, it really 
> possibly affect SSD lifetime.
>> 
 Firstly, we use try_lock to get fstrim dlm lock to identify if there is any
>>> other node which is doing fstrim on the disk.
 If not, this node is the first one, this node should do fstrim operation on
>>> the disk.
 If yes, this node is not the first one, this node should wait until the
>>> first node is done for fstrim operation, then return the result from DLM
>>> lock's value.

> Can it just acquire the _trimfs_ lock as a blocking one directly here?
 We can not do a blocking lock directly, since we need to identify if there
>>> is any other node has being do fstrim operation when this node start to do
>>> fstrim.
>>>
>>> Thanks for your elaboration.
>>>
>>> Well how about the third node trying to trimming fs too?
>>> It needs LVB from the second node.
>>> But it seems that the second node can't provide a valid LVB.
>>> So the third node will perform trimfs once more.
>> No, the second node does not change DLM lock's value, but the DLM lock's 
> value is still valid.
>> The third node also refer to this DLM lock's value, then do the same logic 
> like the second node.
> 
> Hi Gang,
> I don't see any places where ocfs2_lock_res::ocfs2_lock_res_ops::set_lvb is 
> set while flag LOCK_TYPE_USES_LVB is added.
> 
> Are you sure below code path can work well?
Yes, have done a full testing on two and three nodes.

> ocfs2_process_blocked_lock
>ocfs2_unblock_lock
>Reference to ::set_lvb since LOCK_TYPE_USES_LVB is set.
> 
the set_lvb callback function is not necessary, if we update DLM lock value by 
ourselves before unlock.
By 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/10 18:14, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/10 17:05, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 Hi Gang,

 On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
>
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
>
> Signed-off-by: Gang He 
> ---
> fs/ocfs2/alloc.c | 44 
> 1 file changed, 44 insertions(+)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

 I think *pinfo* is not necessary.
>>> This pointer is necessary, since it can be NULL or non-NULL depend on the
>> code logic.
>>
>> This point is OK for me.
>>
>>>
> 
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
> 
>   trace_ocfs2_trim_fs(start, len, minlen);
> 
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

 I don't get why try to lock here and if fails, acquire the same lock again
 later but wait until granted.
>>> Please think about the user case, the patch is only used to handle this
>> case.
>>> When the administer configures a fstrim schedule task on each node, then
>> each node will trigger a fstrim on shared disks concurrently.
>>> In this case, we should avoid duplicated fstrim on a shared disk since this
>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>
>> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
>> since physical-logical address converting table resides in RAM while SSD is
>> working.
>> And that table won't be at a big scale. My point here is not affecting this
>> patch. Just a tip here.
> This depend on SSD firmware implementation, but for secure-trim, it really 
> possibly affect SSD lifetime.
> 
>>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is any
>> other node which is doing fstrim on the disk.
>>> If not, this node is the first one, this node should do fstrim operation on
>> the disk.
>>> If yes, this node is not the first one, this node should wait until the
>> first node is done for fstrim operation, then return the result from DLM
>> lock's value.
>>>
 Can it just acquire the _trimfs_ lock as a blocking one directly here?
>>> We can not do a blocking lock directly, since we need to identify if there
>> is any other node has being do fstrim operation when this node start to do
>> fstrim.
>>
>> Thanks for your elaboration.
>>
>> Well how about the third node trying to trimming fs too?
>> It needs LVB from the second node.
>> But it seems that the second node can't provide a valid LVB.
>> So the third node will perform trimfs once more.
> No, the second node does not change DLM lock's value, but the DLM lock's 
> value is still valid.
> The third node also refer to this DLM lock's value, then do the same logic 
> like the second node.

Hi Gang,
I don't see any places where ocfs2_lock_res::ocfs2_lock_res_ops::set_lvb is set 
while flag LOCK_TYPE_USES_LVB is added.

Are you sure below code path can work well?
ocfs2_process_blocked_lock
   ocfs2_unblock_lock
   Reference to ::set_lvb since LOCK_TYPE_USES_LVB is set.

Thanks,
Changwei

> 
>>
>> IOW, three nodes are trying to trimming fs concurrently. Is your patch able
>> to handle such a scenario?
> Yes, the patch can handle this case.
> 
>>
>> Even the second lock request with QUEUE set just follows
>> ocfs2_trim_fs_lock_res_uninit() will not get rid of concurrent 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/10 18:14, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/10 17:05, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 Hi Gang,

 On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
>
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
>
> Signed-off-by: Gang He 
> ---
> fs/ocfs2/alloc.c | 44 
> 1 file changed, 44 insertions(+)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

 I think *pinfo* is not necessary.
>>> This pointer is necessary, since it can be NULL or non-NULL depend on the
>> code logic.
>>
>> This point is OK for me.
>>
>>>
> 
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
> 
>   trace_ocfs2_trim_fs(start, len, minlen);
> 
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

 I don't get why try to lock here and if fails, acquire the same lock again
 later but wait until granted.
>>> Please think about the user case, the patch is only used to handle this
>> case.
>>> When the administer configures a fstrim schedule task on each node, then
>> each node will trigger a fstrim on shared disks concurrently.
>>> In this case, we should avoid duplicated fstrim on a shared disk since this
>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>
>> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
>> since physical-logical address converting table resides in RAM while SSD is
>> working.
>> And that table won't be at a big scale. My point here is not affecting this
>> patch. Just a tip here.
> This depend on SSD firmware implementation, but for secure-trim, it really 
> possibly affect SSD lifetime.
> 
>>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is any
>> other node which is doing fstrim on the disk.
>>> If not, this node is the first one, this node should do fstrim operation on
>> the disk.
>>> If yes, this node is not the first one, this node should wait until the
>> first node is done for fstrim operation, then return the result from DLM
>> lock's value.
>>>
 Can it just acquire the _trimfs_ lock as a blocking one directly here?
>>> We can not do a blocking lock directly, since we need to identify if there
>> is any other node has being do fstrim operation when this node start to do
>> fstrim.
>>
>> Thanks for your elaboration.
>>
>> Well how about the third node trying to trimming fs too?
>> It needs LVB from the second node.
>> But it seems that the second node can't provide a valid LVB.
>> So the third node will perform trimfs once more.
> No, the second node does not change DLM lock's value, but the DLM lock's 
> value is still valid.
> The third node also refer to this DLM lock's value, then do the same logic 
> like the second node.

Hi Gang,
I don't see any places where ocfs2_lock_res::ocfs2_lock_res_ops::set_lvb is set 
while flag LOCK_TYPE_USES_LVB is added.

Are you sure below code path can work well?
ocfs2_process_blocked_lock
   ocfs2_unblock_lock
   Reference to ::set_lvb since LOCK_TYPE_USES_LVB is set.

Thanks,
Changwei

> 
>>
>> IOW, three nodes are trying to trimming fs concurrently. Is your patch able
>> to handle such a scenario?
> Yes, the patch can handle this case.
> 
>>
>> Even the second lock request with QUEUE set just follows
>> ocfs2_trim_fs_lock_res_uninit() will not get rid of concurrent trimfs.
>>
>>>

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
Hi Gang,

On 2018/1/10 18:14, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/10 17:05, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 Hi Gang,

 On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
>
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
>
> Signed-off-by: Gang He 
> ---
> fs/ocfs2/alloc.c | 44 
> 1 file changed, 44 insertions(+)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

 I think *pinfo* is not necessary.
>>> This pointer is necessary, since it can be NULL or non-NULL depend on the
>> code logic.
>>
>> This point is OK for me.
>>
>>>
> 
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
> 
>   trace_ocfs2_trim_fs(start, len, minlen);
> 
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

 I don't get why try to lock here and if fails, acquire the same lock again
 later but wait until granted.
>>> Please think about the user case, the patch is only used to handle this
>> case.
>>> When the administer configures a fstrim schedule task on each node, then
>> each node will trigger a fstrim on shared disks concurrently.
>>> In this case, we should avoid duplicated fstrim on a shared disk since this
>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>
>> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
>> since physical-logical address converting table resides in RAM while SSD is
>> working.
>> And that table won't be at a big scale. My point here is not affecting this
>> patch. Just a tip here.
> This depend on SSD firmware implementation, but for secure-trim, it really 
> possibly affect SSD lifetime.
> 
>>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is any
>> other node which is doing fstrim on the disk.
>>> If not, this node is the first one, this node should do fstrim operation on
>> the disk.
>>> If yes, this node is not the first one, this node should wait until the
>> first node is done for fstrim operation, then return the result from DLM
>> lock's value.
>>>
 Can it just acquire the _trimfs_ lock as a blocking one directly here?
>>> We can not do a blocking lock directly, since we need to identify if there
>> is any other node has being do fstrim operation when this node start to do
>> fstrim.
>>
>> Thanks for your elaboration.
>>
>> Well how about the third node trying to trimming fs too?
>> It needs LVB from the second node.
>> But it seems that the second node can't provide a valid LVB.
>> So the third node will perform trimfs once more.
> No, the second node does not change DLM lock's value, but the DLM lock's 
> value is still valid.
> The third node also refer to this DLM lock's value, then do the same logic 
> like the second node.

Um, as I know SUSE is using fs/dlm which I am not familiar with.
But for ocfs2/dlm, the LVB passing path should be like below:

NODE 1 lvb (ex granted at time1) -> NODE 2 lvb(ex granted at time2) -> NODE 3 
lvb(ex granted at time3).
time1 < time2  < time3

So I think NODE 3 can't obtain LVB from NODE 1 but from NODE 2.
Moreover, if node 1 is the master of trimfs lock resource, node 1's LVB will be 
updated to be the same as node 2.

Thanks,
Changwei

> 
>>
>> IOW, three nodes are trying to trimming fs concurrently. Is your patch able
>> to handle such a scenario?
> Yes, the 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
Hi Gang,

On 2018/1/10 18:14, Gang He wrote:
> Hi Changwei,
> 
> 

>> On 2018/1/10 17:05, Gang He wrote:
>>> Hi Changwei,
>>>
>>>
>>
 Hi Gang,

 On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
>
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
>
> Signed-off-by: Gang He 
> ---
> fs/ocfs2/alloc.c | 44 
> 1 file changed, 44 insertions(+)
>
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

 I think *pinfo* is not necessary.
>>> This pointer is necessary, since it can be NULL or non-NULL depend on the
>> code logic.
>>
>> This point is OK for me.
>>
>>>
> 
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
 fstrim_range *range)
> 
>   trace_ocfs2_trim_fs(start, len, minlen);
> 
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

 I don't get why try to lock here and if fails, acquire the same lock again
 later but wait until granted.
>>> Please think about the user case, the patch is only used to handle this
>> case.
>>> When the administer configures a fstrim schedule task on each node, then
>> each node will trigger a fstrim on shared disks concurrently.
>>> In this case, we should avoid duplicated fstrim on a shared disk since this
>> will waste CPU/IO resources and affect SSD lifetime sometimes.
>>
>> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot,
>> since physical-logical address converting table resides in RAM while SSD is
>> working.
>> And that table won't be at a big scale. My point here is not affecting this
>> patch. Just a tip here.
> This depend on SSD firmware implementation, but for secure-trim, it really 
> possibly affect SSD lifetime.
> 
>>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is any
>> other node which is doing fstrim on the disk.
>>> If not, this node is the first one, this node should do fstrim operation on
>> the disk.
>>> If yes, this node is not the first one, this node should wait until the
>> first node is done for fstrim operation, then return the result from DLM
>> lock's value.
>>>
 Can it just acquire the _trimfs_ lock as a blocking one directly here?
>>> We can not do a blocking lock directly, since we need to identify if there
>> is any other node has being do fstrim operation when this node start to do
>> fstrim.
>>
>> Thanks for your elaboration.
>>
>> Well how about the third node trying to trimming fs too?
>> It needs LVB from the second node.
>> But it seems that the second node can't provide a valid LVB.
>> So the third node will perform trimfs once more.
> No, the second node does not change DLM lock's value, but the DLM lock's 
> value is still valid.
> The third node also refer to this DLM lock's value, then do the same logic 
> like the second node.

Um, as I know SUSE is using fs/dlm which I am not familiar with.
But for ocfs2/dlm, the LVB passing path should be like below:

NODE 1 lvb (ex granted at time1) -> NODE 2 lvb(ex granted at time2) -> NODE 3 
lvb(ex granted at time3).
time1 < time2  < time3

So I think NODE 3 can't obtain LVB from NODE 1 but from NODE 2.
Moreover, if node 1 is the master of trimfs lock resource, node 1's LVB will be 
updated to be the same as node 2.

Thanks,
Changwei

> 
>>
>> IOW, three nodes are trying to trimming fs concurrently. Is your patch able
>> to handle such a scenario?
> Yes, the patch can handle 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> On 2018/1/10 17:05, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> Hi Gang,
>>>
>>> On 2017/12/14 13:16, Gang He wrote:
 As you know, ocfs2 has support trim the underlying disk via
 fstrim command. But there is a problem, ocfs2 is a shared disk
 cluster file system, if the user configures a scheduled fstrim
 job on each file system node, this will trigger multiple nodes
 trim a shared disk simultaneously, it is very wasteful for CPU
 and IO consumption, also might negatively affect the lifetime
 of poor-quality SSD devices.
 Then, we introduce a trimfs dlm lock to communicate with each
 other in this case, which will make only one fstrim command to
 do the trimming on a shared disk among the cluster, the fstrim
 commands from the other nodes should wait for the first fstrim
 to finish and returned success directly, to avoid running a the
 same trim on the shared disk again.

 Compare with first version, I change the fstrim commands' returned
 value and behavior in case which meets a fstrim command is running
 on a shared disk.

 Signed-off-by: Gang He 
 ---
fs/ocfs2/alloc.c | 44 
1 file changed, 44 insertions(+)

 diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
 index ab5105f..5c9c3e2 100644
 --- a/fs/ocfs2/alloc.c
 +++ b/fs/ocfs2/alloc.c
 @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
 +  struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>
>>> I think *pinfo* is not necessary.
>> This pointer is necessary, since it can be NULL or non-NULL depend on the 
> code logic.
> 
> This point is OK for me.
> 
>> 

start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
 @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)

trace_ocfs2_trim_fs(start, len, minlen);

 +  ocfs2_trim_fs_lock_res_init(osb);
 +  ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>
>>> I don't get why try to lock here and if fails, acquire the same lock again
>>> later but wait until granted.
>> Please think about the user case, the patch is only used to handle this 
> case.
>> When the administer configures a fstrim schedule task on each node, then 
> each node will trigger a fstrim on shared disks concurrently.
>> In this case, we should avoid duplicated fstrim on a shared disk since this 
> will waste CPU/IO resources and affect SSD lifetime sometimes.
> 
> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot, 
> since physical-logical address converting table resides in RAM while SSD is 
> working.
> And that table won't be at a big scale. My point here is not affecting this 
> patch. Just a tip here.
This depend on SSD firmware implementation, but for secure-trim, it really 
possibly affect SSD lifetime.

>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is any 
> other node which is doing fstrim on the disk.
>> If not, this node is the first one, this node should do fstrim operation on 
> the disk.
>> If yes, this node is not the first one, this node should wait until the 
> first node is done for fstrim operation, then return the result from DLM 
> lock's value.
>> 
>>> Can it just acquire the _trimfs_ lock as a blocking one directly here?
>> We can not do a blocking lock directly, since we need to identify if there 
> is any other node has being do fstrim operation when this node start to do 
> fstrim.
> 
> Thanks for your elaboration.
> 
> Well how about the third node trying to trimming fs too?
> It needs LVB from the second node.
> But it seems that the second node can't provide a valid LVB.
> So the third node will perform trimfs once more.
No, the second node does not change DLM lock's value, but the DLM lock's value 
is still valid.
The third node also refer to this DLM lock's value, then do the same logic like 
the second node.

> 
> IOW, three nodes are trying to trimming fs concurrently. Is your patch able 
> to handle such a scenario?
Yes, the patch can handle this case.

> 
> Even the second lock request with QUEUE set just follows 
> ocfs2_trim_fs_lock_res_uninit() will not get rid of concurrent trimfs.
> 
>> 
>>>
 +  if (ret < 0) {
 +  if (ret != -EAGAIN) {
 +  mlog_errno(ret);
 +  ocfs2_trim_fs_lock_res_uninit(osb);
 +  goto out_unlock;
 +  }
 +
 +  mlog(ML_NOTICE, "Wait for trim on device (%s) to "
 +   "finish, which is running from another node.\n",
 +   osb->dev_str);
 +  ret 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> On 2018/1/10 17:05, Gang He wrote:
>> Hi Changwei,
>> 
>> 
>
>>> Hi Gang,
>>>
>>> On 2017/12/14 13:16, Gang He wrote:
 As you know, ocfs2 has support trim the underlying disk via
 fstrim command. But there is a problem, ocfs2 is a shared disk
 cluster file system, if the user configures a scheduled fstrim
 job on each file system node, this will trigger multiple nodes
 trim a shared disk simultaneously, it is very wasteful for CPU
 and IO consumption, also might negatively affect the lifetime
 of poor-quality SSD devices.
 Then, we introduce a trimfs dlm lock to communicate with each
 other in this case, which will make only one fstrim command to
 do the trimming on a shared disk among the cluster, the fstrim
 commands from the other nodes should wait for the first fstrim
 to finish and returned success directly, to avoid running a the
 same trim on the shared disk again.

 Compare with first version, I change the fstrim commands' returned
 value and behavior in case which meets a fstrim command is running
 on a shared disk.

 Signed-off-by: Gang He 
 ---
fs/ocfs2/alloc.c | 44 
1 file changed, 44 insertions(+)

 diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
 index ab5105f..5c9c3e2 100644
 --- a/fs/ocfs2/alloc.c
 +++ b/fs/ocfs2/alloc.c
 @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)
struct buffer_head *gd_bh = NULL;
struct ocfs2_dinode *main_bm;
struct ocfs2_group_desc *gd = NULL;
 +  struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>
>>> I think *pinfo* is not necessary.
>> This pointer is necessary, since it can be NULL or non-NULL depend on the 
> code logic.
> 
> This point is OK for me.
> 
>> 

start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
 @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>>> fstrim_range *range)

trace_ocfs2_trim_fs(start, len, minlen);

 +  ocfs2_trim_fs_lock_res_init(osb);
 +  ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>>
>>> I don't get why try to lock here and if fails, acquire the same lock again
>>> later but wait until granted.
>> Please think about the user case, the patch is only used to handle this 
> case.
>> When the administer configures a fstrim schedule task on each node, then 
> each node will trigger a fstrim on shared disks concurrently.
>> In this case, we should avoid duplicated fstrim on a shared disk since this 
> will waste CPU/IO resources and affect SSD lifetime sometimes.
> 
> I'm not worrying about that trimfs will affect SSD's lifetime quite a lot, 
> since physical-logical address converting table resides in RAM while SSD is 
> working.
> And that table won't be at a big scale. My point here is not affecting this 
> patch. Just a tip here.
This depend on SSD firmware implementation, but for secure-trim, it really 
possibly affect SSD lifetime.

>> Firstly, we use try_lock to get fstrim dlm lock to identify if there is any 
> other node which is doing fstrim on the disk.
>> If not, this node is the first one, this node should do fstrim operation on 
> the disk.
>> If yes, this node is not the first one, this node should wait until the 
> first node is done for fstrim operation, then return the result from DLM 
> lock's value.
>> 
>>> Can it just acquire the _trimfs_ lock as a blocking one directly here?
>> We can not do a blocking lock directly, since we need to identify if there 
> is any other node has being do fstrim operation when this node start to do 
> fstrim.
> 
> Thanks for your elaboration.
> 
> Well how about the third node trying to trimming fs too?
> It needs LVB from the second node.
> But it seems that the second node can't provide a valid LVB.
> So the third node will perform trimfs once more.
No, the second node does not change DLM lock's value, but the DLM lock's value 
is still valid.
The third node also refer to this DLM lock's value, then do the same logic like 
the second node.

> 
> IOW, three nodes are trying to trimming fs concurrently. Is your patch able 
> to handle such a scenario?
Yes, the patch can handle this case.

> 
> Even the second lock request with QUEUE set just follows 
> ocfs2_trim_fs_lock_res_uninit() will not get rid of concurrent trimfs.
> 
>> 
>>>
 +  if (ret < 0) {
 +  if (ret != -EAGAIN) {
 +  mlog_errno(ret);
 +  ocfs2_trim_fs_lock_res_uninit(osb);
 +  goto out_unlock;
 +  }
 +
 +  mlog(ML_NOTICE, "Wait for trim on device (%s) to "
 +   "finish, which is running from another node.\n",
 +   osb->dev_str);
 +  ret = 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/4 10:09, Gang He wrote:
> Hi Alex,
> 
> 

>> Hi Gang,
>>
>> On 2017/12/14 13:14, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>> For the same purpose, can we take global bitmap meta lock EXMODE instead of
>> add a new trimfs dlm lock?
> I do not think using EXMODE lock to global bitmap meta can handle this case.
> this patch's purpose is to avoid duplicated trim when the nodes in cluster 
> are configured to a schedule fstrim (usually the administrator do like that).
> But, there are lots of path to use EXMODE lock to global bitmap meta, we can 
> not know which is used to trim fs.
> Second, we can not use global bitmap meta lock to save fstrim related data 
> and trylock.

Adding a new type of lock resource is acceptable, I suppose.

Thanks,
Changwei

> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Alex
>>
>>> Compare with first version, I change the fstrim commands' returned
>>> value and behavior in case which meets a fstrim command is running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>   fs/ocfs2/alloc.c | 44 
>>>   1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>   
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>   
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>   
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>> +   if (ret < 0) {
>>> +   if (ret != -EAGAIN) {
>>> +   mlog_errno(ret);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>> +   goto out_unlock;
>>> +   }
>>> +
>>> +   mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>> +"finish, which is running from another node.\n",
>>> +osb->dev_str);
>>> +   ret = ocfs2_trim_fs_lock(osb, , 0);
>>> +   if (ret < 0) {
>>> +   mlog_errno(ret);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>> +   goto out_unlock;
>>> +   }
>>> +
>>> +   if (info.tf_valid && info.tf_success &&
>>> +   info.tf_start == start && info.tf_len == len &&
>>> +   info.tf_minlen == minlen) {
>>> +   /* Avoid sending duplicated trim to a shared device */
>>> +   mlog(ML_NOTICE, "The same trim on device (%s) was "
>>> +"just done from node (%u), return.\n",
>>> +osb->dev_str, info.tf_nodenum);
>>> +   range->len = info.tf_trimlen;
>>> +   goto out_trimunlock;
>>> +   }
>>> +   }
>>> +
>>> +   info.tf_nodenum = osb->node_num;
>>> +   info.tf_start = start;
>>> +   info.tf_len = len;
>>> +   info.tf_minlen = minlen;
>>> +
>>> /* Determine first and last group to examine based on start and len */
>>> first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>> if (first_group == osb->first_cluster_group_blkno)
>>> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>> group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>> }
>>> range->len = trimmed * sb->s_blocksize;
>>> +
>>> +   info.tf_trimlen = range->len;
>>> +   info.tf_success = (ret ? 0 : 1);
>>> +   pinfo = 
>>> +out_trimunlock:
>>> +   ocfs2_trim_fs_unlock(osb, pinfo);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>>   out_unlock:
>>> ocfs2_inode_unlock(main_bm_inode, 0);
>>> brelse(main_bm_bh);
>>>
> 
> ___
> Ocfs2-devel mailing list
> ocfs2-de...@oss.oracle.com
> 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/4 10:09, Gang He wrote:
> Hi Alex,
> 
> 

>> Hi Gang,
>>
>> On 2017/12/14 13:14, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>> For the same purpose, can we take global bitmap meta lock EXMODE instead of
>> add a new trimfs dlm lock?
> I do not think using EXMODE lock to global bitmap meta can handle this case.
> this patch's purpose is to avoid duplicated trim when the nodes in cluster 
> are configured to a schedule fstrim (usually the administrator do like that).
> But, there are lots of path to use EXMODE lock to global bitmap meta, we can 
> not know which is used to trim fs.
> Second, we can not use global bitmap meta lock to save fstrim related data 
> and trylock.

Adding a new type of lock resource is acceptable, I suppose.

Thanks,
Changwei

> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Alex
>>
>>> Compare with first version, I change the fstrim commands' returned
>>> value and behavior in case which meets a fstrim command is running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>   fs/ocfs2/alloc.c | 44 
>>>   1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>>   
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>   
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>   
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>> +   if (ret < 0) {
>>> +   if (ret != -EAGAIN) {
>>> +   mlog_errno(ret);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>> +   goto out_unlock;
>>> +   }
>>> +
>>> +   mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>> +"finish, which is running from another node.\n",
>>> +osb->dev_str);
>>> +   ret = ocfs2_trim_fs_lock(osb, , 0);
>>> +   if (ret < 0) {
>>> +   mlog_errno(ret);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>> +   goto out_unlock;
>>> +   }
>>> +
>>> +   if (info.tf_valid && info.tf_success &&
>>> +   info.tf_start == start && info.tf_len == len &&
>>> +   info.tf_minlen == minlen) {
>>> +   /* Avoid sending duplicated trim to a shared device */
>>> +   mlog(ML_NOTICE, "The same trim on device (%s) was "
>>> +"just done from node (%u), return.\n",
>>> +osb->dev_str, info.tf_nodenum);
>>> +   range->len = info.tf_trimlen;
>>> +   goto out_trimunlock;
>>> +   }
>>> +   }
>>> +
>>> +   info.tf_nodenum = osb->node_num;
>>> +   info.tf_start = start;
>>> +   info.tf_len = len;
>>> +   info.tf_minlen = minlen;
>>> +
>>> /* Determine first and last group to examine based on start and len */
>>> first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>> if (first_group == osb->first_cluster_group_blkno)
>>> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>> group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>> }
>>> range->len = trimmed * sb->s_blocksize;
>>> +
>>> +   info.tf_trimlen = range->len;
>>> +   info.tf_success = (ret ? 0 : 1);
>>> +   pinfo = 
>>> +out_trimunlock:
>>> +   ocfs2_trim_fs_unlock(osb, pinfo);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>>   out_unlock:
>>> ocfs2_inode_unlock(main_bm_inode, 0);
>>> brelse(main_bm_bh);
>>>
> 
> ___
> Ocfs2-devel mailing list
> ocfs2-de...@oss.oracle.com
> 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/10 17:05, Gang He wrote:
> Hi Changwei,
> 
> 

>> Hi Gang,
>>
>> On 2017/12/14 13:16, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>>> Compare with first version, I change the fstrim commands' returned
>>> value and behavior in case which meets a fstrim command is running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>fs/ocfs2/alloc.c | 44 
>>>1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>
>> I think *pinfo* is not necessary.
> This pointer is necessary, since it can be NULL or non-NULL depend on the 
> code logic.

This point is OK for me.

> 
>>>
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>
>> I don't get why try to lock here and if fails, acquire the same lock again
>> later but wait until granted.
> Please think about the user case, the patch is only used to handle this case.
> When the administer configures a fstrim schedule task on each node, then each 
> node will trigger a fstrim on shared disks concurrently.
> In this case, we should avoid duplicated fstrim on a shared disk since this 
> will waste CPU/IO resources and affect SSD lifetime sometimes.

I'm not worrying about that trimfs will affect SSD's lifetime quite a lot, 
since physical-logical address converting table resides in RAM while SSD is 
working.
And that table won't be at a big scale. My point here is not affecting this 
patch. Just a tip here.
> Firstly, we use try_lock to get fstrim dlm lock to identify if there is any 
> other node which is doing fstrim on the disk.
> If not, this node is the first one, this node should do fstrim operation on 
> the disk.
> If yes, this node is not the first one, this node should wait until the first 
> node is done for fstrim operation, then return the result from DLM lock's 
> value.
> 
>> Can it just acquire the _trimfs_ lock as a blocking one directly here?
> We can not do a blocking lock directly, since we need to identify if there is 
> any other node has being do fstrim operation when this node start to do 
> fstrim.

Thanks for your elaboration.

Well how about the third node trying to trimming fs too?
It needs LVB from the second node.
But it seems that the second node can't provide a valid LVB.
So the third node will perform trimfs once more.

IOW, three nodes are trying to trimming fs concurrently. Is your patch able to 
handle such a scenario?

Even the second lock request with QUEUE set just follows 
ocfs2_trim_fs_lock_res_uninit() will not get rid of concurrent trimfs.

> 
>>
>>> +   if (ret < 0) {
>>> +   if (ret != -EAGAIN) {
>>> +   mlog_errno(ret);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>> +   goto out_unlock;
>>> +   }
>>> +
>>> +   mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>> +"finish, which is running from another node.\n",
>>> +osb->dev_str);
>>> +   ret = ocfs2_trim_fs_lock(osb, , 0);
>>> +   if (ret < 0) {
>>> +   mlog_errno(ret);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>
>> In ocfs2_trim_fs_lock_res_uninit(), you drop lock. But it is never granted.
>> Still need to drop lock resource?
> Yes, we need to init/uninit fstrim dlm lock resource for each time.
> Otherwise, trylock does not work, this is a little different from other dlm 
> lock usage in ocfs2.

This point is OK for now, too.
> 
>>
>>> +   

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Changwei Ge
On 2018/1/10 17:05, Gang He wrote:
> Hi Changwei,
> 
> 

>> Hi Gang,
>>
>> On 2017/12/14 13:16, Gang He wrote:
>>> As you know, ocfs2 has support trim the underlying disk via
>>> fstrim command. But there is a problem, ocfs2 is a shared disk
>>> cluster file system, if the user configures a scheduled fstrim
>>> job on each file system node, this will trigger multiple nodes
>>> trim a shared disk simultaneously, it is very wasteful for CPU
>>> and IO consumption, also might negatively affect the lifetime
>>> of poor-quality SSD devices.
>>> Then, we introduce a trimfs dlm lock to communicate with each
>>> other in this case, which will make only one fstrim command to
>>> do the trimming on a shared disk among the cluster, the fstrim
>>> commands from the other nodes should wait for the first fstrim
>>> to finish and returned success directly, to avoid running a the
>>> same trim on the shared disk again.
>>>
>>> Compare with first version, I change the fstrim commands' returned
>>> value and behavior in case which meets a fstrim command is running
>>> on a shared disk.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>fs/ocfs2/alloc.c | 44 
>>>1 file changed, 44 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>>> index ab5105f..5c9c3e2 100644
>>> --- a/fs/ocfs2/alloc.c
>>> +++ b/fs/ocfs2/alloc.c
>>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>> struct buffer_head *gd_bh = NULL;
>>> struct ocfs2_dinode *main_bm;
>>> struct ocfs2_group_desc *gd = NULL;
>>> +   struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>
>> I think *pinfo* is not necessary.
> This pointer is necessary, since it can be NULL or non-NULL depend on the 
> code logic.

This point is OK for me.

> 
>>>
>>> start = range->start >> osb->s_clustersize_bits;
>>> len = range->len >> osb->s_clustersize_bits;
>>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct
>> fstrim_range *range)
>>>
>>> trace_ocfs2_trim_fs(start, len, minlen);
>>>
>>> +   ocfs2_trim_fs_lock_res_init(osb);
>>> +   ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>>
>> I don't get why try to lock here and if fails, acquire the same lock again
>> later but wait until granted.
> Please think about the user case, the patch is only used to handle this case.
> When the administer configures a fstrim schedule task on each node, then each 
> node will trigger a fstrim on shared disks concurrently.
> In this case, we should avoid duplicated fstrim on a shared disk since this 
> will waste CPU/IO resources and affect SSD lifetime sometimes.

I'm not worrying about that trimfs will affect SSD's lifetime quite a lot, 
since physical-logical address converting table resides in RAM while SSD is 
working.
And that table won't be at a big scale. My point here is not affecting this 
patch. Just a tip here.
> Firstly, we use try_lock to get fstrim dlm lock to identify if there is any 
> other node which is doing fstrim on the disk.
> If not, this node is the first one, this node should do fstrim operation on 
> the disk.
> If yes, this node is not the first one, this node should wait until the first 
> node is done for fstrim operation, then return the result from DLM lock's 
> value.
> 
>> Can it just acquire the _trimfs_ lock as a blocking one directly here?
> We can not do a blocking lock directly, since we need to identify if there is 
> any other node has being do fstrim operation when this node start to do 
> fstrim.

Thanks for your elaboration.

Well how about the third node trying to trimming fs too?
It needs LVB from the second node.
But it seems that the second node can't provide a valid LVB.
So the third node will perform trimfs once more.

IOW, three nodes are trying to trimming fs concurrently. Is your patch able to 
handle such a scenario?

Even the second lock request with QUEUE set just follows 
ocfs2_trim_fs_lock_res_uninit() will not get rid of concurrent trimfs.

> 
>>
>>> +   if (ret < 0) {
>>> +   if (ret != -EAGAIN) {
>>> +   mlog_errno(ret);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>> +   goto out_unlock;
>>> +   }
>>> +
>>> +   mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>>> +"finish, which is running from another node.\n",
>>> +osb->dev_str);
>>> +   ret = ocfs2_trim_fs_lock(osb, , 0);
>>> +   if (ret < 0) {
>>> +   mlog_errno(ret);
>>> +   ocfs2_trim_fs_lock_res_uninit(osb);
>>
>> In ocfs2_trim_fs_lock_res_uninit(), you drop lock. But it is never granted.
>> Still need to drop lock resource?
> Yes, we need to init/uninit fstrim dlm lock resource for each time.
> Otherwise, trylock does not work, this is a little different from other dlm 
> lock usage in ocfs2.

This point is OK for now, too.
> 
>>
>>> +   goto out_unlock;

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> Hi Gang,
> 
> On 2017/12/14 13:16, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>> 
>> Compare with first version, I change the fstrim commands' returned
>> value and behavior in case which meets a fstrim command is running
>> on a shared disk.
>> 
>> Signed-off-by: Gang He 
>> ---
>>   fs/ocfs2/alloc.c | 44 
>>   1 file changed, 44 insertions(+)
>> 
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
> 
> I think *pinfo* is not necessary.
This pointer is necessary, since it can be NULL or non-NULL depend on the code 
logic.  

>>   
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>   
>>  trace_ocfs2_trim_fs(start, len, minlen);
>>   
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> 
> I don't get why try to lock here and if fails, acquire the same lock again 
> later but wait until granted.
Please think about the user case, the patch is only used to handle this case.
When the administer configures a fstrim schedule task on each node, then each 
node will trigger a fstrim on shared disks concurrently.
In this case, we should avoid duplicated fstrim on a shared disk since this 
will waste CPU/IO resources and affect SSD lifetime sometimes.
Firstly, we use try_lock to get fstrim dlm lock to identify if there is any 
other node which is doing fstrim on the disk.
If not, this node is the first one, this node should do fstrim operation on the 
disk.
If yes, this node is not the first one, this node should wait until the first 
node is done for fstrim operation, then return the result from DLM lock's value.

> Can it just acquire the _trimfs_ lock as a blocking one directly here?
We can not do a blocking lock directly, since we need to identify if there is 
any other node has being do fstrim operation when this node start to do fstrim.

> 
>> +if (ret < 0) {
>> +if (ret != -EAGAIN) {
>> +mlog_errno(ret);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
>> +goto out_unlock;
>> +}
>> +
>> +mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>> + "finish, which is running from another node.\n",
>> + osb->dev_str);
>> +ret = ocfs2_trim_fs_lock(osb, , 0);
>> +if (ret < 0) {
>> +mlog_errno(ret);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
> 
> In ocfs2_trim_fs_lock_res_uninit(), you drop lock. But it is never granted.
> Still need to drop lock resource?
Yes, we need to init/uninit fstrim dlm lock resource for each time.
Otherwise, trylock does not work, this is a little different from other dlm 
lock usage in ocfs2.

> 
>> +goto out_unlock;
>> +}
>> +
>> +if (info.tf_valid && info.tf_success &&
>> +info.tf_start == start && info.tf_len == len &&
>> +info.tf_minlen == minlen) {
>> +/* Avoid sending duplicated trim to a shared device */
>> +mlog(ML_NOTICE, "The same trim on device (%s) was "
>> + "just done from node (%u), return.\n",
>> + osb->dev_str, info.tf_nodenum);
>> +range->len = info.tf_trimlen;
>> +goto out_trimunlock;
>> +}
>> +}
>> +
>> +info.tf_nodenum = osb->node_num;
>> +info.tf_start = start;
>> +info.tf_len = len;
>> +info.tf_minlen = minlen;
> 
> If we faild during dong trimfs, I think we should not cache above info in 
> LVB.
It is necessary, if the second node is waiting the first node, the first node 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-10 Thread Gang He
Hi Changwei,


>>> 
> Hi Gang,
> 
> On 2017/12/14 13:16, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>> 
>> Compare with first version, I change the fstrim commands' returned
>> value and behavior in case which meets a fstrim command is running
>> on a shared disk.
>> 
>> Signed-off-by: Gang He 
>> ---
>>   fs/ocfs2/alloc.c | 44 
>>   1 file changed, 44 insertions(+)
>> 
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
> 
> I think *pinfo* is not necessary.
This pointer is necessary, since it can be NULL or non-NULL depend on the code 
logic.  

>>   
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>   
>>  trace_ocfs2_trim_fs(start, len, minlen);
>>   
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> 
> I don't get why try to lock here and if fails, acquire the same lock again 
> later but wait until granted.
Please think about the user case, the patch is only used to handle this case.
When the administer configures a fstrim schedule task on each node, then each 
node will trigger a fstrim on shared disks concurrently.
In this case, we should avoid duplicated fstrim on a shared disk since this 
will waste CPU/IO resources and affect SSD lifetime sometimes.
Firstly, we use try_lock to get fstrim dlm lock to identify if there is any 
other node which is doing fstrim on the disk.
If not, this node is the first one, this node should do fstrim operation on the 
disk.
If yes, this node is not the first one, this node should wait until the first 
node is done for fstrim operation, then return the result from DLM lock's value.

> Can it just acquire the _trimfs_ lock as a blocking one directly here?
We can not do a blocking lock directly, since we need to identify if there is 
any other node has being do fstrim operation when this node start to do fstrim.

> 
>> +if (ret < 0) {
>> +if (ret != -EAGAIN) {
>> +mlog_errno(ret);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
>> +goto out_unlock;
>> +}
>> +
>> +mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>> + "finish, which is running from another node.\n",
>> + osb->dev_str);
>> +ret = ocfs2_trim_fs_lock(osb, , 0);
>> +if (ret < 0) {
>> +mlog_errno(ret);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
> 
> In ocfs2_trim_fs_lock_res_uninit(), you drop lock. But it is never granted.
> Still need to drop lock resource?
Yes, we need to init/uninit fstrim dlm lock resource for each time.
Otherwise, trylock does not work, this is a little different from other dlm 
lock usage in ocfs2.

> 
>> +goto out_unlock;
>> +}
>> +
>> +if (info.tf_valid && info.tf_success &&
>> +info.tf_start == start && info.tf_len == len &&
>> +info.tf_minlen == minlen) {
>> +/* Avoid sending duplicated trim to a shared device */
>> +mlog(ML_NOTICE, "The same trim on device (%s) was "
>> + "just done from node (%u), return.\n",
>> + osb->dev_str, info.tf_nodenum);
>> +range->len = info.tf_trimlen;
>> +goto out_trimunlock;
>> +}
>> +}
>> +
>> +info.tf_nodenum = osb->node_num;
>> +info.tf_start = start;
>> +info.tf_len = len;
>> +info.tf_minlen = minlen;
> 
> If we faild during dong trimfs, I think we should not cache above info in 
> LVB.
It is necessary, if the second node is waiting the first node, the first node 
fails to do 

Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-09 Thread Changwei Ge
Hi Gang,

On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
> 
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
> 
> Signed-off-by: Gang He 
> ---
>   fs/ocfs2/alloc.c | 44 
>   1 file changed, 44 insertions(+)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

I think *pinfo* is not necessary.
>   
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   
>   trace_ocfs2_trim_fs(start, len, minlen);
>   
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

I don't get why try to lock here and if fails, acquire the same lock again 
later but wait until granted.
Can it just acquire the _trimfs_ lock as a blocking one directly here?

> + if (ret < 0) {
> + if (ret != -EAGAIN) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> +  "finish, which is running from another node.\n",
> +  osb->dev_str);
> + ret = ocfs2_trim_fs_lock(osb, , 0);
> + if (ret < 0) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);

In ocfs2_trim_fs_lock_res_uninit(), you drop lock. But it is never granted.
Still need to drop lock resource?

> + goto out_unlock;
> + }
> +
> + if (info.tf_valid && info.tf_success &&
> + info.tf_start == start && info.tf_len == len &&
> + info.tf_minlen == minlen) {
> + /* Avoid sending duplicated trim to a shared device */
> + mlog(ML_NOTICE, "The same trim on device (%s) was "
> +  "just done from node (%u), return.\n",
> +  osb->dev_str, info.tf_nodenum);
> + range->len = info.tf_trimlen;
> + goto out_trimunlock;
> + }
> + }
> +
> + info.tf_nodenum = osb->node_num;
> + info.tf_start = start;
> + info.tf_len = len;
> + info.tf_minlen = minlen;

If we faild during dong trimfs, I think we should not cache above info in LVB.
BTW, it seems that this patch is on top of  'try lock' patches which you 
previously sent out.
Are they related?

Thanks,
Changwei

> +
>   /* Determine first and last group to examine based on start and len */
>   first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>   if (first_group == osb->first_cluster_group_blkno)
> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>   }
>   range->len = trimmed * sb->s_blocksize;
> +
> + info.tf_trimlen = range->len;
> + info.tf_success = (ret ? 0 : 1);
> + pinfo = 
> +out_trimunlock:
> + ocfs2_trim_fs_unlock(osb, pinfo);
> + ocfs2_trim_fs_lock_res_uninit(osb);
>   out_unlock:
>   ocfs2_inode_unlock(main_bm_inode, 0);
>   brelse(main_bm_bh);
> 



Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-09 Thread Changwei Ge
Hi Gang,

On 2017/12/14 13:16, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
> 
> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
> 
> Signed-off-by: Gang He 
> ---
>   fs/ocfs2/alloc.c | 44 
>   1 file changed, 44 insertions(+)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;

I think *pinfo* is not necessary.
>   
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   
>   trace_ocfs2_trim_fs(start, len, minlen);
>   
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);

I don't get why try to lock here and if fails, acquire the same lock again 
later but wait until granted.
Can it just acquire the _trimfs_ lock as a blocking one directly here?

> + if (ret < 0) {
> + if (ret != -EAGAIN) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> +  "finish, which is running from another node.\n",
> +  osb->dev_str);
> + ret = ocfs2_trim_fs_lock(osb, , 0);
> + if (ret < 0) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);

In ocfs2_trim_fs_lock_res_uninit(), you drop lock. But it is never granted.
Still need to drop lock resource?

> + goto out_unlock;
> + }
> +
> + if (info.tf_valid && info.tf_success &&
> + info.tf_start == start && info.tf_len == len &&
> + info.tf_minlen == minlen) {
> + /* Avoid sending duplicated trim to a shared device */
> + mlog(ML_NOTICE, "The same trim on device (%s) was "
> +  "just done from node (%u), return.\n",
> +  osb->dev_str, info.tf_nodenum);
> + range->len = info.tf_trimlen;
> + goto out_trimunlock;
> + }
> + }
> +
> + info.tf_nodenum = osb->node_num;
> + info.tf_start = start;
> + info.tf_len = len;
> + info.tf_minlen = minlen;

If we faild during dong trimfs, I think we should not cache above info in LVB.
BTW, it seems that this patch is on top of  'try lock' patches which you 
previously sent out.
Are they related?

Thanks,
Changwei

> +
>   /* Determine first and last group to examine based on start and len */
>   first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>   if (first_group == osb->first_cluster_group_blkno)
> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>   }
>   range->len = trimmed * sb->s_blocksize;
> +
> + info.tf_trimlen = range->len;
> + info.tf_success = (ret ? 0 : 1);
> + pinfo = 
> +out_trimunlock:
> + ocfs2_trim_fs_unlock(osb, pinfo);
> + ocfs2_trim_fs_lock_res_uninit(osb);
>   out_unlock:
>   ocfs2_inode_unlock(main_bm_inode, 0);
>   brelse(main_bm_bh);
> 



Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-03 Thread Gang He
Hi Alex,


>>> 
> Hi Gang,
> 
> On 2017/12/14 13:14, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>> 
> For the same purpose, can we take global bitmap meta lock EXMODE instead of
> add a new trimfs dlm lock?
I do not think using EXMODE lock to global bitmap meta can handle this case.
this patch's purpose is to avoid duplicated trim when the nodes in cluster are 
configured to a schedule fstrim (usually the administrator do like that).
But, there are lots of path to use EXMODE lock to global bitmap meta, we can 
not know which is used to trim fs.
Second, we can not use global bitmap meta lock to save fstrim related data and 
trylock.

Thanks
Gang 

> 
> Thanks,
> Alex
> 
>> Compare with first version, I change the fstrim commands' returned
>> value and behavior in case which meets a fstrim command is running
>> on a shared disk.
>> 
>> Signed-off-by: Gang He 
>> ---
>>  fs/ocfs2/alloc.c | 44 
>>  1 file changed, 44 insertions(+)
>> 
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>  
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>  
>>  trace_ocfs2_trim_fs(start, len, minlen);
>>  
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>> +if (ret < 0) {
>> +if (ret != -EAGAIN) {
>> +mlog_errno(ret);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
>> +goto out_unlock;
>> +}
>> +
>> +mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>> + "finish, which is running from another node.\n",
>> + osb->dev_str);
>> +ret = ocfs2_trim_fs_lock(osb, , 0);
>> +if (ret < 0) {
>> +mlog_errno(ret);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
>> +goto out_unlock;
>> +}
>> +
>> +if (info.tf_valid && info.tf_success &&
>> +info.tf_start == start && info.tf_len == len &&
>> +info.tf_minlen == minlen) {
>> +/* Avoid sending duplicated trim to a shared device */
>> +mlog(ML_NOTICE, "The same trim on device (%s) was "
>> + "just done from node (%u), return.\n",
>> + osb->dev_str, info.tf_nodenum);
>> +range->len = info.tf_trimlen;
>> +goto out_trimunlock;
>> +}
>> +}
>> +
>> +info.tf_nodenum = osb->node_num;
>> +info.tf_start = start;
>> +info.tf_len = len;
>> +info.tf_minlen = minlen;
>> +
>>  /* Determine first and last group to examine based on start and len */
>>  first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>  if (first_group == osb->first_cluster_group_blkno)
>> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>  group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>  }
>>  range->len = trimmed * sb->s_blocksize;
>> +
>> +info.tf_trimlen = range->len;
>> +info.tf_success = (ret ? 0 : 1);
>> +pinfo = 
>> +out_trimunlock:
>> +ocfs2_trim_fs_unlock(osb, pinfo);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
>>  out_unlock:
>>  ocfs2_inode_unlock(main_bm_inode, 0);
>>  brelse(main_bm_bh);
>> 


Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-03 Thread Gang He
Hi Alex,


>>> 
> Hi Gang,
> 
> On 2017/12/14 13:14, Gang He wrote:
>> As you know, ocfs2 has support trim the underlying disk via
>> fstrim command. But there is a problem, ocfs2 is a shared disk
>> cluster file system, if the user configures a scheduled fstrim
>> job on each file system node, this will trigger multiple nodes
>> trim a shared disk simultaneously, it is very wasteful for CPU
>> and IO consumption, also might negatively affect the lifetime
>> of poor-quality SSD devices.
>> Then, we introduce a trimfs dlm lock to communicate with each
>> other in this case, which will make only one fstrim command to
>> do the trimming on a shared disk among the cluster, the fstrim
>> commands from the other nodes should wait for the first fstrim
>> to finish and returned success directly, to avoid running a the
>> same trim on the shared disk again.
>> 
> For the same purpose, can we take global bitmap meta lock EXMODE instead of
> add a new trimfs dlm lock?
I do not think using EXMODE lock to global bitmap meta can handle this case.
this patch's purpose is to avoid duplicated trim when the nodes in cluster are 
configured to a schedule fstrim (usually the administrator do like that).
But, there are lots of path to use EXMODE lock to global bitmap meta, we can 
not know which is used to trim fs.
Second, we can not use global bitmap meta lock to save fstrim related data and 
trylock.

Thanks
Gang 

> 
> Thanks,
> Alex
> 
>> Compare with first version, I change the fstrim commands' returned
>> value and behavior in case which meets a fstrim command is running
>> on a shared disk.
>> 
>> Signed-off-by: Gang He 
>> ---
>>  fs/ocfs2/alloc.c | 44 
>>  1 file changed, 44 insertions(+)
>> 
>> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
>> index ab5105f..5c9c3e2 100644
>> --- a/fs/ocfs2/alloc.c
>> +++ b/fs/ocfs2/alloc.c
>> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>  struct buffer_head *gd_bh = NULL;
>>  struct ocfs2_dinode *main_bm;
>>  struct ocfs2_group_desc *gd = NULL;
>> +struct ocfs2_trim_fs_info info, *pinfo = NULL;
>>  
>>  start = range->start >> osb->s_clustersize_bits;
>>  len = range->len >> osb->s_clustersize_bits;
>> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>  
>>  trace_ocfs2_trim_fs(start, len, minlen);
>>  
>> +ocfs2_trim_fs_lock_res_init(osb);
>> +ret = ocfs2_trim_fs_lock(osb, NULL, 1);
>> +if (ret < 0) {
>> +if (ret != -EAGAIN) {
>> +mlog_errno(ret);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
>> +goto out_unlock;
>> +}
>> +
>> +mlog(ML_NOTICE, "Wait for trim on device (%s) to "
>> + "finish, which is running from another node.\n",
>> + osb->dev_str);
>> +ret = ocfs2_trim_fs_lock(osb, , 0);
>> +if (ret < 0) {
>> +mlog_errno(ret);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
>> +goto out_unlock;
>> +}
>> +
>> +if (info.tf_valid && info.tf_success &&
>> +info.tf_start == start && info.tf_len == len &&
>> +info.tf_minlen == minlen) {
>> +/* Avoid sending duplicated trim to a shared device */
>> +mlog(ML_NOTICE, "The same trim on device (%s) was "
>> + "just done from node (%u), return.\n",
>> + osb->dev_str, info.tf_nodenum);
>> +range->len = info.tf_trimlen;
>> +goto out_trimunlock;
>> +}
>> +}
>> +
>> +info.tf_nodenum = osb->node_num;
>> +info.tf_start = start;
>> +info.tf_len = len;
>> +info.tf_minlen = minlen;
>> +
>>  /* Determine first and last group to examine based on start and len */
>>  first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>>  if (first_group == osb->first_cluster_group_blkno)
>> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>>  group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>>  }
>>  range->len = trimmed * sb->s_blocksize;
>> +
>> +info.tf_trimlen = range->len;
>> +info.tf_success = (ret ? 0 : 1);
>> +pinfo = 
>> +out_trimunlock:
>> +ocfs2_trim_fs_unlock(osb, pinfo);
>> +ocfs2_trim_fs_lock_res_uninit(osb);
>>  out_unlock:
>>  ocfs2_inode_unlock(main_bm_inode, 0);
>>  brelse(main_bm_bh);
>> 


Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-03 Thread alex chen
Hi Gang,

On 2017/12/14 13:14, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
> 
For the same purpose, can we take global bitmap meta lock EXMODE instead of
add a new trimfs dlm lock?

Thanks,
Alex

> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
> 
> Signed-off-by: Gang He 
> ---
>  fs/ocfs2/alloc.c | 44 
>  1 file changed, 44 insertions(+)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;
>  
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>  
>   trace_ocfs2_trim_fs(start, len, minlen);
>  
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> + if (ret < 0) {
> + if (ret != -EAGAIN) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> +  "finish, which is running from another node.\n",
> +  osb->dev_str);
> + ret = ocfs2_trim_fs_lock(osb, , 0);
> + if (ret < 0) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + if (info.tf_valid && info.tf_success &&
> + info.tf_start == start && info.tf_len == len &&
> + info.tf_minlen == minlen) {
> + /* Avoid sending duplicated trim to a shared device */
> + mlog(ML_NOTICE, "The same trim on device (%s) was "
> +  "just done from node (%u), return.\n",
> +  osb->dev_str, info.tf_nodenum);
> + range->len = info.tf_trimlen;
> + goto out_trimunlock;
> + }
> + }
> +
> + info.tf_nodenum = osb->node_num;
> + info.tf_start = start;
> + info.tf_len = len;
> + info.tf_minlen = minlen;
> +
>   /* Determine first and last group to examine based on start and len */
>   first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>   if (first_group == osb->first_cluster_group_blkno)
> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>   }
>   range->len = trimmed * sb->s_blocksize;
> +
> + info.tf_trimlen = range->len;
> + info.tf_success = (ret ? 0 : 1);
> + pinfo = 
> +out_trimunlock:
> + ocfs2_trim_fs_unlock(osb, pinfo);
> + ocfs2_trim_fs_lock_res_uninit(osb);
>  out_unlock:
>   ocfs2_inode_unlock(main_bm_inode, 0);
>   brelse(main_bm_bh);
> 



Re: [Ocfs2-devel] [PATCH v2 2/2] ocfs2: add trimfs lock to avoid duplicated trims in cluster

2018-01-03 Thread alex chen
Hi Gang,

On 2017/12/14 13:14, Gang He wrote:
> As you know, ocfs2 has support trim the underlying disk via
> fstrim command. But there is a problem, ocfs2 is a shared disk
> cluster file system, if the user configures a scheduled fstrim
> job on each file system node, this will trigger multiple nodes
> trim a shared disk simultaneously, it is very wasteful for CPU
> and IO consumption, also might negatively affect the lifetime
> of poor-quality SSD devices.
> Then, we introduce a trimfs dlm lock to communicate with each
> other in this case, which will make only one fstrim command to
> do the trimming on a shared disk among the cluster, the fstrim
> commands from the other nodes should wait for the first fstrim
> to finish and returned success directly, to avoid running a the
> same trim on the shared disk again.
> 
For the same purpose, can we take global bitmap meta lock EXMODE instead of
add a new trimfs dlm lock?

Thanks,
Alex

> Compare with first version, I change the fstrim commands' returned
> value and behavior in case which meets a fstrim command is running
> on a shared disk.
> 
> Signed-off-by: Gang He 
> ---
>  fs/ocfs2/alloc.c | 44 
>  1 file changed, 44 insertions(+)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index ab5105f..5c9c3e2 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7382,6 +7382,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   struct buffer_head *gd_bh = NULL;
>   struct ocfs2_dinode *main_bm;
>   struct ocfs2_group_desc *gd = NULL;
> + struct ocfs2_trim_fs_info info, *pinfo = NULL;
>  
>   start = range->start >> osb->s_clustersize_bits;
>   len = range->len >> osb->s_clustersize_bits;
> @@ -7419,6 +7420,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>  
>   trace_ocfs2_trim_fs(start, len, minlen);
>  
> + ocfs2_trim_fs_lock_res_init(osb);
> + ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> + if (ret < 0) {
> + if (ret != -EAGAIN) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> +  "finish, which is running from another node.\n",
> +  osb->dev_str);
> + ret = ocfs2_trim_fs_lock(osb, , 0);
> + if (ret < 0) {
> + mlog_errno(ret);
> + ocfs2_trim_fs_lock_res_uninit(osb);
> + goto out_unlock;
> + }
> +
> + if (info.tf_valid && info.tf_success &&
> + info.tf_start == start && info.tf_len == len &&
> + info.tf_minlen == minlen) {
> + /* Avoid sending duplicated trim to a shared device */
> + mlog(ML_NOTICE, "The same trim on device (%s) was "
> +  "just done from node (%u), return.\n",
> +  osb->dev_str, info.tf_nodenum);
> + range->len = info.tf_trimlen;
> + goto out_trimunlock;
> + }
> + }
> +
> + info.tf_nodenum = osb->node_num;
> + info.tf_start = start;
> + info.tf_len = len;
> + info.tf_minlen = minlen;
> +
>   /* Determine first and last group to examine based on start and len */
>   first_group = ocfs2_which_cluster_group(main_bm_inode, start);
>   if (first_group == osb->first_cluster_group_blkno)
> @@ -7463,6 +7500,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>   group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>   }
>   range->len = trimmed * sb->s_blocksize;
> +
> + info.tf_trimlen = range->len;
> + info.tf_success = (ret ? 0 : 1);
> + pinfo = 
> +out_trimunlock:
> + ocfs2_trim_fs_unlock(osb, pinfo);
> + ocfs2_trim_fs_lock_res_uninit(osb);
>  out_unlock:
>   ocfs2_inode_unlock(main_bm_inode, 0);
>   brelse(main_bm_bh);
>