Hi Gang,

Most parts of this patch look sane to me, just a tiny question...

On 2019/1/11 17:01, Gang He wrote:
> The user reported this problem, the upper application IO was
> timeout when fstrim was running on this ocfs2 partition. the
> application monitoring resource agent considered that this
> application did not work, then this node was fenced by the cluster
> brain (e.g. pacemaker).
> The root cause is that fstrim thread always holds main_bm meta-file
> related locks until all the cluster groups are trimmed.
> This patch will make fstrim thread release main_bm meta-file
> related locks when each cluster group is trimmed, this will let
> the current application IO has a chance to claim the clusters from
> main_bm meta-file.
> 
> Signed-off-by: Gang He <g...@suse.com>
> ---
>   fs/ocfs2/alloc.c       | 159 +++++++++++++++++++++++++----------------
>   fs/ocfs2/dlmglue.c     |   5 ++
>   fs/ocfs2/ocfs2.h       |   1 +
>   fs/ocfs2/ocfs2_trace.h |   2 +
>   fs/ocfs2/super.c       |   2 +
>   5 files changed, 106 insertions(+), 63 deletions(-)
> 
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index d1cbb27808e2..6f0999015a44 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb,
>       return count;
>   }
>   
> -int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +static
> +int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range)
>   {
>       struct ocfs2_super *osb = OCFS2_SB(sb);
> -     u64 start, len, trimmed, first_group, last_group, group;
> +     u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0;
>       int ret, cnt;
>       u32 first_bit, last_bit, minlen;
>       struct buffer_head *main_bm_bh = NULL;
> @@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>       struct buffer_head *gd_bh = NULL;
>       struct ocfs2_dinode *main_bm;
>       struct ocfs2_group_desc *gd = NULL;
> -     struct ocfs2_trim_fs_info info, *pinfo = NULL;
>   
>       start = range->start >> osb->s_clustersize_bits;
>       len = range->len >> osb->s_clustersize_bits;
> @@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>       if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
>               return -EINVAL;
>   
> +     trace_ocfs2_trim_mainbm(start, len, minlen);
> +
> +next_group:
>       main_bm_inode = ocfs2_get_system_file_inode(osb,
>                                                   GLOBAL_BITMAP_SYSTEM_INODE,
>                                                   OCFS2_INVALID_SLOT);
> @@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>       }
>       main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data;
>   
> -     if (start >= le32_to_cpu(main_bm->i_clusters)) {
> -             ret = -EINVAL;
> -             goto out_unlock;
> -     }
> -
> -     len = range->len >> osb->s_clustersize_bits;
> -     if (start + len > le32_to_cpu(main_bm->i_clusters))
> -             len = le32_to_cpu(main_bm->i_clusters) - start;
> -
> -     trace_ocfs2_trim_fs(start, len, minlen);
> -
> -     ocfs2_trim_fs_lock_res_init(osb);
> -     ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> -     if (ret < 0) {
> -             if (ret != -EAGAIN) {
> -                     mlog_errno(ret);
> -                     ocfs2_trim_fs_lock_res_uninit(osb);
> +     /*
> +      * Do some check before trim the first group.
> +      */
> +     if (!group) {
> +             if (start >= le32_to_cpu(main_bm->i_clusters)) {
> +                     ret = -EINVAL;
>                       goto out_unlock;
>               }
>   
> -             mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> -                  "finish, which is running from another node.\n",
> -                  osb->dev_str);
> -             ret = ocfs2_trim_fs_lock(osb, &info, 0);
> -             if (ret < 0) {
> -                     mlog_errno(ret);
> -                     ocfs2_trim_fs_lock_res_uninit(osb);
> -                     goto out_unlock;
> -             }
> +             if (start + len > le32_to_cpu(main_bm->i_clusters))
> +                     len = le32_to_cpu(main_bm->i_clusters) - start;
>   
> -             if (info.tf_valid && info.tf_success &&
> -                 info.tf_start == start && info.tf_len == len &&
> -                 info.tf_minlen == minlen) {
> -                     /* Avoid sending duplicated trim to a shared device */
> -                     mlog(ML_NOTICE, "The same trim on device (%s) was "
> -                          "just done from node (%u), return.\n",
> -                          osb->dev_str, info.tf_nodenum);
> -                     range->len = info.tf_trimlen;
> -                     goto out_trimunlock;
> -             }
> +             /*
> +              * Determine first and last group to examine based on
> +              * start and len
> +              */
> +             first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> +             if (first_group == osb->first_cluster_group_blkno)
> +                     first_bit = start;
> +             else
> +                     first_bit = start - ocfs2_blocks_to_clusters(sb,
> +                                                             first_group);
> +             last_group = ocfs2_which_cluster_group(main_bm_inode,
> +                                                    start + len - 1);
> +             group = first_group;
>       }
>   
> -     info.tf_nodenum = osb->node_num;
> -     info.tf_start = start;
> -     info.tf_len = len;
> -     info.tf_minlen = minlen;
> -
> -     /* Determine first and last group to examine based on start and len */
> -     first_group = ocfs2_which_cluster_group(main_bm_inode, start);
> -     if (first_group == osb->first_cluster_group_blkno)
> -             first_bit = start;
> -     else
> -             first_bit = start - ocfs2_blocks_to_clusters(sb, first_group);
> -     last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
> -     last_bit = osb->bitmap_cpg;
> -
> -     trimmed = 0;
> -     for (group = first_group; group <= last_group;) {
> +     do {
>               if (first_bit + len >= osb->bitmap_cpg)
>                       last_bit = osb->bitmap_cpg;
>               else
> @@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct 
> fstrim_range *range)
>                       group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
>               else
>                       group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg);
> -     }
> -     range->len = trimmed * sb->s_blocksize;
> +     } while (0);
>   
> -     info.tf_trimlen = range->len;
> -     info.tf_success = (ret ? 0 : 1);
> -     pinfo = &info;
> -out_trimunlock:
> -     ocfs2_trim_fs_unlock(osb, pinfo);
> -     ocfs2_trim_fs_lock_res_uninit(osb);
>   out_unlock:
>       ocfs2_inode_unlock(main_bm_inode, 0);
>       brelse(main_bm_bh);
> +     main_bm_bh = NULL;
>   out_mutex:
>       inode_unlock(main_bm_inode);
>       iput(main_bm_inode);
> +
> +     /*
> +      * If all the groups trim are not done or failed, but we should release
> +      * main_bm related locks for avoiding the current IO starve, then go to
> +      * trim the next group
> +      */
> +     if (ret >= 0 && group <= last_group)
> +             goto next_group;
>   out:
> +     range->len = trimmed * sb->s_blocksize;
> +     return ret;
> +}
> +
> +int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
> +{
> +     int ret;
> +     struct ocfs2_super *osb = OCFS2_SB(sb);
> +     struct ocfs2_trim_fs_info info, *pinfo = NULL;
> +
> +     ocfs2_trim_fs_lock_res_init(osb);
> +
> +     trace_ocfs2_trim_fs(range->start, range->len, range->minlen);
> +
> +     ret = ocfs2_trim_fs_lock(osb, NULL, 1);
> +     if (ret < 0) {
> +             if (ret != -EAGAIN) {
> +                     mlog_errno(ret);
> +                     ocfs2_trim_fs_lock_res_uninit(osb);
> +                     return ret;
> +             }
> +
> +             mlog(ML_NOTICE, "Wait for trim on device (%s) to "
> +                  "finish, which is running from another node.\n",
> +                  osb->dev_str);
> +             ret = ocfs2_trim_fs_lock(osb, &info, 0);
> +             if (ret < 0) {
> +                     mlog_errno(ret);
> +                     ocfs2_trim_fs_lock_res_uninit(osb);
> +                     return ret;
> +             }
> +
> +             if (info.tf_valid && info.tf_success &&
> +                 info.tf_start == range->start &&
> +                 info.tf_len == range->len &&
> +                 info.tf_minlen == range->minlen) {
> +                     /* Avoid sending duplicated trim to a shared device */
> +                     mlog(ML_NOTICE, "The same trim on device (%s) was "
> +                          "just done from node (%u), return.\n",
> +                          osb->dev_str, info.tf_nodenum);
> +                     range->len = info.tf_trimlen;
> +                     goto out;
> +             }
> +     }
> +
> +     info.tf_nodenum = osb->node_num;
> +     info.tf_start = range->start;
> +     info.tf_len = range->len;
> +     info.tf_minlen = range->minlen;
> +
> +     ret = ocfs2_trim_mainbm(sb, range);
> +
> +     info.tf_trimlen = range->len;
> +     info.tf_success = (ret < 0 ? 0 : 1);
> +     pinfo = &info;
> +out:
> +     ocfs2_trim_fs_unlock(osb, pinfo);
> +     ocfs2_trim_fs_lock_res_uninit(osb);
>       return ret;
>   }
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index 7c835824247e..af405586c5b1 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
>   {
>       struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
>   
> +     /* Only one trimfs thread are allowed to work at the same time. */
> +     mutex_lock(&osb->obs_trim_fs_mutex);
> +

Cluster lock of fstrim have a trylock behavior, will it be better if we trylock 
here?

Thanks,
Changwei

>       ocfs2_lock_res_init_once(lockres);
>       ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name);
>       ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS,
> @@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super 
> *osb)
>   
>       ocfs2_simple_drop_lockres(osb, lockres);
>       ocfs2_lock_res_free(lockres);
> +
> +     mutex_unlock(&osb->obs_trim_fs_mutex);
>   }
>   
>   static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res,
> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
> index 4f86ac0027b5..1f029fbe8b8d 100644
> --- a/fs/ocfs2/ocfs2.h
> +++ b/fs/ocfs2/ocfs2.h
> @@ -407,6 +407,7 @@ struct ocfs2_super
>       struct ocfs2_lock_res osb_rename_lockres;
>       struct ocfs2_lock_res osb_nfs_sync_lockres;
>       struct ocfs2_lock_res osb_trim_fs_lockres;
> +     struct mutex obs_trim_fs_mutex;
>       struct ocfs2_dlm_debug *osb_dlm_debug;
>   
>       struct dentry *osb_debug_root;
> diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
> index 2ee76a90ba8f..dc4bce1649c1 100644
> --- a/fs/ocfs2/ocfs2_trace.h
> +++ b/fs/ocfs2/ocfs2_trace.h
> @@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent,
>   
>   DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group);
>   
> +DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm);
> +
>   DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs);
>   
>   /* End of trace events for fs/ocfs2/alloc.c. */
> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
> index 3415e0b09398..96ae7cedd487 100644
> --- a/fs/ocfs2/super.c
> +++ b/fs/ocfs2/super.c
> @@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb)
>       if (ocfs2_is_hard_readonly(osb))
>               goto leave;
>   
> +     mutex_init(&osb->obs_trim_fs_mutex);
> +
>       status = ocfs2_dlm_init(osb);
>       if (status < 0) {
>               mlog_errno(status);
> 

Reply via email to