Re: [Ocfs2-devel] [PATCH] ocfs2: fix a tiny race that leads file system read-only

Joseph Qi Wed, 02 Mar 2016 17:21:44 -0800

On 2016/3/2 17:29, xuejiufei wrote:
> when o2hb detect a node down, it first set the dead node to recovery
> map and create ocfs2rec which will replay journal for dead node.
> o2hb thread then call dlm_do_local_recovery_cleanup() to delete the
> lock for dead node. After the lock of dead node is gone, locks for other
> nodes can be granted and may modify the meta data without replaying
> journal of the dead node. The detail is described as follows.
> 
>      N1                         N2                   N3(master)
> modify the extent tree of
> inode, and commit
> dirty metadata to journal,
> then goes down.
>                                                  o2hb thread detects
>                                                  N1 goes down, set
>                                                  recovery map and
>                                                  delete the lock of N1.
> 
>                                                  dlm_thread flush ast
>                                                  for the lock of N2.
>                         do not detect the death
>                         of N1, so recovery map is
>                         empty.
> 
>                         read inode from disk
>                         without replaying
>                         the journal of N1 and
>                         modify the extent tree
>                         of the inode that N1
>                         had modified.
>                                                  ocfs2rec recover the
>                                                  journal of N1.
>                                                  The modification of N2
>                                                  is lost.
> 
> The modification of N1 and N2 are not serial, and it will lead to
> read-only file system. We can set recovery_waiting flag to the lock
> resource after delete the lock for dead node to prevent other node
> from getting the lock before dlm recovery. After dlm recovery, the
> recovery map on N2 is not empty, ocfs2_inode_lock_full_nested() will
> wait for ocfs2 recovery.
> 
> Signed-off-by: Jiufei Xue <xuejiu...@huawei.com>


Lockres $RECOVERY cannot be set the flag. So I agree that the flag is
set in dlm_free_dead_locks which has already excluded lockres $RECOVERY.
So this fix looks good to me.

Reviewed-by: Joseph Qi <joseph...@huawei.com>

> ---
>  fs/ocfs2/dlm/dlmcommon.h   | 5 ++++-
>  fs/ocfs2/dlm/dlmmaster.c   | 3 ++-
>  fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++++
>  fs/ocfs2/dlm/dlmthread.c   | 6 ++++--
>  4 files changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
> index 68c607e..1276d91 100644
> --- a/fs/ocfs2/dlm/dlmcommon.h
> +++ b/fs/ocfs2/dlm/dlmcommon.h
> @@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt 
> *dlm,
>  #define DLM_LOCK_RES_DROPPING_REF         0x00000040
>  #define DLM_LOCK_RES_BLOCK_DIRTY          0x00001000
>  #define DLM_LOCK_RES_SETREF_INPROG        0x00002000
> +#define DLM_LOCK_RES_RECOVERY_WAITING     0x00004000
> 
>  /* max milliseconds to wait to sync up a network failure with a node death */
>  #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
> @@ -789,7 +790,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource 
> *res)
> 
>       assert_spin_locked(&res->spinlock);
> 
> -     if (res->state & DLM_LOCK_RES_RECOVERING)
> +     if (res->state & (DLM_LOCK_RES_RECOVERING|
> +                     DLM_LOCK_RES_RECOVERY_WAITING))
>               status = DLM_RECOVERING;
>       else if (res->state & DLM_LOCK_RES_MIGRATING)
>               status = DLM_MIGRATING;
> @@ -1009,6 +1011,7 @@ static inline void __dlm_wait_on_lockres(struct 
> dlm_lock_resource *res)
>  {
>       __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
>                                         DLM_LOCK_RES_RECOVERING|
> +                                       DLM_LOCK_RES_RECOVERY_WAITING|
>                                         DLM_LOCK_RES_MIGRATING));
>  }
> 
> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
> index 9477d6e..12d385b 100644
> --- a/fs/ocfs2/dlm/dlmmaster.c
> +++ b/fs/ocfs2/dlm/dlmmaster.c
> @@ -2432,7 +2432,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt 
> *dlm,
>               return 0;
> 
>       /* delay migration when the lockres is in RECOCERING state */
> -     if (res->state & DLM_LOCK_RES_RECOVERING)
> +     if (res->state & (DLM_LOCK_RES_RECOVERING|
> +                     DLM_LOCK_RES_RECOVERY_WAITING))
>               return 0;
> 
>       if (res->owner != dlm->node_num)
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index b94a425..099714e 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -2163,6 +2163,13 @@ static void dlm_finish_local_lockres_recovery(struct 
> dlm_ctxt *dlm,
>       for (i = 0; i < DLM_HASH_BUCKETS; i++) {
>               bucket = dlm_lockres_hash(dlm, i);
>               hlist_for_each_entry(res, bucket, hash_node) {
> +                     if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
> +                             spin_lock(&res->spinlock);
> +                             res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
> +                             spin_unlock(&res->spinlock);
> +                             wake_up(&res->wq);
> +                     }
> +
>                       if (!(res->state & DLM_LOCK_RES_RECOVERING))
>                               continue;
> 
> @@ -2300,6 +2307,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
>                            res->lockname.len, res->lockname.name, freed, 
> dead_node);
>                       __dlm_print_one_lock_resource(res);
>               }
> +             res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
>               dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
>       } else if (test_bit(dead_node, res->refmap)) {
>               mlog(0, "%s:%.*s: dead node %u had a ref, but had "
> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
> index c5f6c24..73c03be 100644
> --- a/fs/ocfs2/dlm/dlmthread.c
> +++ b/fs/ocfs2/dlm/dlmthread.c
> @@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
>       if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
>               return 0;
> 
> -     if (res->state & DLM_LOCK_RES_RECOVERING)
> +     if (res->state & (DLM_LOCK_RES_RECOVERING|
> +                     DLM_LOCK_RES_RECOVERY_WAITING))
>               return 0;
> 
>       /* Another node has this resource with this node as the master */
> @@ -700,7 +701,8 @@ static int dlm_thread(void *data)
>                        * dirty for a short while. */
>                       BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
>                       if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
> -                                       DLM_LOCK_RES_RECOVERING)) {
> +                                       DLM_LOCK_RES_RECOVERING |
> +                                       DLM_LOCK_RES_RECOVERY_WAITING)) {
>                               /* move it to the tail and keep going */
>                               res->state &= ~DLM_LOCK_RES_DIRTY;
>                               spin_unlock(&res->spinlock);
> 



_______________________________________________
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel

Re: [Ocfs2-devel] [PATCH] ocfs2: fix a tiny race that leads file system read-only

Reply via email to