Re: [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lock resources

piaojun Sun, 01 Apr 2018 21:32:58 -0700

Hi Changwei,

On 2018/4/2 10:59, Changwei Ge wrote:
> Hi Jun,
> 
> It seems that you have ever posted this patch, If I remember correctly.
> My concern is still if you disable dlm recovery via ::migrate_done. Then flag 
> DLM_LOCK_RES_RECOVERING can't be cleared.
After I set migrate_done, all lockres had been migrated. So there won't be
any lockres with DLM_LOCK_RES_RECOVERING. And I have tested this patch for
a few months.


thanks,
Jun
> 
> So we can't purge the problematic lock resource since __dlm_lockres_unused() 
> needs to check that flag.
> 
> Finally, umount will run the while loop in dlm_migrate_all_locks() infinitely.
> Or if I miss something?
> 
> Thanks,
> Changwei
> 
> On 2018/3/15 21:00, piaojun wrote:
>> Wait for dlm recovery done when migrating all lock resources in case that
>> new lock resource left after leaving dlm domain. And the left lock
>> resource will cause other nodes BUG.
>>
>>        NodeA                       NodeB                NodeC
>>
>> umount:
>>    dlm_unregister_domain()
>>      dlm_migrate_all_locks()
>>
>>                                   NodeB down
>>
>> do recovery for NodeB
>> and collect a new lockres
>> form other live nodes:
>>
>>    dlm_do_recovery
>>      dlm_remaster_locks
>>        dlm_request_all_locks:
>>
>>    dlm_mig_lockres_handler
>>      dlm_new_lockres
>>        __dlm_insert_lockres
>>
>> at last NodeA become the
>> master of the new lockres
>> and leave domain:
>>    dlm_leave_domain()
>>
>>                                                    mount:
>>                                                      dlm_join_domain()
>>
>>                                                    touch file and request
>>                                                    for the owner of the new
>>                                                    lockres, but all the
>>                                                    other nodes said 'NO',
>>                                                    so NodeC decide to be
>>                                                    the owner, and send do
>>                                                    assert msg to other
>>                                                    nodes:
>>                                                    dlmlock()
>>                                                      dlm_get_lock_resource()
>>                                                        dlm_do_assert_master()
>>
>>                                                    other nodes receive the 
>> msg
>>                                                    and found two masters 
>> exist.
>>                                                    at last cause BUG in
>>                                                    
>> dlm_assert_master_handler()
>>                                                    -->BUG();
>>
>> Fixes: bc9838c4d44a ("dlm: allow dlm do recovery during shutdown")
>>
>> Signed-off-by: Jun Piao <piao...@huawei.com>
>> Reviewed-by: Alex Chen <alex.c...@huawei.com>
>> Reviewed-by: Yiwen Jiang <jiangyi...@huawei.com>
>> ---
>>   fs/ocfs2/dlm/dlmcommon.h   |  1 +
>>   fs/ocfs2/dlm/dlmdomain.c   | 15 +++++++++++++++
>>   fs/ocfs2/dlm/dlmrecovery.c | 13 ++++++++++---
>>   3 files changed, 26 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
>> index 953c200..d06e27e 100644
>> --- a/fs/ocfs2/dlm/dlmcommon.h
>> +++ b/fs/ocfs2/dlm/dlmcommon.h
>> @@ -140,6 +140,7 @@ struct dlm_ctxt
>>      u8 node_num;
>>      u32 key;
>>      u8  joining_node;
>> +    u8 migrate_done; /* set to 1 means node has migrated all lock resources 
>> */
>>      wait_queue_head_t dlm_join_events;
>>      unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
>>      unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
>> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
>> index 25b76f0..425081b 100644
>> --- a/fs/ocfs2/dlm/dlmdomain.c
>> +++ b/fs/ocfs2/dlm/dlmdomain.c
>> @@ -461,6 +461,19 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
>>              cond_resched_lock(&dlm->spinlock);
>>              num += n;
>>      }
>> +
>> +    if (!num) {
>> +            if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
>> +                    mlog(0, "%s: perhaps there are more lock resources "
>> +                         "need to be migrated after dlm recovery\n", 
>> dlm->name);
>> +                    ret = -EAGAIN;
>> +            } else {
>> +                    mlog(0, "%s: we won't do dlm recovery after migrating "
>> +                         "all lock resources\n", dlm->name);
>> +                    dlm->migrate_done = 1;
>> +            }
>> +    }
>> +
>>      spin_unlock(&dlm->spinlock);
>>      wake_up(&dlm->dlm_thread_wq);
>>
>> @@ -2038,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char 
>> *domain,
>>      dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
>>      init_waitqueue_head(&dlm->dlm_join_events);
>>
>> +    dlm->migrate_done = 0;
>> +
>>      dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
>>      dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
>>
>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>> index 505ab42..d4336a5 100644
>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>> @@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
>>
>>   static void dlm_begin_recovery(struct dlm_ctxt *dlm)
>>   {
>> -    spin_lock(&dlm->spinlock);
>> +    assert_spin_locked(&dlm->spinlock);
>>      BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
>>      printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
>>             dlm->name, dlm->reco.dead_node);
>>      dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
>> -    spin_unlock(&dlm->spinlock);
>>   }
>>
>>   static void dlm_end_recovery(struct dlm_ctxt *dlm)
>> @@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
>>
>>      spin_lock(&dlm->spinlock);
>>
>> +    if (dlm->migrate_done) {
>> +            mlog(0, "%s: no need do recovery after migrating all "
>> +                 "lock resources\n", dlm->name);
>> +            spin_unlock(&dlm->spinlock);
>> +            return 0;
>> +    }
>> +
>>      /* check to see if the new master has died */
>>      if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
>>          test_bit(dlm->reco.new_master, dlm->recovery_map)) {
>> @@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
>>      mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
>>           dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
>>           dlm->reco.dead_node);
>> -    spin_unlock(&dlm->spinlock);
>>
>>      /* take write barrier */
>>      /* (stops the list reshuffling thread, proxy ast handling) */
>>      dlm_begin_recovery(dlm);
>>
>> +    spin_unlock(&dlm->spinlock);
>> +
>>      if (dlm->reco.new_master == dlm->node_num)
>>              goto master_here;
>>
> .
> 

_______________________________________________
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel

Re: [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lock resources

Reply via email to