dlm_lowcomms_stop() was not functioning properly. Correctly, we have to
wait until all processing is finished with send_workqueue and
recv_workqueue.
This problem causes the following issue. Scenario is

1. dlm_send thread:
    send_to_sock refers con->writequeue
2. main thread:
    dlm_lowcomms_stop calls list_del
3. dlm_send thread:
    send_to_sock calls list_del in writequeue_entry_complete

[ 1925.770305] dlm: canceled swork for node 4
[ 1925.772374] general protection fault: 0000 [#1] SMP
[ 1925.777930] Modules linked in: ocfs2_stack_user ocfs2 ocfs2_nodemanager 
ocfs2_stackglue dlm fmxnet(O) fmx_api(O) fmx_cu(O) igb(O) kvm_intel kvm 
irqbypass autofs4
[ 1925.794131] CPU: 3 PID: 6994 Comm: kworker/u8:0 Tainted: G           O    
4.4.39 #1
[ 1925.802684] Hardware name: TOSHIBA OX/OX, BIOS OX-P0015 12/03/2015
[ 1925.809595] Workqueue: dlm_send process_send_sockets [dlm]
[ 1925.815714] task: ffff8804398d3c00 ti: ffff88046910c000 task.ti: 
ffff88046910c000
[ 1925.824072] RIP: 0010:[<ffffffffa04bd158>]  [<ffffffffa04bd158>] 
process_send_sockets+0xf8/0x280 [dlm]
[ 1925.834480] RSP: 0018:ffff88046910fde0  EFLAGS: 00010246
[ 1925.840411] RAX: dead000000000200 RBX: 0000000000000001 RCX: 000000000000000a
[ 1925.848372] RDX: ffff88046bd980c0 RSI: 0000000000000000 RDI: ffff8804673c5670
[ 1925.856341] RBP: ffff88046910fe20 R08: 00000000000000c9 R09: 0000000000000010
[ 1925.864311] R10: ffffffff81e22fc0 R11: 0000000000000000 R12: ffff8804673c56d8
[ 1925.872281] R13: ffff8804673c5660 R14: ffff88046bd98440 R15: 0000000000000058
[ 1925.880251] FS:  0000000000000000(0000) GS:ffff88047fd80000(0000) 
knlGS:0000000000000000
[ 1925.889280] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 1925.895694] CR2: 00007fff09eadf58 CR3: 00000004690f5000 CR4: 00000000001006e0
[ 1925.903663] Stack:
[ 1925.905903]  ffff8804673c5630 ffff8804673c5620 ffff8804673c5670 
ffff88007d219b40
[ 1925.914181]  ffff88046f095800 0000000000000100 ffff8800717a1400 
ffff8804673c56d8
[ 1925.922459]  ffff88046910fe60 ffffffff81073db2 00ff880400000000 
ffff88007d219b40
[ 1925.930736] Call Trace:
[ 1925.933468]  [<ffffffff81073db2>] process_one_work+0x162/0x450
[ 1925.939983]  [<ffffffff81074459>] worker_thread+0x69/0x4a0
[ 1925.946109]  [<ffffffff810743f0>] ? rescuer_thread+0x350/0x350
[ 1925.952622]  [<ffffffff8107956f>] kthread+0xef/0x110
[ 1925.958165]  [<ffffffff81079480>] ? kthread_park+0x60/0x60
[ 1925.964283]  [<ffffffff8186ab2f>] ret_from_fork+0x3f/0x70
[ 1925.970312]  [<ffffffff81079480>] ? kthread_park+0x60/0x60
[ 1925.976436] Code: 01 00 00 48 8b 7d d0 e8 07 d3 3a e1 45 01 7e 18 45 29 7e 
1c 75 ab 41 8b 46 24 85 c0 75 a3 49 8b 16 49 8b 46 08 31 f6 48 89 42 08 <48> 89 
10 48 b8 00 01 00 00 00 00 ad de 49 8b 7e 10 49 89 06 66
[ 1925.997791] RIP  [<ffffffffa04bd158>] process_send_sockets+0xf8/0x280 [dlm]
[ 1926.005577]  RSP <ffff88046910fde0>

Signed-off-by: Tadashi Miyauchi <miyau...@toshiba-tops.co.jp>
Signed-off-by: Tsutomu Owa <tsutomu....@toshiba.co.jp>
---
 fs/dlm/lowcomms.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index be62cf1..9f7932c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1628,11 +1628,20 @@ static int work_start(void)
        return 0;
 }
 
-static void stop_conn(struct connection *con)
+static void _stop_conn(struct connection *con, bool and_other)
 {
-       con->flags |= 0x0F;
+       mutex_lock(&con->sock_mutex);
+       set_bit(CF_READ_PENDING, &con->flags);
        if (con->sock && con->sock->sk)
                con->sock->sk->sk_user_data = NULL;
+       if (con->othercon && and_other)
+               _stop_conn(con->othercon, false);
+       mutex_unlock(&con->sock_mutex);
+}
+
+static void stop_conn(struct connection *con)
+{
+       _stop_conn(con, true);
 }
 
 static void free_conn(struct connection *con)
@@ -1644,6 +1653,32 @@ static void free_conn(struct connection *con)
        kmem_cache_free(con_cache, con);
 }
 
+static void work_flush(void)
+{
+       int ok;
+       int i;
+       struct hlist_node *n;
+       struct connection *con;
+
+       flush_workqueue(recv_workqueue);
+       flush_workqueue(send_workqueue);
+       do {
+               ok = 1;
+               foreach_conn(stop_conn);
+               flush_workqueue(recv_workqueue);
+               flush_workqueue(send_workqueue);
+               for (i = 0; i < CONN_HASH_SIZE && ok; i++) {
+                       hlist_for_each_entry_safe(con, n,
+                                                 &connection_hash[i], list) {
+                               ok &= test_bit(CF_READ_PENDING, &con->flags);
+                               if (con->othercon)
+                                       ok &= test_bit(CF_READ_PENDING,
+                                                      &con->othercon->flags);
+                       }
+               }
+       } while (!ok);
+}
+
 void dlm_lowcomms_stop(void)
 {
        /* Set all the flags to prevent any
@@ -1651,11 +1686,10 @@ void dlm_lowcomms_stop(void)
        */
        mutex_lock(&connections_lock);
        dlm_allow_conn = 0;
-       foreach_conn(stop_conn);
+       mutex_unlock(&connections_lock);
+       work_flush();
        clean_writequeues();
        foreach_conn(free_conn);
-       mutex_unlock(&connections_lock);
-
        work_stop();
 
        kmem_cache_destroy(con_cache);
-- 
2.7.4




Reply via email to