Hi folks,
Thank you for your time and support in advance.
We have a problem since a while, and it consumed a lot of time to narrow down
the reason.
Our final "catch" is that apparently a DRBD resource is stuck and that causes
in cascade other problems.
The setup is:
* RHEL cluster, Pacemaker, DRBD
* httpd, httpd-fs, httpd-storage (configured as a drbd resource with 2
nodes and a diskless tiebreaker node)
Kernel stack (the problematic one):
[<ffffffffc09ffeb3>] drbd_flush_workqueue+0x63/0x80 [drbd] (<== stuck)
[<ffffffffc09ed2e2>] conn_disconnect+0x102/0x450 [drbd]
[<ffffffffc09ee438>] drbd_receiver+0x1f8/0x780 [drbd]
[<ffffffffc09fc206>] drbd_thread_setup+0xb6/0x1f0 [drbd]
[<ffffffff84ac61f1>] kthread+0xd1/0xe0
[<ffffffff8518dd1d>] ret_from_fork_nospec_begin+0x7/0x21
[<ffffffffffffffff>] 0xffffffffffffffff
Software versions used:
[root@dep-1 myusername]# rpm -qa | grep drbd
kmod-drbd90-9.0.20-1.el7_7.elrepo.x86_64
drbd90-utils-9.12.2-1.el7.elrepo.x86_64
[root@dep-1 myusername]# rpm -qa | grep pacema
pacemaker-libs-1.1.20-5.el7_7.2.x86_64
pacemaker-cli-1.1.20-5.el7_7.2.x86_64
pacemaker-1.1.20-5.el7_7.2.x86_64
pacemaker-cluster-libs-1.1.20-5.el7_7.2.x86_64
[root@dep-1 myusername]# rpm -qa | grep corosync
corosync-2.4.3-6.el7_7.1.x86_64
corosync-qdevice-2.4.3-6.el7_7.1.x86_64
corosynclib-2.4.3-6.el7_7.1.x86_64
[root@dep-1 frqadmin]# cat /etc/redhat-release
Red Hat Enterprise Linux Server release 7.7 (Maipo)
Resource config attached: drbd-resource-conf.txt.
Stack for all threads attached: drbd-resource-full-stack.txt.
Do you have any idea what could be wrong, what is causing the thread to stuck?
Best Regards,
Szabi.
[root@dep-1 myusername]# cat /etc/drbd.conf
include "/etc/drbd.d/global_common.conf";
include "/etc/drbd.d/puppet/*.res";
include "/etc/drbd.d/custom/*.res";
[root@cj-fw-dev-phys-dep-1 myusername]# cat "/etc/drbd.d/global_common.conf"
global {
usage-count no;
}
common {
protocol C;
}
[root@dep-1 myusername]# cat /etc/drbd.d/custom/httpd-storage.res
resource httpd-storage {
options {
quorum 2; # tested now with 2 but the same happened with majority;
}
handlers {
split-brain "/usr/lib/drbd/custom-notify-split-brain.sh";
quorum-lost "/usr/lib/drbd/custom-notify-quorum-lost.sh";
}
protocol C;
startup {
wfc-timeout 10;
degr-wfc-timeout 5;
}
disk {
on-io-error detach;
c-max-rate 900M;
c-min-rate 100M;
c-fill-target 1M;
resync-rate 300M;
}
on dep-1.domain.com {
device /dev/drbd7501;
disk /dev/mapper/lvmdata-httpd--storage;
node-id 1;
meta-disk internal;
address 10.21.24.11:7501;
}
on dep-2.domain.com {
device /dev/drbd7501;
disk /dev/mapper/lvmdata-httpd--storage;
meta-disk internal;
node-id 2;
address 10.21.24.12:7501;
}
on doc-1.domain.com {
device /dev/drbd7501;
disk none;
meta-disk internal;
node-id 3;
address 10.21.24.13:7501;
}
connection-mesh {
hosts dep-1.domain.com dep-2.domain.com doc-1.domain.com;
}
}
[root@dep-1 myusername]# for pid in `ps aux | grep '_httpd-st' | awk '{ print
$2 }'`; do echo $pid; cat /proc/$pid/stack; done
12402
[<ffffffffc09ffeb3>] drbd_flush_workqueue+0x63/0x80 [drbd]
[<ffffffffc09ed2e2>] conn_disconnect+0x102/0x450 [drbd]
[<ffffffffc09ee438>] drbd_receiver+0x1f8/0x780 [drbd]
[<ffffffffc09fc206>] drbd_thread_setup+0xb6/0x1f0 [drbd]
[<ffffffff84ac61f1>] kthread+0xd1/0xe0
[<ffffffff8518dd1d>] ret_from_fork_nospec_begin+0x7/0x21
[<ffffffffffffffff>] 0xffffffffffffffff
12405
[<ffffffff84acc191>] down+0x41/0x50
[<ffffffffc0a1ebe6>] state_change_lock+0x66/0xa0 [drbd]
[<ffffffffc0a253b6>] change_cluster_wide_state+0x46/0x1170 [drbd]
[<ffffffffc0a2686d>] change_cstate_es+0x8d/0xc0 [drbd]
[<ffffffffc09ee2c2>] drbd_receiver+0x82/0x780 [drbd]
[<ffffffffc09fc206>] drbd_thread_setup+0xb6/0x1f0 [drbd]
[<ffffffff84ac61f1>] kthread+0xd1/0xe0
[<ffffffff8518dd1d>] ret_from_fork_nospec_begin+0x7/0x21
[<ffffffffffffffff>] 0xffffffffffffffff
16937
[<ffffffffc09d7f2c>] drbd_worker+0x46c/0x530 [drbd]
[<ffffffffc09fc206>] drbd_thread_setup+0xb6/0x1f0 [drbd]
[<ffffffff84ac61f1>] kthread+0xd1/0xe0
[<ffffffff8518dd1d>] ret_from_fork_nospec_begin+0x7/0x21
[<ffffffffffffffff>] 0xffffffffffffffff
16941
[<ffffffff84acc191>] down+0x41/0x50
[<ffffffffc0a1ebe6>] state_change_lock+0x66/0xa0 [drbd]
[<ffffffffc0a253b6>] change_cluster_wide_state+0x46/0x1170 [drbd]
[<ffffffffc0a2686d>] change_cstate_es+0x8d/0xc0 [drbd]
[<ffffffffc09e4ea1>] connect_work+0x191/0x260 [drbd]
[<ffffffffc09d7821>] drbd_sender+0x181/0x420 [drbd]
[<ffffffffc09fc206>] drbd_thread_setup+0xb6/0x1f0 [drbd]
[<ffffffff84ac61f1>] kthread+0xd1/0xe0
[<ffffffff8518dd1d>] ret_from_fork_nospec_begin+0x7/0x21
[<ffffffffffffffff>] 0xffffffffffffffff
16948
[<ffffffffc09d15c5>] wait_for_sender_todo+0x165/0x2b0 [drbd]
[<ffffffffc09d7945>] drbd_sender+0x2a5/0x420 [drbd]
[<ffffffffc09fc206>] drbd_thread_setup+0xb6/0x1f0 [drbd]
[<ffffffff84ac61f1>] kthread+0xd1/0xe0
[<ffffffff8518dd1d>] ret_from_fork_nospec_begin+0x7/0x21
[<ffffffffffffffff>] 0xffffffffffffffff
_______________________________________________
Star us on GITHUB: https://github.com/LINBIT
drbd-user mailing list
[email protected]
https://lists.linbit.com/mailman/listinfo/drbd-user