** Changed in: ubuntu-power-systems
Status: Triaged => Incomplete
--
You received this bug notification because you are a member of Kernel
Packages, which is subscribed to linux in Ubuntu.
https://bugs.launchpad.net/bugs/1768115
Title:
ISST-LTE:KVM:Ubuntu1804:BostonLC:boslcp3g1: Migration guest running
with IO stress crashed@security_file_permission+0xf4/0x160.
Status in The Ubuntu-power-systems project:
Incomplete
Status in linux package in Ubuntu:
New
Bug description:
Problem Description: Migration Guest running with IO stress
crashed@security_file_permission+0xf4/0x160 after couple of
migrations.
Steps to re-create:
Source host - boslcp3
Destination host - boslcp4
1.boslcp3 & boslcp4 installed with latest kernel
root@boslcp3:~# uname -a
Linux boslcp3 4.15.0-20-generic #21+bug166588 SMP Thu Apr 26 15:05:59 CDT
2018 ppc64le ppc64le ppc64le GNU/Linux
root@boslcp3:~#
root@boslcp4:~# uname -a
Linux boslcp4 4.15.0-20-generic #21+bug166588 SMP Thu Apr 26 15:05:59 CDT
2018 ppc64le ppc64le ppc64le GNU/Linux
root@boslcp3:~#
2. Installed guest boslcp3g1 with kernel and started LTP run from
boslcp3 host
root@boslcp3g1:~# uname -a
Linux boslcp3g1 4.15.0-15-generic #16+bug166877 SMP Wed Apr 18 14:47:30 CDT
2018 ppc64le ppc64le ppc64le GNU/Linux
3. Started migrating boslcp3g1 guest from source to destination & viceversa.
4. After couple of migrations it crashed at boslcp4 & enters into xmon
8:mon> t
[c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160
[c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120
[c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0
[c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110
[c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c
--- Exception: c01 (System Call) at 000071f1779fe280
SP (7fffe99ece50) is in userspace
8:mon> S
msr = 8000000000001033 sprg0 = 0000000000000000
pvr = 00000000004e1202 sprg1 = c000000007a85800
dec = 00000000591e3e03 sprg2 = c000000007a85800
sp = c0000004f8a234a0 sprg3 = 0000000000010008
toc = c0000000016eae00 dar = 000000000000023c
srr0 = c0000000000c355c srr1 = 8000000000001033 dsisr = 40000000
dscr = 0000000000000000 ppr = 0010000000000000 pir = 00000011
amr = 0000000000000000 uamor = 0000000000000000
dpdes = 0000000000000000 tir = 0000000000000000 cir = 00000000
fscr = 0500000000000180 tar = 0000000000000000 pspb = 00000000
mmcr0 = 0000000080000000 mmcr1 = 0000000000000000 mmcr2 = 0000000000000000
pmc1 = 00000000 pmc2 = 00000000 pmc3 = 00000000 pmc4 = 00000000
mmcra = 0000000000000000 siar = 0000000000000000 pmc5 = 0000026c
sdar = 0000000000000000 sier = 0000000000000000 pmc6 = 00000861
ebbhr = 0000000000000000 ebbrr = 0000000000000000 bescr = 0000000000000000
iamr = 4000000000000000
pidr = 0000000000000034 tidr = 0000000000000000
cpu 0x8: Vector: 700 (Program Check) at [c0000004f8a23220]
pc: c0000000000e4854: xmon_core+0x1f24/0x3520
lr: c0000000000e4850: xmon_core+0x1f20/0x3520
sp: c0000004f8a234a0
msr: 8000000000041033
current = 0xc0000004f89faf00
paca = 0xc000000007a85800 softe: 0 irq_happened: 0x01
pid = 24028, comm = top
Linux version 4.15.0-20-generic (buildd@bos02-ppc64el-002) (gcc version 7.3.0
(Ubuntu 7.3.0-16ubuntu3)) #21-Ubuntu SMP Tue Apr 24 06:14:44 UTC 2018 (Ubuntu
4.15.0-20.21-generic 4.15.17)
cpu 0x8: Exception 700 (Program Check) in xmon, returning to main loop
[c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160
[c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120
[c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0
[c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110
[c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c
--- Exception: c01 (System Call) at 000071f1779fe280
SP (7fffe99ece50) is in userspace
8:mon> r
R00 = c00000000043b7fc R16 = 0000000000000000
R01 = c0000004f8a23c90 R17 = ffffffffffffff70
R02 = c0000000016eae00 R18 = 00000a51b4bebfc8
R03 = c000000279557200 R19 = 00007fffe99edbb0
R04 = c0000003242499c0 R20 = 00000a51b4c04db0
R05 = 0000000000020000 R21 = 00000a51b4c20e90
R06 = 0000000000000004 R22 = 0000000000040f00
R07 = ffffff8100000000 R23 = 00000a51b4c06560
R08 = ffffff8000000000 R24 = ffffffffffffff80
R09 = 0000000000000000 R25 = 00000a51b4bec2b8
R10 = 0000000000000000 R26 = 000071f177bb0b20
R11 = 0000000000000000 R27 = 0000000000000000
R12 = 0000000000002000 R28 = c000000279557200
R13 = c000000007a85800 R29 = c0000004c7734210
R14 = 0000000000000000 R30 = 0000000000000000
R15 = 0000000000000000 R31 = c0000003242499c0
pc = c00000000043b808 __fsnotify_parent+0x88/0x1a0
cfar= c0000000003f9e78 dget_parent+0xe8/0x150
lr = c00000000043b7fc __fsnotify_parent+0x7c/0x1a0
msr = 8000000000009033 cr = 28002222
ctr = c0000000006252b0 xer = 0000000000000000 trap = 300
dar = 000000000000023c dsisr = 40000000
8:mon> e
cpu 0x8: Vector: 300 (Data Access) at [c0000004f8a23a10]
pc: c00000000043b808: __fsnotify_parent+0x88/0x1a0
lr: c00000000043b7fc: __fsnotify_parent+0x7c/0x1a0
sp: c0000004f8a23c90
msr: 8000000000009033
dar: 23c
dsisr: 40000000
current = 0xc0000004f89faf00
paca = 0xc000000007a85800 softe: 0 irq_happened: 0x01
pid = 24028, comm = top
Linux version 4.15.0-20-generic (buildd@bos02-ppc64el-002) (gcc version 7.3.0
(Ubuntu 7.3.0-16ubuntu3)) #21-Ubuntu SMP Tue Apr 24 06:14:44 UTC 2018 (Ubuntu
4.15.0-20.21-generic 4.15.17)
6. Guest enters into xmon after migrating from boslcp3 to boslcp4.
>
> 8:mon> t
> [c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160
> [c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120
> [c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0
> [c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110
> [c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c
> --- Exception: c01 (System Call) at 000071f1779fe280
> SP (7fffe99ece50) is in userspace
> 8:mon> r
> R00 = c00000000043b7fc R16 = 0000000000000000
> R01 = c0000004f8a23c90 R17 = ffffffffffffff70
> R02 = c0000000016eae00 R18 = 00000a51b4bebfc8
> R03 = c000000279557200 R19 = 00007fffe99edbb0
> R04 = c0000003242499c0 R20 = 00000a51b4c04db0
> R05 = 0000000000020000 R21 = 00000a51b4c20e90
> R06 = 0000000000000004 R22 = 0000000000040f00
> R07 = ffffff8100000000 R23 = 00000a51b4c06560
> R08 = ffffff8000000000 R24 = ffffffffffffff80
> R09 = 0000000000000000 R25 = 00000a51b4bec2b8
> R10 = 0000000000000000 R26 = 000071f177bb0b20
> R11 = 0000000000000000 R27 = 0000000000000000
> R12 = 0000000000002000 R28 = c000000279557200
> R13 = c000000007a85800 R29 = c0000004c7734210
> R14 = 0000000000000000 R30 = 0000000000000000
> R15 = 0000000000000000 R31 = c0000003242499c0
> pc = c00000000043b808 __fsnotify_parent+0x88/0x1a0
> cfar= c0000000003f9e78 dget_parent+0xe8/0x150
> lr = c00000000043b7fc __fsnotify_parent+0x7c/0x1a0
> msr = 8000000000009033 cr = 28002222
> ctr = c0000000006252b0 xer = 0000000000000000 trap = 300
> dar = 000000000000023c dsisr = 40000000
> BUG_ON in jbd2_journal_write_metadata_buffer
I've included xmon crash data from a more recent crash, this time a
BUG_ON in fs/jbd2/journal.c:jbd2_journal_write_metadata_buffer():
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct journal_head *jh_in,
struct buffer_head **bh_out,
sector_t blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
int do_escape = 0;
char *mapped_data;
struct buffer_head *new_bh;
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
journal_t *journal = transaction->t_journal;
/*
* The buffer really shouldn't be locked: only the current committing
* transaction is allowed to write it, so nobody else is allowed
* to do any IO.
*
* akpm: except if we're journalling data, and write() output is
* also part of a shared mapping, and another thread has
* decided to launch a writepage() against this buffer.
*/
J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
This is not the same as the original bug, but I suspect they are part
of a class of issues we're hitting while running under very particular
circumstances which might not generally be seen during normal
operation and triggering various corner cases. As such I think it
makes sense to group them under this bug for the time being.
The general workload is running IO-heavy disk workloads on large
guests (20GB memory, 16 vcpus) with SAN-based storage, and then
performing migration during the workload. During migration we begin to
see a high occurrence of rcu_sched stall warnings, and after 1-3
hours of operations we hit filesystem-related crashes like the ones
posted. We've seen this with 2 separate FC cards, emulex and qlogic,
where we invoke QEMU through libvirt as:
C_ALL=C
PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
QEMU_AUDIO_DRV=none /usr/bin/qemu-system-ppc64 -name guest=boslcp3g1
,debug-threads=on -S -object
secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-7-boslcp3g1
/master-key.aes -machine pseries-2.10,accel=kvm,usb=off,dump-guest-
core=off,max-cpu-compat=power9 -cpu host -m 20480 -realtime mlock=off
-smp 16,maxcpus=32,sockets=4,cores=8,threads=1 -object memory-backend-
file,id=ram-node0,prealloc=yes,mem-
path=/dev/hugepages/libvirt/qemu/7-boslcp3g1,size=10737418240 -numa
node,nodeid=0,cpus=0-7,memdev=ram-node0 -object memory-backend-ram,id
=ram-node1,size=10737418240 -numa node,nodeid=1,cpus=8-15,memdev=ram-
node1 -uuid bd110ed9-dcfc-4470-b4ae-6166a56819f0 -display none -no-
user-config -nodefaults -chardev
socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-7-boslcp3g1/monitor.sock,server,nowait
-mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-
shutdown -boot menu=on,strict=on -device spapr-pci-host-
bridge,index=1,id=pci.1 -device nec-usb-xhci,id=usb,bus=pci.0,addr=0x3
-device virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0x2 -drive file=/home
/bionic-server-ppc64el.iso,format=raw,if=none,id=drive-
scsi0-0-0-2,readonly=on,cache=none -device scsi-
cd,bus=scsi0.0,channel=0,scsi-id=0,lun=2,drive=drive-
scsi0-0-0-2,id=scsi0-0-0-2 -drive file=/dev/disk/by-id/dm-uuid-part1
-mpath-3600507680183050d28000000000002a4,format=raw,if=none,id=drive-
virtio-disk0,cache=none -device virtio-blk-
pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-
disk0,bootindex=1 -drive file=/dev/disk/by-id/dm-uuid-part2-mpath-
3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio-
disk1,cache=none -device virtio-blk-
pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk1,id=virtio-
disk1 -drive file=/dev/disk/by-id/dm-uuid-part3-mpath-
3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio-
disk2,cache=none -device virtio-blk-
pci,scsi=off,bus=pci.0,addr=0x7,drive=drive-virtio-disk2,id=virtio-
disk2 -netdev tap,fd=27,id=hostnet0,vhost=on,vhostfd=30 -device
virtio-net-
pci,netdev=hostnet0,id=net0,mac=52:54:00:72:d2:69,bus=pci.0,addr=0x1,bootindex=2
-chardev pty,id=charserial0 -device spapr-
vty,chardev=charserial0,id=serial0,reg=0x30000000 -device virtio-
balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -msg timestamp=on
I will attach the libvirt XML separately
IBM is requesting some general filesystem skills from Canonical if
they have some as we continue debugging...
To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu-power-systems/+bug/1768115/+subscriptions
--
Mailing list: https://launchpad.net/~kernel-packages
Post to : [email protected]
Unsubscribe : https://launchpad.net/~kernel-packages
More help : https://help.launchpad.net/ListHelp