** Description changed:

  Problem Description: Migration Guest running with IO stress
  crashed@security_file_permission+0xf4/0x160 after couple of migrations.
  
  Steps to re-create:
  
  Source host - boslcp3
  Destination host - boslcp4
  
- 1.boslcp3 & boslcp4 installed with latest kernel 
+ 1.boslcp3 & boslcp4 installed with latest kernel
  root@boslcp3:~# uname -a
  Linux boslcp3 4.15.0-20-generic #21+bug166588 SMP Thu Apr 26 15:05:59 CDT 
2018 ppc64le ppc64le ppc64le GNU/Linux
  root@boslcp3:~#
  
  root@boslcp4:~# uname -a
  Linux boslcp4 4.15.0-20-generic #21+bug166588 SMP Thu Apr 26 15:05:59 CDT 
2018 ppc64le ppc64le ppc64le GNU/Linux
  root@boslcp3:~#
  
  2. Installed guest boslcp3g1 with kernel and started LTP run from
  boslcp3 host
  
  root@boslcp3g1:~# uname -a
  Linux boslcp3g1 4.15.0-15-generic #16+bug166877 SMP Wed Apr 18 14:47:30 CDT 
2018 ppc64le ppc64le ppc64le GNU/Linux
  
  3. Started migrating boslcp3g1 guest from source to destination & viceversa.
  4. After couple of migrations it crashed at boslcp4 & enters into xmon
  
  8:mon> t
  [c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160
  [c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120
  [c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0
  [c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110
  [c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c
  --- Exception: c01 (System Call) at 000071f1779fe280
  SP (7fffe99ece50) is in userspace
  8:mon> S
  msr    = 8000000000001033  sprg0 = 0000000000000000
  pvr    = 00000000004e1202  sprg1 = c000000007a85800
  dec    = 00000000591e3e03  sprg2 = c000000007a85800
  sp     = c0000004f8a234a0  sprg3 = 0000000000010008
  toc    = c0000000016eae00  dar   = 000000000000023c
  srr0   = c0000000000c355c  srr1  = 8000000000001033 dsisr  = 40000000
  dscr   = 0000000000000000  ppr   = 0010000000000000 pir    = 00000011
  amr    = 0000000000000000  uamor = 0000000000000000
  dpdes  = 0000000000000000  tir   = 0000000000000000 cir    = 00000000
  fscr   = 0500000000000180  tar   = 0000000000000000 pspb   = 00000000
  mmcr0  = 0000000080000000  mmcr1 = 0000000000000000 mmcr2  = 0000000000000000
  pmc1   = 00000000 pmc2 = 00000000  pmc3 = 00000000  pmc4   = 00000000
  mmcra  = 0000000000000000   siar = 0000000000000000 pmc5   = 0000026c
  sdar   = 0000000000000000   sier = 0000000000000000 pmc6   = 00000861
  ebbhr  = 0000000000000000  ebbrr = 0000000000000000 bescr  = 0000000000000000
  iamr   = 4000000000000000
  pidr   = 0000000000000034  tidr  = 0000000000000000
  cpu 0x8: Vector: 700 (Program Check) at [c0000004f8a23220]
-     pc: c0000000000e4854: xmon_core+0x1f24/0x3520
-     lr: c0000000000e4850: xmon_core+0x1f20/0x3520
-     sp: c0000004f8a234a0
-    msr: 8000000000041033
-   current = 0xc0000004f89faf00
-   paca    = 0xc000000007a85800   softe: 0        irq_happened: 0x01
-     pid   = 24028, comm = top
+     pc: c0000000000e4854: xmon_core+0x1f24/0x3520
+     lr: c0000000000e4850: xmon_core+0x1f20/0x3520
+     sp: c0000004f8a234a0
+    msr: 8000000000041033
+   current = 0xc0000004f89faf00
+   paca    = 0xc000000007a85800   softe: 0        irq_happened: 0x01
+     pid   = 24028, comm = top
  Linux version 4.15.0-20-generic (buildd@bos02-ppc64el-002) (gcc version 7.3.0 
(Ubuntu 7.3.0-16ubuntu3)) #21-Ubuntu SMP Tue Apr 24 06:14:44 UTC 2018 (Ubuntu 
4.15.0-20.21-generic 4.15.17)
  cpu 0x8: Exception 700 (Program Check) in xmon, returning to main loop
  [c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160
  [c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120
  [c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0
  [c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110
  [c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c
  --- Exception: c01 (System Call) at 000071f1779fe280
  SP (7fffe99ece50) is in userspace
  8:mon> r
  R00 = c00000000043b7fc   R16 = 0000000000000000
  R01 = c0000004f8a23c90   R17 = ffffffffffffff70
  R02 = c0000000016eae00   R18 = 00000a51b4bebfc8
  R03 = c000000279557200   R19 = 00007fffe99edbb0
  R04 = c0000003242499c0   R20 = 00000a51b4c04db0
  R05 = 0000000000020000   R21 = 00000a51b4c20e90
  R06 = 0000000000000004   R22 = 0000000000040f00
  R07 = ffffff8100000000   R23 = 00000a51b4c06560
  R08 = ffffff8000000000   R24 = ffffffffffffff80
  R09 = 0000000000000000   R25 = 00000a51b4bec2b8
  R10 = 0000000000000000   R26 = 000071f177bb0b20
  R11 = 0000000000000000   R27 = 0000000000000000
  R12 = 0000000000002000   R28 = c000000279557200
  R13 = c000000007a85800   R29 = c0000004c7734210
  R14 = 0000000000000000   R30 = 0000000000000000
  R15 = 0000000000000000   R31 = c0000003242499c0
  pc  = c00000000043b808 __fsnotify_parent+0x88/0x1a0
  cfar= c0000000003f9e78 dget_parent+0xe8/0x150
  lr  = c00000000043b7fc __fsnotify_parent+0x7c/0x1a0
  msr = 8000000000009033   cr  = 28002222
  ctr = c0000000006252b0   xer = 0000000000000000   trap =  300
  dar = 000000000000023c   dsisr = 40000000
  8:mon> e
  cpu 0x8: Vector: 300 (Data Access) at [c0000004f8a23a10]
-     pc: c00000000043b808: __fsnotify_parent+0x88/0x1a0
-     lr: c00000000043b7fc: __fsnotify_parent+0x7c/0x1a0
-     sp: c0000004f8a23c90
-    msr: 8000000000009033
-    dar: 23c
-  dsisr: 40000000
-   current = 0xc0000004f89faf00
-   paca    = 0xc000000007a85800   softe: 0        irq_happened: 0x01
-     pid   = 24028, comm = top
+     pc: c00000000043b808: __fsnotify_parent+0x88/0x1a0
+     lr: c00000000043b7fc: __fsnotify_parent+0x7c/0x1a0
+     sp: c0000004f8a23c90
+    msr: 8000000000009033
+    dar: 23c
+  dsisr: 40000000
+   current = 0xc0000004f89faf00
+   paca    = 0xc000000007a85800   softe: 0        irq_happened: 0x01
+     pid   = 24028, comm = top
  Linux version 4.15.0-20-generic (buildd@bos02-ppc64el-002) (gcc version 7.3.0 
(Ubuntu 7.3.0-16ubuntu3)) #21-Ubuntu SMP Tue Apr 24 06:14:44 UTC 2018 (Ubuntu 
4.15.0-20.21-generic 4.15.17)
  
  6. Guest enters into xmon after migrating from boslcp3 to boslcp4.
  
- 
- > 
+ >
  > 8:mon> t
  > [c0000004f8a23d20] c0000000005a7674 security_file_permission+0xf4/0x160
  > [c0000004f8a23d60] c0000000003d1d30 rw_verify_area+0x70/0x120
  > [c0000004f8a23d90] c0000000003d375c vfs_read+0x8c/0x1b0
  > [c0000004f8a23de0] c0000000003d3d88 SyS_read+0x68/0x110
  > [c0000004f8a23e30] c00000000000b184 system_call+0x58/0x6c
  > --- Exception: c01 (System Call) at 000071f1779fe280
  > SP (7fffe99ece50) is in userspace
  
  > 8:mon> r
  > R00 = c00000000043b7fc   R16 = 0000000000000000
  > R01 = c0000004f8a23c90   R17 = ffffffffffffff70
  > R02 = c0000000016eae00   R18 = 00000a51b4bebfc8
  > R03 = c000000279557200   R19 = 00007fffe99edbb0
  > R04 = c0000003242499c0   R20 = 00000a51b4c04db0
  > R05 = 0000000000020000   R21 = 00000a51b4c20e90
  > R06 = 0000000000000004   R22 = 0000000000040f00
  > R07 = ffffff8100000000   R23 = 00000a51b4c06560
  > R08 = ffffff8000000000   R24 = ffffffffffffff80
  > R09 = 0000000000000000   R25 = 00000a51b4bec2b8
  > R10 = 0000000000000000   R26 = 000071f177bb0b20
  > R11 = 0000000000000000   R27 = 0000000000000000
  > R12 = 0000000000002000   R28 = c000000279557200
  > R13 = c000000007a85800   R29 = c0000004c7734210
  > R14 = 0000000000000000   R30 = 0000000000000000
  > R15 = 0000000000000000   R31 = c0000003242499c0
  > pc  = c00000000043b808 __fsnotify_parent+0x88/0x1a0
  > cfar= c0000000003f9e78 dget_parent+0xe8/0x150
  > lr  = c00000000043b7fc __fsnotify_parent+0x7c/0x1a0
  > msr = 8000000000009033   cr  = 28002222
  > ctr = c0000000006252b0   xer = 0000000000000000   trap =  300
  > dar = 000000000000023c   dsisr = 40000000
  
- 
  > BUG_ON in jbd2_journal_write_metadata_buffer
  
  I've included xmon crash data from a more recent crash, this time a
  BUG_ON in fs/jbd2/journal.c:jbd2_journal_write_metadata_buffer():
  
  int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
-                                   struct journal_head  *jh_in,
-                                   struct buffer_head **bh_out,
-                                   sector_t blocknr)
+                                   struct journal_head  *jh_in,
+                                   struct buffer_head **bh_out,
+                                   sector_t blocknr)
  {
-         int need_copy_out = 0;
-         int done_copy_out = 0;
-         int do_escape = 0;
-         char *mapped_data;
-         struct buffer_head *new_bh;
-         struct page *new_page;
-         unsigned int new_offset;
-         struct buffer_head *bh_in = jh2bh(jh_in);
-         journal_t *journal = transaction->t_journal;
- 
-         /*
-          * The buffer really shouldn't be locked: only the current committing
-          * transaction is allowed to write it, so nobody else is allowed
-          * to do any IO.
-          *
-          * akpm: except if we're journalling data, and write() output is
-          * also part of a shared mapping, and another thread has
-          * decided to launch a writepage() against this buffer.
-          */
-         J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
- 
- 
- This is not the same as the original bug, but I suspect they are part of a 
class of issues we're hitting while running under very particular circumstances 
which might not generally be seen during normal operation and triggering 
various corner cases. As such I think it makes sense to group them under this 
bug for the time being.
+         int need_copy_out = 0;
+         int done_copy_out = 0;
+         int do_escape = 0;
+         char *mapped_data;
+         struct buffer_head *new_bh;
+         struct page *new_page;
+         unsigned int new_offset;
+         struct buffer_head *bh_in = jh2bh(jh_in);
+         journal_t *journal = transaction->t_journal;
+ 
+         /*
+          * The buffer really shouldn't be locked: only the current committing
+          * transaction is allowed to write it, so nobody else is allowed
+          * to do any IO.
+          *
+          * akpm: except if we're journalling data, and write() output is
+          * also part of a shared mapping, and another thread has
+          * decided to launch a writepage() against this buffer.
+          */
+         J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
+ 
+ This is not the same as the original bug, but I suspect they are part of
+ a class of issues we're hitting while running under very particular
+ circumstances which might not generally be seen during normal operation
+ and triggering various corner cases. As such I think it makes sense to
+ group them under this bug for the time being.
  
  The general workload is running IO-heavy disk workloads on large guests
  (20GB memory, 16 vcpus) with SAN-based storage, and then performing
  migration during the workload. During migration we begin to see a high
  occurrence of rcu_sched stall warnings, and after 1-3  hours of
  operations we hit filesystem-related crashes like the ones posted. We've
  seen this with 2 separate FC cards, emulex and qlogic, where we invoke
  QEMU through libvirt as:
  
  C_ALL=C
  PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
  QEMU_AUDIO_DRV=none /usr/bin/qemu-system-ppc64 -name guest=boslcp3g1
  ,debug-threads=on -S -object
  secret,id=masterKey0,format=raw,file=/var/lib/libvirt/qemu/domain-7-boslcp3g1
  /master-key.aes -machine pseries-2.10,accel=kvm,usb=off,dump-guest-
  core=off,max-cpu-compat=power9 -cpu host -m 20480 -realtime mlock=off
  -smp 16,maxcpus=32,sockets=4,cores=8,threads=1 -object memory-backend-
  file,id=ram-node0,prealloc=yes,mem-
  path=/dev/hugepages/libvirt/qemu/7-boslcp3g1,size=10737418240 -numa
  node,nodeid=0,cpus=0-7,memdev=ram-node0 -object memory-backend-ram,id
  =ram-node1,size=10737418240 -numa node,nodeid=1,cpus=8-15,memdev=ram-
  node1 -uuid bd110ed9-dcfc-4470-b4ae-6166a56819f0 -display none -no-user-
  config -nodefaults -chardev
  
socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-7-boslcp3g1/monitor.sock,server,nowait
  -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc -no-
  shutdown -boot menu=on,strict=on -device spapr-pci-host-
  bridge,index=1,id=pci.1 -device nec-usb-xhci,id=usb,bus=pci.0,addr=0x3
  -device virtio-scsi-pci,id=scsi0,bus=pci.0,addr=0x2 -drive file=/home
  /bionic-server-ppc64el.iso,format=raw,if=none,id=drive-
  scsi0-0-0-2,readonly=on,cache=none -device scsi-cd,bus=scsi0.0,channel=0
  ,scsi-id=0,lun=2,drive=drive-scsi0-0-0-2,id=scsi0-0-0-2 -drive
  file=/dev/disk/by-id/dm-uuid-part1-mpath-
  3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio-
  disk0,cache=none -device virtio-blk-
  pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-
  disk0,bootindex=1 -drive file=/dev/disk/by-id/dm-uuid-part2-mpath-
  3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio-
  disk1,cache=none -device virtio-blk-
  pci,scsi=off,bus=pci.0,addr=0x6,drive=drive-virtio-disk1,id=virtio-disk1
  -drive file=/dev/disk/by-id/dm-uuid-part3-mpath-
  3600507680183050d28000000000002a4,format=raw,if=none,id=drive-virtio-
  disk2,cache=none -device virtio-blk-
  pci,scsi=off,bus=pci.0,addr=0x7,drive=drive-virtio-disk2,id=virtio-disk2
  -netdev tap,fd=27,id=hostnet0,vhost=on,vhostfd=30 -device virtio-net-
  
pci,netdev=hostnet0,id=net0,mac=52:54:00:72:d2:69,bus=pci.0,addr=0x1,bootindex=2
  -chardev pty,id=charserial0 -device spapr-
  vty,chardev=charserial0,id=serial0,reg=0x30000000 -device virtio-
  balloon-pci,id=balloon0,bus=pci.0,addr=0x4 -msg timestamp=on
  
  I will attach the libvirt XML separately
  
  IBM is requesting some general filesystem skills from Canonical if they
  have some as we continue debugging...

-- 
You received this bug notification because you are a member of Ubuntu
Bugs, which is subscribed to Ubuntu.
https://bugs.launchpad.net/bugs/1768115

Title:
  ISST-LTE:KVM:Ubuntu1804:BostonLC:boslcp3g1: Migration guest running
  with IO stress crashed@security_file_permission+0xf4/0x160.

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu-power-systems/+bug/1768115/+subscriptions

-- 
ubuntu-bugs mailing list
ubuntu-bugs@lists.ubuntu.com
https://lists.ubuntu.com/mailman/listinfo/ubuntu-bugs

Reply via email to