*** This bug is a duplicate of bug 1709171 ***
    https://bugs.launchpad.net/bugs/1709171

** This bug has been marked a duplicate of bug 1709171
   Disable CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE

-- 
You received this bug notification because you are a member of Kernel
Packages, which is subscribed to linux in Ubuntu.
https://bugs.launchpad.net/bugs/1710922

Title:
  Ubuntu 17.10 ppc64el guest with MEMORY_HOTPLUG_DEFAULT_ONLINE=y gets a
  "kernel BUG at mm/memory_hotplug.c:2185" when hotplugging LMBs with
  QEMU upstream

Status in The Ubuntu-power-systems project:
  Triaged
Status in linux package in Ubuntu:
  Triaged

Bug description:
  == Comment: #0 - Daniel Henrique Barboza <danie...@br.ibm.com> - 2017-08-08 
09:13:31 ==
  - Host information: Ubuntu 16.10 running upstream QEMU

  $ uname -a
  Linux louis 4.10.0-20-generic #22-Ubuntu SMP Thu Apr 20 09:22:16 UTC 2017 
ppc64le ppc64le ppc64le GNU/Linux

  $ cat /proc/cpuinfo
  processor     : 0
  cpu           : POWER8E (raw), altivec supported
  clock         : 2061.000000MHz
  revision      : 2.1 (pvr 004b 0201)
  (...)
  timebase      : 512000000
  platform      : PowerNV
  model         : 8247-42L
  machine               : PowerNV 8247-42L
  firmware      : OPAL

  
  - qemu command line that launched the Ubuntu 17.10 ppc64el guest:

  sudo ./qemu-system-ppc64 -name migrate_qemu -boot strict=on --enable-
  kvm -device nec-usb-xhci,id=usb,bus=pci.0,addr=0xf -device spapr-
  vscsi,id=scsi0,reg=0x2000 -smp 1,maxcpus=4,sockets=4,cores=1,threads=1
  --machine pseries,accel=kvm,usb=off,dump-guest-core=off -m
  4G,slots=32,maxmem=32G -drive
  file=/home/danielhb/vm_imgs/ub1710.qcow2,format=qcow2,if=none,id
  =drive-virtio-disk0,cache=none -device virtio-blk-
  pci,scsi=off,bus=pci.0,addr=0x2,drive=drive-virtio-disk0,id=virtio-
  disk0,bootindex=1 -nographic

  
  - guest information: Ubuntu 17.10 ppc64el:

  root@ubuntu1710:~# uname -a
  Linux ubuntu1710 4.11.0-10-generic #15-Ubuntu SMP Thu Jun 29 15:02:54 UTC 
2017 ppc64le ppc64le ppc64le GNU/Linux
  root@ubuntu1710:~# 

  
  - Problem: hotplugging a LMB generates a guest kernel Oops:

  root@ubuntu1710:~# QEMU 2.9.90 monitor - type 'help' for more information
  (qemu) 
  (qemu) object_add memory-backend-ram,id=ram1,size=1G
  (qemu) device_add pc-dimm,id=dimm1,memdev=ram1
  (qemu) [  126.850952] kernel BUG at 
/build/linux-S1V_3d/linux-4.11.0/mm/memory_hotplug.c:2185!
  [  126.851285] Oops: Exception in kernel mode, sig: 5 [#1]
  [  126.851428] SMP NR_CPUS=2048 
  [  126.851428] NUMA 
  [  126.851546] pSeries
  [  126.851714] Modules linked in: vmx_crypto ib_iser rdma_cm iw_cm ib_cm 
ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ip_tables 
x_tables autofs4 btrfs raid10 raid456 async_raid6_recov async_memcpy async_pq 
async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear ibmvscsi 
crc32c_vpmsum virtio_blk
  [  126.852447] CPU: 0 PID: 5 Comm: kworker/u8:0 Not tainted 4.11.0-10-generic 
#15-Ubuntu
  [  126.852656] Workqueue: pseries hotplug workque pseries_hp_work_fn
  [  126.852828] task: c0000000fea80000 task.stack: c0000000fe118000
  [  126.853000] NIP: c000000000350268 LR: c0000000003501e0 CTR: 
0000000000000000
  [  126.853190] REGS: c0000000fe11b780 TRAP: 0700   Not tainted  
(4.11.0-10-generic)
  [  126.853390] MSR: 800000000282b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>
  [  126.853396]   CR: 42002422  XER: 20000000
  [  126.853672] CFAR: c0000000003501e4 SOFTE: 1 
  [  126.853672] GPR00: c0000000003501e0 c0000000fe11ba00 c00000000149eb00 
0000000000000001 
  [  126.853672] GPR04: c0000000f9901480 c0000000ffe21c00 000000000000003e 
0000000000000003 
  [  126.853672] GPR08: 0000000000000002 0000000000000003 0000000000000003 
303078302d303030 
  [  126.853672] GPR12: 0000000000002200 c00000000fb80000 c000000000110008 
c0000000fe1810c0 
  [  126.853672] GPR16: c0000000fe050ea8 0000000000000010 c0000000fffffc30 
c0000000fffffea0 
  [  126.853672] GPR20: c0000000f951a1a4 0000000000000004 0000000000000001 
0000000000000010 
  [  126.853672] GPR24: 0000000000000001 c0000000f951a1a0 0000000000000004 
0000000000000000 
  [  126.853672] GPR28: 0000000000000000 0000000000000001 0000000010000000 
0000000140000000 
  [  126.855221] NIP [c000000000350268] remove_memory+0xf8/0x100
  [  126.855338] LR [c0000000003501e0] remove_memory+0x70/0x100
  [  126.855453] Call Trace:
  [  126.855520] [c0000000fe11ba00] [c0000000003501e0] remove_memory+0x70/0x100 
(unreliable)
  [  126.855684] [c0000000fe11ba40] [c0000000000b0880] dlpar_add_lmb+0x370/0x3f0
  [  126.855822] [c0000000fe11bb20] [c0000000000b174c] dlpar_memory+0x7cc/0xd20
  [  126.855959] [c0000000fe11bbf0] [c0000000000a9af8] 
handle_dlpar_errorlog+0xa8/0x170
  [  126.856118] [c0000000fe11bc60] [c0000000000a9c54] 
pseries_hp_work_fn+0x94/0xa0
  [  126.856275] [c0000000fe11bc90] [c0000000001071d0] 
process_one_work+0x2b0/0x5a0
  [  126.856430] [c0000000fe11bd20] [c000000000107568] worker_thread+0xa8/0x670
  [  126.856563] [c0000000fe11bdc0] [c000000000110164] kthread+0x164/0x1b0
  [  126.856695] [c0000000fe11be30] [c00000000000b4e8] 
ret_from_kernel_thread+0x5c/0x74
  [  126.856846] Instruction dump:
  [  126.856931] 60000000 387f0060 48824b19 60000000 38210040 e8010010 eb81ffe0 
eba1ffe8 
  [  126.857088] ebc1fff0 ebe1fff8 7c0803a6 4e800020 <0fe00000> 00000000 
3c4c0115 3842e890 
  [  126.857243] ---[ end trace 76fab848b8f01d0a ]---
  [  126.859577] 

  
  Investigating the cause I've found this kernel commit:

  commit 943db62c316c578f8e2cc6fb81a5f641096b29bf
  Author: Nathan Fontenot <nf...@linux.vnet.ibm.com>
  Date:   Wed Feb 15 13:45:30 2017 -0500

      powerpc/pseries: Revert 'Auto-online hotplugged memory'
      
      This reverts commit ec999072442a ("powerpc/pseries: Auto-online
      hotplugged memory"), and 9dc512819e4b ("powerpc: Fix unused function
      warning 'lmb_to_memblock'").
      
      Using the auto-online acpability does online added memory but does not
      update the associated device struct to indicate that the memory is
      online. This causes the pseries memory DLPAR code to fail when trying to
      remove a LMB that was previously removed and added back. This happens
      when validating that the LMB is removable.
      
      This patch reverts to the previous behavior of calling device_online()
      to online the LMB when it is DLPAR added and moves the lmb_to_memblock()
      routine out of CONFIG_MEMORY_HOTREMOVE now that we call it for add.

  
  This commit removed a specific kernel configuration in the revert:

  --- a/arch/powerpc/configs/pseries_defconfig
  +++ b/arch/powerpc/configs/pseries_defconfig
  @@ -58,7 +58,6 @@ CONFIG_KEXEC_FILE=y
   CONFIG_IRQ_ALL_CPUS=y
   CONFIG_MEMORY_HOTPLUG=y
   CONFIG_MEMORY_HOTREMOVE=y
  -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
   CONFIG_KSM=y
   CONFIG_TRANSPARENT_HUGEPAGE=y

  
  Using the vanilla kernel from Linus I've got the following default config for 
pseries:

  [danielhb@arthas linux]$ ARCH=powerpc make pseries_defconfig
  #
  # configuration written to .config
  #
  [danielhb@arthas linux]$ grep -R 'HOTPLUG_DEFAULT' .
  ./mm/Kconfig:config MEMORY_HOTPLUG_DEFAULT_ONLINE
  ./mm/memory_hotplug.c:#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
  ./.config:# CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE is not set
  ./Documentation/admin-guide/kernel-parameters.txt:                    
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
  ./Documentation/memory-hotplug.txt:The default depends on the 
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
  [danielhb@arthas linux]$ 

  
  As we can see from the grep result, the .config was generated without the 
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE option and there is no other place in the 
code where it is set to Y.

  In the mm/Kconfig we have:

  (...)
  config MEMORY_HOTPLUG_DEFAULT_ONLINE
          bool "Online the newly added memory blocks by default"
          default n
          depends on MEMORY_HOTPLUG
          help
  (...)  

  This shows that the default value for this option is N, which makes
  sense with the change made in the patch - the absence of the option in
  .config disables the auto_online_blocks feature.

  However, the guest  Ubuntu 17.10 ppc64el kernel is setting this option
  to Y:

  root@ubuntu1710:~# cat /boot/config-4.11.0-10-generic | grep HOTPLUG_DEFAULT
  CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
  root@ubuntu1710:~# 

  
  I am not sure if the intent was to enable just for x86 or all architectures 
but, as is, this is breaking memory hotplug in pseries after the mentioned 
kernel commit 943db62c316c578f8e2cc6fb81a5f641096b29bf. Given that the default 
behavior when the option is not set is N, my suggestion is to change the 
MEMORY_HOTPLUG_DEFAULT_ONLINE  to 'not set' in any ppc64el config file in the 
Ubuntu build, following the defconfig we have in the vanilla kernel.

  
  - Workarounds:

  The most obvious one: if I recompile the Ubuntu 17.10 kernel without
  this option (or setting it to 'n'), LMB hotplug works.

  Another possible workaround, further documented in the kernel commits
  that introduced the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE option, is to
  manually set the auto_online blocks to 'offline'. Doing that prior to
  the hotplug avoids the kernel Oops:

  root@ubuntu1710:~# echo offline > 
/sys/devices/system/memory/auto_online_blocks
  root@ubuntu1710:~# 
  root@ubuntu1710:~# grep Mem /proc/meminfo
  MemTotal:        4169088 kB
  MemFree:         3725632 kB
  MemAvailable:    3917056 kB
  root@ubuntu1710:~# 
  root@ubuntu1710:~# QEMU 2.9.90 monitor - type 'help' for more information
  (qemu) 
  (qemu) object_add memory-backend-ram,id=ram0,size=1G
  (qemu) device_add pc-dimm,id=dimm0,memdev=ram0
  (qemu) 

  root@ubuntu1710:~# 
  root@ubuntu1710:~# grep Mem /proc/meminfo
  MemTotal:        5217664 kB
  MemFree:         4772864 kB
  MemAvailable:    4956928 kB
  root@ubuntu1710:~# 


  We can see, it is possible to normally hotplug memory if we manually
  disable auto_online_blocks.

  
  Additional notes:

  - This same problem was also reported against Fedora 26 guests here: 
https://bugzilla.redhat.com/show_bug.cgi?id=1476380. 
  - Ever since I've opened this Red Hat bug there were some developments in the 
PPC kernel mailing list, but none that actually solves the problem seem here in 
the upstream 4.13+ kernel. The safest course of action is disable this option 
until things are sorted out upstrea.


  Let me know if you need any extra information about the issue or the
  tests.

  
  Thanks,

  
  Daniel

  == Comment: #5 - Nathan D. Fontenot <nfont...@us.ibm.com> - 2017-08-15 
11:23:54 ==
  This problem is resolved by a recent commit upstream (commit id 1a367063ca0c)

  commit 1a367063ca0c1c6f6f54b5abd7b4836b0866a07b
  Author: Nathan Fontenot <nf...@linux.vnet.ibm.com>
  Date:   Wed Aug 2 14:03:22 2017 -0400

      powerpc/pseries: Check memory device state before onlining/offlining
      
      When DLPAR adding or removing memory we need to check the device
      offline status before trying to online/offline the memory. This is
      needed because calls to device_online() and device_offline() will
      return non-zero for memory that is already online and offline
      respectively.
      
      This update resolves two scenarios. First, for a kernel built with
      auto-online memory enabled (CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y),
      memory will be onlined as part of calls to add_memory(). After adding
      the memory the pseries DLPAR code tries to online it and fails since
      the memory is already online. The DLPAR code then tries to remove the
      memory which produces the oops message below because the memory is not
      offline.
      
      The second scenario occurs when removing memory that is already
      offline, i.e. marking memory offline (via sysfs) and then trying to
      remove that memory. This doesn't work because offlining the already
      offline memory does not succeed and the DLPAR code then fails the
      DLPAR remove operation.
      
      The fix for both scenarios is to check the device.offline status
      before making the calls to device_online() or device_offline().
      
        kernel BUG at mm/memory_hotplug.c:1936!
        ...
        NIP [c0000000002ca428] .remove_memory+0xb8/0xc0
        LR [c0000000002ca3cc] .remove_memory+0x5c/0xc0
        Call Trace:
          .remove_memory+0x5c/0xc0 (unreliable)
          .dlpar_add_lmb+0x384/0x400
          .dlpar_memory+0x5dc/0xca0
          .handle_dlpar_errorlog+0x74/0xe0
          .pseries_hp_work_fn+0x2c/0x90
          .process_one_work+0x17c/0x460
          .worker_thread+0x88/0x500
          .kthread+0x15c/0x1a0
          .ret_from_kernel_thread+0x58/0xc0
      
      Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memo
      Signed-off-by: Nathan Fontenot <nf...@linux.vnet.ibm.com>
      [mpe: Use bool, add explicit rc=0 case, change log typos & formatting]
      Signed-off-by: Michael Ellerman <m...@ellerman.id.au>

  diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/pl
  index ca9b2f4..9e3afd2 100644
  --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
  +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
  @@ -336,7 +336,38 @@ static struct memory_block *lmb_to_memblock(struct of_drc
          return mem_block;
   }
   
  +static int dlpar_change_lmb_state(struct of_drconf_cell *lmb, bool online)
  +{
  +       struct memory_block *mem_block;
  +       int rc;
  +
  +       mem_block = lmb_to_memblock(lmb);
  +       if (!mem_block)
  +               return -EINVAL;
  +
  +       if (online && mem_block->dev.offline)
  +               rc = device_online(&mem_block->dev);
  +       else if (!online && !mem_block->dev.offline)
  +               rc = device_offline(&mem_block->dev);
  +       else
  +               rc = 0;
  +
  +       put_device(&mem_block->dev);
  +
  +       return rc;
  +}
  +
  +static int dlpar_online_lmb(struct of_drconf_cell *lmb)
  +{
  +       return dlpar_change_lmb_state(lmb, true);
  +}
  +
   #ifdef CONFIG_MEMORY_HOTREMOVE
  +static int dlpar_offline_lmb(struct of_drconf_cell *lmb)
  +{
  +       return dlpar_change_lmb_state(lmb, false);
  +}
  +
   static int pseries_remove_memblock(unsigned long base, unsigned int memblock_
   {
          unsigned long block_sz, start_pfn;
  @@ -431,19 +462,13 @@ static int dlpar_add_lmb(struct of_drconf_cell *);
   
   static int dlpar_remove_lmb(struct of_drconf_cell *lmb)
   {
  -       struct memory_block *mem_block;
          unsigned long block_sz;
          int nid, rc;
   
          if (!lmb_is_removable(lmb))
                  return -EINVAL;
   
  -       mem_block = lmb_to_memblock(lmb);
  -       if (!mem_block)
  -               return -EINVAL;
  -
  -       rc = device_offline(&mem_block->dev);
  -       put_device(&mem_block->dev);
  +       rc = dlpar_offline_lmb(lmb);
          if (rc)
                  return rc;
   
  @@ -737,20 +762,6 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
   }
   #endif /* CONFIG_MEMORY_HOTREMOVE */
   
  -static int dlpar_online_lmb(struct of_drconf_cell *lmb)
  -{
  -       struct memory_block *mem_block;
  -       int rc;
  -
  -       mem_block = lmb_to_memblock(lmb);
  -       if (!mem_block)
  -               return -EINVAL;
  -
  -       rc = device_online(&mem_block->dev);
  -       put_device(&mem_block->dev);
  -       return rc;
  -}
  -
   static int dlpar_add_lmb(struct of_drconf_cell *lmb)
   {
          unsigned long block_sz;

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu-power-systems/+bug/1710922/+subscriptions

-- 
Mailing list: https://launchpad.net/~kernel-packages
Post to     : kernel-packages@lists.launchpad.net
Unsubscribe : https://launchpad.net/~kernel-packages
More help   : https://help.launchpad.net/ListHelp

Reply via email to