[PATCH v4 3/3] PCI/DPC: Disable DPC interrupt during suspend
PCIe service that shares IRQ with PME may cause spurious wakeup on system suspend. Since AER is conditionally disabled in previous patch, also apply the same logic to disable DPC which depends on AER to work. PCIe Base Spec 5.0, section 5.2 "Link State Power Management" states that TLP and DLLP transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold with aux power) and L3 (D3cold), so we don't lose much here to disable DPC during system suspend. This is very similar to previous attempts to suspend AER and DPC [1], but with a different reason. [1] https://lore.kernel.org/linux-pci/20220408153159.106741-1-kai.heng.f...@canonical.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216295 Reviewed-by: Mika Westerberg Signed-off-by: Kai-Heng Feng --- drivers/pci/pcie/dpc.c | 26 ++ 1 file changed, 26 insertions(+) diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c index a5d7c69b764e..98bdefde6df1 100644 --- a/drivers/pci/pcie/dpc.c +++ b/drivers/pci/pcie/dpc.c @@ -385,6 +385,30 @@ static int dpc_probe(struct pcie_device *dev) return status; } +static int dpc_suspend(struct pcie_device *dev) +{ + struct pci_dev *pdev = dev->port; + u16 ctl; + + pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ); + ctl &= ~PCI_EXP_DPC_CTL_INT_EN; + pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl); + + return 0; +} + +static int dpc_resume(struct pcie_device *dev) +{ + struct pci_dev *pdev = dev->port; + u16 ctl; + + pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ); + ctl |= PCI_EXP_DPC_CTL_INT_EN; + pci_write_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_CTL, ctl); + + return 0; +} + static void dpc_remove(struct pcie_device *dev) { struct pci_dev *pdev = dev->port; @@ -400,6 +424,8 @@ static struct pcie_port_service_driver dpcdriver = { .port_type = PCIE_ANY_PORT, .service= PCIE_PORT_SERVICE_DPC, .probe = dpc_probe, + .suspend= dpc_suspend, + .resume = dpc_resume, .remove = dpc_remove, }; -- 2.34.1
[PATCH v4 1/3] PCI/AER: Factor out interrupt toggling into helpers
There are many places that enable and disable AER interrput, so move them into helpers. Reviewed-by: Mika Westerberg Reviewed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kai-Heng Feng --- drivers/pci/pcie/aer.c | 45 +- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index f6c24ded134c..1420e1f27105 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1227,6 +1227,28 @@ static irqreturn_t aer_irq(int irq, void *context) return IRQ_WAKE_THREAD; } +static void aer_enable_irq(struct pci_dev *pdev) +{ + int aer = pdev->aer_cap; + u32 reg32; + + /* Enable Root Port's interrupt in response to error messages */ + pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, ); + reg32 |= ROOT_PORT_INTR_ON_MESG_MASK; + pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32); +} + +static void aer_disable_irq(struct pci_dev *pdev) +{ + int aer = pdev->aer_cap; + u32 reg32; + + /* Disable Root's interrupt in response to error messages */ + pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, ); + reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK; + pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32); +} + /** * aer_enable_rootport - enable Root Port's interrupts when receiving messages * @rpc: pointer to a Root Port data structure @@ -1256,10 +1278,7 @@ static void aer_enable_rootport(struct aer_rpc *rpc) pci_read_config_dword(pdev, aer + PCI_ERR_UNCOR_STATUS, ); pci_write_config_dword(pdev, aer + PCI_ERR_UNCOR_STATUS, reg32); - /* Enable Root Port's interrupt in response to error messages */ - pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, ); - reg32 |= ROOT_PORT_INTR_ON_MESG_MASK; - pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32); + aer_enable_irq(pdev); } /** @@ -1274,10 +1293,7 @@ static void aer_disable_rootport(struct aer_rpc *rpc) int aer = pdev->aer_cap; u32 reg32; - /* Disable Root's interrupt in response to error messages */ - pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, ); - reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK; - pci_write_config_dword(pdev, aer + PCI_ERR_ROOT_COMMAND, reg32); + aer_disable_irq(pdev); /* Clear Root's error status reg */ pci_read_config_dword(pdev, aer + PCI_ERR_ROOT_STATUS, ); @@ -1372,12 +1388,8 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev) */ aer = root ? root->aer_cap : 0; - if ((host->native_aer || pcie_ports_native) && aer) { - /* Disable Root's interrupt in response to error messages */ - pci_read_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, ); - reg32 &= ~ROOT_PORT_INTR_ON_MESG_MASK; - pci_write_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, reg32); - } + if ((host->native_aer || pcie_ports_native) && aer) + aer_disable_irq(root); if (type == PCI_EXP_TYPE_RC_EC || type == PCI_EXP_TYPE_RC_END) { rc = pcie_reset_flr(dev, PCI_RESET_DO_RESET); @@ -1396,10 +1408,7 @@ static pci_ers_result_t aer_root_reset(struct pci_dev *dev) pci_read_config_dword(root, aer + PCI_ERR_ROOT_STATUS, ); pci_write_config_dword(root, aer + PCI_ERR_ROOT_STATUS, reg32); - /* Enable Root Port's interrupt in response to error messages */ - pci_read_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, ); - reg32 |= ROOT_PORT_INTR_ON_MESG_MASK; - pci_write_config_dword(root, aer + PCI_ERR_ROOT_COMMAND, reg32); + aer_enable_irq(root); } return rc ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED; -- 2.34.1
[PATCH v4 2/3] PCI/AER: Disable AER interrupt on suspend
PCIe service that shares IRQ with PME may cause spurious wakeup on system suspend. PCIe Base Spec 5.0, section 5.2 "Link State Power Management" states that TLP and DLLP transmission is disabled for a Link in L2/L3 Ready (D3hot), L2 (D3cold with aux power) and L3 (D3cold), so we don't lose much here to disable AER during system suspend. This is very similar to previous attempts to suspend AER and DPC [1], but with a different reason. [1] https://lore.kernel.org/linux-pci/20220408153159.106741-1-kai.heng.f...@canonical.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216295 Reviewed-by: Mika Westerberg Signed-off-by: Kai-Heng Feng --- drivers/pci/pcie/aer.c | 22 ++ 1 file changed, 22 insertions(+) diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c index 1420e1f27105..9c07fdbeb52d 100644 --- a/drivers/pci/pcie/aer.c +++ b/drivers/pci/pcie/aer.c @@ -1356,6 +1356,26 @@ static int aer_probe(struct pcie_device *dev) return 0; } +static int aer_suspend(struct pcie_device *dev) +{ + struct aer_rpc *rpc = get_service_data(dev); + struct pci_dev *pdev = rpc->rpd; + + aer_disable_irq(pdev); + + return 0; +} + +static int aer_resume(struct pcie_device *dev) +{ + struct aer_rpc *rpc = get_service_data(dev); + struct pci_dev *pdev = rpc->rpd; + + aer_enable_irq(pdev); + + return 0; +} + /** * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP * @dev: pointer to Root Port, RCEC, or RCiEP @@ -1420,6 +1440,8 @@ static struct pcie_port_service_driver aerdriver = { .service= PCIE_PORT_SERVICE_AER, .probe = aer_probe, + .suspend= aer_suspend, + .resume = aer_resume, .remove = aer_remove, }; -- 2.34.1
Re: BUG : PowerPC RCU: torture test failed with __stack_chk_fail
Thank Boqun for your wonderful analysis! On Mon, Apr 24, 2023 at 8:33 AM Boqun Feng wrote: > > On Sat, Apr 22, 2023 at 09:28:39PM +0200, Joel Fernandes wrote: > > On Sat, Apr 22, 2023 at 2:47 PM Zhouyi Zhou wrote: > > > > > > Dear PowerPC and RCU developers: > > > During the RCU torture test on mainline (on the VM of Opensource Lab > > > of Oregon State University), SRCU-P failed with __stack_chk_fail: > > > [ 264.381952][ T99] [c6c7bab0] [c10c67c0] > > > dump_stack_lvl+0x94/0xd8 (unreliable) > > > [ 264.383786][ T99] [c6c7bae0] [c014fc94] > > > panic+0x19c/0x468 > > > [ 264.385128][ T99] [c6c7bb80] [c10fca24] > > > __stack_chk_fail+0x24/0x30 > > > [ 264.386610][ T99] [c6c7bbe0] [c02293b4] > > > srcu_gp_start_if_needed+0x5c4/0x5d0 > > > [ 264.388188][ T99] [c6c7bc70] [c022f7f4] > > > srcu_torture_call+0x34/0x50 > > > [ 264.389611][ T99] [c6c7bc90] [c022b5e8] > > > rcu_torture_fwd_prog+0x8c8/0xa60 > > > [ 264.391439][ T99] [c6c7be00] [c018e37c] > > > kthread+0x15c/0x170 > > > [ 264.392792][ T99] [c6c7be50] [c000df94] > > > ret_from_kernel_thread+0x5c/0x64 > > > The kernel config file can be found in [1]. > > > And I write a bash script to accelerate the bug reproducing [2]. > > > After a week's debugging, I found the cause of the bug is because the > > > register r10 used to judge for stack overflow is not constant between > > > context switches. > > > The assembly code for srcu_gp_start_if_needed is located at [3]: > > > c0226eb4: 78 6b aa 7d mr r10,r13 > > > c0226eb8: 14 42 29 7d add r9,r9,r8 > > > c0226ebc: ac 04 00 7c hwsync > > > c0226ec0: 10 00 7b 3b addir27,r27,16 > > > c0226ec4: 14 da 29 7d add r9,r9,r27 > > > c0226ec8: a8 48 00 7d ldarx r8,0,r9 > > > c0226ecc: 01 00 08 31 addic r8,r8,1 > > > c0226ed0: ad 49 00 7d stdcx. r8,0,r9 > > > c0226ed4: f4 ff c2 40 bne-c0226ec8 > > > > > > c0226ed8: 28 00 21 e9 ld r9,40(r1) > > > c0226edc: 78 0c 4a e9 ld r10,3192(r10) > > > c0226ee0: 79 52 29 7d xor.r9,r9,r10 > > > c0226ee4: 00 00 40 39 li r10,0 > > > c0226ee8: b8 03 82 40 bne c02272a0 > > > > > > by debugging, I see the r10 is assigned with r13 on c0226eb4, > > > but if there is a context-switch before c0226edc, a false > > > positive will be reported. > > > > > > [1] http://154.220.3.115/logs/0422/configformainline.txt > > > [2] 154.220.3.115/logs/0422/whilebash.sh > > > [3] http://154.220.3.115/logs/0422/srcu_gp_start_if_needed.txt > > > > > > My analysis and debugging may not be correct, but the bug is easily > > > reproducible. > > > > If this is a bug in the stack smashing protection as you seem to hint, > > I wonder if you see the issue with a specific gcc version and is a > > compiler-specific issue. It's hard to say, but considering this I > > Very likely, more asm code from Zhouyi's link: > > This is the __srcu_read_unlock_nmisafe(), since "hwsync" is > smp_mb__{after,before}_atomic(), and the following code is first > barrier then atomic, so it's the unlock. > > c0226eb4: 78 6b aa 7d mr r10,r13 > > ^ r13 is the pointer to percpu data on PPC64 kernel, and it's also > the pointer to TLS data for userspace code. > > c0226eb8: 14 42 29 7d add r9,r9,r8 > c0226ebc: ac 04 00 7c hwsync > c0226ec0: 10 00 7b 3b addir27,r27,16 > c0226ec4: 14 da 29 7d add r9,r9,r27 > c0226ec8: a8 48 00 7d ldarx r8,0,r9 > c0226ecc: 01 00 08 31 addic r8,r8,1 > c0226ed0: ad 49 00 7d stdcx. r8,0,r9 > c0226ed4: f4 ff c2 40 bne-c0226ec8 > > c0226ed8: 28 00 21 e9 ld r9,40(r1) > c0226edc: 78 0c 4a e9 ld r10,3192(r10) > > here I think that the compiler is using r10 as an alias to r13, since > for userspace program, it's safe to assume the TLS pointer doesn't > change. However this is not true for kernel percpu pointer. I learned a lot from your analysis, this is a fruitful learning journey for me ;-) > > The real intention here is to compare 40(r1) vs 3192(r13) for stack > guard checking, however since r13 is the percpu pointer in kernel, so > the value of r13 can be changed if the thread gets scheduled to a > different CPU after reading r13 for r10. > > __srcu_read_unlock_nmisafe() triggers this issue, because: > > * it contains a read from r13 > * it locates at the very end of srcu_gp_start_if_needed(). > > This gives the compiler more opportunity to "optimize" a read from r13 >
[PATCH v2 05/13] ASoC: fsl: use asoc_dummy_dlc
Now we can share asoc_dummy_dlc. This patch use it. Signed-off-by: Kuninori Morimoto --- sound/soc/fsl/imx-audmix.c | 14 +- sound/soc/fsl/imx-card.c | 11 +-- sound/soc/fsl/imx-rpmsg.c | 3 +-- sound/soc/fsl/imx-spdif.c | 8 +++- 4 files changed, 10 insertions(+), 26 deletions(-) diff --git a/sound/soc/fsl/imx-audmix.c b/sound/soc/fsl/imx-audmix.c index b2c5aca92c6b..efbcd4a65ca8 100644 --- a/sound/soc/fsl/imx-audmix.c +++ b/sound/soc/fsl/imx-audmix.c @@ -207,8 +207,8 @@ static int imx_audmix_probe(struct platform_device *pdev) for (i = 0; i < num_dai; i++) { struct snd_soc_dai_link_component *dlc; - /* for CPU/Codec x 2 */ - dlc = devm_kcalloc(>dev, 4, sizeof(*dlc), GFP_KERNEL); + /* for CPU x 2 */ + dlc = devm_kcalloc(>dev, 2, sizeof(*dlc), GFP_KERNEL); if (!dlc) return -ENOMEM; @@ -244,7 +244,7 @@ static int imx_audmix_probe(struct platform_device *pdev) */ priv->dai[i].cpus = priv->dai[i].platforms = [0]; - priv->dai[i].codecs = [1]; + priv->dai[i].codecs = _dummy_dlc; priv->dai[i].num_cpus = 1; priv->dai[i].num_codecs = 1; @@ -252,8 +252,6 @@ static int imx_audmix_probe(struct platform_device *pdev) priv->dai[i].name = dai_name; priv->dai[i].stream_name = "HiFi-AUDMIX-FE"; - priv->dai[i].codecs->dai_name = "snd-soc-dummy-dai"; - priv->dai[i].codecs->name = "snd-soc-dummy"; priv->dai[i].cpus->of_node = args.np; priv->dai[i].cpus->dai_name = dev_name(_pdev->dev); priv->dai[i].dynamic = 1; @@ -270,15 +268,13 @@ static int imx_audmix_probe(struct platform_device *pdev) be_cp = devm_kasprintf(>dev, GFP_KERNEL, "AUDMIX-Capture-%d", i); - priv->dai[num_dai + i].cpus = [2]; - priv->dai[num_dai + i].codecs = [3]; + priv->dai[num_dai + i].cpus = [1]; + priv->dai[num_dai + i].codecs = _dummy_dlc; priv->dai[num_dai + i].num_cpus = 1; priv->dai[num_dai + i].num_codecs = 1; priv->dai[num_dai + i].name = be_name; - priv->dai[num_dai + i].codecs->dai_name = "snd-soc-dummy-dai"; - priv->dai[num_dai + i].codecs->name = "snd-soc-dummy"; priv->dai[num_dai + i].cpus->of_node = audmix_np; priv->dai[num_dai + i].cpus->dai_name = be_name; priv->dai[num_dai + i].no_pcm = 1; diff --git a/sound/soc/fsl/imx-card.c b/sound/soc/fsl/imx-card.c index 64a4d7e9db60..78e2e3932ba5 100644 --- a/sound/soc/fsl/imx-card.c +++ b/sound/soc/fsl/imx-card.c @@ -615,17 +615,8 @@ static int imx_card_parse_of(struct imx_card_data *data) plat_data->type = CODEC_AK5552; } else { - dlc = devm_kzalloc(dev, sizeof(*dlc), GFP_KERNEL); - if (!dlc) { - ret = -ENOMEM; - goto err; - } - - link->codecs = dlc; + link->codecs = _dummy_dlc; link->num_codecs = 1; - - link->codecs->dai_name = "snd-soc-dummy-dai"; - link->codecs->name = "snd-soc-dummy"; } if (!strncmp(link->name, "HiFi-ASRC-FE", 12)) { diff --git a/sound/soc/fsl/imx-rpmsg.c b/sound/soc/fsl/imx-rpmsg.c index 89178106fe2c..93fc976e98dc 100644 --- a/sound/soc/fsl/imx-rpmsg.c +++ b/sound/soc/fsl/imx-rpmsg.c @@ -92,8 +92,7 @@ static int imx_rpmsg_probe(struct platform_device *pdev) /* Optional codec node */ ret = of_parse_phandle_with_fixed_args(np, "audio-codec", 0, 0, ); if (ret) { - data->dai.codecs->dai_name = "snd-soc-dummy-dai"; - data->dai.codecs->name = "snd-soc-dummy"; + *data->dai.codecs = asoc_dummy_dlc; } else { struct clk *clk; diff --git a/sound/soc/fsl/imx-spdif.c b/sound/soc/fsl/imx-spdif.c index ab978431ac98..44463f92e522 100644 --- a/sound/soc/fsl/imx-spdif.c +++ b/sound/soc/fsl/imx-spdif.c @@ -26,7 +26,7 @@ static int imx_spdif_audio_probe(struct platform_device *pdev) } data = devm_kzalloc(>dev, sizeof(*data), GFP_KERNEL); - comp = devm_kzalloc(>dev, 2 * sizeof(*comp), GFP_KERNEL); + comp = devm_kzalloc(>dev, sizeof(*comp), GFP_KERNEL); if (!data || !comp) { ret = -ENOMEM; goto end; @@ -37,8 +37,8 @@ static int imx_spdif_audio_probe(struct platform_device *pdev) * platform is using soc-generic-dmaengine-pcm */ data->dai.cpus = -
[PATCH v2 00/13] ASoC: add and use asoc_dummy_dlc
Hi Mark These are v2 patch-set of asoc_dummy_dlc. Many ASoC drivers are using dummy DAI. I have 2 concern about it. 1st one is there is no guarantee that local strings ("snd-soc-dummy-dai", "snd-soc-dummy") are kept until the card was binded if it was added at subfunction. 2nd one is we can use common snd_soc_dai_link_component for it. This patch-set adds common asoc_dummy_dlc, and use it. v1 -> v2 - Separate intel patch into 3 - Topology codec doesn't use asoc_dummy_dlc Link: https://lore.kernel.org/r/874jpe3uqh.wl-kuninori.morimoto...@renesas.com Kuninori Morimoto (13): ASoC: soc-utils.c: add asoc_dummy_dlc ASoC: ti: use asoc_dummy_dlc ASoC: sof: use asoc_dummy_dlc ASoC: amd: use asoc_dummy_dlc ASoC: fsl: use asoc_dummy_dlc ASoC: qcom: use asoc_dummy_dlc ASoC: atmel: use asoc_dummy_dlc ASoC: meson: use asoc_dummy_dlc ASoC: intel: avs: use asoc_dummy_dlc ASoC: intel: sof: use asoc_dummy_dlc ASoC: intel: skylake: use asoc_dummy_dlc ASoC: simple_card_utils.c: use asoc_dummy_dlc ASoC: soc-topology.c: add comment for Platform/Codec include/sound/simple_card_utils.h| 1 - include/sound/soc.h | 1 + sound/soc/amd/acp/acp-mach-common.c | 43 sound/soc/atmel/atmel-classd.c | 8 ++-- sound/soc/atmel/atmel-pdmic.c| 8 ++-- sound/soc/fsl/imx-audmix.c | 14 +++ sound/soc/fsl/imx-card.c | 11 + sound/soc/fsl/imx-rpmsg.c| 3 +- sound/soc/fsl/imx-spdif.c| 8 ++-- sound/soc/generic/simple-card-utils.c| 9 +--- sound/soc/intel/avs/boards/i2s_test.c| 6 +-- sound/soc/intel/boards/ehl_rt5660.c | 8 +--- sound/soc/intel/boards/skl_hda_dsp_generic.c | 8 +--- sound/soc/intel/boards/sof_cs42l42.c | 11 + sound/soc/intel/boards/sof_es8336.c | 11 + sound/soc/intel/boards/sof_nau8825.c | 11 + sound/soc/intel/boards/sof_pcm512x.c | 3 +- sound/soc/intel/boards/sof_rt5682.c | 14 ++- sound/soc/intel/boards/sof_sdw.c | 13 +- sound/soc/intel/boards/sof_ssp_amp.c | 18 +++- sound/soc/meson/axg-card.c | 8 ++-- sound/soc/meson/meson-card-utils.c | 10 + sound/soc/qcom/common.c | 11 + sound/soc/soc-topology.c | 22 +- sound/soc/soc-utils.c| 7 sound/soc/sof/nocodec.c | 8 ++-- sound/soc/ti/omap-hdmi.c | 8 ++-- 27 files changed, 89 insertions(+), 194 deletions(-) -- 2.25.1
Re: BUG : PowerPC RCU: torture test failed with __stack_chk_fail
On Sat, Apr 22, 2023 at 09:28:39PM +0200, Joel Fernandes wrote: > On Sat, Apr 22, 2023 at 2:47 PM Zhouyi Zhou wrote: > > > > Dear PowerPC and RCU developers: > > During the RCU torture test on mainline (on the VM of Opensource Lab > > of Oregon State University), SRCU-P failed with __stack_chk_fail: > > [ 264.381952][ T99] [c6c7bab0] [c10c67c0] > > dump_stack_lvl+0x94/0xd8 (unreliable) > > [ 264.383786][ T99] [c6c7bae0] [c014fc94] > > panic+0x19c/0x468 > > [ 264.385128][ T99] [c6c7bb80] [c10fca24] > > __stack_chk_fail+0x24/0x30 > > [ 264.386610][ T99] [c6c7bbe0] [c02293b4] > > srcu_gp_start_if_needed+0x5c4/0x5d0 > > [ 264.388188][ T99] [c6c7bc70] [c022f7f4] > > srcu_torture_call+0x34/0x50 > > [ 264.389611][ T99] [c6c7bc90] [c022b5e8] > > rcu_torture_fwd_prog+0x8c8/0xa60 > > [ 264.391439][ T99] [c6c7be00] [c018e37c] > > kthread+0x15c/0x170 > > [ 264.392792][ T99] [c6c7be50] [c000df94] > > ret_from_kernel_thread+0x5c/0x64 > > The kernel config file can be found in [1]. > > And I write a bash script to accelerate the bug reproducing [2]. > > After a week's debugging, I found the cause of the bug is because the > > register r10 used to judge for stack overflow is not constant between > > context switches. > > The assembly code for srcu_gp_start_if_needed is located at [3]: > > c0226eb4: 78 6b aa 7d mr r10,r13 > > c0226eb8: 14 42 29 7d add r9,r9,r8 > > c0226ebc: ac 04 00 7c hwsync > > c0226ec0: 10 00 7b 3b addir27,r27,16 > > c0226ec4: 14 da 29 7d add r9,r9,r27 > > c0226ec8: a8 48 00 7d ldarx r8,0,r9 > > c0226ecc: 01 00 08 31 addic r8,r8,1 > > c0226ed0: ad 49 00 7d stdcx. r8,0,r9 > > c0226ed4: f4 ff c2 40 bne-c0226ec8 > > > > c0226ed8: 28 00 21 e9 ld r9,40(r1) > > c0226edc: 78 0c 4a e9 ld r10,3192(r10) > > c0226ee0: 79 52 29 7d xor.r9,r9,r10 > > c0226ee4: 00 00 40 39 li r10,0 > > c0226ee8: b8 03 82 40 bne c02272a0 > > > > by debugging, I see the r10 is assigned with r13 on c0226eb4, > > but if there is a context-switch before c0226edc, a false > > positive will be reported. > > > > [1] http://154.220.3.115/logs/0422/configformainline.txt > > [2] 154.220.3.115/logs/0422/whilebash.sh > > [3] http://154.220.3.115/logs/0422/srcu_gp_start_if_needed.txt > > > > My analysis and debugging may not be correct, but the bug is easily > > reproducible. > > If this is a bug in the stack smashing protection as you seem to hint, > I wonder if you see the issue with a specific gcc version and is a > compiler-specific issue. It's hard to say, but considering this I Very likely, more asm code from Zhouyi's link: This is the __srcu_read_unlock_nmisafe(), since "hwsync" is smp_mb__{after,before}_atomic(), and the following code is first barrier then atomic, so it's the unlock. c0226eb4: 78 6b aa 7d mr r10,r13 ^ r13 is the pointer to percpu data on PPC64 kernel, and it's also the pointer to TLS data for userspace code. c0226eb8: 14 42 29 7d add r9,r9,r8 c0226ebc: ac 04 00 7c hwsync c0226ec0: 10 00 7b 3b addir27,r27,16 c0226ec4: 14 da 29 7d add r9,r9,r27 c0226ec8: a8 48 00 7d ldarx r8,0,r9 c0226ecc: 01 00 08 31 addic r8,r8,1 c0226ed0: ad 49 00 7d stdcx. r8,0,r9 c0226ed4: f4 ff c2 40 bne-c0226ec8 c0226ed8: 28 00 21 e9 ld r9,40(r1) c0226edc: 78 0c 4a e9 ld r10,3192(r10) here I think that the compiler is using r10 as an alias to r13, since for userspace program, it's safe to assume the TLS pointer doesn't change. However this is not true for kernel percpu pointer. The real intention here is to compare 40(r1) vs 3192(r13) for stack guard checking, however since r13 is the percpu pointer in kernel, so the value of r13 can be changed if the thread gets scheduled to a different CPU after reading r13 for r10. __srcu_read_unlock_nmisafe() triggers this issue, because: * it contains a read from r13 * it locates at the very end of srcu_gp_start_if_needed(). This gives the compiler more opportunity to "optimize" a read from r13 away. c0226ee0: 79 52 29 7d xor.r9,r9,r10 c0226ee4: 00 00 40 39 li r10,0 c0226ee8: b8 03 82 40 bne c02272a0 As a result, here triggers __stack_chk_fail if mis-match. If I'm correct, the following should be a workaround: diff --git
[PATCH v10 5/5] powerpc/kexec: add crash memory hotplug support
Extend PowerPC arch crash hotplug handler to support memory hotplug events. Since elfcorehdr is used to exchange the memory info between the kernels hence it needs to be recreated to reflect the changes due to memory hotplug events. The way memory hotplug events are handled on PowerPC and the notifier call chain used in generic code to trigger the arch crash handler, the process to recreate the elfcorehdr is different for memory add and remove case. For memory remove case the memory change notifier call chain is triggered first and then memblock regions is updated. Whereas for the memory hot add case, memblock regions are updated before invoking the memory change notifier call chain. On PowerPC, memblock regions list is used to prepare the elfcorehdr. In case of memory hot remove the memblock regions are updated after the arch crash hotplug handler is triggered, hence an additional step is taken to ensure that memory ranges used to prepare elfcorehdr do not include hot removed memory. When memory is hot removed it possible that memory regions count may increase. So to accommodate a growing number of memory regions, the elfcorehdr kexec segment is built with additional buffer space. The changes done here will also work for the kexec_load system call given that the kexec tool builds the elfcoredhr with additional space to accommodate future memory regions as it is done for kexec_file_load system call in the kernel. Signed-off-by: Sourabh Jain Reviewed-by: Laurent Dufour --- arch/powerpc/include/asm/kexec_ranges.h | 1 + arch/powerpc/kexec/core_64.c| 77 +- arch/powerpc/kexec/file_load_64.c | 36 ++- arch/powerpc/kexec/ranges.c | 85 + 4 files changed, 195 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kexec_ranges.h b/arch/powerpc/include/asm/kexec_ranges.h index f83866a19e870..802abf580cf0f 100644 --- a/arch/powerpc/include/asm/kexec_ranges.h +++ b/arch/powerpc/include/asm/kexec_ranges.h @@ -7,6 +7,7 @@ void sort_memory_ranges(struct crash_mem *mrngs, bool merge); struct crash_mem *realloc_mem_ranges(struct crash_mem **mem_ranges); int add_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); +int remove_mem_range(struct crash_mem **mem_ranges, u64 base, u64 size); int add_tce_mem_ranges(struct crash_mem **mem_ranges); int add_initrd_mem_range(struct crash_mem **mem_ranges); #ifdef CONFIG_PPC_64S_HASH_MMU diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 147ea6288a526..01a764b1c9b07 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -547,6 +548,76 @@ int update_cpus_node(void *fdt) #undef pr_fmt #define pr_fmt(fmt) "crash hp: " fmt +/** + * update_crash_elfcorehdr() - Recreate the elfcorehdr and replace it with old + *elfcorehdr in the kexec segment array. + * @image: the active struct kimage + * @arg: struct memory_notify data handler + */ +static void update_crash_elfcorehdr(struct kimage *image, struct memory_notify *mn) +{ + int ret; + struct crash_mem *cmem = NULL; + struct kexec_segment *ksegment; + void *ptr, *mem, *elfbuf = NULL; + unsigned long elfsz, memsz, base_addr, size; + + ksegment = >segment[image->elfcorehdr_index]; + mem = (void *) ksegment->mem; + memsz = ksegment->memsz; + + ret = get_crash_memory_ranges(); + if (ret) { + pr_err("Failed to get crash mem range\n"); + return; + } + + /* +* The hot unplugged memory is not yet removed from crash memory +* ranges, remove it here. +*/ + if (image->hp_action == KEXEC_CRASH_HP_REMOVE_MEMORY) { + base_addr = PFN_PHYS(mn->start_pfn); + size = mn->nr_pages * PAGE_SIZE; + ret = remove_mem_range(, base_addr, size); + if (ret) { + pr_err("Failed to remove hot-unplugged from crash memory ranges.\n"); + return; + } + } + + ret = crash_prepare_elf64_headers(cmem, false, , ); + if (ret) { + pr_err("Failed to prepare elf header\n"); + return; + } + + /* +* It is unlikely that kernel hit this because elfcorehdr kexec +* segment (memsz) is built with addition space to accommodate growing +* number of crash memory ranges while loading the kdump kernel. It is +* Just to avoid any unforeseen case. +*/ + if (elfsz > memsz) { + pr_err("Updated crash elfcorehdr elfsz %lu > memsz %lu", elfsz, memsz); + goto out; + } + + ptr = __va(mem); + if (ptr) { + /* Temporarily invalidate the crash image while it is replaced */ + xchg(_crash_image, NULL); + +
[PATCH v10 4/5] crash: forward memory_notify args to arch crash hotplug handler
On PowePC memblock regions are used to prepare elfcorehdr which describes the memory regions of the running kernel to the kdump kernel. Since the notifier used for the memory hotplug crash handler gets initiated before the update of the memblock region happens (as depicted below) the newly prepared elfcorehdr still holds the old memory regions. If the elfcorehdr is prepared with stale memblock regions then the newly prepared elfcorehdr will still be holding stale memory regions. And dump collection with stale elfcorehdr will lead to dump collection failure or incomplete dump collection. The sequence of actions done on PowerPC when an LMB memory hot removed: Initiate memory hot remove | v offline pages | v initiate memory notify call chain for MEM_OFFLINE event <---> Prepare new elfcorehdr and replace it with old one | v update memblock regions Such challenges only exist for memory remove case. For the memory add case the memory regions are updated first and then memory notify calls the arch crash hotplug handler to update the elfcorehdr. This patch passes additional information about the hot removed LMB to the arch crash hotplug handler in the form of memory_notify object. How passing memory_notify to arch crash hotplug handler will help? memory_notify holds the start PFN and page count of the hot removed memory. With that base address and the size of the hot removed memory can be calculated and same can be used to avoid adding hot removed memory region to get added in the elfcorehdr. Signed-off-by: Sourabh Jain Reviewed-by: Laurent Dufour --- arch/powerpc/include/asm/kexec.h | 2 +- arch/powerpc/kexec/core_64.c | 3 ++- arch/x86/include/asm/kexec.h | 2 +- arch/x86/kernel/crash.c | 3 ++- include/linux/kexec.h| 2 +- kernel/crash_core.c | 14 +++--- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index f01ba767af56e..7e811bad5ec92 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -104,7 +104,7 @@ struct crash_mem; int update_cpus_node(void *fdt); int get_crash_memory_ranges(struct crash_mem **mem_ranges); #if defined(CONFIG_CRASH_HOTPLUG) -void arch_crash_handle_hotplug_event(struct kimage *image); +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event #endif #endif diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 611b89bcea2be..147ea6288a526 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -551,10 +551,11 @@ int update_cpus_node(void *fdt) * arch_crash_hotplug_handler() - Handle crash CPU/Memory hotplug events to update the *necessary kexec segments based on the hotplug event. * @image: the active struct kimage + * @arg: struct memory_notify handler for memory add/remove case and NULL for CPU case. * * Update FDT segment to include newly added CPU. No action for CPU remove case. */ -void arch_crash_handle_hotplug_event(struct kimage *image) +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { void *fdt, *ptr; unsigned long mem; diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 1bc852ce347d4..70c3b23b468b6 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -213,7 +213,7 @@ extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; extern void kdump_nmi_shootdown_cpus(void); #ifdef CONFIG_CRASH_HOTPLUG -void arch_crash_handle_hotplug_event(struct kimage *image); +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg); #define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index ead602636f3e0..b45d13193b579 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -445,11 +445,12 @@ int crash_load_segments(struct kimage *image) /** * arch_crash_handle_hotplug_event() - Handle hotplug elfcorehdr changes * @image: the active struct kimage + * @arg: struct memory_notify handler for memory add/remove case and NULL for CPU case. * * The new elfcorehdr is prepared in a kernel buffer, and then it is * written on top of the existing/old elfcorehdr. */ -void arch_crash_handle_hotplug_event(struct kimage *image) +void arch_crash_handle_hotplug_event(struct kimage *image, void *arg) { void *elfbuf = NULL, *old_elfcorehdr; unsigned long nr_mem_ranges; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 0ac41f48de0b1..69765e6a92d0d 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -506,7 +506,7 @@ static inline void
[PATCH v10 3/5] powerpc/crash: add crash CPU hotplug support
Introduce powerpc crash hotplug handler to update the necessary kexec segments in the kernel on CPU/Memory hotplug events. Currently, these updates are done by monitoring CPU/Memory hotplug events in userspace. A common crash hotplug handler is triggered from generic infrastructure for both CPU/Memory hotplug events. But in this patch, crash updates are handled only for CPU hotplug events. Support for the crash update on memory hotplug events is added in upcoming patches. The elfcorehdr segment is used to exchange the CPU and other dump-related information between the kernels. Ideally, the elfcorehdr segment needs to be recreated on CPU hotplug events to reflect the changes. But on powerpc, the elfcorehdr is built with possible CPUs hence there is no need to update/recreate the elfcorehdr on CPU hotplug events. In addition to elfcorehdr, there is another kexec segment that holds CPU data on powerpc is FDT (Flattened Device Tree). During the kdump kernel boot, it is expected that the crashing CPU must be present in FDT, else kdump kernel boot fails. Now the only action needed on powerpc to handle the crash CPU hotplug event is to add hot added CPUs in the kdump FDT segment to avoid kdump kernel boot failure. So for the CPU hot add event, the FDT segment is updated with hot added CPU and Since there is no need to remove the hot unplugged CPUs from the FDT segment hence no action was taken for CPU hot remove event. To accommodate a growing number of CPUs, FDT is built with additional buffer space to ensure that it can hold possible CPU nodes. The changes done here will also work for the kexec_load system call given that the kexec tool builds the FDT segment with additional space to accommodate possible CPU nodes. Since memory crash hotplug support is not there yet the crash hotplug the handler simply warns the user and returns. Signed-off-by: Sourabh Jain Reviewed-by: Laurent Dufour --- arch/powerpc/include/asm/kexec.h | 4 ++ arch/powerpc/kexec/core_64.c | 61 +++ arch/powerpc/kexec/elf_64.c | 12 +- arch/powerpc/kexec/file_load_64.c | 14 +++ 4 files changed, 90 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 8090ad7d97d9d..f01ba767af56e 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -103,6 +103,10 @@ void kexec_copy_flush(struct kimage *image); struct crash_mem; int update_cpus_node(void *fdt); int get_crash_memory_ranges(struct crash_mem **mem_ranges); +#if defined(CONFIG_CRASH_HOTPLUG) +void arch_crash_handle_hotplug_event(struct kimage *image); +#define arch_crash_handle_hotplug_event arch_crash_handle_hotplug_event +#endif #endif #if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_PPC_RTAS) diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 0b292f93a74cc..611b89bcea2be 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -543,6 +543,67 @@ int update_cpus_node(void *fdt) return ret; } +#if defined(CONFIG_CRASH_HOTPLUG) +#undef pr_fmt +#define pr_fmt(fmt) "crash hp: " fmt + +/** + * arch_crash_hotplug_handler() - Handle crash CPU/Memory hotplug events to update the + *necessary kexec segments based on the hotplug event. + * @image: the active struct kimage + * + * Update FDT segment to include newly added CPU. No action for CPU remove case. + */ +void arch_crash_handle_hotplug_event(struct kimage *image) +{ + void *fdt, *ptr; + unsigned long mem; + int i, fdt_index = -1; + unsigned int hp_action = image->hp_action; + + /* +* Since the hot-unplugged CPU is already part of crash FDT, +* no action is needed for CPU remove case. +*/ + if (hp_action == KEXEC_CRASH_HP_REMOVE_CPU) + return; + + /* crash update on memory hotplug events is not supported yet */ + if (hp_action == KEXEC_CRASH_HP_REMOVE_MEMORY || hp_action == KEXEC_CRASH_HP_ADD_MEMORY) { + pr_info_once("Crash update is not supported for memory hotplug\n"); + return; + } + + /* Find the FDT segment index in kexec segment array. */ + for (i = 0; i < image->nr_segments; i++) { + mem = image->segment[i].mem; + ptr = __va(mem); + + if (ptr && fdt_magic(ptr) == FDT_MAGIC) { + fdt_index = i; + break; + } + } + + if (fdt_index < 0) { + pr_err("Unable to locate FDT segment.\n"); + return; + } + + fdt = __va((void *)image->segment[fdt_index].mem); + + /* Temporarily invalidate the crash image while it is replaced */ + xchg(_crash_image, NULL); + + /* update FDT to refelect changes in CPU resrouces */ + if (update_cpus_node(fdt)) + pr_err("Failed to update crash FDT"); +
[PATCH v10 0/5] PowerPC: In-kernel handling of CPU/Memory hotplug/online/offline events for kdump kernel
The Problem: Post CPU/Memory hot plug/unplug and online/offline events the kernel holds stale information about the system. Dump collection with stale kdump kernel might end up in dump capture failure or an inaccurate dump collection. Existing solution: == The existing solution to keep the kdump kernel up-to-date by monitoring CPU/Memory hotplug/online/offline events via udev rule and trigger a full kdump kernel reload for every hotplug event. Shortcomings: - Leaves a window where kernel crash might not lead to a successful dump collection. - Reloading all kexec components for each hotplug is inefficient. - udev rules are prone to races if hotplug events are frequent. More about issues with an existing solution is posted here: - https://lkml.org/lkml/2020/12/14/532 - https://lists.ozlabs.org/pipermail/linuxppc-dev/2022-February/240254.html Proposed Solution: == Instead of reloading all kexec segments on CPU/Memory hotplug/online/offline event, this patch series focuses on updating only the relevant kexec segment. Once the kexec segments are loaded in the kernel reserved area then an arch-specific hotplug handler will update the relevant kexec segment based on hotplug event type. Series Dependencies This patch series implements the crash hotplug handler on PowerPC. The generic crash hotplug handler is introduced by https://lkml.org/lkml/2023/4/4/1136 patch series. Git tree for testing: = The below git tree has this patch series applied on top of dependent patch series. https://github.com/sourabhjains/linux/tree/e21-s10 To realise the feature the kdump udev rule must updated to avoid reloading of kdump reload on CPU/Memory hotplug/online/offline events. RHEL: /usr/lib/udev/rules.d/98-kexec.rules -SUBSYSTEM=="cpu", ACTION=="online", GOTO="kdump_reload_cpu" -SUBSYSTEM=="memory", ACTION=="online", GOTO="kdump_reload_mem" -SUBSYSTEM=="memory", ACTION=="offline", GOTO="kdump_reload_mem" +SUBSYSTEM=="cpu", ATTRS{crash_hotplug}=="1", GOTO="kdump_reload_end" +SUBSYSTEM=="memory", ATTRS{crash_hotplug}=="1", GOTO="kdump_reload_end" Note: only kexec_file_load syscall will work. For kexec_load minor changes are required in kexec tool. --- Changelog: v10: - Drop the patch that adds fdt_index attribute to struct kimage_arch Find the fdt segment index when needed. - Added more details into commits messages. - Rebased onto 6.3.0-rc5 v9: - Removed patch to prepare elfcorehdr crash notes for possible CPUs. The patch is moved to generic patch series that introduces generic infrastructure for in kernel crash update. - Removed patch to pass the hotplug action type to the arch crash hotplug handler function. The generic patch series has introduced the hotplug action type in kimage struct. - Add detail commit message for better understanding. v8: - Restrict fdt_index initialization to machine_kexec_post_load it work for both kexec_load and kexec_file_load.[3/8] Laurent Dufour - Updated the logic to find the number of offline core. [6/8] - Changed the logic to find the elfcore program header to accommodate future memory ranges due memory hotplug events. [8/8] v7 - added a new config to configure this feature - pass hotplug action type to arch specific handler v6 - Added crash memory hotplug support v5: - Replace COFNIG_CRASH_HOTPLUG with CONFIG_HOTPLUG_CPU. - Move fdt segment identification for kexec_load case to load path instead of crash hotplug handler - Keep new attribute defined under kimage_arch to track FDT segment under CONFIG_HOTPLUG_CPU config. v4: - Update the logic to find the additional space needed for hotadd CPUs post kexec load. Refer "[RFC v4 PATCH 4/5] powerpc/crash hp: add crash hotplug support for kexec_file_load" patch to know more about the change. - Fix a couple of typo. - Replace pr_err to pr_info_once to warn user about memory hotplug support. - In crash hotplug handle exit the for loop if FDT segment is found. v3 - Move fdt_index and fdt_index_vaild variables to kimage_arch struct. - Rebase patche on top of https://lkml.org/lkml/2022/3/3/674 [v5] - Fixed warning reported by checpatch script v2: - Use generic hotplug handler introduced by https://lkml.org/lkml/2022/2/9/1406, a significant change from v1. Sourabh Jain (5): powerpc/kexec: turn some static helper functions public powerpc/crash: introduce a new config option CRASH_HOTPLUG powerpc/crash: add crash CPU hotplug support crash: forward memory_notify args to arch crash hotplug handler powerpc/kexec: add crash memory hotplug support arch/powerpc/Kconfig| 12 + arch/powerpc/include/asm/kexec.h| 10 + arch/powerpc/include/asm/kexec_ranges.h | 1 + arch/powerpc/kexec/core_64.c| 301
[PATCH v10 2/5] powerpc/crash: introduce a new config option CRASH_HOTPLUG
Due to CPU/Memory hot plug/unplug or online/offline events the system resources changes. A similar change should reflect in the loaded kdump kernel kexec segments that describes the state of the CPU and memory of the running kernel. If the kdump kernel kexec segments are not updated after the CPU/Memory hot plug/unplug or online/offline events and kdump kernel tries to collect the dump with the stale system resource data then this might lead to dump collection failure or an inaccurate dump collection. The current method to keep the kdump kernel kexec segments up to date is by reloading the complete kdump kernel whenever a CPU/Memory hot plug/unplug or online/offline event is observed in userspace. Reloading the kdump kernel for every CPU/Memory hot plug/unplug or online/offline event is inefficient and creates a large window where the kdump service is not available. It can be improved by doing in-kernel updates to only necessary kdump kernel kexec segments which describe CPU and Memory resources of the running kernel to the kdump kernel. The kernel changes related to in-kernel updates to the kdump kernel kexec segments are kept under the CRASH_HOTPLUG config option. Later in the series, a powerpc crash hotplug handler is introduced to update the kdump kernel kexec segments on CPU/Memory hotplug events. This arch-specific handler is triggered from a generic crash handler that registers with the CPU/Memory add/remove notifiers. The CRASH_HOTPLUG config option is enabled by default. Signed-off-by: Sourabh Jain Reviewed-by: Laurent Dufour --- arch/powerpc/Kconfig | 12 1 file changed, 12 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a6c4407d3ec83..ac0dc0ffe89b4 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -681,6 +681,18 @@ config CRASH_DUMP The same kernel binary can be used as production kernel and dump capture kernel. +config CRASH_HOTPLUG + bool "In-kernel update to kdump kernel on system configuration changes" + default y + depends on CRASH_DUMP && (HOTPLUG_CPU || MEMORY_HOTPLUG) + help + Quick and efficient mechanism to update the kdump kernel in the + event of CPU/Memory hot plug/unplug or online/offline events. This + approach does the in-kernel update to only necessary kexec segment + instead of unload-reload entire kdump kernel from userspace. + + If unsure, say Y. + config FA_DUMP bool "Firmware-assisted dump" depends on PPC64 && (PPC_RTAS || PPC_POWERNV) -- 2.39.2
[PATCH v10 1/5] powerpc/kexec: turn some static helper functions public
Move update_cpus_node and get_crash_memory_ranges functions from kexec/file_load_64.c to kexec/core_64.c to make these functions usable by other kexec components. Later in the series, these functions are utilized to do in-kernel update to kexec segments on CPU/Memory hot plug/unplug or online/offline events for both kexec_load and kexec_file_load syscalls. No functional change intended. Signed-off-by: Sourabh Jain Reviewed-by: Laurent Dufour --- arch/powerpc/include/asm/kexec.h | 6 ++ arch/powerpc/kexec/core_64.c | 166 ++ arch/powerpc/kexec/file_load_64.c | 162 - 3 files changed, 172 insertions(+), 162 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index a1ddba01e7d13..8090ad7d97d9d 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -99,6 +99,12 @@ void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_co void kexec_copy_flush(struct kimage *image); +#ifdef CONFIG_PPC64 +struct crash_mem; +int update_cpus_node(void *fdt); +int get_crash_memory_ranges(struct crash_mem **mem_ranges); +#endif + #if defined(CONFIG_CRASH_DUMP) && defined(CONFIG_PPC_RTAS) void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); #define crash_free_reserved_phys_range crash_free_reserved_phys_range diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index a79e28c91e2be..0b292f93a74cc 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include @@ -30,6 +32,8 @@ #include #include #include +#include +#include int machine_kexec_prepare(struct kimage *image) { @@ -377,6 +381,168 @@ void default_machine_kexec(struct kimage *image) /* NOTREACHED */ } +/** + * get_crash_memory_ranges - Get crash memory ranges. This list includes + * first/crashing kernel's memory regions that + * would be exported via an elfcore. + * @mem_ranges: Range list to add the memory ranges to. + * + * Returns 0 on success, negative errno on error. + */ +int get_crash_memory_ranges(struct crash_mem **mem_ranges) +{ + phys_addr_t base, end; + struct crash_mem *tmem; + u64 i; + int ret; + + for_each_mem_range(i, , ) { + u64 size = end - base; + + /* Skip backup memory region, which needs a separate entry */ + if (base == BACKUP_SRC_START) { + if (size > BACKUP_SRC_SIZE) { + base = BACKUP_SRC_END + 1; + size -= BACKUP_SRC_SIZE; + } else + continue; + } + + ret = add_mem_range(mem_ranges, base, size); + if (ret) + goto out; + + /* Try merging adjacent ranges before reallocation attempt */ + if ((*mem_ranges)->nr_ranges == (*mem_ranges)->max_nr_ranges) + sort_memory_ranges(*mem_ranges, true); + } + + /* Reallocate memory ranges if there is no space to split ranges */ + tmem = *mem_ranges; + if (tmem && (tmem->nr_ranges == tmem->max_nr_ranges)) { + tmem = realloc_mem_ranges(mem_ranges); + if (!tmem) + goto out; + } + + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(tmem, crashk_res.start, crashk_res.end); + if (ret) + goto out; + + /* +* FIXME: For now, stay in parity with kexec-tools but if RTAS/OPAL +*regions are exported to save their context at the time of +*crash, they should actually be backed up just like the +*first 64K bytes of memory. +*/ + ret = add_rtas_mem_range(mem_ranges); + if (ret) + goto out; + + ret = add_opal_mem_range(mem_ranges); + if (ret) + goto out; + + /* create a separate program header for the backup region */ + ret = add_mem_range(mem_ranges, BACKUP_SRC_START, BACKUP_SRC_SIZE); + if (ret) + goto out; + + sort_memory_ranges(*mem_ranges, false); +out: + if (ret) + pr_err("Failed to setup crash memory ranges\n"); + return ret; +} + +/** + * add_node_props - Reads node properties from device node structure and add + * them to fdt. + * @fdt:Flattened device tree of the kernel + * @node_offset:offset of the node to add a property at + * @dn: device node pointer + * + * Returns 0 on success, negative errno on error. + */ +static int add_node_props(void *fdt, int node_offset, const struct device_node *dn) +{ + int ret = 0; + struct property