[RFC patch 4/5] ASoC: tlv320aic31xx: Handle BCLK set as PLL input configuration

2021-11-19 Thread Ariel D'Alessandro
If BCLK is used as PLL input, the sysclk is determined by the hw
params. So it must be updated here to match the input frequency, based
on sample rate, format and channels.

Signed-off-by: Ariel D'Alessandro 
Signed-off-by: Michael Trimarchi 
---
 sound/soc/codecs/tlv320aic31xx.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/sound/soc/codecs/tlv320aic31xx.c b/sound/soc/codecs/tlv320aic31xx.c
index e8307f0737f2..4224b4b3cae6 100644
--- a/sound/soc/codecs/tlv320aic31xx.c
+++ b/sound/soc/codecs/tlv320aic31xx.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -169,6 +170,7 @@ struct aic31xx_priv {
struct regulator_bulk_data supplies[AIC31XX_NUM_SUPPLIES];
struct aic31xx_disable_nb disable_nb[AIC31XX_NUM_SUPPLIES];
struct snd_soc_jack *jack;
+   u32 sysclk_id;
unsigned int sysclk;
u8 p_div;
int rate_div_line;
@@ -962,6 +964,7 @@ static int aic31xx_hw_params(struct snd_pcm_substream 
*substream,
 struct snd_soc_dai *dai)
 {
struct snd_soc_component *component = dai->component;
+   struct aic31xx_priv *aic31xx = snd_soc_component_get_drvdata(component);
u8 data = 0;
 
dev_dbg(component->dev, "## %s: width %d rate %d\n",
@@ -993,6 +996,16 @@ static int aic31xx_hw_params(struct snd_pcm_substream 
*substream,
AIC31XX_IFACE1_DATALEN_MASK,
data);
 
+   /*
+* If BCLK is used as PLL input, the sysclk is determined by the hw
+* params. So it must be updated here to match the input frequency.
+*/
+   if (aic31xx->sysclk_id == AIC31XX_PLL_CLKIN_BCLK) {
+   aic31xx->sysclk = params_rate(params) * params_width(params) *
+ params_channels(params);
+   aic31xx->p_div = 1;
+   }
+
return aic31xx_setup_pll(component, params);
 }
 
@@ -1177,6 +1190,7 @@ static int aic31xx_set_dai_sysclk(struct snd_soc_dai 
*codec_dai,
snd_soc_component_update_bits(component, AIC31XX_CLKMUX, 
AIC31XX_PLL_CLKIN_MASK,
clk_id << AIC31XX_PLL_CLKIN_SHIFT);
 
+   aic31xx->sysclk_id = clk_id;
aic31xx->sysclk = freq;
 
return 0;
-- 
2.30.2



[RFC patch 3/5] ASoC: tlv320aic31xx: Add divs for bclk as clk_in

2021-11-19 Thread Ariel D'Alessandro
Add divisors for rates needed when the clk_in is set to BCLK.

Signed-off-by: Michael Trimarchi 
Signed-off-by: Ariel D'Alessandro 
---
 sound/soc/codecs/tlv320aic31xx.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/sound/soc/codecs/tlv320aic31xx.c b/sound/soc/codecs/tlv320aic31xx.c
index 1aec03d834d0..e8307f0737f2 100644
--- a/sound/soc/codecs/tlv320aic31xx.c
+++ b/sound/soc/codecs/tlv320aic31xx.c
@@ -195,46 +195,66 @@ struct aic31xx_rate_divs {
 static const struct aic31xx_rate_divs aic31xx_divs[] = {
/* mclk/prate  pll: r  j d dosr ndac mdac  aors nadc madc */
/* 8k rate */
+   {  512000,   8000,  4, 48,   0, 128,  48,  2,   128,  48,  2},
{1200,   8000,  1, 8, 1920, 128,  48,  2,   128,  48,  2},
{1200,   8000,  1, 8, 1920, 128,  32,  3,   128,  32,  3},
{1250,   8000,  1, 7, 8643, 128,  48,  2,   128,  48,  2},
/* 11.025k rate */
+   {  705600,  11025,  3, 48,   0, 128,  24,  3,   128,  24,  3},
{1200,  11025,  1, 7, 5264, 128,  32,  2,   128,  32,  2},
{1200,  11025,  1, 8, 4672, 128,  24,  3,   128,  24,  3},
{1250,  11025,  1, 7, 2253, 128,  32,  2,   128,  32,  2},
/* 16k rate */
+   {  512000,  16000,  4, 48,   0, 128,  16,  3,   128,  16,  3},
+   { 1024000,  16000,  2, 48,   0, 128,  16,  3,   128,  16,  3},
{1200,  16000,  1, 8, 1920, 128,  24,  2,   128,  24,  2},
{1200,  16000,  1, 8, 1920, 128,  16,  3,   128,  16,  3},
{1250,  16000,  1, 7, 8643, 128,  24,  2,   128,  24,  2},
/* 22.05k rate */
+   {  705600,  22050,  4, 36,   0, 128,  12,  3,   128,  12,  3},
+   { 1411200,  22050,  2, 36,   0, 128,  12,  3,   128,  12,  3},
{1200,  22050,  1, 7, 5264, 128,  16,  2,   128,  16,  2},
{1200,  22050,  1, 8, 4672, 128,  12,  3,   128,  12,  3},
{1250,  22050,  1, 7, 2253, 128,  16,  2,   128,  16,  2},
/* 32k rate */
+   { 1024000,  32000,  2, 48,   0, 128,  12,  2,   128,  12,  2},
+   { 2048000,  32000,  1, 48,   0, 128,  12,  2,   128,  12,  2},
{1200,  32000,  1, 8, 1920, 128,  12,  2,   128,  12,  2},
{1200,  32000,  1, 8, 1920, 128,   8,  3,   128,   8,  3},
{1250,  32000,  1, 7, 8643, 128,  12,  2,   128,  12,  2},
/* 44.1k rate */
+   { 1411200,  44100,  2, 32,   0, 128,   8,  2,   128,   8,  2},
+   { 2822400,  44100,  1, 32,   0, 128,   8,  2,   128,   8,  2},
{1200,  44100,  1, 7, 5264, 128,   8,  2,   128,   8,  2},
{1200,  44100,  1, 8, 4672, 128,   6,  3,   128,   6,  3},
{1250,  44100,  1, 7, 2253, 128,   8,  2,   128,   8,  2},
/* 48k rate */
+   { 1536000,  48000,  2, 32,   0, 128,   8,  2,   128,   8,  2},
+   { 3072000,  48000,  1, 32,   0, 128,   8,  2,   128,   8,  2},
{1200,  48000,  1, 8, 1920, 128,   8,  2,   128,   8,  2},
{1200,  48000,  1, 7, 6800,  96,   5,  4,96,   5,  4},
{1250,  48000,  1, 7, 8643, 128,   8,  2,   128,   8,  2},
/* 88.2k rate */
+   { 2822400,  88200,  2, 16,   0,  64,   8,  2,64,   8,  2},
+   { 5644800,  88200,  1, 16,   0,  64,   8,  2,64,   8,  2},
{1200,  88200,  1, 7, 5264,  64,   8,  2,64,   8,  2},
{1200,  88200,  1, 8, 4672,  64,   6,  3,64,   6,  3},
{1250,  88200,  1, 7, 2253,  64,   8,  2,64,   8,  2},
/* 96k rate */
+   { 3072000,  96000,  2, 16,   0,  64,   8,  2,64,   8,  2},
+   { 6144000,  96000,  1, 16,   0,  64,   8,  2,64,   8,  2},
{1200,  96000,  1, 8, 1920,  64,   8,  2,64,   8,  2},
{1200,  96000,  1, 7, 6800,  48,   5,  4,48,   5,  4},
{1250,  96000,  1, 7, 8643,  64,   8,  2,64,   8,  2},
/* 176.4k rate */
+   { 5644800, 176400,  2, 8,0,  32,   8,  2,32,   8,  2},
+   {11289600, 176400,  1, 8,0,  32,   8,  2,32,   8,  2},
{1200, 176400,  1, 7, 5264,  32,   8,  2,32,   8,  2},
{1200, 176400,  1, 8, 4672,  32,   6,  3,32,   6,  3},
{1250, 176400,  1, 7, 2253,  32,   8,  2,32,   8,  2},
/* 192k rate */
+   { 6144000, 192000,  2, 8,0,  32,   8,  2,32,   8,  2},
+   {12288000, 192000,  1, 8,0,  32,   8,  2,32,   8,  2},
{1200, 192000,  1, 8, 1920,  32,   8,  2,32,   8,  2},
{1200, 192000,  1, 7, 6800,  24,   5,  4,24,   5,  4},
  

[RFC patch 5/5] ASoC: fsl-asoc-card: Support fsl, imx-audio-tlv320aic31xx codec

2021-11-19 Thread Ariel D'Alessandro
Add entry for fsl,imx-audio-tlv320aic31xx audio codec. This codec is
configured to use BCLK as clock input.

Signed-off-by: Michael Trimarchi 
Signed-off-by: Ariel D'Alessandro 
---
 sound/soc/fsl/fsl-asoc-card.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c
index 6e6494f9f399..90cbed496f98 100644
--- a/sound/soc/fsl/fsl-asoc-card.c
+++ b/sound/soc/fsl/fsl-asoc-card.c
@@ -26,6 +26,7 @@
 #include "../codecs/wm8962.h"
 #include "../codecs/wm8960.h"
 #include "../codecs/wm8994.h"
+#include "../codecs/tlv320aic31xx.h"
 
 #define CS427x_SYSCLK_MCLK 0
 
@@ -629,6 +630,16 @@ static int fsl_asoc_card_probe(struct platform_device 
*pdev)
} else if (of_device_is_compatible(np, "fsl,imx-audio-tlv320aic32x4")) {
codec_dai_name = "tlv320aic32x4-hifi";
priv->dai_fmt |= SND_SOC_DAIFMT_CBP_CFP;
+   } else if (of_device_is_compatible(np, "fsl,imx-audio-tlv320aic31xx")) {
+   codec_dai_name = "tlv320dac31xx-hifi";
+   priv->dai_fmt |= SND_SOC_DAIFMT_CBS_CFS;
+   priv->dai_link[1].dpcm_capture = 0;
+   priv->dai_link[2].dpcm_capture = 0;
+   priv->cpu_priv.sysclk_dir[TX] = SND_SOC_CLOCK_OUT;
+   priv->cpu_priv.sysclk_dir[RX] = SND_SOC_CLOCK_OUT;
+   priv->codec_priv.mclk_id = AIC31XX_PLL_CLKIN_BCLK;
+   priv->card.dapm_routes = audio_map_tx;
+   priv->card.num_dapm_routes = ARRAY_SIZE(audio_map_tx);
} else if (of_device_is_compatible(np, "fsl,imx-audio-wm8962")) {
codec_dai_name = "wm8962";
priv->codec_priv.mclk_id = WM8962_SYSCLK_MCLK;
@@ -888,6 +899,7 @@ static const struct of_device_id fsl_asoc_card_dt_ids[] = {
{ .compatible = "fsl,imx-audio-cs42888", },
{ .compatible = "fsl,imx-audio-cs427x", },
{ .compatible = "fsl,imx-audio-tlv320aic32x4", },
+   { .compatible = "fsl,imx-audio-tlv320aic31xx", },
{ .compatible = "fsl,imx-audio-sgtl5000", },
{ .compatible = "fsl,imx-audio-wm8962", },
{ .compatible = "fsl,imx-audio-wm8960", },
-- 
2.30.2



[RFC patch 0/5] Support BCLK input clock in tlv320aic31xx

2021-11-19 Thread Ariel D'Alessandro
The tlv320aic31xx codec allows using BCLK as the input clock for PLL,
deriving all the frequencies through a set of divisors.

In this case, codec sysclk is determined by the hwparams sample
rate/format. So its frequency must be updated from the codec itself when
these are changed.

This patchset modifies the tlv320aic31xx driver to update its sysclk if
BCLK is used as the input clock. This allows to be used by the generic
fsl-asoc-card, without having to add a specific driver.

Ariel D'Alessandro (5):
  ASoC: tlv320aic31xx: Fix typo in BCLK clock name
  ASoC: tlv320aic31xx: Add support for pll_r coefficient
  ASoC: tlv320aic31xx: Add divs for bclk as clk_in
  ASoC: tlv320aic31xx: Handle BCLK set as PLL input configuration
  ASoC: fsl-asoc-card: Support fsl,imx-audio-tlv320aic31xx codec

 sound/soc/codecs/tlv320aic31xx.c | 105 ---
 sound/soc/codecs/tlv320aic31xx.h |   2 +-
 sound/soc/fsl/fsl-asoc-card.c|  12 
 3 files changed, 83 insertions(+), 36 deletions(-)

-- 
2.30.2



[RFC patch 2/5] ASoC: tlv320aic31xx: Add support for pll_r coefficient

2021-11-19 Thread Ariel D'Alessandro
When the clock used by the codec is BCLK, the operation parameters need
to be calculated from input sample rate and format. Low frequency rates
required different r multipliers, in order to achieve a higher PLL
output frequency.

Signed-off-by: Michael Trimarchi 
Signed-off-by: Ariel D'Alessandro 
---
 sound/soc/codecs/tlv320aic31xx.c | 71 
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/sound/soc/codecs/tlv320aic31xx.c b/sound/soc/codecs/tlv320aic31xx.c
index 52d2c968b5c0..1aec03d834d0 100644
--- a/sound/soc/codecs/tlv320aic31xx.c
+++ b/sound/soc/codecs/tlv320aic31xx.c
@@ -180,6 +180,7 @@ struct aic31xx_priv {
 struct aic31xx_rate_divs {
u32 mclk_p;
u32 rate;
+   u8 pll_r;
u8 pll_j;
u16 pll_d;
u16 dosr;
@@ -192,51 +193,51 @@ struct aic31xx_rate_divs {
 
 /* ADC dividers can be disabled by configuring them to 0 */
 static const struct aic31xx_rate_divs aic31xx_divs[] = {
-   /* mclk/prate  pll: j ddosr ndac mdac  aors nadc madc */
+   /* mclk/prate  pll: r  j d dosr ndac mdac  aors nadc madc */
/* 8k rate */
-   {1200,   8000,  8, 1920,128,  48,  2,   128,  48,  2},
-   {1200,   8000,  8, 1920,128,  32,  3,   128,  32,  3},
-   {1250,   8000,  7, 8643,128,  48,  2,   128,  48,  2},
+   {1200,   8000,  1, 8, 1920, 128,  48,  2,   128,  48,  2},
+   {1200,   8000,  1, 8, 1920, 128,  32,  3,   128,  32,  3},
+   {1250,   8000,  1, 7, 8643, 128,  48,  2,   128,  48,  2},
/* 11.025k rate */
-   {1200,  11025,  7, 5264,128,  32,  2,   128,  32,  2},
-   {1200,  11025,  8, 4672,128,  24,  3,   128,  24,  3},
-   {1250,  11025,  7, 2253,128,  32,  2,   128,  32,  2},
+   {1200,  11025,  1, 7, 5264, 128,  32,  2,   128,  32,  2},
+   {1200,  11025,  1, 8, 4672, 128,  24,  3,   128,  24,  3},
+   {1250,  11025,  1, 7, 2253, 128,  32,  2,   128,  32,  2},
/* 16k rate */
-   {1200,  16000,  8, 1920,128,  24,  2,   128,  24,  2},
-   {1200,  16000,  8, 1920,128,  16,  3,   128,  16,  3},
-   {1250,  16000,  7, 8643,128,  24,  2,   128,  24,  2},
+   {1200,  16000,  1, 8, 1920, 128,  24,  2,   128,  24,  2},
+   {1200,  16000,  1, 8, 1920, 128,  16,  3,   128,  16,  3},
+   {1250,  16000,  1, 7, 8643, 128,  24,  2,   128,  24,  2},
/* 22.05k rate */
-   {1200,  22050,  7, 5264,128,  16,  2,   128,  16,  2},
-   {1200,  22050,  8, 4672,128,  12,  3,   128,  12,  3},
-   {1250,  22050,  7, 2253,128,  16,  2,   128,  16,  2},
+   {1200,  22050,  1, 7, 5264, 128,  16,  2,   128,  16,  2},
+   {1200,  22050,  1, 8, 4672, 128,  12,  3,   128,  12,  3},
+   {1250,  22050,  1, 7, 2253, 128,  16,  2,   128,  16,  2},
/* 32k rate */
-   {1200,  32000,  8, 1920,128,  12,  2,   128,  12,  2},
-   {1200,  32000,  8, 1920,128,   8,  3,   128,   8,  3},
-   {1250,  32000,  7, 8643,128,  12,  2,   128,  12,  2},
+   {1200,  32000,  1, 8, 1920, 128,  12,  2,   128,  12,  2},
+   {1200,  32000,  1, 8, 1920, 128,   8,  3,   128,   8,  3},
+   {1250,  32000,  1, 7, 8643, 128,  12,  2,   128,  12,  2},
/* 44.1k rate */
-   {1200,  44100,  7, 5264,128,   8,  2,   128,   8,  2},
-   {1200,  44100,  8, 4672,128,   6,  3,   128,   6,  3},
-   {1250,  44100,  7, 2253,128,   8,  2,   128,   8,  2},
+   {1200,  44100,  1, 7, 5264, 128,   8,  2,   128,   8,  2},
+   {1200,  44100,  1, 8, 4672, 128,   6,  3,   128,   6,  3},
+   {1250,  44100,  1, 7, 2253, 128,   8,  2,   128,   8,  2},
/* 48k rate */
-   {1200,  48000,  8, 1920,128,   8,  2,   128,   8,  2},
-   {1200,  48000,  7, 6800, 96,   5,  4,96,   5,  4},
-   {1250,  48000,  7, 8643,128,   8,  2,   128,   8,  2},
+   {1200,  48000,  1, 8, 1920, 128,   8,  2,   128,   8,  2},
+   {1200,  48000,  1, 7, 6800,  96,   5,  4,96,   5,  4},
+   {1250,  48000,  1, 7, 8643, 128,   8,  2,   128,   8,  2},
/* 88.2k rate */
-   {1200,  88200,  7, 5264, 64,   8,  2,64,   8,  2},
-   {1200,  88200,  8, 4672, 64,   6,  3,64,   6,  3},
-   {1250,  88200,  7, 2253, 64,   8,  2,64,   8,  2},
+   {1200,  88200,  1, 7, 5264,  64,   8,  2,64,   8,  2},
+   {1200,  88200,  

[RFC patch 1/5] ASoC: tlv320aic31xx: Fix typo in BCLK clock name

2021-11-19 Thread Ariel D'Alessandro
Signed-off-by: Ariel D'Alessandro 
---
 sound/soc/codecs/tlv320aic31xx.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sound/soc/codecs/tlv320aic31xx.h b/sound/soc/codecs/tlv320aic31xx.h
index 2513922a0292..80d062578fb5 100644
--- a/sound/soc/codecs/tlv320aic31xx.h
+++ b/sound/soc/codecs/tlv320aic31xx.h
@@ -118,7 +118,7 @@ struct aic31xx_pdata {
 #define AIC31XX_PLL_CLKIN_MASK GENMASK(3, 2)
 #define AIC31XX_PLL_CLKIN_SHIFT(2)
 #define AIC31XX_PLL_CLKIN_MCLK 0x00
-#define AIC31XX_PLL_CLKIN_BCKL 0x01
+#define AIC31XX_PLL_CLKIN_BCLK 0x01
 #define AIC31XX_PLL_CLKIN_GPIO10x02
 #define AIC31XX_PLL_CLKIN_DIN  0x03
 #define AIC31XX_CODEC_CLKIN_MASK   GENMASK(1, 0)
-- 
2.30.2



[PATCH] powerpc/prom_init: fix the improper check of prom_getprop

2021-11-19 Thread Peiwei Hu
prom_getprop() can return PROM_ERROR. Binary operator can not identify it.

Signed-off-by: Peiwei Hu 
---
 arch/powerpc/kernel/prom_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 18b04b08b983..f845065c860e 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -2991,7 +2991,7 @@ static void __init fixup_device_tree_efika_add_phy(void)
 
/* Check if the phy-handle property exists - bail if it does */
rv = prom_getprop(node, "phy-handle", prop, sizeof(prop));
-   if (!rv)
+   if (rv <= 0)
return;
 
/*
-- 
2.25.1



Re: [PATCH 2/2] of: reserved_mem: Remove reserved regions count restriction

2021-11-19 Thread Calvin Zhang
On Fri, Nov 19, 2021 at 11:56:08AM +0200, Andy Shevchenko wrote:
>On Fri, Nov 19, 2021 at 03:58:19PM +0800, Calvin Zhang wrote:
>> Change to allocate reserved_mems dynamically. Static reserved regions
>> must be reserved before any memblock allocations. The reserved_mems
>> array couldn't be allocated until memblock and linear mapping are ready.
>> 
>> So move the allocation and initialization of records and reserved memory
>> from early_init_fdt_scan_reserved_mem() to of_reserved_mem_init().
>
>>  arch/arc/mm/init.c |  3 ++
>>  arch/arm/kernel/setup.c|  2 +
>>  arch/arm64/kernel/setup.c  |  3 ++
>>  arch/csky/kernel/setup.c   |  3 ++
>>  arch/h8300/kernel/setup.c  |  2 +
>>  arch/mips/kernel/setup.c   |  3 ++
>>  arch/nds32/kernel/setup.c  |  3 ++
>>  arch/nios2/kernel/setup.c  |  2 +
>>  arch/openrisc/kernel/setup.c   |  3 ++
>>  arch/powerpc/kernel/setup-common.c |  3 ++
>>  arch/riscv/kernel/setup.c  |  2 +
>>  arch/sh/kernel/setup.c |  3 ++
>>  arch/xtensa/kernel/setup.c |  2 +
>
>Isn't x86 missed? Is it on purpose?
>Would be nice to have this in the commit message or fixed accordingly.
AFAIK, x86 doesn't reserve memory through "/reserved-memory" node until now.
Actually, I got the arch list from callers of
early_init_fdt_scan_reserved_mem().
>
>-- 
>With Best Regards,
>Andy Shevchenko
>
>
>
Thanks,
Calvin


Re: [PATCH 2/2] of: reserved_mem: Remove reserved regions count restriction

2021-11-19 Thread Calvin Zhang
On Fri, Nov 19, 2021 at 11:56:08AM +0200, Andy Shevchenko wrote:
>On Fri, Nov 19, 2021 at 03:58:19PM +0800, Calvin Zhang wrote:
>> Change to allocate reserved_mems dynamically. Static reserved regions
>> must be reserved before any memblock allocations. The reserved_mems
>> array couldn't be allocated until memblock and linear mapping are ready.
>> 
>> So move the allocation and initialization of records and reserved memory
>> from early_init_fdt_scan_reserved_mem() to of_reserved_mem_init().
>
>>  arch/arc/mm/init.c |  3 ++
>>  arch/arm/kernel/setup.c|  2 +
>>  arch/arm64/kernel/setup.c  |  3 ++
>>  arch/csky/kernel/setup.c   |  3 ++
>>  arch/h8300/kernel/setup.c  |  2 +
>>  arch/mips/kernel/setup.c   |  3 ++
>>  arch/nds32/kernel/setup.c  |  3 ++
>>  arch/nios2/kernel/setup.c  |  2 +
>>  arch/openrisc/kernel/setup.c   |  3 ++
>>  arch/powerpc/kernel/setup-common.c |  3 ++
>>  arch/riscv/kernel/setup.c  |  2 +
>>  arch/sh/kernel/setup.c |  3 ++
>>  arch/xtensa/kernel/setup.c |  2 +
>
>Isn't x86 missed? Is it on purpose?
>Would be nice to have this in the commit message or fixed accordingly.
AFAIK, x86 doesn't reserve memory through "/reserved-memory" node until now.
Actually, I got the arch list from callers of
early_init_fdt_scan_reserved_mem().
>
>-- 
>With Best Regards,
>Andy Shevchenko
>
>
>

Thanks,
Calvin


Re: [PATCH 2/2] of: reserved_mem: Remove reserved regions count restriction

2021-11-19 Thread Andy Shevchenko
On Fri, Nov 19, 2021 at 03:58:19PM +0800, Calvin Zhang wrote:
> Change to allocate reserved_mems dynamically. Static reserved regions
> must be reserved before any memblock allocations. The reserved_mems
> array couldn't be allocated until memblock and linear mapping are ready.
> 
> So move the allocation and initialization of records and reserved memory
> from early_init_fdt_scan_reserved_mem() to of_reserved_mem_init().

>  arch/arc/mm/init.c |  3 ++
>  arch/arm/kernel/setup.c|  2 +
>  arch/arm64/kernel/setup.c  |  3 ++
>  arch/csky/kernel/setup.c   |  3 ++
>  arch/h8300/kernel/setup.c  |  2 +
>  arch/mips/kernel/setup.c   |  3 ++
>  arch/nds32/kernel/setup.c  |  3 ++
>  arch/nios2/kernel/setup.c  |  2 +
>  arch/openrisc/kernel/setup.c   |  3 ++
>  arch/powerpc/kernel/setup-common.c |  3 ++
>  arch/riscv/kernel/setup.c  |  2 +
>  arch/sh/kernel/setup.c |  3 ++
>  arch/xtensa/kernel/setup.c |  2 +

Isn't x86 missed? Is it on purpose?
Would be nice to have this in the commit message or fixed accordingly.

-- 
With Best Regards,
Andy Shevchenko




Re: [PATCH 0/3] KEXEC_SIG with appended signature

2021-11-19 Thread Mimi Zohar
On Fri, 2021-11-19 at 12:18 +0100, Michal Suchánek wrote:
> Maybe I was not clear enough. If you happen to focus on an architecture
> that supports IMA fully it's great.
> 
> My point of view is maintaining multiple architectures. Both end users
> and people conecerend with security are rarely familiar with
> architecture specifics. Portability of documentation and debugging
> instructions across architectures is a concern.
> 
> IMA has large number of options with varying availablitily across
> architectures for no apparent reason. The situation is complex and hard
> to grasp.

IMA measures, verifies, and audits the integrity of files based on a
system wide policy.  The known "good" integrity value may be stored in
the security.ima xattr or more recently as an appended signature.

With both IMA kexec appraise and measurement policy rules, not only is
the kernel image signature verified and the file hash included in the
IMA measurement list, but the signature used to verify the integrity of
the kexec kernel image is also included in the IMA measurement list
(ima_template=ima-sig).

Even without PECOFF support in IMA, IMA kexec measurement policy rules
can be defined to supplement the KEXEC_SIG signature verfication.

> 
> In comparison the *_SIG options are widely available. The missing
> support for KEXEC_SIG on POWER is trivial to add by cut from s390.
> With that all the documentation that exists already is also trivially
> applicable to POWER. Any additional code cleanup is a bonus but not
> really needed to enable the kexec lockdown on POWER.

Before lockdown was upstreamed, Matthew made sure that IMA signature
verification could co-exist.   Refer to commit 29d3c1c8dfe7 ("kexec:
Allow kexec_file() with appropriate IMA policy when locked down").   If
there is a problem with the downstream kexec lockdown patches, they
should be fixed.

The kexec kselftest might provide some insight into how the different
signature verification methods and lockdown co-exist.

As for adding KEXEC_SIG appended signature support on PowerPC based on
the s390 code, it sounds reasonable.

thanks,

Mimi



Re: [PATCH] powerpc/signal32: Use struct_group() to zero spe regs

2021-11-19 Thread Kees Cook
On Fri, Nov 19, 2021 at 05:35:00PM +0100, Christophe Leroy wrote:
> Neither do I. I was just scared by what I saw while reviewing your patch. A
> cleanup is probably required but it can be another patch.

Heh, understood! For my end, my objective with the fortify work is to
either split cross-member memcpy() calls (which is usually undesirable) or
add a struct group so it can be seen as a "single member" memcpy again
(and usually results in 0 differences in binary output). :)

-- 
Kees Cook


Re: [PATCH] powerpc/signal32: Use struct_group() to zero spe regs

2021-11-19 Thread Christophe Leroy




Le 19/11/2021 à 17:28, Kees Cook a écrit :

On Fri, Nov 19, 2021 at 08:46:27AM +, LEROY Christophe wrote:



Le 18/11/2021 à 21:36, Kees Cook a écrit :

In preparation for FORTIFY_SOURCE performing compile-time and run-time
field bounds checking for memset(), avoid intentionally writing across
neighboring fields.

Add a struct_group() for the spe registers so that memset() can correctly reason
about the size:

 In function 'fortify_memset_chk',
 inlined from 'restore_user_regs.part.0' at 
arch/powerpc/kernel/signal_32.c:539:3:
 >> include/linux/fortify-string.h:195:4: error: call to 
'__write_overflow_field' declared with attribute warning: detected write beyond size 
of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning]
   195 |__write_overflow_field();
   |^~~~

Reported-by: kernel test robot 
Signed-off-by: Kees Cook 


Reviewed-by: Christophe Leroy 

However, is it really worth adding that grouping ? Wouldn't it be
cleaner to handle evr[] and acc separately ? Now that we are using
unsafe variants of get/put user performance wouldn't be impacted.


I'm fine with whatever is desired here. I reworked an earlier version of
this patch based on mpe's feedback, so I can certain rework it again. :)


Well, with oddities like the below, it may not be straight forward. If 
the objective is to enable FORTIFY_SOURCE, maybe that's good enough.


Let see if Michael has any opinion.






I have some doubts about things like:

unsafe_copy_to_user(>mc_vregs, current->thread.evr,
ELF_NEVRREG * sizeof(u32), failed);

Because as far as I can see, ELF_NEVRREG is 34 but mc_vregs is a table
of 33 u32 and is at the end of the structure:

struct mcontext {
elf_gregset_t   mc_gregs;
elf_fpregset_t  mc_fregs;
unsigned long   mc_pad[2];
elf_vrregset_t  mc_vregs __attribute__((__aligned__(16)));
};

typedef elf_vrreg_t elf_vrregset_t[ELF_NVRREG];

# define ELF_NEVRREG34  /* includes acc (as 2) */
# define ELF_NVRREG 33  /* includes vscr */


I don't know these internals very well -- do you want me to change this
specifically somehow? With the BUILD_BUG_ON()s added, there's no binary
change here -- I wanted to make sure nothing was different in the
output.



Neither do I. I was just scared by what I saw while reviewing your 
patch. A cleanup is probably required but it can be another patch.


Christophe


Re: [PATCH] powerpc/signal32: Use struct_group() to zero spe regs

2021-11-19 Thread Kees Cook
On Fri, Nov 19, 2021 at 08:46:27AM +, LEROY Christophe wrote:
> 
> 
> Le 18/11/2021 à 21:36, Kees Cook a écrit :
> > In preparation for FORTIFY_SOURCE performing compile-time and run-time
> > field bounds checking for memset(), avoid intentionally writing across
> > neighboring fields.
> > 
> > Add a struct_group() for the spe registers so that memset() can correctly 
> > reason
> > about the size:
> > 
> > In function 'fortify_memset_chk',
> > inlined from 'restore_user_regs.part.0' at 
> > arch/powerpc/kernel/signal_32.c:539:3:
> > >> include/linux/fortify-string.h:195:4: error: call to 
> > '__write_overflow_field' declared with attribute warning: detected write 
> > beyond size of field (1st parameter); maybe use struct_group()? 
> > [-Werror=attribute-warning]
> >   195 |__write_overflow_field();
> >   |^~~~
> > 
> > Reported-by: kernel test robot 
> > Signed-off-by: Kees Cook 
> 
> Reviewed-by: Christophe Leroy 
> 
> However, is it really worth adding that grouping ? Wouldn't it be 
> cleaner to handle evr[] and acc separately ? Now that we are using 
> unsafe variants of get/put user performance wouldn't be impacted.

I'm fine with whatever is desired here. I reworked an earlier version of
this patch based on mpe's feedback, so I can certain rework it again. :)

> 
> I have some doubts about things like:
> 
>   unsafe_copy_to_user(>mc_vregs, current->thread.evr,
>   ELF_NEVRREG * sizeof(u32), failed);
> 
> Because as far as I can see, ELF_NEVRREG is 34 but mc_vregs is a table 
> of 33 u32 and is at the end of the structure:
> 
>   struct mcontext {
>   elf_gregset_t   mc_gregs;
>   elf_fpregset_t  mc_fregs;
>   unsigned long   mc_pad[2];
>   elf_vrregset_t  mc_vregs __attribute__((__aligned__(16)));
>   };
> 
>   typedef elf_vrreg_t elf_vrregset_t[ELF_NVRREG];
> 
>   # define ELF_NEVRREG34  /* includes acc (as 2) */
>   # define ELF_NVRREG 33  /* includes vscr */

I don't know these internals very well -- do you want me to change this
specifically somehow? With the BUILD_BUG_ON()s added, there's no binary
change here -- I wanted to make sure nothing was different in the
output.

-Kees

> 
> 
> 
> > ---
> >   arch/powerpc/include/asm/processor.h |  6 --
> >   arch/powerpc/kernel/signal_32.c  | 14 +-
> >   2 files changed, 13 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/processor.h 
> > b/arch/powerpc/include/asm/processor.h
> > index e39bd0ff69f3..978a80308466 100644
> > --- a/arch/powerpc/include/asm/processor.h
> > +++ b/arch/powerpc/include/asm/processor.h
> > @@ -191,8 +191,10 @@ struct thread_struct {
> > int used_vsr;   /* set if process has used VSX */
> >   #endif /* CONFIG_VSX */
> >   #ifdef CONFIG_SPE
> > -   unsigned long   evr[32];/* upper 32-bits of SPE regs */
> > -   u64 acc;/* Accumulator */
> > +   struct_group(spe,
> > +   unsigned long   evr[32];/* upper 32-bits of SPE regs */
> > +   u64 acc;/* Accumulator */
> > +   );
> > unsigned long   spefscr;/* SPE & eFP status */
> > unsigned long   spefscr_last;   /* SPEFSCR value on last prctl
> >call or trap return */
> > diff --git a/arch/powerpc/kernel/signal_32.c 
> > b/arch/powerpc/kernel/signal_32.c
> > index 00a9c9cd6d42..5e1664b501e4 100644
> > --- a/arch/powerpc/kernel/signal_32.c
> > +++ b/arch/powerpc/kernel/signal_32.c
> > @@ -527,16 +527,20 @@ static long restore_user_regs(struct pt_regs *regs,
> > regs_set_return_msr(regs, regs->msr & ~(MSR_FP | MSR_FE0 | MSR_FE1));
> >   
> >   #ifdef CONFIG_SPE
> > -   /* force the process to reload the spe registers from
> > -  current->thread when it next does spe instructions */
> > +   /*
> > +* Force the process to reload the spe registers from
> > +* current->thread when it next does spe instructions.
> > +* Since this is user ABI, we must enforce the sizing.
> > +*/
> > +   BUILD_BUG_ON(sizeof(current->thread.spe) != ELF_NEVRREG * sizeof(u32));
> > regs_set_return_msr(regs, regs->msr & ~MSR_SPE);
> > if (msr & MSR_SPE) {
> > /* restore spe registers from the stack */
> > -   unsafe_copy_from_user(current->thread.evr, >mc_vregs,
> > - ELF_NEVRREG * sizeof(u32), failed);
> > +   unsafe_copy_from_user(>thread.spe, >mc_vregs,
> > + sizeof(current->thread.spe), failed);
> > current->thread.used_spe = true;
> > } else if (current->thread.used_spe)
> > -   memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32));
> > +   memset(>thread.spe, 0, sizeof(current->thread.spe));
> >   
> > /* Always get SPEFSCR back */
> > 

Re: [PATCH v4 5/5] powerpc/watchdog: help remote CPUs to flush NMI printk output

2021-11-19 Thread Laurent Dufour

Le 19/11/2021 à 12:31, Nicholas Piggin a écrit :

The printk layer at the moment does not seem to have a good way to force
flush printk messages that are created in NMI context, except in the
panic path.

NMI-context printk messages normally get to the console with irq_work,
but that won't help if the CPU is stuck with irqs disabled, as can be
the case for hard lockup watchdog messages.

The watchdog currently flushes the printk buffers after detecting a
lockup on remote CPUs, but they may not have processed their NMI IPI
yet by that stage, or they may have self-detected a lockup in which
case they won't go via this NMI IPI path.

Improve the situation by having NMI-context mark a flag if it called
printk, and have watchdog timer interrupts check if that flag was set
and try to flush if it was. Latency is not a big problem because we
were already stuck for a while, just need to try to make sure the
messages eventually make it out.

Cc: Laurent Dufour 
Signed-off-by: Nicholas Piggin 


Reviewed-by: Laurent Dufour 


---
This patch requires commit 5d5e4522a7f4 ("printk: restore flushing of
NMI buffers on remote CPUs after NMI backtraces"). If backporting this
to a kernel without commit 93d102f094be ("printk: remove safe buffers"),
then printk_safe_flush() should be used in place of
printk_trigger_flush().

Thanks,
Nick

  arch/powerpc/kernel/watchdog.c | 37 --
  1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 23745af38d62..bfc27496fe7e 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -86,6 +86,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
  /* SMP checker bits */
  static unsigned long __wd_smp_lock;
  static unsigned long __wd_reporting;
+static unsigned long __wd_nmi_output;
  static cpumask_t wd_smp_cpus_pending;
  static cpumask_t wd_smp_cpus_stuck;
  static u64 wd_smp_last_reset_tb;
@@ -154,6 +155,23 @@ static void wd_lockup_ipi(struct pt_regs *regs)
else
dump_stack();
  
+	/*

+* __wd_nmi_output must be set after we printk from NMI context.
+*
+* printk from NMI context defers printing to the console to irq_work.
+* If that NMI was taken in some code that is hard-locked, then irqs
+* are disabled so irq_work will never fire. That can result in the
+* hard lockup messages being delayed (indefinitely, until something
+* else kicks the console drivers).
+*
+* Setting __wd_nmi_output will cause another CPU to notice and kick
+* the console drivers for us.
+*
+* xchg is not needed here (it could be a smp_mb and store), but xchg
+* gives the memory ordering and atomicity required.
+*/
+   xchg(&__wd_nmi_output, 1);
+
/* Do not panic from here because that can recurse into NMI IPI layer */
  }
  
@@ -227,12 +245,6 @@ static void watchdog_smp_panic(int cpu)

cpumask_clear(_smp_cpus_ipi);
}
  
-	/*

-* Force flush any remote buffers that might be stuck in IRQ context
-* and therefore could not run their irq_work.
-*/
-   printk_trigger_flush();
-
if (hardlockup_panic)
nmi_panic(NULL, "Hard LOCKUP");
  
@@ -337,6 +349,17 @@ static void watchdog_timer_interrupt(int cpu)
  
  	if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)

watchdog_smp_panic(cpu);
+
+   if (__wd_nmi_output && xchg(&__wd_nmi_output, 0)) {
+   /*
+* Something has called printk from NMI context. It might be
+* stuck, so this this triggers a flush that will get that
+* printk output to the console.
+*
+* See wd_lockup_ipi.
+*/
+   printk_trigger_flush();
+   }
  }
  
  DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)

@@ -386,6 +409,8 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
print_irqtrace_events(current);
show_regs(regs);
  
+		xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi

+
if (sysctl_hardlockup_all_cpu_backtrace)
trigger_allbutself_cpu_backtrace();
  





Re: [RFC PATCH 0/3] Use pageblock_order for cma and alloc_contig_range alignment.

2021-11-19 Thread Zi Yan
On 19 Nov 2021, at 7:33, Vlastimil Babka wrote:

> On 11/15/21 20:37, Zi Yan wrote:
>> From: Zi Yan 
>>
>> Hi David,
>>
>> You suggested to make alloc_contig_range() deal with pageblock_order instead 
>> of
>> MAX_ORDER - 1 and get rid of MAX_ORDER - 1 dependency in virtio_mem[1]. This
>> patchset is my attempt to achieve that. Please take a look and let me know if
>> I am doing it correctly or not.
>>
>> From what my understanding, cma required alignment of
>> max(MAX_ORDER - 1, pageblock_order), because when MIGRATE_CMA was introduced,
>> __free_one_page() does not prevent merging two different pageblocks, when
>> MAX_ORDER - 1 > pageblock_order. But current __free_one_page() implementation
>> does prevent that.
>
> But it does prevent that only for isolated pageblock, not CMA, and yout
> patchset doesn't seem to expand that to CMA? Or am I missing something.

Yeah, you are right. Originally, I thought preventing merging isolated pageblock
with other types of pageblocks is sufficient, since MIGRATE_CMA is always
converted from MIGRATE_ISOLATE. But that is not true. I will rework the code.
Thanks for pointing this out.

>
>
>> It should be OK to just align cma to pageblock_order.
>> alloc_contig_range() relies on MIGRATE_CMA to get free pages, so it can use
>> pageblock_order as alignment too.
>>
>> In terms of virtio_mem, if I understand correctly, it relies on
>> alloc_contig_range() to obtain contiguous free pages and offlines them to 
>> reduce
>> guest memory size. As the result of alloc_contig_range() alignment change,
>> virtio_mem should be able to just align PFNs to pageblock_order.
>>
>> Thanks.
>>
>>
>> [1] 
>> https://lore.kernel.org/linux-mm/28b57903-fae6-47ac-7e1b-a1dd41421...@redhat.com/
>>
>> Zi Yan (3):
>>   mm: cma: alloc_contig_range: use pageblock_order as the single
>> alignment.
>>   drivers: virtio_mem: use pageblock size as the minimum virtio_mem
>> size.
>>   arch: powerpc: adjust fadump alignment to be pageblock aligned.
>>
>>  arch/powerpc/include/asm/fadump-internal.h |  4 +---
>>  drivers/virtio/virtio_mem.c|  6 ++
>>  include/linux/mmzone.h |  5 +
>>  kernel/dma/contiguous.c|  2 +-
>>  mm/cma.c   |  6 ++
>>  mm/page_alloc.c| 12 +---
>>  6 files changed, 12 insertions(+), 23 deletions(-)
>>

--
Best Regards,
Yan, Zi


signature.asc
Description: OpenPGP digital signature


Re: [PATCH V4 0/1] powerpc/perf: Clear pending PMI in ppmu callbacks

2021-11-19 Thread Athira Rajeev



> On 21-Jul-2021, at 11:18 AM, Athira Rajeev  
> wrote:
> 
> Running perf fuzzer testsuite popped up below messages
> in the dmesg logs:
> 
> "Can't find PMC that caused IRQ"
> 
> This means a PMU exception happened, but none of the PMC's (Performance
> Monitor Counter) were found to be overflown. Perf interrupt handler checks
> the PMC's to see which PMC has overflown and if none of the PMCs are
> overflown ( counter value not >= 0x8000 ), it throws warning:
> "Can't find PMC that caused IRQ".
> 
> Powerpc has capability to mask and replay a performance monitoring
> interrupt (PMI). In case of replayed PMI, there are some corner cases
> that clears the PMCs after masking. In such cases, the perf interrupt
> handler will not find the active PMC values that had caused the overflow
> and thus leading to this message. This patchset attempts to fix those
> corner cases.
> 
> However there is one more case in PowerNV where these messages are
> emitted during system wide profiling or when a specific CPU is monitored
> for an event. That is, when a counter overflow just before entering idle
> and a PMI gets triggered after wakeup from idle. Since PMCs
> are not saved in the idle path, perf interrupt handler will not
> find overflown counter value and emits the "Can't find PMC" messages.
> This patch documents this race condition in powerpc core-book3s.
> 
> Patch fixes the ppmu callbacks to disable pending interrupt before clearing
> the overflown PMC and documents the race condition in idle path.
> 
> Changelog:
> changes from v3 -> v4
>   Addressed review comments from Nicholas Piggin
>   - Added comment explaining the need to clear MMCR0 PMXE bit in
> pmu disable callback.
>   - Added a check to display warning if there is a PMI pending
> bit set in Paca without any overflown PMC.
>   - Removed the condition check before clearing pending PMI
> in 'clear_pmi_irq_pending' function.
>   - Added reviewed by from Nicholas Piggin.
> 
> Changes from v2 -> v3
>   Addressed review comments from Nicholas Piggin
>   - Moved the clearing of PMI bit to power_pmu_disable.
> In previous versions, this was done in power_pmu_del,
> power_pmu_stop/enable callbacks before clearing of PMC's.
>   - power_pmu_disable is called before any event gets deleted
> or stopped. If more than one event is running in the PMU,
> we may clear the PMI bit for an event which is not going
> to be deleted/stopped. Hence introduced check in
> power_pmu_enable to set back PMI to avoid dropping of valid
> samples in such cases.
>   - Disable MMCR0 PMXE bit in pmu disable callback which otherwise
> could trigger PMI when PMU is getting disabled.
> Changes from v1 -> v2
>   Addressed review comments from Nicholas Piggin
>   - Moved the PMI pending check and clearing function
> to arch/powerpc/include/asm/hw_irq.h and renamed
> function to "get_clear_pmi_irq_pending"
>   - Along with checking for pending PMI bit in Paca,
> look for PMAO bit in MMCR0 register to decide on
> pending PMI interrupt.
> 
> Athira Rajeev (1):
>  powerpc/perf: Fix PMU callbacks to clear pending PMI before resetting
>an overflown PMC

Hi,

Please let me know if there are any review comments for this patch.

Thanks
Athira
> 
> arch/powerpc/include/asm/hw_irq.h | 38 +
> arch/powerpc/perf/core-book3s.c   | 59 ++-
> 2 files changed, 96 insertions(+), 1 deletion(-)
> 
> -- 
> 1.8.3.1
> 



Re: [RFC PATCH 0/3] Use pageblock_order for cma and alloc_contig_range alignment.

2021-11-19 Thread Vlastimil Babka
On 11/15/21 20:37, Zi Yan wrote:
> From: Zi Yan 
> 
> Hi David,
> 
> You suggested to make alloc_contig_range() deal with pageblock_order instead 
> of
> MAX_ORDER - 1 and get rid of MAX_ORDER - 1 dependency in virtio_mem[1]. This
> patchset is my attempt to achieve that. Please take a look and let me know if
> I am doing it correctly or not.
> 
> From what my understanding, cma required alignment of
> max(MAX_ORDER - 1, pageblock_order), because when MIGRATE_CMA was introduced,
> __free_one_page() does not prevent merging two different pageblocks, when
> MAX_ORDER - 1 > pageblock_order. But current __free_one_page() implementation
> does prevent that.

But it does prevent that only for isolated pageblock, not CMA, and yout
patchset doesn't seem to expand that to CMA? Or am I missing something.


> It should be OK to just align cma to pageblock_order.
> alloc_contig_range() relies on MIGRATE_CMA to get free pages, so it can use
> pageblock_order as alignment too.
> 
> In terms of virtio_mem, if I understand correctly, it relies on
> alloc_contig_range() to obtain contiguous free pages and offlines them to 
> reduce
> guest memory size. As the result of alloc_contig_range() alignment change,
> virtio_mem should be able to just align PFNs to pageblock_order.
> 
> Thanks.
> 
> 
> [1] 
> https://lore.kernel.org/linux-mm/28b57903-fae6-47ac-7e1b-a1dd41421...@redhat.com/
> 
> Zi Yan (3):
>   mm: cma: alloc_contig_range: use pageblock_order as the single
> alignment.
>   drivers: virtio_mem: use pageblock size as the minimum virtio_mem
> size.
>   arch: powerpc: adjust fadump alignment to be pageblock aligned.
> 
>  arch/powerpc/include/asm/fadump-internal.h |  4 +---
>  drivers/virtio/virtio_mem.c|  6 ++
>  include/linux/mmzone.h |  5 +
>  kernel/dma/contiguous.c|  2 +-
>  mm/cma.c   |  6 ++
>  mm/page_alloc.c| 12 +---
>  6 files changed, 12 insertions(+), 23 deletions(-)
> 



[PATCH v4 5/5] powerpc/watchdog: help remote CPUs to flush NMI printk output

2021-11-19 Thread Nicholas Piggin
The printk layer at the moment does not seem to have a good way to force
flush printk messages that are created in NMI context, except in the
panic path.

NMI-context printk messages normally get to the console with irq_work,
but that won't help if the CPU is stuck with irqs disabled, as can be
the case for hard lockup watchdog messages.

The watchdog currently flushes the printk buffers after detecting a
lockup on remote CPUs, but they may not have processed their NMI IPI
yet by that stage, or they may have self-detected a lockup in which
case they won't go via this NMI IPI path.

Improve the situation by having NMI-context mark a flag if it called
printk, and have watchdog timer interrupts check if that flag was set
and try to flush if it was. Latency is not a big problem because we
were already stuck for a while, just need to try to make sure the
messages eventually make it out.

Cc: Laurent Dufour 
Signed-off-by: Nicholas Piggin 
---
This patch requires commit 5d5e4522a7f4 ("printk: restore flushing of
NMI buffers on remote CPUs after NMI backtraces"). If backporting this
to a kernel without commit 93d102f094be ("printk: remove safe buffers"),
then printk_safe_flush() should be used in place of
printk_trigger_flush().

Thanks,
Nick

 arch/powerpc/kernel/watchdog.c | 37 --
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 23745af38d62..bfc27496fe7e 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -86,6 +86,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
 /* SMP checker bits */
 static unsigned long __wd_smp_lock;
 static unsigned long __wd_reporting;
+static unsigned long __wd_nmi_output;
 static cpumask_t wd_smp_cpus_pending;
 static cpumask_t wd_smp_cpus_stuck;
 static u64 wd_smp_last_reset_tb;
@@ -154,6 +155,23 @@ static void wd_lockup_ipi(struct pt_regs *regs)
else
dump_stack();
 
+   /*
+* __wd_nmi_output must be set after we printk from NMI context.
+*
+* printk from NMI context defers printing to the console to irq_work.
+* If that NMI was taken in some code that is hard-locked, then irqs
+* are disabled so irq_work will never fire. That can result in the
+* hard lockup messages being delayed (indefinitely, until something
+* else kicks the console drivers).
+*
+* Setting __wd_nmi_output will cause another CPU to notice and kick
+* the console drivers for us.
+*
+* xchg is not needed here (it could be a smp_mb and store), but xchg
+* gives the memory ordering and atomicity required.
+*/
+   xchg(&__wd_nmi_output, 1);
+
/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
@@ -227,12 +245,6 @@ static void watchdog_smp_panic(int cpu)
cpumask_clear(_smp_cpus_ipi);
}
 
-   /*
-* Force flush any remote buffers that might be stuck in IRQ context
-* and therefore could not run their irq_work.
-*/
-   printk_trigger_flush();
-
if (hardlockup_panic)
nmi_panic(NULL, "Hard LOCKUP");
 
@@ -337,6 +349,17 @@ static void watchdog_timer_interrupt(int cpu)
 
if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
watchdog_smp_panic(cpu);
+
+   if (__wd_nmi_output && xchg(&__wd_nmi_output, 0)) {
+   /*
+* Something has called printk from NMI context. It might be
+* stuck, so this this triggers a flush that will get that
+* printk output to the console.
+*
+* See wd_lockup_ipi.
+*/
+   printk_trigger_flush();
+   }
 }
 
 DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
@@ -386,6 +409,8 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
print_irqtrace_events(current);
show_regs(regs);
 
+   xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi
+
if (sysctl_hardlockup_all_cpu_backtrace)
trigger_allbutself_cpu_backtrace();
 
-- 
2.23.0



[PATCH v4 4/5] powerpc/watchdog: read TB close to where it is used

2021-11-19 Thread Nicholas Piggin
When taking watchdog actions, printing messages, comparing and
re-setting wd_smp_last_reset_tb, etc., read TB close to the point of use
and under wd_smp_lock or printing lock (if applicable).

This should keep timebase mostly monotonic with kernel log messages, and
could prevent (in theory) a laggy CPU updating wd_smp_last_reset_tb to
something a long way in the past, and causing other CPUs to appear to be
stuck.

These additional TB reads are all slowpath (lockup has been detected),
so performance does not matter.

Reviewed-by: Laurent Dufour 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/watchdog.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index cfd45049ec7f..23745af38d62 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -157,7 +157,7 @@ static void wd_lockup_ipi(struct pt_regs *regs)
/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
-static bool set_cpu_stuck(int cpu, u64 tb)
+static bool set_cpu_stuck(int cpu)
 {
cpumask_set_cpu(cpu, _smp_cpus_stuck);
cpumask_clear_cpu(cpu, _smp_cpus_pending);
@@ -166,7 +166,7 @@ static bool set_cpu_stuck(int cpu, u64 tb)
 */
smp_mb();
if (cpumask_empty(_smp_cpus_pending)) {
-   wd_smp_last_reset_tb = tb;
+   wd_smp_last_reset_tb = get_tb();
cpumask_andnot(_smp_cpus_pending,
_cpus_enabled,
_smp_cpus_stuck);
@@ -175,15 +175,16 @@ static bool set_cpu_stuck(int cpu, u64 tb)
return false;
 }
 
-static void watchdog_smp_panic(int cpu, u64 tb)
+static void watchdog_smp_panic(int cpu)
 {
static cpumask_t wd_smp_cpus_ipi; // protected by reporting
unsigned long flags;
-   u64 last_reset;
+   u64 tb, last_reset;
int c;
 
wd_smp_lock();
/* Double check some things under lock */
+   tb = get_tb();
last_reset = wd_smp_last_reset_tb;
if ((s64)(tb - last_reset) < (s64)wd_smp_panic_timeout_tb)
goto out;
@@ -198,7 +199,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
continue; // should not happen
 
__cpumask_set_cpu(c, _smp_cpus_ipi);
-   if (set_cpu_stuck(c, tb))
+   if (set_cpu_stuck(c))
break;
}
if (cpumask_empty(_smp_cpus_ipi)) {
@@ -243,7 +244,7 @@ static void watchdog_smp_panic(int cpu, u64 tb)
wd_smp_unlock();
 }
 
-static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
+static void wd_smp_clear_cpu_pending(int cpu)
 {
if (!cpumask_test_cpu(cpu, _smp_cpus_pending)) {
if (unlikely(cpumask_test_cpu(cpu, _smp_cpus_stuck))) {
@@ -251,7 +252,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
unsigned long flags;
 
pr_emerg("CPU %d became unstuck TB:%lld\n",
-cpu, tb);
+cpu, get_tb());
print_irqtrace_events(current);
if (regs)
show_regs(regs);
@@ -317,7 +318,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 */
wd_smp_lock();
if (cpumask_empty(_smp_cpus_pending)) {
-   wd_smp_last_reset_tb = tb;
+   wd_smp_last_reset_tb = get_tb();
cpumask_andnot(_smp_cpus_pending,
_cpus_enabled,
_smp_cpus_stuck);
@@ -332,10 +333,10 @@ static void watchdog_timer_interrupt(int cpu)
 
per_cpu(wd_timer_tb, cpu) = tb;
 
-   wd_smp_clear_cpu_pending(cpu, tb);
+   wd_smp_clear_cpu_pending(cpu);
 
if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
-   watchdog_smp_panic(cpu, tb);
+   watchdog_smp_panic(cpu);
 }
 
 DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
@@ -372,7 +373,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
return 0;
}
 
-   set_cpu_stuck(cpu, tb);
+   set_cpu_stuck(cpu);
 
wd_smp_unlock();
 
@@ -433,7 +434,7 @@ void arch_touch_nmi_watchdog(void)
tb = get_tb();
if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
per_cpu(wd_timer_tb, cpu) = tb;
-   wd_smp_clear_cpu_pending(cpu, tb);
+   wd_smp_clear_cpu_pending(cpu);
}
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
@@ -491,7 +492,7 @@ static void stop_watchdog(void *arg)
cpumask_clear_cpu(cpu, _cpus_enabled);
wd_smp_unlock();
 
-   wd_smp_clear_cpu_pending(cpu, get_tb());
+   wd_smp_clear_cpu_pending(cpu);
 }
 
 static int stop_watchdog_on_cpu(unsigned 

[PATCH v4 3/5] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi

2021-11-19 Thread Nicholas Piggin
There is a deadlock with the console_owner lock and the wd_smp_lock:

CPU x takes the console_owner lock
CPU y takes a watchdog timer interrupt and takes __wd_smp_lock
CPU x takes a soft-NMI interrupt, detects deadlock, spins on __wd_smp_lock
CPU y detects deadlock, tries to print something and spins on console_owner
-> deadlock

Change the watchdog locking scheme so wd_smp_lock protects the watchdog
internal data, but "reporting" (printing, issuing NMI IPIs, taking any
action outside of watchdog) uses a non-waiting exclusion. If a CPU detects
a problem but can not take the reporting lock, it just returns because
something else is already reporting. It will try again at some point.

Typically hard lockup watchdog report usefulness is not impacted due to
failure to spewing a large enough amount of data in as short a time as
possible, but by messages getting garbled.

Laurent debugged this and found the deadlock, and this patch is based on
his general approach to avoid expensive operations while holding the lock.
With the addition of the reporting exclusion.

Signed-off-by: Laurent Dufour 
[np: rework to add reporting exclusion update changelog]
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/watchdog.c | 100 +
 1 file changed, 78 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 588f54350d19..cfd45049ec7f 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -85,10 +85,36 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
 
 /* SMP checker bits */
 static unsigned long __wd_smp_lock;
+static unsigned long __wd_reporting;
 static cpumask_t wd_smp_cpus_pending;
 static cpumask_t wd_smp_cpus_stuck;
 static u64 wd_smp_last_reset_tb;
 
+/*
+ * Try to take the exclusive watchdog action / NMI IPI / printing lock.
+ * wd_smp_lock must be held. If this fails, we should return and wait
+ * for the watchdog to kick in again (or another CPU to trigger it).
+ *
+ * Importantly, if hardlockup_panic is set, wd_try_report failure should
+ * not delay the panic, because whichever other CPU is reporting will
+ * call panic.
+ */
+static bool wd_try_report(void)
+{
+   if (__wd_reporting)
+   return false;
+   __wd_reporting = 1;
+   return true;
+}
+
+/* End printing after successful wd_try_report. wd_smp_lock not required. */
+static void wd_end_reporting(void)
+{
+   smp_mb(); /* End printing "critical section" */
+   WARN_ON_ONCE(__wd_reporting == 0);
+   WRITE_ONCE(__wd_reporting, 0);
+}
+
 static inline void wd_smp_lock(unsigned long *flags)
 {
/*
@@ -151,45 +177,54 @@ static bool set_cpu_stuck(int cpu, u64 tb)
 
 static void watchdog_smp_panic(int cpu, u64 tb)
 {
+   static cpumask_t wd_smp_cpus_ipi; // protected by reporting
unsigned long flags;
+   u64 last_reset;
int c;
 
wd_smp_lock();
/* Double check some things under lock */
-   if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
+   last_reset = wd_smp_last_reset_tb;
+   if ((s64)(tb - last_reset) < (s64)wd_smp_panic_timeout_tb)
goto out;
if (cpumask_test_cpu(cpu, _smp_cpus_pending))
goto out;
-   if (cpumask_weight(_smp_cpus_pending) == 0)
+   if (!wd_try_report())
goto out;
+   for_each_online_cpu(c) {
+   if (!cpumask_test_cpu(c, _smp_cpus_pending))
+   continue;
+   if (c == cpu)
+   continue; // should not happen
+
+   __cpumask_set_cpu(c, _smp_cpus_ipi);
+   if (set_cpu_stuck(c, tb))
+   break;
+   }
+   if (cpumask_empty(_smp_cpus_ipi)) {
+   wd_end_reporting();
+   goto out;
+   }
+   wd_smp_unlock();
 
pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
-cpu, cpumask_pr_args(_smp_cpus_pending));
+cpu, cpumask_pr_args(_smp_cpus_ipi));
pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
-cpu, tb, wd_smp_last_reset_tb,
-tb_to_ns(tb - wd_smp_last_reset_tb) / 100);
+cpu, tb, last_reset, tb_to_ns(tb - last_reset) / 100);
 
if (!sysctl_hardlockup_all_cpu_backtrace) {
/*
 * Try to trigger the stuck CPUs, unless we are going to
 * get a backtrace on all of them anyway.
 */
-   for_each_cpu(c, _smp_cpus_pending) {
-   bool empty;
-   if (c == cpu)
-   continue;
-   /* Take the stuck CPUs out of the watch group */
-   empty = set_cpu_stuck(c, tb);
+   for_each_cpu(c, _smp_cpus_ipi) {
smp_send_nmi_ipi(c, wd_lockup_ipi, 100);
-   if 

[PATCH v4 2/5] powerpc/watchdog: tighten non-atomic read-modify-write access

2021-11-19 Thread Nicholas Piggin
Most updates to wd_smp_cpus_pending are under lock except the watchdog
interrupt bit clear.

This can race with non-atomic RMW updates to the mask under lock, which
can happen in two instances:

Firstly, if another CPU detects this one is stuck, removes it from the
mask, mask becomes empty and is re-filled with non-atomic stores. This
is okay because it would re-fill the mask with this CPU's bit clear
anyway (because this CPU is now stuck), so it doesn't matter that the
bit clear update got "lost". Add a comment for this.

Secondly, if another CPU detects a different CPU is stuck and removes it
from the pending mask with a non-atomic store to bytes which also
include the bit of this CPU. This case can result in the bit clear being
lost and the end result being the bit is set. This should be so rare it
hardly matters, but to make things simpler to reason about just avoid
the non-atomic access for that case.

Reviewed-by: Laurent Dufour 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/watchdog.c | 36 --
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index ad94a2c6b733..588f54350d19 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -131,10 +131,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
/* Do not panic from here because that can recurse into NMI IPI layer */
 }
 
-static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
+static bool set_cpu_stuck(int cpu, u64 tb)
 {
-   cpumask_or(_smp_cpus_stuck, _smp_cpus_stuck, cpumask);
-   cpumask_andnot(_smp_cpus_pending, _smp_cpus_pending, cpumask);
+   cpumask_set_cpu(cpu, _smp_cpus_stuck);
+   cpumask_clear_cpu(cpu, _smp_cpus_pending);
/*
 * See wd_smp_clear_cpu_pending()
 */
@@ -144,11 +144,9 @@ static void set_cpumask_stuck(const struct cpumask 
*cpumask, u64 tb)
cpumask_andnot(_smp_cpus_pending,
_cpus_enabled,
_smp_cpus_stuck);
+   return true;
}
-}
-static void set_cpu_stuck(int cpu, u64 tb)
-{
-   set_cpumask_stuck(cpumask_of(cpu), tb);
+   return false;
 }
 
 static void watchdog_smp_panic(int cpu, u64 tb)
@@ -177,15 +175,17 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 * get a backtrace on all of them anyway.
 */
for_each_cpu(c, _smp_cpus_pending) {
+   bool empty;
if (c == cpu)
continue;
+   /* Take the stuck CPUs out of the watch group */
+   empty = set_cpu_stuck(c, tb);
smp_send_nmi_ipi(c, wd_lockup_ipi, 100);
+   if (empty)
+   break;
}
}
 
-   /* Take the stuck CPUs out of the watch group */
-   set_cpumask_stuck(_smp_cpus_pending, tb);
-
wd_smp_unlock();
 
if (sysctl_hardlockup_all_cpu_backtrace)
@@ -243,6 +243,22 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
return;
}
 
+   /*
+* All other updates to wd_smp_cpus_pending are performed under
+* wd_smp_lock. All of them are atomic except the case where the
+* mask becomes empty and is reset. This will not happen here because
+* cpu was tested to be in the bitmap (above), and a CPU only clears
+* its own bit. _Except_ in the case where another CPU has detected a
+* hard lockup on our CPU and takes us out of the pending mask. So in
+* normal operation there will be no race here, no problem.
+*
+* In the lockup case, this atomic clear-bit vs a store that refills
+* other bits in the accessed word wll not be a problem. The bit clear
+* is atomic so it will not cause the store to get lost, and the store
+* will never set this bit so it will not overwrite the bit clear. The
+* only way for a stuck CPU to return to the pending bitmap is to
+* become unstuck itself.
+*/
cpumask_clear_cpu(cpu, _smp_cpus_pending);
 
/*
-- 
2.23.0



[PATCH v4 1/5] powerpc/watchdog: Fix missed watchdog reset due to memory ordering race

2021-11-19 Thread Nicholas Piggin
It is possible for all CPUs to miss the pending cpumask becoming clear,
and then nobody resetting it, which will cause the lockup detector to
stop working. It will eventually expire, but watchdog_smp_panic will
avoid doing anything if the pending mask is clear and it will never be
reset.

Order the cpumask clear vs the subsequent test to close this race.

Add an extra check for an empty pending mask when the watchdog fires and
finds its bit still clear, to try to catch any other possible races or
bugs here and keep the watchdog working. The extra test in
arch_touch_nmi_watchdog is required to prevent the new warning from
firing off.

Debugged-by: Laurent Dufour 
Reviewed-by: Laurent Dufour 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/watchdog.c | 41 +-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 3fa6d240bade..ad94a2c6b733 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -135,6 +135,10 @@ static void set_cpumask_stuck(const struct cpumask 
*cpumask, u64 tb)
 {
cpumask_or(_smp_cpus_stuck, _smp_cpus_stuck, cpumask);
cpumask_andnot(_smp_cpus_pending, _smp_cpus_pending, cpumask);
+   /*
+* See wd_smp_clear_cpu_pending()
+*/
+   smp_mb();
if (cpumask_empty(_smp_cpus_pending)) {
wd_smp_last_reset_tb = tb;
cpumask_andnot(_smp_cpus_pending,
@@ -221,13 +225,44 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
 
cpumask_clear_cpu(cpu, _smp_cpus_stuck);
wd_smp_unlock();
+   } else {
+   /*
+* The last CPU to clear pending should have reset the
+* watchdog so we generally should not find it empty
+* here if our CPU was clear. However it could happen
+* due to a rare race with another CPU taking the
+* last CPU out of the mask concurrently.
+*
+* We can't add a warning for it. But just in case
+* there is a problem with the watchdog that is causing
+* the mask to not be reset, try to kick it along here.
+*/
+   if (unlikely(cpumask_empty(_smp_cpus_pending)))
+   goto none_pending;
}
return;
}
+
cpumask_clear_cpu(cpu, _smp_cpus_pending);
+
+   /*
+* Order the store to clear pending with the load(s) to check all
+* words in the pending mask to check they are all empty. This orders
+* with the same barrier on another CPU. This prevents two CPUs
+* clearing the last 2 pending bits, but neither seeing the other's
+* store when checking if the mask is empty, and missing an empty
+* mask, which ends with a false positive.
+*/
+   smp_mb();
if (cpumask_empty(_smp_cpus_pending)) {
unsigned long flags;
 
+none_pending:
+   /*
+* Double check under lock because more than one CPU could see
+* a clear mask with the lockless check after clearing their
+* pending bits.
+*/
wd_smp_lock();
if (cpumask_empty(_smp_cpus_pending)) {
wd_smp_last_reset_tb = tb;
@@ -318,8 +353,12 @@ void arch_touch_nmi_watchdog(void)
 {
unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
int cpu = smp_processor_id();
-   u64 tb = get_tb();
+   u64 tb;
 
+   if (!cpumask_test_cpu(cpu, _cpumask))
+   return;
+
+   tb = get_tb();
if (tb - per_cpu(wd_timer_tb, cpu) >= ticks) {
per_cpu(wd_timer_tb, cpu) = tb;
wd_smp_clear_cpu_pending(cpu, tb);
-- 
2.23.0



[PATCH v4 0/5] powerpc: watchdog fixes

2021-11-19 Thread Nicholas Piggin
These are some watchdog fixes and improvements, in particular a
deadlock between the wd_smp_lock and console lock when the watchdog
fires, found by Laurent.

Thanks,
Nick

Since v3:
- Rebased on upstream.
- Brought patch 5 into the series.
- Fix bug with SMP watchdog last heartbeat time reporting.

Since v2:
- Fix a false positive warning in patch 1 found by Laurent.
- Move a comment change hunk to the correct patch.
- Drop the patch that removed the unstuck backtrace which is considered
  useful.

Since v1:
- Fixes noticed by Laurent in v1.
- Correct the description of the ABBA deadlock I wrote incorrectly in
  v1.
- Made several other improvements (patches 2,4,5).

Nicholas Piggin (5):
  powerpc/watchdog: Fix missed watchdog reset due to memory ordering
race
  powerpc/watchdog: tighten non-atomic read-modify-write access
  powerpc/watchdog: Avoid holding wd_smp_lock over printk and
smp_send_nmi_ipi
  powerpc/watchdog: read TB close to where it is used
  powerpc/watchdog: help remote CPUs to flush NMI printk output

 arch/powerpc/kernel/watchdog.c | 223 ++---
 1 file changed, 180 insertions(+), 43 deletions(-)

-- 
2.23.0



Re: [PATCH 0/3] KEXEC_SIG with appended signature

2021-11-19 Thread Michal Suchánek
Hello,

On Thu, Nov 18, 2021 at 05:34:01PM -0500, Nayna wrote:
> 
> On 11/16/21 04:53, Michal Suchánek wrote:
> > On Mon, Nov 15, 2021 at 06:53:53PM -0500, Nayna wrote:
> > > On 11/12/21 03:30, Michal Suchánek wrote:
> > > > Hello,
> > > > 
> > > > On Thu, Nov 11, 2021 at 05:26:41PM -0500, Nayna wrote:
> > > > > On 11/8/21 07:05, Michal Suchánek wrote:
> > > > > > Hello,
> > > > > > 
> > > > > > The other part is that distributions apply 'lockdown' patches that 
> > > > > > change
> > > > > > the security policy depending on secure boot status which were 
> > > > > > rejected
> > > > > > by upstream which only hook into the _SIG options, and not into the 
> > > > > > IMA_
> > > > > > options. Of course, I expect this to change when the IMA options are
> > > > > > universally available across architectures and the support picked 
> > > > > > up by
> > > > > > distributions.
> > > > > > 
> > > > > > Which brings the third point: IMA features vary across 
> > > > > > architectures,
> > > > > > and KEXEC_SIG is more common than IMA_KEXEC.
> > > > > > 
> > > > > > config/arm64/default:CONFIG_HAVE_IMA_KEXEC=y
> > > > > > config/ppc64le/default:CONFIG_HAVE_IMA_KEXEC=y
> > > > > > 
> > > > > > config/arm64/default:CONFIG_KEXEC_SIG=y
> > > > > > config/s390x/default:CONFIG_KEXEC_SIG=y
> > > > > > config/x86_64/default:CONFIG_KEXEC_SIG=y
> > > > > > 
> > > > > > KEXEC_SIG makes it much easier to get uniform features across
> > > > > > architectures.
> > > > > Architectures use KEXEC_SIG vs IMA_KEXEC based on their requirement.
> > > > > IMA_KEXEC is for the kernel images signed using sign-file (appended
> > > > > signatures, not PECOFF), provides measurement along with 
> > > > > verification, and
> > > > That's certainly not the case. S390 uses appended signatures with
> > > > KEXEC_SIG, arm64 uses PECOFF with both KEXEC_SIG and IMA_KEXEC.
> > > Yes, S390 uses appended signature, but they also do not support
> > > measurements.
> > > 
> > > On the other hand for arm64/x86, PECOFF works only with KEXEC_SIG. Look at
> > > the KEXEC_IMAGE_VERIFY_SIG config dependencies in arch/arm64/Kconfig and
> > > KEXEC_BZIMAGE_VERIFY_SIG config dependencies in arch/x86/Kconfig. Now, if
> > > KEXEC_SIG is not enabled, then IMA appraisal policies are enforced if 
> > > secure
> > > boot is enabled, refer to security/integrity/ima_efi.c . IMA would fail
> > > verification if kernel is not signed with module sig appended signatures 
> > > or
> > > signature verification fails.
> > > 
> > > In short, IMA is used to enforce the existence of a policy if secure boot 
> > > is
> > > enabled. If they don't support module sig appended signatures, by 
> > > definition
> > > it fails. Thus PECOFF doesn't work with both KEXEC_SIG and IMA_KEXEC, but
> > > only with KEXEC_SIG.
> > Then IMA_KEXEC is a no-go. It is not supported on all architectures and
> > it principially cannot be supported because it does not support PECOFF
> > which is needed to boot the kernel on EFI platforms. To get feature
> > parity across architectures KEXEC_SIG is required.
> 
> I would not say "a no-go", it is based on user requirements.
> 
> The key takeaway from this discussion is that both KEXEC_SIG and IMA_KEXEC
> support functionality with some small degree of overlap, and that
> documenting the differences is needed.  This will help kernel consumers to
> understand the difference and enable the appropriate functionality for their
> environment.

Maybe I was not clear enough. If you happen to focus on an architecture
that supports IMA fully it's great.

My point of view is maintaining multiple architectures. Both end users
and people conecerend with security are rarely familiar with
architecture specifics. Portability of documentation and debugging
instructions across architectures is a concern.

IMA has large number of options with varying availablitily across
architectures for no apparent reason. The situation is complex and hard
to grasp.

In comparison the *_SIG options are widely available. The missing
support for KEXEC_SIG on POWER is trivial to add by cut from s390.
With that all the documentation that exists already is also trivially
applicable to POWER. Any additional code cleanup is a bonus but not
really needed to enable the kexec lockdown on POWER.

Thanks

Michal


Re: [PATCH v3 3/4] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi

2021-11-19 Thread Nicholas Piggin
Excerpts from Nicholas Piggin's message of November 10, 2021 12:50 pm:
> @@ -160,11 +187,26 @@ static void watchdog_smp_panic(int cpu, u64 tb)
>   goto out;
>   if (cpumask_test_cpu(cpu, _smp_cpus_pending))
>   goto out;
> - if (cpumask_weight(_smp_cpus_pending) == 0)
> + if (!wd_try_report())
>   goto out;
> + for_each_online_cpu(c) {
> + if (!cpumask_test_cpu(c, _smp_cpus_pending))
> + continue;
> + if (c == cpu)
> + continue; // should not happen
> +
> + __cpumask_set_cpu(c, _smp_cpus_ipi);
> + if (set_cpu_stuck(c, tb))
> + break;
> + }
> + if (cpumask_empty(_smp_cpus_ipi)) {
> + wd_end_reporting();
> + goto out;
> + }
> + wd_smp_unlock();
>  
>   pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
> -  cpu, cpumask_pr_args(_smp_cpus_pending));
> +  cpu, cpumask_pr_args(_smp_cpus_ipi));
>   pr_emerg("CPU %d TB:%lld, last SMP heartbeat TB:%lld (%lldms ago)\n",
>cpu, tb, wd_smp_last_reset_tb,
>tb_to_ns(tb - wd_smp_last_reset_tb) / 100);

Oops, this has a bug: wd_smp_last_reset_tb gets reset above by
set_cpu_stuck when all the stuck CPUs are taken out of the pending
mask, so this prints nonsense last-reset times.

I might just send out an updated series, because the fix has a slight
clash with the next patch. All I do is take a local copy of
wd_smp_last_reset_tb near the start of the function.

Thanks,
Nick


Re: [PATCH] powerpc/prom_init: fix the improper check of prom_getprop

2021-11-19 Thread Andy Shevchenko
On Fri, Nov 19, 2021 at 05:12:18PM +0800, Peiwei Hu wrote:
> prom_getprop() can return PROM_ERROR. Binary operator can not identify it.

Fixes: 94d2dde738a5 ("[POWERPC] Efika: prune fixups and make them more 
carefull")

?

P.S. Try to use my script [1] to send patches, it should be smart enough to not
include my name, for example.

[1]: https://github.com/andy-shev/home-bin-tools/blob/master/ge2maintainer.sh

-- 
With Best Regards,
Andy Shevchenko




[PATCH 2/2] of: reserved_mem: Remove reserved regions count restriction

2021-11-19 Thread Calvin Zhang
Change to allocate reserved_mems dynamically. Static reserved regions
must be reserved before any memblock allocations. The reserved_mems
array couldn't be allocated until memblock and linear mapping are ready.

So move the allocation and initialization of records and reserved memory
from early_init_fdt_scan_reserved_mem() to of_reserved_mem_init().

Signed-off-by: Calvin Zhang 
---
 arch/arc/mm/init.c |  3 ++
 arch/arm/kernel/setup.c|  2 +
 arch/arm64/kernel/setup.c  |  3 ++
 arch/csky/kernel/setup.c   |  3 ++
 arch/h8300/kernel/setup.c  |  2 +
 arch/mips/kernel/setup.c   |  3 ++
 arch/nds32/kernel/setup.c  |  3 ++
 arch/nios2/kernel/setup.c  |  2 +
 arch/openrisc/kernel/setup.c   |  3 ++
 arch/powerpc/kernel/setup-common.c |  3 ++
 arch/riscv/kernel/setup.c  |  2 +
 arch/sh/kernel/setup.c |  3 ++
 arch/xtensa/kernel/setup.c |  2 +
 drivers/of/fdt.c   |  1 -
 drivers/of/of_reserved_mem.c   | 66 --
 15 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index ce4e939a7f07..a75f0e693f37 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -10,6 +10,7 @@
 #include 
 #endif
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -165,6 +166,8 @@ void __init setup_arch_memory(void)
 
 #endif /* CONFIG_HIGHMEM */
 
+   of_reserved_mem_init();
+
free_area_init(max_zone_pfn);
 }
 
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index 284a80c0b6e1..e76737effbf4 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1153,6 +1154,7 @@ void __init setup_arch(char **cmdline_p)
early_ioremap_reset();
 
paging_init(mdesc);
+   of_reserved_mem_init();
kasan_init();
request_standard_resources(mdesc);
 
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index be5f85b0a24d..4624e5193d6e 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -339,6 +340,8 @@ void __init __no_sanitize_address setup_arch(char 
**cmdline_p)
 
paging_init();
 
+   of_reserved_mem_init();
+
acpi_table_upgrade();
 
/* Parse the ACPI tables for possible boot-time configuration */
diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index c64e7be2045b..40878906644d 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -64,6 +65,8 @@ static void __init csky_memblock_init(void)
 #endif
memblock_set_current_limit(PFN_PHYS(max_low_pfn));
 
+   of_reserved_mem_init();
+
dma_contiguous_reserve(0);
 
free_area_init(max_zone_pfn);
diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c
index 61091a76eb7e..0f0ec72a260e 100644
--- a/arch/h8300/kernel/setup.c
+++ b/arch/h8300/kernel/setup.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -87,6 +88,7 @@ static void __init bootmem_init(void)
 
early_init_fdt_reserve_self();
early_init_fdt_scan_reserved_mem();
+   of_reserved_mem_init();
 
memblock_dump_all();
 }
diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c
index f979adfd4fc2..053a10d80cb9 100644
--- a/arch/mips/kernel/setup.c
+++ b/arch/mips/kernel/setup.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -776,6 +777,8 @@ void __init setup_arch(char **cmdline_p)
cpu_cache_init();
paging_init();
 
+   of_reserved_mem_init();
+
memblock_dump_all();
 }
 
diff --git a/arch/nds32/kernel/setup.c b/arch/nds32/kernel/setup.c
index b3d34d646652..1054804526c5 100644
--- a/arch/nds32/kernel/setup.c
+++ b/arch/nds32/kernel/setup.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -301,6 +302,8 @@ void __init setup_arch(char **cmdline_p)
/* paging_init() sets up the MMU and marks all pages as reserved */
paging_init();
 
+   of_reserved_mem_init();
+
/* invalidate all TLB entries because the new mapping is created */
__nds32__tlbop_flua();
 
diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c
index 40bc8fb75e0b..7e40e90bc3cd 100644
--- a/arch/nios2/kernel/setup.c
+++ b/arch/nios2/kernel/setup.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -173,6 +174,7 @@ void __init setup_arch(char **cmdline_p)
 
early_init_fdt_reserve_self();
early_init_fdt_scan_reserved_mem();
+   of_reserved_mem_init();
 
unflatten_and_copy_device_tree();
 
diff --git 

[PATCH 1/2] of: Sort reserved_mem related code

2021-11-19 Thread Calvin Zhang
Move code about parsing /reserved-memory and initializing of
reserved_mems array to of_reserved_mem.c for better modularity.

Rename array name from reserved_mem to reserved_mems to distinguish
from type definition.

Signed-off-by: Calvin Zhang 
---
 drivers/of/fdt.c| 108 +
 drivers/of/of_private.h |  12 ++-
 drivers/of/of_reserved_mem.c| 163 ++--
 include/linux/of_reserved_mem.h |   4 +
 4 files changed, 149 insertions(+), 138 deletions(-)

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index bdca35284ceb..445af4e69300 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -80,7 +80,7 @@ void __init of_fdt_limit_memory(int limit)
}
 }
 
-static bool of_fdt_device_is_available(const void *blob, unsigned long node)
+bool of_fdt_device_is_available(const void *blob, unsigned long node)
 {
const char *status = fdt_getprop(blob, node, "status", NULL);
 
@@ -476,7 +476,7 @@ void *initial_boot_params __ro_after_init;
 
 static u32 of_fdt_crc32;
 
-static int __init early_init_dt_reserve_memory_arch(phys_addr_t base,
+int __init early_init_dt_reserve_memory_arch(phys_addr_t base,
phys_addr_t size, bool nomap)
 {
if (nomap) {
@@ -492,108 +492,6 @@ static int __init 
early_init_dt_reserve_memory_arch(phys_addr_t base,
return memblock_reserve(base, size);
 }
 
-/*
- * __reserved_mem_reserve_reg() - reserve all memory described in 'reg' 
property
- */
-static int __init __reserved_mem_reserve_reg(unsigned long node,
-const char *uname)
-{
-   int t_len = (dt_root_addr_cells + dt_root_size_cells) * sizeof(__be32);
-   phys_addr_t base, size;
-   int len;
-   const __be32 *prop;
-   int first = 1;
-   bool nomap;
-
-   prop = of_get_flat_dt_prop(node, "reg", );
-   if (!prop)
-   return -ENOENT;
-
-   if (len && len % t_len != 0) {
-   pr_err("Reserved memory: invalid reg property in '%s', skipping 
node.\n",
-  uname);
-   return -EINVAL;
-   }
-
-   nomap = of_get_flat_dt_prop(node, "no-map", NULL) != NULL;
-
-   while (len >= t_len) {
-   base = dt_mem_next_cell(dt_root_addr_cells, );
-   size = dt_mem_next_cell(dt_root_size_cells, );
-
-   if (size &&
-   early_init_dt_reserve_memory_arch(base, size, nomap) == 0)
-   pr_debug("Reserved memory: reserved region for node 
'%s': base %pa, size %lu MiB\n",
-   uname, , (unsigned long)(size / SZ_1M));
-   else
-   pr_info("Reserved memory: failed to reserve memory for 
node '%s': base %pa, size %lu MiB\n",
-   uname, , (unsigned long)(size / SZ_1M));
-
-   len -= t_len;
-   if (first) {
-   fdt_reserved_mem_save_node(node, uname, base, size);
-   first = 0;
-   }
-   }
-   return 0;
-}
-
-/*
- * __reserved_mem_check_root() - check if #size-cells, #address-cells provided
- * in /reserved-memory matches the values supported by the current 
implementation,
- * also check if ranges property has been provided
- */
-static int __init __reserved_mem_check_root(unsigned long node)
-{
-   const __be32 *prop;
-
-   prop = of_get_flat_dt_prop(node, "#size-cells", NULL);
-   if (!prop || be32_to_cpup(prop) != dt_root_size_cells)
-   return -EINVAL;
-
-   prop = of_get_flat_dt_prop(node, "#address-cells", NULL);
-   if (!prop || be32_to_cpup(prop) != dt_root_addr_cells)
-   return -EINVAL;
-
-   prop = of_get_flat_dt_prop(node, "ranges", NULL);
-   if (!prop)
-   return -EINVAL;
-   return 0;
-}
-
-/*
- * fdt_scan_reserved_mem() - scan a single FDT node for reserved memory
- */
-static int __init fdt_scan_reserved_mem(void)
-{
-   int node, child;
-   const void *fdt = initial_boot_params;
-
-   node = fdt_path_offset(fdt, "/reserved-memory");
-   if (node < 0)
-   return -ENODEV;
-
-   if (__reserved_mem_check_root(node) != 0) {
-   pr_err("Reserved memory: unsupported node format, ignoring\n");
-   return -EINVAL;
-   }
-
-   fdt_for_each_subnode(child, fdt, node) {
-   const char *uname;
-   int err;
-
-   if (!of_fdt_device_is_available(fdt, child))
-   continue;
-
-   uname = fdt_get_name(fdt, child, NULL);
-
-   err = __reserved_mem_reserve_reg(child, uname);
-   if (err == -ENOENT && of_get_flat_dt_prop(child, "size", NULL))
-   fdt_reserved_mem_save_node(child, uname, 0, 0);
-   }
-   return 0;
-}
-
 /*
  * fdt_reserve_elfcorehdr() - reserves memory for elf core header
  *
@@ -642,7 +540,7 @@ 

[PATCH 0/2] of: remove reserved regions count restriction

2021-11-19 Thread Calvin Zhang
The count of reserved regions in /reserved-memory was limited because
the struct reserved_mem array was defined statically. This series sorts
out reserved memory code and allocates that array from early allocator.

Note: reserved region with fixed location must be reserved before any
memory allocation. While struct reserved_mem array should be allocated
after allocator is activated. We make early_init_fdt_scan_reserved_mem()
do reservation only and add another call to initialize reserved memory.
So arch code have to change for it.

I'm only familiar with arm and arm64 architectures. Approvals from arch
maintainers are required. Thank you all.

Calvin Zhang (2):
  of: Sort reserved_mem related code
  of: reserved_mem: Remove reserved regions count restriction

 arch/arc/mm/init.c |   3 +
 arch/arm/kernel/setup.c|   2 +
 arch/arm64/kernel/setup.c  |   3 +
 arch/csky/kernel/setup.c   |   3 +
 arch/h8300/kernel/setup.c  |   2 +
 arch/mips/kernel/setup.c   |   3 +
 arch/nds32/kernel/setup.c  |   3 +
 arch/nios2/kernel/setup.c  |   2 +
 arch/openrisc/kernel/setup.c   |   3 +
 arch/powerpc/kernel/setup-common.c |   3 +
 arch/riscv/kernel/setup.c  |   2 +
 arch/sh/kernel/setup.c |   3 +
 arch/xtensa/kernel/setup.c |   2 +
 drivers/of/fdt.c   | 107 +---
 drivers/of/of_private.h|  12 +-
 drivers/of/of_reserved_mem.c   | 189 -
 include/linux/of_reserved_mem.h|   4 +
 17 files changed, 207 insertions(+), 139 deletions(-)

-- 
2.30.2



Re: [PATCH v3 1/4] powerpc/watchdog: Fix missed watchdog reset due to memory ordering race

2021-11-19 Thread Laurent Dufour

Le 19/11/2021 à 10:05, Nicholas Piggin a écrit :

Excerpts from Laurent Dufour's message of November 16, 2021 1:09 am:

Le 10/11/2021 à 03:50, Nicholas Piggin a écrit :

It is possible for all CPUs to miss the pending cpumask becoming clear,
and then nobody resetting it, which will cause the lockup detector to
stop working. It will eventually expire, but watchdog_smp_panic will
avoid doing anything if the pending mask is clear and it will never be
reset.

Order the cpumask clear vs the subsequent test to close this race.

Add an extra check for an empty pending mask when the watchdog fires and
finds its bit still clear, to try to catch any other possible races or
bugs here and keep the watchdog working. The extra test in
arch_touch_nmi_watchdog is required to prevent the new warning from
firing off.

Debugged-by: Laurent Dufour 
Signed-off-by: Nicholas Piggin 
---
   arch/powerpc/kernel/watchdog.c | 41 +-
   1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index f9ea0e5357f9..3c60872b6a2c 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -135,6 +135,10 @@ static void set_cpumask_stuck(const struct cpumask 
*cpumask, u64 tb)
   {
cpumask_or(_smp_cpus_stuck, _smp_cpus_stuck, cpumask);
cpumask_andnot(_smp_cpus_pending, _smp_cpus_pending, cpumask);
+   /*
+* See wd_smp_clear_cpu_pending()
+*/
+   smp_mb();
if (cpumask_empty(_smp_cpus_pending)) {
wd_smp_last_reset_tb = tb;
cpumask_andnot(_smp_cpus_pending,
@@ -215,13 +219,44 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
   
   			cpumask_clear_cpu(cpu, _smp_cpus_stuck);

wd_smp_unlock();
+   } else {
+   /*
+* The last CPU to clear pending should have reset the
+* watchdog so we generally should not find it empty
+* here if our CPU was clear. However it could happen
+* due to a rare race with another CPU taking the
+* last CPU out of the mask concurrently.
+*
+* We can't add a warning for it. But just in case
+* there is a problem with the watchdog that is causing
+* the mask to not be reset, try to kick it along here.
+*/
+   if (unlikely(cpumask_empty(_smp_cpus_pending)))
+   goto none_pending;


If I understand correctly, that branch is a security in case the code is not
working as expected. But I'm really wondering if that's really needed, and we
will end up with a contention on the watchdog lock while this path should be
lockless, and I'd say that in most of the case there is nothing to do after
grabbing that lock. Am I missing something risky here?


I'm thinking it should not hit very much because that first test

 if (!cpumask_test_cpu(cpu, _smp_cpus_pending)) {

I think it should not be true too often, it would mean a CPU has taken
two timer interrupts while another one has not taken any, so hopefully
that's pretty rare in normal operation.


Thanks, Nick, for the clarification.

Reviewed-by: Laurent Dufour 


Re: [PATCH v3 08/12] KVM: Propagate vcpu explicitly to mark_page_dirty_in_slot()

2021-11-19 Thread David Woodhouse
On Thu, 2021-11-18 at 19:46 +, Sean Christopherson wrote:
> It is sufficient for the current physical CPU to have an active vCPU, which is
> generally guaranteed in the MMU code because, with a few exceptions, 
> populating
> SPTEs is done in vCPU context.
> 
> mmap() will never directly trigger SPTE creation, KVM first requires a vCPU to
> fault on the new address.  munmap() is a pure zap flow, i.e. won't create a
> present SPTE and trigger the writeback of the dirty bit.

OK, thanks.

> That's also why I dislike using kvm_get_running_vcpu(); when it's needed, 
> there's
> a valid vCPU from the caller, but it deliberately gets dropped and indirectly
> picked back up.

Yeah. So as things stand we have a kvm_write_guest() function which
takes a 'struct kvm *', as well as a kvm_vcpu_write_guest() function
which takes a 'struct kvm_vcpu *'.

But it is verboten to *use* the kvm_write_guest() or mark_page_dirty()
functions unless you actually *do* have an active vCPU. Do so, and the
kernel might just crash; not even a graceful failure mode.

That's a fairly awful bear trap that has now caught me *twice*. I'm
kind of amused that in all my hairy inline asm and pinning and crap for
guest memory access, the thing that's been *broken* is where I just
used the *existing* kvm_write_wall_clock() which does the simple
kvm_write_guest() thing.

I think at the very least perhaps we should do something like this in
mark_page_dirty_in_slot():

 WARN_ON_ONCE(!kvm_get_running_vcpu() || kvm_get_running_vcpu()->kvm != kvm);

(For illustration only; I'd actually use a local vcpu variable *and*
pass that vcpu to kvm_dirty_ring_get())

On propagating the caller's vcpu through and killing off the non-vCPU
versions of the functions, I'm torn... because even if we insist on *a*
vCPU being passed, it might be *the* vCPU, and that's just setting a
more subtle trap (which would have bitten my GPC invalidate code, for
example).

There are other more complex approaches like adding an extra ring, with
spinlocks, for the 'not from a vCPU' cases. But I think that's
overkill.








smime.p7s
Description: S/MIME cryptographic signature


Re: [PATCH v1] powerpc/watchdog: help remote CPUs to flush NMI printk output

2021-11-19 Thread Nicholas Piggin
Excerpts from Daniel Axtens's message of November 12, 2021 4:08 pm:
> Hi,
> 
>> The printk layer at the moment does not seem to have a good way to force
>> flush printk messages that are created in NMI context, except in the
>> panic path.
>>
>> NMI-context printk messages normally get to the console with irq_work,
>> but that won't help if the CPU is stuck with irqs disabled, as can be
>> the case for hard lockup watchdog messages.
>>
>> The watchdog currently flushes the printk buffers after detecting a
>> lockup on remote CPUs, but they may not have processed their NMI IPI
>> yet by that stage, or they may have self-detected a lockup in which
>> case they won't go via this NMI IPI path.
>>
>> Improve the situation by having NMI-context mark a flag if it called
>> printk, and have watchdog timer interrupts check if that flag was set
>> and try to flush if it was. Latency is not a big problem because we
>> were already stuck for a while, just need to try to make sure the
>> messages eventually make it out.

Sorry just coming back to this now the printk fix was merged upstream.

> Initially I was surprised that this doesn't affect the printk code
> itself, just the powerpc code...

Yeah I'm actually not sure how other watchdogs handle this. If they
use nmi_trigger_cpumask_backtrace() then that does have a flush.

>> Cc: Laurent Dufour 
>> Signed-off-by: Nicholas Piggin 
>> ---
>> This patch is actually based on top of this one which is planned to go
>> upstream in rc1/2. https://marc.info/?l=linux-kernel=163626070312052=2
>>
>> Prior to commit 93d102f094be that is fixed by the above, we had a printk
>> flush function with a different name but basically does the same job, so
>> this patch can be backported, just needs some care. I'm posting it for
>> review now for feedback so it's ready to go when the printk patch is
>> upstream.
>>
>> Thanks,
>> Nick
>>
>>  arch/powerpc/kernel/watchdog.c | 29 +++--
>>  1 file changed, 23 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
>> index b6533539386b..a7b6b0691203 100644
>> --- a/arch/powerpc/kernel/watchdog.c
>> +++ b/arch/powerpc/kernel/watchdog.c
>> @@ -86,6 +86,7 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
>>  /* SMP checker bits */
>>  static unsigned long __wd_smp_lock;
>>  static unsigned long __wd_reporting;
>> +static unsigned long __wd_nmi_output;
>>  static cpumask_t wd_smp_cpus_pending;
>>  static cpumask_t wd_smp_cpus_stuck;
>>  static u64 wd_smp_last_reset_tb;
>> @@ -154,6 +155,18 @@ static void wd_lockup_ipi(struct pt_regs *regs)
>>  else
>>  dump_stack();
>>  
>> +/*
>> + * We printk()ed from NMI context, the contents may not get flushed
>> + * if we return to a context with interrupts disabled because
>> + * printk uses irq_work to schedule flushes of NMI output.
>> + * __wd_nmi_output says there has been output from NMI context, so
>> + * other CPUs are recruited to help flush it.
>> + *
>> + * xchg is not needed here (it could be a simple atomic store), but
>> + * it gives the memory ordering and atomicity required.
>> + */
>> +xchg(&__wd_nmi_output, 1);
>> +
>>  /* Do not panic from here because that can recurse into NMI IPI layer */
>>  }
> 
> I think, looking at this and the other site where __wd_nmi_output is
> set, that this works because you set the flag only when you are done
> printing from the non-panic lockup context on this CPU. I was initially
> worried that you set this flag part way through printing, and then it
> might get cleared by another CPU while you're still trying to print.
> However, in this function it's right at the end - there's nothing else
> left to do, and ...
> 
>>  DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
>> @@ -386,6 +401,8 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
>>  print_irqtrace_events(current);
>>  show_regs(regs);
>>  
>> +xchg(&__wd_nmi_output, 1); // see wd_lockup_ipi
>> +
>>  if (sysctl_hardlockup_all_cpu_backtrace)
>>  trigger_allbutself_cpu_backtrace();
> 
> in this one, the only things that can happen afterwards are
>  - a panic, which does its own flushing, and
> 
> - trigger_allbutself_cpu_backtrace(), which seems to just send IPIs, not
>  do any printing of its own.

Yeah, on powerpc that actually ends up not using NMI IPIs because that's 
dangerous and core code uses it for less severe things than a hard 
lockup so we disabled it, so if you can take a regular IPI then you can
run irq work when it returns and print things. If we changed it to use
NMI IPIs I guess we might want to do something, maybe.

> All of which is fine, but I wonder if we need a more descriptive
> variable name or if the comment needs to specify that the flag should
> only be set at certain times.

I did try to add some explanation in that comment. I'll see if that can 
be made a bit more 

[PATCH] w1: Misuse of get_user()/put_user() reported by sparse

2021-11-19 Thread Christophe Leroy
sparse warnings: (new ones prefixed by >>)
>> drivers/w1/slaves/w1_ds28e04.c:342:13: sparse: sparse: incorrect type in 
>> initializer (different address spaces) @@ expected char [noderef] __user 
>> *_pu_addr @@ got char *buf @@
   drivers/w1/slaves/w1_ds28e04.c:342:13: sparse: expected char [noderef] 
__user *_pu_addr
   drivers/w1/slaves/w1_ds28e04.c:342:13: sparse: got char *buf
>> drivers/w1/slaves/w1_ds28e04.c:356:13: sparse: sparse: incorrect type in 
>> initializer (different address spaces) @@ expected char const [noderef] 
>> __user *_gu_addr @@ got char const *buf @@
   drivers/w1/slaves/w1_ds28e04.c:356:13: sparse: expected char const 
[noderef] __user *_gu_addr
   drivers/w1/slaves/w1_ds28e04.c:356:13: sparse: got char const *buf

The buffer buf is a failsafe buffer in kernel space, it's not user
memory hence doesn't deserve the use of get_user() or put_user().

Access 'buf' content directly.

Reported-by: kernel test robot 
Link: https://lore.kernel.org/lkml/20290526.k5vb7nwc-...@intel.com/T/
Signed-off-by: Christophe Leroy 
---
 drivers/w1/slaves/w1_ds28e04.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/w1/slaves/w1_ds28e04.c b/drivers/w1/slaves/w1_ds28e04.c
index e4f336111edc..d75bb16fb7a1 100644
--- a/drivers/w1/slaves/w1_ds28e04.c
+++ b/drivers/w1/slaves/w1_ds28e04.c
@@ -339,10 +339,7 @@ static BIN_ATTR_RW(pio, 1);
 static ssize_t crccheck_show(struct device *dev, struct device_attribute *attr,
 char *buf)
 {
-   if (put_user(w1_enable_crccheck + 0x30, buf))
-   return -EFAULT;
-
-   return sizeof(w1_enable_crccheck);
+   return sprintf(buf, "%d", w1_enable_crccheck);
 }
 
 static ssize_t crccheck_store(struct device *dev, struct device_attribute 
*attr,
@@ -353,11 +350,8 @@ static ssize_t crccheck_store(struct device *dev, struct 
device_attribute *attr,
if (count != 1 || !buf)
return -EINVAL;
 
-   if (get_user(val, buf))
-   return -EFAULT;
-
/* convert to decimal */
-   val = val - 0x30;
+   val = *buf - 0x30;
if (val != 0 && val != 1)
return -EINVAL;
 
-- 
2.33.1



Re: [PATCH v3 1/4] powerpc/watchdog: Fix missed watchdog reset due to memory ordering race

2021-11-19 Thread Nicholas Piggin
Excerpts from Laurent Dufour's message of November 16, 2021 1:09 am:
> Le 10/11/2021 à 03:50, Nicholas Piggin a écrit :
>> It is possible for all CPUs to miss the pending cpumask becoming clear,
>> and then nobody resetting it, which will cause the lockup detector to
>> stop working. It will eventually expire, but watchdog_smp_panic will
>> avoid doing anything if the pending mask is clear and it will never be
>> reset.
>> 
>> Order the cpumask clear vs the subsequent test to close this race.
>> 
>> Add an extra check for an empty pending mask when the watchdog fires and
>> finds its bit still clear, to try to catch any other possible races or
>> bugs here and keep the watchdog working. The extra test in
>> arch_touch_nmi_watchdog is required to prevent the new warning from
>> firing off.
>> 
>> Debugged-by: Laurent Dufour 
>> Signed-off-by: Nicholas Piggin 
>> ---
>>   arch/powerpc/kernel/watchdog.c | 41 +-
>>   1 file changed, 40 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
>> index f9ea0e5357f9..3c60872b6a2c 100644
>> --- a/arch/powerpc/kernel/watchdog.c
>> +++ b/arch/powerpc/kernel/watchdog.c
>> @@ -135,6 +135,10 @@ static void set_cpumask_stuck(const struct cpumask 
>> *cpumask, u64 tb)
>>   {
>>  cpumask_or(_smp_cpus_stuck, _smp_cpus_stuck, cpumask);
>>  cpumask_andnot(_smp_cpus_pending, _smp_cpus_pending, cpumask);
>> +/*
>> + * See wd_smp_clear_cpu_pending()
>> + */
>> +smp_mb();
>>  if (cpumask_empty(_smp_cpus_pending)) {
>>  wd_smp_last_reset_tb = tb;
>>  cpumask_andnot(_smp_cpus_pending,
>> @@ -215,13 +219,44 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
>>   
>>  cpumask_clear_cpu(cpu, _smp_cpus_stuck);
>>  wd_smp_unlock();
>> +} else {
>> +/*
>> + * The last CPU to clear pending should have reset the
>> + * watchdog so we generally should not find it empty
>> + * here if our CPU was clear. However it could happen
>> + * due to a rare race with another CPU taking the
>> + * last CPU out of the mask concurrently.
>> + *
>> + * We can't add a warning for it. But just in case
>> + * there is a problem with the watchdog that is causing
>> + * the mask to not be reset, try to kick it along here.
>> + */
>> +if (unlikely(cpumask_empty(_smp_cpus_pending)))
>> +goto none_pending;
> 
> If I understand correctly, that branch is a security in case the code is not 
> working as expected. But I'm really wondering if that's really needed, and we 
> will end up with a contention on the watchdog lock while this path should be 
> lockless, and I'd say that in most of the case there is nothing to do after 
> grabbing that lock. Am I missing something risky here?

I'm thinking it should not hit very much because that first test

if (!cpumask_test_cpu(cpu, _smp_cpus_pending)) {

I think it should not be true too often, it would mean a CPU has taken 
two timer interrupts while another one has not taken any, so hopefully
that's pretty rare in normal operation.

Thanks,
Nick


Re: [PATCH] powerpc/signal32: Use struct_group() to zero spe regs

2021-11-19 Thread LEROY Christophe


Le 18/11/2021 à 21:36, Kees Cook a écrit :
> In preparation for FORTIFY_SOURCE performing compile-time and run-time
> field bounds checking for memset(), avoid intentionally writing across
> neighboring fields.
> 
> Add a struct_group() for the spe registers so that memset() can correctly 
> reason
> about the size:
> 
> In function 'fortify_memset_chk',
> inlined from 'restore_user_regs.part.0' at 
> arch/powerpc/kernel/signal_32.c:539:3:
> >> include/linux/fortify-string.h:195:4: error: call to 
> '__write_overflow_field' declared with attribute warning: detected write 
> beyond size of field (1st parameter); maybe use struct_group()? 
> [-Werror=attribute-warning]
>   195 |__write_overflow_field();
>   |^~~~
> 
> Reported-by: kernel test robot 
> Signed-off-by: Kees Cook 

Reviewed-by: Christophe Leroy 

However, is it really worth adding that grouping ? Wouldn't it be 
cleaner to handle evr[] and acc separately ? Now that we are using 
unsafe variants of get/put user performance wouldn't be impacted.

I have some doubts about things like:

unsafe_copy_to_user(>mc_vregs, current->thread.evr,
ELF_NEVRREG * sizeof(u32), failed);

Because as far as I can see, ELF_NEVRREG is 34 but mc_vregs is a table 
of 33 u32 and is at the end of the structure:

struct mcontext {
elf_gregset_t   mc_gregs;
elf_fpregset_t  mc_fregs;
unsigned long   mc_pad[2];
elf_vrregset_t  mc_vregs __attribute__((__aligned__(16)));
};

typedef elf_vrreg_t elf_vrregset_t[ELF_NVRREG];

# define ELF_NEVRREG34  /* includes acc (as 2) */
# define ELF_NVRREG 33  /* includes vscr */



> ---
>   arch/powerpc/include/asm/processor.h |  6 --
>   arch/powerpc/kernel/signal_32.c  | 14 +-
>   2 files changed, 13 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/processor.h 
> b/arch/powerpc/include/asm/processor.h
> index e39bd0ff69f3..978a80308466 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -191,8 +191,10 @@ struct thread_struct {
>   int used_vsr;   /* set if process has used VSX */
>   #endif /* CONFIG_VSX */
>   #ifdef CONFIG_SPE
> - unsigned long   evr[32];/* upper 32-bits of SPE regs */
> - u64 acc;/* Accumulator */
> + struct_group(spe,
> + unsigned long   evr[32];/* upper 32-bits of SPE regs */
> + u64 acc;/* Accumulator */
> + );
>   unsigned long   spefscr;/* SPE & eFP status */
>   unsigned long   spefscr_last;   /* SPEFSCR value on last prctl
>  call or trap return */
> diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
> index 00a9c9cd6d42..5e1664b501e4 100644
> --- a/arch/powerpc/kernel/signal_32.c
> +++ b/arch/powerpc/kernel/signal_32.c
> @@ -527,16 +527,20 @@ static long restore_user_regs(struct pt_regs *regs,
>   regs_set_return_msr(regs, regs->msr & ~(MSR_FP | MSR_FE0 | MSR_FE1));
>   
>   #ifdef CONFIG_SPE
> - /* force the process to reload the spe registers from
> -current->thread when it next does spe instructions */
> + /*
> +  * Force the process to reload the spe registers from
> +  * current->thread when it next does spe instructions.
> +  * Since this is user ABI, we must enforce the sizing.
> +  */
> + BUILD_BUG_ON(sizeof(current->thread.spe) != ELF_NEVRREG * sizeof(u32));
>   regs_set_return_msr(regs, regs->msr & ~MSR_SPE);
>   if (msr & MSR_SPE) {
>   /* restore spe registers from the stack */
> - unsafe_copy_from_user(current->thread.evr, >mc_vregs,
> -   ELF_NEVRREG * sizeof(u32), failed);
> + unsafe_copy_from_user(>thread.spe, >mc_vregs,
> +   sizeof(current->thread.spe), failed);
>   current->thread.used_spe = true;
>   } else if (current->thread.used_spe)
> - memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32));
> + memset(>thread.spe, 0, sizeof(current->thread.spe));
>   
>   /* Always get SPEFSCR back */
>   unsafe_get_user(current->thread.spefscr, (u32 __user *)>mc_vregs + 
> ELF_NEVRREG, failed);
>