Re: [PATCH 02/13] nvme-multipath: add error handling support for add_disk()

2021-10-15 Thread Christoph Hellwig
Thanks,

applied to nvme-5.16.


Re: [PATCH 06/13] nvdimm/blk: avoid calling del_gendisk() on early failures

2021-10-15 Thread Dan Williams
On Fri, Oct 15, 2021 at 4:53 PM Luis Chamberlain  wrote:
>
> If nd_integrity_init() fails we'd get del_gendisk() called,
> but that's not correct as we should only call that if we're
> done with device_add_disk(). Fix this by providing unwinding
> prior to the devm call being registered and moving the devm
> registration to the very end.
>
> This should fix calling del_gendisk() if nd_integrity_init()
> fails. I only spotted this issue through code inspection. It
> does not fix any real world bug.
>

Just fyi, I'm preparing patches to delete this driver completely as it
is unused by any shipping platform. I hope to get that removal into
v5.16.


Re: [PATCH 02/13] nvme-multipath: add error handling support for add_disk()

2021-10-15 Thread Keith Busch
On Fri, Oct 15, 2021 at 04:52:08PM -0700, Luis Chamberlain wrote:
> We never checked for errors on add_disk() as this function
> returned void. Now that this is fixed, use the shiny new
> error handling.
> 
> Since we now can tell for sure when a disk was added, move
> setting the bit NVME_NSHEAD_DISK_LIVE only when we did
> add the disk successfully.
> 
> Nothing to do here as the cleanup is done elsewhere. We take
> care and use test_and_set_bit() because it is protects against
> two nvme paths simultaneously calling device_add_disk() on the
> same namespace head.

Looks good, thank you.

Reviewed-by: Keith Busch 


[PATCH 08/13] zram: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling.

Signed-off-by: Luis Chamberlain 
---
 drivers/block/zram/zram_drv.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 0a9309a2ef54..bdbded107e6b 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1983,7 +1983,9 @@ static int zram_add(void)
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
 
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
-   device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
+   ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
+   if (ret)
+   goto out_cleanup_disk;
 
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
 
@@ -1991,6 +1993,8 @@ static int zram_add(void)
pr_info("Added device: %s\n", zram->disk->disk_name);
return device_id;
 
+out_cleanup_disk:
+   blk_cleanup_disk(zram->disk);
 out_free_idr:
idr_remove(_index_idr, device_id);
 out_free_dev:
-- 
2.30.2



[PATCH 11/13] ps3vram: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling.

Signed-off-by: Luis Chamberlain 
---
 drivers/block/ps3vram.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index c7b19e128b03..af2a0d09c598 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -755,9 +755,14 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
dev_info(>core, "%s: Using %llu MiB of GPU memory\n",
 gendisk->disk_name, get_capacity(gendisk) >> 11);
 
-   device_add_disk(>core, gendisk, NULL);
+   error = device_add_disk(>core, gendisk, NULL);
+   if (error)
+   goto out_cleanup_disk;
+
return 0;
 
+out_cleanup_disk:
+   blk_cleanup_disk(gendisk);
 out_cache_cleanup:
remove_proc_entry(DEVICE_NAME, NULL);
ps3vram_cache_cleanup(dev);
-- 
2.30.2



[PATCH 05/13] nvdimm/btt: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling.

Signed-off-by: Luis Chamberlain 
---
 drivers/nvdimm/btt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 23ee8c005db5..57b921c5fbb5 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1542,7 +1542,9 @@ static int btt_blk_init(struct btt *btt)
}
 
set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
-   device_add_disk(>nd_btt->dev, btt->btt_disk, NULL);
+   rc = device_add_disk(>nd_btt->dev, btt->btt_disk, NULL);
+   if (rc)
+   goto out_cleanup_disk;
 
btt->nd_btt->size = btt->nlba * (u64)btt->sector_size;
nvdimm_check_and_set_ro(btt->btt_disk);
-- 
2.30.2



[PATCH 13/13] mtd/ubi/block: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling.

Signed-off-by: Luis Chamberlain 
---
 drivers/mtd/ubi/block.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index e003b4b44ffa..062e6c2c45f5 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -447,12 +447,18 @@ int ubiblock_create(struct ubi_volume_info *vi)
list_add_tail(>list, _devices);
 
/* Must be the last step: anyone can call file ops from now on */
-   add_disk(dev->gd);
+   ret = add_disk(dev->gd);
+   if (ret)
+   goto out_destroy_wq;
+
dev_info(disk_to_dev(dev->gd), "created from ubi%d:%d(%s)",
 dev->ubi_num, dev->vol_id, vi->name);
mutex_unlock(_mutex);
return 0;
 
+out_destroy_wq:
+   list_del(>list);
+   destroy_workqueue(dev->wq);
 out_remove_minor:
idr_remove(_minor_idr, gd->first_minor);
 out_cleanup_disk:
-- 
2.30.2



[PATCH 09/13] z2ram: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling. Only the disk is cleaned up inside
z2ram_register_disk() as the caller deals with the rest.

Signed-off-by: Luis Chamberlain 
---
 drivers/block/z2ram.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 4eef218108c6..ccc52c935faf 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -318,6 +318,7 @@ static const struct blk_mq_ops z2_mq_ops = {
 static int z2ram_register_disk(int minor)
 {
struct gendisk *disk;
+   int err;
 
disk = blk_mq_alloc_disk(_set, NULL);
if (IS_ERR(disk))
@@ -333,8 +334,10 @@ static int z2ram_register_disk(int minor)
sprintf(disk->disk_name, "z2ram");
 
z2ram_gendisk[minor] = disk;
-   add_disk(disk);
-   return 0;
+   err = add_disk(disk);
+   if (err)
+   blk_cleanup_disk(disk);
+   return err;
 }
 
 static int __init z2_init(void)
-- 
2.30.2



[PATCH 06/13] nvdimm/blk: avoid calling del_gendisk() on early failures

2021-10-15 Thread Luis Chamberlain
If nd_integrity_init() fails we'd get del_gendisk() called,
but that's not correct as we should only call that if we're
done with device_add_disk(). Fix this by providing unwinding
prior to the devm call being registered and moving the devm
registration to the very end.

This should fix calling del_gendisk() if nd_integrity_init()
fails. I only spotted this issue through code inspection. It
does not fix any real world bug.

Signed-off-by: Luis Chamberlain 
---
 drivers/nvdimm/blk.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 088d3dd6f6fa..591fa1f86f1e 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -240,6 +240,7 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
resource_size_t available_disk_size;
struct gendisk *disk;
u64 internal_nlba;
+   int rc;
 
internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk));
available_disk_size = internal_nlba * nsblk_sector_size(nsblk);
@@ -256,20 +257,26 @@ static int nsblk_attach_disk(struct nd_namespace_blk 
*nsblk)
blk_queue_logical_block_size(disk->queue, nsblk_sector_size(nsblk));
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
 
-   if (devm_add_action_or_reset(dev, nd_blk_release_disk, disk))
-   return -ENOMEM;
-
if (nsblk_meta_size(nsblk)) {
-   int rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
+   rc = nd_integrity_init(disk, nsblk_meta_size(nsblk));
 
if (rc)
-   return rc;
+   goto out_before_devm_err;
}
 
set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
device_add_disk(dev, disk, NULL);
+
+   /* nd_blk_release_disk() is called if this fails */
+   if (devm_add_action_or_reset(dev, nd_blk_release_disk, disk))
+   return -ENOMEM;
+
nvdimm_check_and_set_ro(disk);
return 0;
+
+out_before_devm_err:
+   blk_cleanup_disk(disk);
+   return rc;
 }
 
 static int nd_blk_probe(struct device *dev)
-- 
2.30.2



[PATCH 10/13] ps3disk: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling.

Signed-off-by: Luis Chamberlain 
---
 drivers/block/ps3disk.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 8d51efbe045d..3054adf77460 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -467,9 +467,13 @@ static int ps3disk_probe(struct ps3_system_bus_device 
*_dev)
 gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
 get_capacity(gendisk) >> 11);
 
-   device_add_disk(>sbd.core, gendisk, NULL);
-   return 0;
+   error = device_add_disk(>sbd.core, gendisk, NULL);
+   if (error)
+   goto fail_cleanup_disk;
 
+   return 0;
+fail_cleanup_disk:
+   blk_cleanup_disk(gendisk);
 fail_free_tag_set:
blk_mq_free_tag_set(>tag_set);
 fail_teardown:
-- 
2.30.2



[PATCH 03/13] nvdimm/btt: do not call del_gendisk() if not needed

2021-10-15 Thread Luis Chamberlain
We know we don't need del_gendisk() if we haven't added
the disk, so just skip it. This should fix a bug on older
kernels, as del_gendisk() became able to deal with
disks not added only recently, after the patch titled
"block: add flag for add_disk() completion notation".

Signed-off-by: Luis Chamberlain 
---
 drivers/nvdimm/btt.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 52de60b7adee..29cc7325e890 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1538,7 +1538,6 @@ static int btt_blk_init(struct btt *btt)
int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
 
if (rc) {
-   del_gendisk(btt->btt_disk);
blk_cleanup_disk(btt->btt_disk);
return rc;
}
-- 
2.30.2



[PATCH 02/13] nvme-multipath: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling.

Since we now can tell for sure when a disk was added, move
setting the bit NVME_NSHEAD_DISK_LIVE only when we did
add the disk successfully.

Nothing to do here as the cleanup is done elsewhere. We take
care and use test_and_set_bit() because it is protects against
two nvme paths simultaneously calling device_add_disk() on the
same namespace head.

Signed-off-by: Luis Chamberlain 
---
 drivers/nvme/host/multipath.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index e8ccdd398f78..022837f7be41 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -496,13 +496,23 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct 
nvme_ns_head *head)
 static void nvme_mpath_set_live(struct nvme_ns *ns)
 {
struct nvme_ns_head *head = ns->head;
+   int rc;
 
if (!head->disk)
return;
 
+   /*
+* test_and_set_bit() is used because it is protecting against two nvme
+* paths simultaneously calling device_add_disk() on the same namespace
+* head.
+*/
if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, >flags)) {
-   device_add_disk(>subsys->dev, head->disk,
-   nvme_ns_id_attr_groups);
+   rc = device_add_disk(>subsys->dev, head->disk,
+nvme_ns_id_attr_groups);
+   if (rc) {
+   clear_bit(NVME_NSHEAD_DISK_LIVE, >flags);
+   return;
+   }
nvme_add_ns_head_cdev(head);
}
 
-- 
2.30.2



[PATCH 00/13] block: add_disk() error handling stragglers

2021-10-15 Thread Luis Chamberlain
This patch set consists of al the straggler drivers for which we have
have no patch reviews done for yet. I'd like to ask for folks to please
consider chiming in, specially if you're the maintainer for the driver.
Additionally if you can specify if you'll take the patch in yourself or
if you want Jens to take it, that'd be great too.

Luis Chamberlain (13):
  block/brd: add error handling support for add_disk()
  nvme-multipath: add error handling support for add_disk()
  nvdimm/btt: do not call del_gendisk() if not needed
  nvdimm/btt: use goto error labels on btt_blk_init()
  nvdimm/btt: add error handling support for add_disk()
  nvdimm/blk: avoid calling del_gendisk() on early failures
  nvdimm/blk: add error handling support for add_disk()
  zram: add error handling support for add_disk()
  z2ram: add error handling support for add_disk()
  ps3disk: add error handling support for add_disk()
  ps3vram: add error handling support for add_disk()
  block/sunvdc: add error handling support for add_disk()
  mtd/ubi/block: add error handling support for add_disk()

 drivers/block/brd.c   |  9 +++--
 drivers/block/ps3disk.c   |  8 ++--
 drivers/block/ps3vram.c   |  7 ++-
 drivers/block/sunvdc.c| 14 +++---
 drivers/block/z2ram.c |  7 +--
 drivers/block/zram/zram_drv.c |  6 +-
 drivers/mtd/ubi/block.c   |  8 +++-
 drivers/nvdimm/blk.c  | 21 +++--
 drivers/nvdimm/btt.c  | 24 +++-
 drivers/nvme/host/multipath.c | 14 --
 10 files changed, 89 insertions(+), 29 deletions(-)

-- 
2.30.2



[PATCH 07/13] nvdimm/blk: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling.

Since nvdimm/blk uses devm we just need to move the devm
registration towards the end. And in hindsight, that seems
to also provide a fix given del_gendisk() should not be
called unless the disk was already added via add_disk().
The probably of that issue happening is low though, like
OOM while calling devm_add_action(), so the fix is minor.

We manually unwind in case of add_disk() failure prior
to the devm registration.

Signed-off-by: Luis Chamberlain 
---
 drivers/nvdimm/blk.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index 591fa1f86f1e..9f1eb41404ac 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -265,7 +265,9 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk)
}
 
set_capacity(disk, available_disk_size >> SECTOR_SHIFT);
-   device_add_disk(dev, disk, NULL);
+   rc = device_add_disk(dev, disk, NULL);
+   if (rc)
+   goto out_before_devm_err;
 
/* nd_blk_release_disk() is called if this fails */
if (devm_add_action_or_reset(dev, nd_blk_release_disk, disk))
-- 
2.30.2



[PATCH 01/13] block/brd: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling.

Signed-off-by: Luis Chamberlain 
---
 drivers/block/brd.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 530b31240203..6065f493876f 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -372,6 +372,7 @@ static int brd_alloc(int i)
struct brd_device *brd;
struct gendisk *disk;
char buf[DISK_NAME_LEN];
+   int err = -ENOMEM;
 
mutex_lock(_devices_mutex);
list_for_each_entry(brd, _devices, brd_list) {
@@ -422,16 +423,20 @@ static int brd_alloc(int i)
/* Tell the block layer that this is not a rotational device */
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
-   add_disk(disk);
+   err = add_disk(disk);
+   if (err)
+   goto out_cleanup_disk;
 
return 0;
 
+out_cleanup_disk:
+   blk_cleanup_disk(disk);
 out_free_dev:
mutex_lock(_devices_mutex);
list_del(>brd_list);
mutex_unlock(_devices_mutex);
kfree(brd);
-   return -ENOMEM;
+   return err;
 }
 
 static void brd_probe(dev_t dev)
-- 
2.30.2



[PATCH 04/13] nvdimm/btt: use goto error labels on btt_blk_init()

2021-10-15 Thread Luis Chamberlain
This will make it easier to share common error paths.

Signed-off-by: Luis Chamberlain 
---
 drivers/nvdimm/btt.c | 19 ---
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index 29cc7325e890..23ee8c005db5 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1520,10 +1520,11 @@ static int btt_blk_init(struct btt *btt)
 {
struct nd_btt *nd_btt = btt->nd_btt;
struct nd_namespace_common *ndns = nd_btt->ndns;
+   int rc = -ENOMEM;
 
btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE);
if (!btt->btt_disk)
-   return -ENOMEM;
+   goto out;
 
nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
btt->btt_disk->first_minor = 0;
@@ -1535,19 +1536,23 @@ static int btt_blk_init(struct btt *btt)
blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue);
 
if (btt_meta_size(btt)) {
-   int rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
-
-   if (rc) {
-   blk_cleanup_disk(btt->btt_disk);
-   return rc;
-   }
+   rc = nd_integrity_init(btt->btt_disk, btt_meta_size(btt));
+   if (rc)
+   goto out_cleanup_disk;
}
+
set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
device_add_disk(>nd_btt->dev, btt->btt_disk, NULL);
+
btt->nd_btt->size = btt->nlba * (u64)btt->sector_size;
nvdimm_check_and_set_ro(btt->btt_disk);
 
return 0;
+
+out_cleanup_disk:
+   blk_cleanup_disk(btt->btt_disk);
+out:
+   return rc;
 }
 
 static void btt_blk_cleanup(struct btt *btt)
-- 
2.30.2



[PATCH 12/13] block/sunvdc: add error handling support for add_disk()

2021-10-15 Thread Luis Chamberlain
We never checked for errors on add_disk() as this function
returned void. Now that this is fixed, use the shiny new
error handling.

We re-use the same free tag call, so we also add a label for
that as well.

Signed-off-by: Luis Chamberlain 
---
 drivers/block/sunvdc.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 4d4bb810c2ae..6f45a53f7cbf 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -826,8 +826,8 @@ static int probe_disk(struct vdc_port *port)
if (IS_ERR(g)) {
printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
   port->vio.name);
-   blk_mq_free_tag_set(>tag_set);
-   return PTR_ERR(g);
+   err = PTR_ERR(g);
+   goto out_free_tag;
}
 
port->disk = g;
@@ -879,9 +879,17 @@ static int probe_disk(struct vdc_port *port)
   port->vdisk_size, (port->vdisk_size >> (20 - 9)),
   port->vio.ver.major, port->vio.ver.minor);
 
-   device_add_disk(>vio.vdev->dev, g, NULL);
+   err = device_add_disk(>vio.vdev->dev, g, NULL);
+   if (err)
+   goto out_cleanup_disk;
 
return 0;
+
+out_cleanup_disk:
+   blk_cleanup_disk(g);
+out_free_tag:
+   blk_mq_free_tag_set(>tag_set);
+   return err;
 }
 
 static struct ldc_channel_config vdc_ldc_cfg = {
-- 
2.30.2



Re: [PATCH v6 00/11] PCI: Drop duplicated tracking of a pci_dev's bound driver

2021-10-15 Thread Andy Shevchenko
On Fri, Oct 15, 2021 at 7:46 PM Bjorn Helgaas  wrote:
> On Wed, Oct 13, 2021 at 04:23:09PM +0300, Andy Shevchenko wrote:

...

> so compared to Uwe's v6, I restored that section to the original code.
> My goal here was to make the patch as simple and easy to review as
> possible.

Thanks for elaboration. I have got it.

...

> You're right, this didn't make much sense in that patch.  I moved the
> line join to the previous patch, which unindented this section and
> made space for this to fit on one line.  Here's the revised commit:
>
>   
> https://git.kernel.org/cgit/linux/kernel/git/helgaas/pci.git/commit/?id=34ab316d7287

Side remark: default without break or return is error prone (okay, to
some extent). Perhaps adding the return statement there will make
things robust and clean.

-- 
With Best Regards,
Andy Shevchenko


Re: [PATCH v6 00/11] PCI: Drop duplicated tracking of a pci_dev's bound driver

2021-10-15 Thread Bjorn Helgaas
On Wed, Oct 13, 2021 at 04:23:09PM +0300, Andy Shevchenko wrote:
> On Wed, Oct 13, 2021 at 06:33:56AM -0500, Bjorn Helgaas wrote:
> > On Wed, Oct 13, 2021 at 12:26:42PM +0300, Andy Shevchenko wrote:
> > > On Wed, Oct 13, 2021 at 2:33 AM Bjorn Helgaas  wrote:
> > > > On Mon, Oct 04, 2021 at 02:59:24PM +0200, Uwe Kleine-König wrote:

> > > > +   return drv && drv->resume ?
> > > > +   drv->resume(pci_dev) : 
> > > > pci_pm_reenable_device(pci_dev);
> > > 
> > > One line?
> > 
> > I don't think I touched that line.
> 
> Then why they are both in + section?

They're both in the + section of the interdiff because Uwe's v6 patch
looks like this:

  static int pci_legacy_resume(struct device *dev)
  {
  struct pci_dev *pci_dev = to_pci_dev(dev);
  -   return drv && drv->resume ?
  -   drv->resume(pci_dev) : 
pci_pm_reenable_device(pci_dev);
  +   if (pci_dev->dev.driver) {
  +   struct pci_driver *drv = to_pci_driver(pci_dev->dev.driver);
  +
  +   if (drv->resume)
  +   return drv->resume(pci_dev);
  +   }
  +
  +   return pci_pm_reenable_device(pci_dev);

and my revision looks like this:

   static int pci_legacy_resume(struct device *dev)
   {
  struct pci_dev *pci_dev = to_pci_dev(dev);
  -   struct pci_driver *drv = pci_dev->driver;
  +   struct pci_driver *drv = to_pci_driver(dev->driver);

so compared to Uwe's v6, I restored that section to the original code.
My goal here was to make the patch as simple and easy to review as
possible.

> > > > +   struct pci_driver *drv = to_pci_driver(dev->dev.driver);
> > > > const struct pci_error_handlers *err_handler =
> > > > -   dev->dev.driver ? 
> > > > to_pci_driver(dev->dev.driver)->err_handler : NULL;
> > > > +   drv ? drv->err_handler : NULL;
> > > 
> > > Isn't dev->driver == to_pci_driver(dev->dev.driver)?
> > 
> > Yes, I think so, but not sure what you're getting at here, can you
> > elaborate?
> 
> Getting pointer from another pointer seems waste of resources, why we
> can't simply
> 
>   struct pci_driver *drv = dev->driver;

I think this is in pci_dev_save_and_disable(), and "dev" here is a
struct pci_dev *.  We're removing the dev->driver member.  Let me know
if I'm still missing something.

> > > > -   "bad request in aer recovery "
> > > > -   "operation!\n");
> > > > +   "bad request in AER recovery 
> > > > operation!\n");

> > > Stray change? Or is it in a separate patch in your tree?
> > 
> > Could be skipped.  The string now fits on one line so I combined it to
> > make it more greppable.
> 
> This is inconsistency in your changes, in one case you are objecting of
> doing something close to the changed lines, in the other you are doing
> unrelated change.

You're right, this didn't make much sense in that patch.  I moved the
line join to the previous patch, which unindented this section and
made space for this to fit on one line.  Here's the revised commit:

  
https://git.kernel.org/cgit/linux/kernel/git/helgaas/pci.git/commit/?id=34ab316d7287



RE: [PATCH] soc: fsl: dpio: protect smp_processor_id when get processor id

2021-10-15 Thread Li, Meng


> -Original Message-
> From: Robin Murphy 
> Sent: Friday, October 15, 2021 9:40 PM
> To: Li, Meng ; roy.ple...@nxp.com;
> leoyang...@nxp.com; ruxandra.radule...@nxp.com; horia.gea...@nxp.com
> Cc: linux-ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-arm-
> ker...@lists.infradead.org
> Subject: Re: [PATCH] soc: fsl: dpio: protect smp_processor_id when get
> processor id
> 
> [Please note: This e-mail is from an EXTERNAL e-mail address]
> 
> On 2021-10-15 07:36, meng...@windriver.com wrote:
> > From: Meng Li 
> >
> > When enable debug kernel configs,there will be calltrace as below:
> >
> > BUG: using smp_processor_id() in preemptible [] code:
> > swapper/0/1 caller is debug_smp_processor_id+0x20/0x30
> > CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.10.63-yocto-standard #1
> > Hardware name: NXP Layerscape LX2160ARDB (DT) Call trace:
> >   dump_backtrace+0x0/0x1a0
> >   show_stack+0x24/0x30
> >   dump_stack+0xf0/0x13c
> >   check_preemption_disabled+0x100/0x110
> >   debug_smp_processor_id+0x20/0x30
> >   dpaa2_io_query_fq_count+0xdc/0x154
> >   dpaa2_eth_stop+0x144/0x314
> >   __dev_close_many+0xdc/0x160
> >   __dev_change_flags+0xe8/0x220
> >   dev_change_flags+0x30/0x70
> >   ic_close_devs+0x50/0x78
> >   ip_auto_config+0xed0/0xf10
> >   do_one_initcall+0xac/0x460
> >   kernel_init_freeable+0x30c/0x378
> >   kernel_init+0x20/0x128
> >   ret_from_fork+0x10/0x38
> >
> > Because smp_processor_id() should be invoked in preempt disable status.
> > So, add preempt_disable/enable() to protect smp_processor_id().
> 
> If preemption doesn't matter anyway, as the comment in the context implies,
> then it probably makes more sense just to use
> raw_smp_processor_id() instead.
> 

Thanks for your professional suggest, I will raw_smp_processor_id() and test.
If works fine, I will send v2 patch.

Thanks,
Limeng

> Robin.
> 
> > Fixes: c89105c9b390 ("staging: fsl-mc: Move DPIO from staging to
> > drivers/soc/fsl")
> > Cc: sta...@vger.kernel.org
> > Signed-off-by: Meng Li 
> > ---
> >   drivers/soc/fsl/dpio/dpio-service.c | 7 +--
> >   1 file changed, 5 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/soc/fsl/dpio/dpio-service.c
> > b/drivers/soc/fsl/dpio/dpio-service.c
> > index 19f47ea9dab0..afc3b89b0fc5 100644
> > --- a/drivers/soc/fsl/dpio/dpio-service.c
> > +++ b/drivers/soc/fsl/dpio/dpio-service.c
> > @@ -58,8 +58,11 @@ static inline struct dpaa2_io
> *service_select_by_cpu(struct dpaa2_io *d,
> >* If cpu == -1, choose the current cpu, with no guarantees about
> >* potentially being migrated away.
> >*/
> > - if (cpu < 0)
> > - cpu = smp_processor_id();
> > +if (cpu < 0) {
> > +preempt_disable();
> > +cpu = smp_processor_id();
> > +preempt_enable();
> > +}
> >
> >   /* If a specific cpu was requested, pick it up immediately */
> >   return dpio_by_cpu[cpu];
> >


Re: [PATCH 2/2] sched: Centralize SCHED_{SMT, MC, CLUSTER} definitions

2021-10-15 Thread Peter Zijlstra
On Fri, Oct 08, 2021 at 04:22:27PM +0100, Valentin Schneider wrote:

> So x86 has it default yes, and a lot of others (e.g. arm64) have it default
> no.
> 
> IMO you don't gain much by disabling them. SCHED_MC and SCHED_CLUSTER only
> control the presence of a sched_domain_topology_level - if it's useless it
> gets degenerated at domain build time. Some valid reasons for not using
> them is if the architecture defines its own topology table (e.g. powerpc
> has CACHE and MC levels which are not gated behind any CONFIG).
> 
> SCHED_SMT has an impact on code generated in sched/core.c, but that is also
> gated by a static key.
> 
> So I'd say having them default yes is sensible. I'd even say we should
> change the "If unsure say N here." to "Y".

Right, so I tend to agree (and also that we should fix that Kconfig help
text). But it would be very nice to have feedback from the affected arch
maintainers.



[PATCH] soc: fsl: dpio: protect smp_processor_id when get processor id

2021-10-15 Thread Meng . Li
From: Meng Li 

When enable debug kernel configs,there will be calltrace as below:

BUG: using smp_processor_id() in preemptible [] code: swapper/0/1
caller is debug_smp_processor_id+0x20/0x30
CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.10.63-yocto-standard #1
Hardware name: NXP Layerscape LX2160ARDB (DT)
Call trace:
 dump_backtrace+0x0/0x1a0
 show_stack+0x24/0x30
 dump_stack+0xf0/0x13c
 check_preemption_disabled+0x100/0x110
 debug_smp_processor_id+0x20/0x30
 dpaa2_io_query_fq_count+0xdc/0x154
 dpaa2_eth_stop+0x144/0x314
 __dev_close_many+0xdc/0x160
 __dev_change_flags+0xe8/0x220
 dev_change_flags+0x30/0x70
 ic_close_devs+0x50/0x78
 ip_auto_config+0xed0/0xf10
 do_one_initcall+0xac/0x460
 kernel_init_freeable+0x30c/0x378
 kernel_init+0x20/0x128
 ret_from_fork+0x10/0x38

Because smp_processor_id() should be invoked in preempt disable status.
So, add preempt_disable/enable() to protect smp_processor_id().

Fixes: c89105c9b390 ("staging: fsl-mc: Move DPIO from staging to 
drivers/soc/fsl")
Cc: sta...@vger.kernel.org
Signed-off-by: Meng Li 
---
 drivers/soc/fsl/dpio/dpio-service.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/soc/fsl/dpio/dpio-service.c 
b/drivers/soc/fsl/dpio/dpio-service.c
index 19f47ea9dab0..afc3b89b0fc5 100644
--- a/drivers/soc/fsl/dpio/dpio-service.c
+++ b/drivers/soc/fsl/dpio/dpio-service.c
@@ -58,8 +58,11 @@ static inline struct dpaa2_io *service_select_by_cpu(struct 
dpaa2_io *d,
 * If cpu == -1, choose the current cpu, with no guarantees about
 * potentially being migrated away.
 */
-   if (cpu < 0)
-   cpu = smp_processor_id();
+if (cpu < 0) {
+preempt_disable();
+cpu = smp_processor_id();
+preempt_enable();
+}
 
/* If a specific cpu was requested, pick it up immediately */
return dpio_by_cpu[cpu];
-- 
2.17.1



Re: [PATCH net-next 11/12] ethernet: ibmveth: use ether_addr_to_u64()

2021-10-15 Thread Tyrel Datwyler
On 10/15/21 3:16 PM, Jakub Kicinski wrote:
> We'll want to make netdev->dev_addr const, remove the local
> helper which is missing a const qualifier on the argument
> and use ether_addr_to_u64().

LGTM. ibmveth_encode_mac_addr() is clearly code duplication of
ether_addr_to_u64() minus the const qualifier.

Reviewed-by: Tyrel Datwyler 

> 
> Similar story to mlx4.
> 
> Signed-off-by: Jakub Kicinski 
> ---
> CC: cforn...@linux.ibm.com
> CC: m...@ellerman.id.au
> CC: b...@kernel.crashing.org
> CC: pau...@samba.org
> CC: linuxppc-dev@lists.ozlabs.org
> ---
>  drivers/net/ethernet/ibm/ibmveth.c | 17 +++--
>  1 file changed, 3 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
> b/drivers/net/ethernet/ibm/ibmveth.c
> index 836617fb3f40..45ba40cf4d07 100644
> --- a/drivers/net/ethernet/ibm/ibmveth.c
> +++ b/drivers/net/ethernet/ibm/ibmveth.c
> @@ -483,17 +483,6 @@ static int ibmveth_register_logical_lan(struct 
> ibmveth_adapter *adapter,
>   return rc;
>  }
> 
> -static u64 ibmveth_encode_mac_addr(u8 *mac)
> -{
> - int i;
> - u64 encoded = 0;
> -
> - for (i = 0; i < ETH_ALEN; i++)
> - encoded = (encoded << 8) | mac[i];
> -
> - return encoded;
> -}
> -
>  static int ibmveth_open(struct net_device *netdev)
>  {
>   struct ibmveth_adapter *adapter = netdev_priv(netdev);
> @@ -553,7 +542,7 @@ static int ibmveth_open(struct net_device *netdev)
>   adapter->rx_queue.num_slots = rxq_entries;
>   adapter->rx_queue.toggle = 1;
> 
> - mac_address = ibmveth_encode_mac_addr(netdev->dev_addr);
> + mac_address = ether_addr_to_u64(netdev->dev_addr);
> 
>   rxq_desc.fields.flags_len = IBMVETH_BUF_VALID |
>   adapter->rx_queue.queue_len;
> @@ -1476,7 +1465,7 @@ static void ibmveth_set_multicast_list(struct 
> net_device *netdev)
>   netdev_for_each_mc_addr(ha, netdev) {
>   /* add the multicast address to the filter table */
>   u64 mcast_addr;
> - mcast_addr = ibmveth_encode_mac_addr(ha->addr);
> + mcast_addr = ether_addr_to_u64(ha->addr);
>   lpar_rc = h_multicast_ctrl(adapter->vdev->unit_address,
>  IbmVethMcastAddFilter,
>  mcast_addr);
> @@ -1606,7 +1595,7 @@ static int ibmveth_set_mac_addr(struct net_device *dev, 
> void *p)
>   if (!is_valid_ether_addr(addr->sa_data))
>   return -EADDRNOTAVAIL;
> 
> - mac_address = ibmveth_encode_mac_addr(addr->sa_data);
> + mac_address = ether_addr_to_u64(addr->sa_data);
>   rc = h_change_logical_lan_mac(adapter->vdev->unit_address, mac_address);
>   if (rc) {
>   netdev_err(adapter->netdev, "h_change_logical_lan_mac failed 
> with rc=%d\n", rc);
> 



[PATCH net-next 11/12] ethernet: ibmveth: use ether_addr_to_u64()

2021-10-15 Thread Jakub Kicinski
We'll want to make netdev->dev_addr const, remove the local
helper which is missing a const qualifier on the argument
and use ether_addr_to_u64().

Similar story to mlx4.

Signed-off-by: Jakub Kicinski 
---
CC: cforn...@linux.ibm.com
CC: m...@ellerman.id.au
CC: b...@kernel.crashing.org
CC: pau...@samba.org
CC: linuxppc-dev@lists.ozlabs.org
---
 drivers/net/ethernet/ibm/ibmveth.c | 17 +++--
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmveth.c 
b/drivers/net/ethernet/ibm/ibmveth.c
index 836617fb3f40..45ba40cf4d07 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -483,17 +483,6 @@ static int ibmveth_register_logical_lan(struct 
ibmveth_adapter *adapter,
return rc;
 }
 
-static u64 ibmveth_encode_mac_addr(u8 *mac)
-{
-   int i;
-   u64 encoded = 0;
-
-   for (i = 0; i < ETH_ALEN; i++)
-   encoded = (encoded << 8) | mac[i];
-
-   return encoded;
-}
-
 static int ibmveth_open(struct net_device *netdev)
 {
struct ibmveth_adapter *adapter = netdev_priv(netdev);
@@ -553,7 +542,7 @@ static int ibmveth_open(struct net_device *netdev)
adapter->rx_queue.num_slots = rxq_entries;
adapter->rx_queue.toggle = 1;
 
-   mac_address = ibmveth_encode_mac_addr(netdev->dev_addr);
+   mac_address = ether_addr_to_u64(netdev->dev_addr);
 
rxq_desc.fields.flags_len = IBMVETH_BUF_VALID |
adapter->rx_queue.queue_len;
@@ -1476,7 +1465,7 @@ static void ibmveth_set_multicast_list(struct net_device 
*netdev)
netdev_for_each_mc_addr(ha, netdev) {
/* add the multicast address to the filter table */
u64 mcast_addr;
-   mcast_addr = ibmveth_encode_mac_addr(ha->addr);
+   mcast_addr = ether_addr_to_u64(ha->addr);
lpar_rc = h_multicast_ctrl(adapter->vdev->unit_address,
   IbmVethMcastAddFilter,
   mcast_addr);
@@ -1606,7 +1595,7 @@ static int ibmveth_set_mac_addr(struct net_device *dev, 
void *p)
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
 
-   mac_address = ibmveth_encode_mac_addr(addr->sa_data);
+   mac_address = ether_addr_to_u64(addr->sa_data);
rc = h_change_logical_lan_mac(adapter->vdev->unit_address, mac_address);
if (rc) {
netdev_err(adapter->netdev, "h_change_logical_lan_mac failed 
with rc=%d\n", rc);
-- 
2.31.1



Re: [PATCH v2 13/13] lkdtm: Add a test for function descriptors protection

2021-10-15 Thread Kees Cook
On Thu, Oct 14, 2021 at 07:50:02AM +0200, Christophe Leroy wrote:
> Add WRITE_OPD to check that you can't modify function
> descriptors.
> 
> Gives the following result when function descriptors are
> not protected:
> 
>   lkdtm: Performing direct entry WRITE_OPD
>   lkdtm: attempting bad 16 bytes write at c269b358
>   lkdtm: FAIL: survived bad write
>   lkdtm: do_nothing was hijacked!
> 
> Looks like a standard compiler barrier(); is not enough to force
> GCC to use the modified function descriptor. Add to add a fake empty
> inline assembly to force GCC to reload the function descriptor.
> 
> Signed-off-by: Christophe Leroy 
> ---
>  drivers/misc/lkdtm/core.c  |  1 +
>  drivers/misc/lkdtm/lkdtm.h |  1 +
>  drivers/misc/lkdtm/perms.c | 22 ++
>  3 files changed, 24 insertions(+)
> 
> diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
> index fe6fd34b8caf..de092aa03b5d 100644
> --- a/drivers/misc/lkdtm/core.c
> +++ b/drivers/misc/lkdtm/core.c
> @@ -148,6 +148,7 @@ static const struct crashtype crashtypes[] = {
>   CRASHTYPE(WRITE_RO),
>   CRASHTYPE(WRITE_RO_AFTER_INIT),
>   CRASHTYPE(WRITE_KERN),
> + CRASHTYPE(WRITE_OPD),
>   CRASHTYPE(REFCOUNT_INC_OVERFLOW),
>   CRASHTYPE(REFCOUNT_ADD_OVERFLOW),
>   CRASHTYPE(REFCOUNT_INC_NOT_ZERO_OVERFLOW),
> diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
> index c212a253edde..188bd0fd6575 100644
> --- a/drivers/misc/lkdtm/lkdtm.h
> +++ b/drivers/misc/lkdtm/lkdtm.h
> @@ -105,6 +105,7 @@ void __init lkdtm_perms_init(void);
>  void lkdtm_WRITE_RO(void);
>  void lkdtm_WRITE_RO_AFTER_INIT(void);
>  void lkdtm_WRITE_KERN(void);
> +void lkdtm_WRITE_OPD(void);
>  void lkdtm_EXEC_DATA(void);
>  void lkdtm_EXEC_STACK(void);
>  void lkdtm_EXEC_KMALLOC(void);
> diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
> index 96b3ebfcb8ed..3870bc82d40d 100644
> --- a/drivers/misc/lkdtm/perms.c
> +++ b/drivers/misc/lkdtm/perms.c
> @@ -44,6 +44,11 @@ static noinline void do_overwritten(void)
>   return;
>  }
>  
> +static noinline void do_almost_nothing(void)
> +{
> + pr_info("do_nothing was hijacked!\n");
> +}
> +
>  static void *setup_function_descriptor(func_desc_t *fdesc, void *dst)
>  {
>   memcpy(fdesc, do_nothing, sizeof(*fdesc));
> @@ -143,6 +148,23 @@ void lkdtm_WRITE_KERN(void)
>   do_overwritten();
>  }
>  
> +void lkdtm_WRITE_OPD(void)
> +{
> + size_t size = sizeof(func_desc_t);
> + void (*func)(void) = do_nothing;
> +
> + if (!have_function_descriptors()) {
> + pr_info("Platform doesn't have function descriptors.\n");

This should be more explicit ('xfail'):

pr_info("XFAIL: platform doesn't use function descriptors.\n");

> + return;
> + }
> + pr_info("attempting bad %zu bytes write at %px\n", size, do_nothing);
> + memcpy(do_nothing, do_almost_nothing, size);
> + pr_err("FAIL: survived bad write\n");
> +
> + asm("" : "=m"(func));

Since this is a descriptor, I assume no icache flush is needed. Are
function descriptors strictly dcache? (Is anything besides just a
barrier needed?)

> + func();
> +}
> +
>  void lkdtm_EXEC_DATA(void)
>  {
>   execute_location(data_area, CODE_WRITE);
> -- 
> 2.31.1
> 

-- 
Kees Cook


Re: [PATCH v2 11/13] lkdtm: Fix lkdtm_EXEC_RODATA()

2021-10-15 Thread Kees Cook
On Thu, Oct 14, 2021 at 07:50:00AM +0200, Christophe Leroy wrote:
> Behind its location, lkdtm_EXEC_RODATA() executes
> lkdtm_rodata_do_nothing() which is a real function,
> not a copy of do_nothing().
> 
> So executes it directly instead of using execute_location().
> 
> This is necessary because following patch will fix execute_location()
> to use a copy of the function descriptor of do_nothing() and
> function descriptor of lkdtm_rodata_do_nothing() might be different.
> 
> And fix displayed addresses by dereferencing the function descriptors.
> 
> Signed-off-by: Christophe Leroy 

I still don't understand this -- it doesn't look needed at all given the
changes in patch 12. (i.e. everything is using
dereference_function_descriptor() now)

Can't this patch be dropped?

-Kees

> ---
>  drivers/misc/lkdtm/perms.c | 9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
> index 035fcca441f0..5266dc28df6e 100644
> --- a/drivers/misc/lkdtm/perms.c
> +++ b/drivers/misc/lkdtm/perms.c
> @@ -153,7 +153,14 @@ void lkdtm_EXEC_VMALLOC(void)
>  
>  void lkdtm_EXEC_RODATA(void)
>  {
> - execute_location(lkdtm_rodata_do_nothing, CODE_AS_IS);
> + pr_info("attempting ok execution at %px\n",
> + dereference_function_descriptor(do_nothing));
> + do_nothing();
> +
> + pr_info("attempting bad execution at %px\n",
> + dereference_function_descriptor(lkdtm_rodata_do_nothing));
> + lkdtm_rodata_do_nothing();
> + pr_err("FAIL: func returned\n");
>  }
>  
>  void lkdtm_EXEC_USERSPACE(void)
> -- 
> 2.31.1
> 

-- 
Kees Cook


Re: [PATCH v2 12/13] lkdtm: Fix execute_[user]_location()

2021-10-15 Thread Kees Cook
On Thu, Oct 14, 2021 at 07:50:01AM +0200, Christophe Leroy wrote:
> execute_location() and execute_user_location() intent
> to copy do_nothing() text and execute it at a new location.
> However, at the time being it doesn't copy do_nothing() function
> but do_nothing() function descriptor which still points to the
> original text. So at the end it still executes do_nothing() at
> its original location allthough using a copied function descriptor.
> 
> So, fix that by really copying do_nothing() text and build a new
> function descriptor by copying do_nothing() function descriptor and
> updating the target address with the new location.
> 
> Also fix the displayed addresses by dereferencing do_nothing()
> function descriptor.
> 
> Signed-off-by: Christophe Leroy 
> ---
>  drivers/misc/lkdtm/perms.c | 25 +
>  include/asm-generic/sections.h |  5 +
>  2 files changed, 26 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
> index 5266dc28df6e..96b3ebfcb8ed 100644
> --- a/drivers/misc/lkdtm/perms.c
> +++ b/drivers/misc/lkdtm/perms.c
> @@ -44,19 +44,32 @@ static noinline void do_overwritten(void)
>   return;
>  }
>  
> +static void *setup_function_descriptor(func_desc_t *fdesc, void *dst)
> +{
> + memcpy(fdesc, do_nothing, sizeof(*fdesc));
> + fdesc->addr = (unsigned long)dst;
> + barrier();
> +
> + return fdesc;
> +}

How about collapsing the "have_function_descriptors()" check into
setup_function_descriptor()?

static void *setup_function_descriptor(func_desc_t *fdesc, void *dst)
{
if (__is_defined(HAVE_FUNCTION_DESCRIPTORS)) {
memcpy(fdesc, do_nothing, sizeof(*fdesc));
fdesc->addr = (unsigned long)dst;
barrier();
return fdesc;
} else {
return dst;
}
}

> +
>  static noinline void execute_location(void *dst, bool write)
>  {
>   void (*func)(void) = dst;
> + func_desc_t fdesc;
> + void *do_nothing_text = dereference_function_descriptor(do_nothing);
>  
> - pr_info("attempting ok execution at %px\n", do_nothing);
> + pr_info("attempting ok execution at %px\n", do_nothing_text);
>   do_nothing();
>  
>   if (write == CODE_WRITE) {
> - memcpy(dst, do_nothing, EXEC_SIZE);
> + memcpy(dst, do_nothing_text, EXEC_SIZE);
>   flush_icache_range((unsigned long)dst,
>  (unsigned long)dst + EXEC_SIZE);
>   }
>   pr_info("attempting bad execution at %px\n", func);
> + if (have_function_descriptors())
> + func = setup_function_descriptor(, dst);
>   func();
>   pr_err("FAIL: func returned\n");
>  }
> @@ -67,15 +80,19 @@ static void execute_user_location(void *dst)
>  
>   /* Intentionally crossing kernel/user memory boundary. */
>   void (*func)(void) = dst;
> + func_desc_t fdesc;
> + void *do_nothing_text = dereference_function_descriptor(do_nothing);
>  
> - pr_info("attempting ok execution at %px\n", do_nothing);
> + pr_info("attempting ok execution at %px\n", do_nothing_text);
>   do_nothing();
>  
> - copied = access_process_vm(current, (unsigned long)dst, do_nothing,
> + copied = access_process_vm(current, (unsigned long)dst, do_nothing_text,
>  EXEC_SIZE, FOLL_WRITE);
>   if (copied < EXEC_SIZE)
>   return;
>   pr_info("attempting bad execution at %px\n", func);
> + if (have_function_descriptors())
> + func = setup_function_descriptor(, dst);
>   func();
>   pr_err("FAIL: func returned\n");
>  }


> diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
> index 76163883c6ff..d225318538bd 100644
> --- a/include/asm-generic/sections.h
> +++ b/include/asm-generic/sections.h
> @@ -70,6 +70,11 @@ typedef struct {
>  } func_desc_t;
>  #endif
>  
> +static inline bool have_function_descriptors(void)
> +{
> + return __is_defined(HAVE_FUNCTION_DESCRIPTORS);
> +}
> +
>  /* random extra sections (if any).  Override
>   * in asm/sections.h */
>  #ifndef arch_is_kernel_text

This hunk seems like it should live in a separate patch.

-- 
Kees Cook


Re: [PATCH] ibmvscsi: use GFP_KERNEL with dma_alloc_coherent in initialize_event_pool

2021-10-15 Thread Tyrel Datwyler
On 10/14/21 9:36 PM, Michael Ellerman wrote:
> Tyrel Datwyler  writes:
>> Just stumbled upon this trivial little patch that looks to have gotten lost 
>> in
>> the shuffle. Seems it even got a reviewed-by from Brian [1].
>>
>> So, uh I guess after almost 3 years...ping?
> 
> It's marked "Changes Requested" here:
> 
>   
> https://patchwork.kernel.org/project/linux-scsi/patch/1547089149-20577-1-git-send-email-tyr...@linux.vnet.ibm.com/

Interesting

> 
> 
> If it isn't picked up in a few days you should probably do a resend so
> that it reappears in patchwork.

Fair enough

-Tyrel

> 
> cheers
> 



Re: [PATCH] tracing: Have all levels of checks prevent recursion

2021-10-15 Thread Steven Rostedt
On Fri, 15 Oct 2021 20:24:59 +0200
Peter Zijlstra  wrote:

> > @@ -206,11 +206,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, 
> > arch_perf_out_copy_user)
> >  static inline int get_recursion_context(int *recursion)
> >  {
> > unsigned int pc = preempt_count();  
> 
> Although I think we can do without that ^ line as well :-)

Ah, I could have sworn I deleted it. Oh well, will make a proper patch set.

Thanks!

-- Steve


> 
> > -   unsigned char rctx = 0;


Re: [PATCH] tracing: Have all levels of checks prevent recursion

2021-10-15 Thread Peter Zijlstra
On Fri, Oct 15, 2021 at 02:20:33PM -0400, Steven Rostedt wrote:
> On Fri, 15 Oct 2021 20:04:29 +0200
> Peter Zijlstra  wrote:
> 
> > On Fri, Oct 15, 2021 at 01:58:06PM -0400, Steven Rostedt wrote:
> > > Something like this:  
> > 
> > I think having one copy of that in a header is better than having 3
> > copies. But yes, something along them lines.
> 
> I was just about to ask you about this patch ;-)

Much better :-)

> diff --git a/kernel/events/internal.h b/kernel/events/internal.h
> index 228801e20788..c91711f20cf8 100644
> --- a/kernel/events/internal.h
> +++ b/kernel/events/internal.h
> @@ -206,11 +206,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, 
> arch_perf_out_copy_user)
>  static inline int get_recursion_context(int *recursion)
>  {
>   unsigned int pc = preempt_count();

Although I think we can do without that ^ line as well :-)

> - unsigned char rctx = 0;
> -
> - rctx += !!(pc & (NMI_MASK));
> - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
> - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
> + unsigned char rctx = interrupt_context_level();
>  
>   if (recursion[rctx])
>   return -1;


Re: [PATCH] tracing: Have all levels of checks prevent recursion

2021-10-15 Thread Steven Rostedt
On Fri, 15 Oct 2021 14:20:33 -0400
Steven Rostedt  wrote:

> > I think having one copy of that in a header is better than having 3
> > copies. But yes, something along them lines.  
> 
> I was just about to ask you about this patch ;-)

Except it doesn't build :-p (need to move the inlined function down a bit)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 4d244e295e85..b32e3dabe28b 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -77,6 +77,27 @@
 /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
 #include 
 
+/**
+ * interrupt_context_level - return interrupt context level
+ *
+ * Returns the current interrupt context level.
+ *  0 - normal context
+ *  1 - softirq context
+ *  2 - hardirq context
+ *  3 - NMI context
+ */
+static __always_inline unsigned char interrupt_context_level(void)
+{
+   unsigned long pc = preempt_count();
+   unsigned char level = 0;
+
+   level += !!(pc & (NMI_MASK));
+   level += !!(pc & (NMI_MASK | HARDIRQ_MASK));
+   level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+
+   return level;
+}
+
 #define nmi_count()(preempt_count() & NMI_MASK)
 #define hardirq_count()(preempt_count() & HARDIRQ_MASK)
 #ifdef CONFIG_PREEMPT_RT
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 41f5bfd9e93f..018a04381556 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -118,12 +118,7 @@ enum {
 
 static __always_inline int trace_get_context_bit(void)
 {
-   unsigned long pc = preempt_count();
-   unsigned char bit = 0;
-
-   bit += !!(pc & (NMI_MASK));
-   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK));
-   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+   unsigned char bit = interrupt_context_level();
 
return TRACE_CTX_NORMAL - bit;
 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 228801e20788..c91711f20cf8 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -206,11 +206,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, 
arch_perf_out_copy_user)
 static inline int get_recursion_context(int *recursion)
 {
unsigned int pc = preempt_count();
-   unsigned char rctx = 0;
-
-   rctx += !!(pc & (NMI_MASK));
-   rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
-   rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+   unsigned char rctx = interrupt_context_level();
 
if (recursion[rctx])
return -1;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 15d4380006e3..f6520d0a4c8c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3167,12 +3167,7 @@ static __always_inline int
 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 {
unsigned int val = cpu_buffer->current_context;
-   unsigned long pc = preempt_count();
-   int bit = 0;
-
-   bit += !!(pc & (NMI_MASK));
-   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK));
-   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+   int bit = interrupt_context_level();
 
bit = RB_CTX_NORMAL - bit;
 


Re: [PATCH] tracing: Have all levels of checks prevent recursion

2021-10-15 Thread Steven Rostedt
On Fri, 15 Oct 2021 20:04:29 +0200
Peter Zijlstra  wrote:

> On Fri, Oct 15, 2021 at 01:58:06PM -0400, Steven Rostedt wrote:
> > Something like this:  
> 
> I think having one copy of that in a header is better than having 3
> copies. But yes, something along them lines.

I was just about to ask you about this patch ;-)

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 4d244e295e85..a6ae329aa654 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -74,6 +74,27 @@
  */
 #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
 
+/**
+ * interrupt_context_level - return interrupt context level
+ *
+ * Returns the current interrupt context level.
+ *  0 - normal context
+ *  1 - softirq context
+ *  2 - hardirq context
+ *  3 - NMI context
+ */
+static __always_inline unsigned char interrupt_context_level(void)
+{
+   unsigned long pc = preempt_count();
+   unsigned char level = 0;
+
+   level += !!(pc & (NMI_MASK));
+   level += !!(pc & (NMI_MASK | HARDIRQ_MASK));
+   level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+
+   return level;
+}
+
 /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
 #include 
 
diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 41f5bfd9e93f..018a04381556 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -118,12 +118,7 @@ enum {
 
 static __always_inline int trace_get_context_bit(void)
 {
-   unsigned long pc = preempt_count();
-   unsigned char bit = 0;
-
-   bit += !!(pc & (NMI_MASK));
-   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK));
-   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+   unsigned char bit = interrupt_context_level();
 
return TRACE_CTX_NORMAL - bit;
 }
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 228801e20788..c91711f20cf8 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -206,11 +206,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, 
arch_perf_out_copy_user)
 static inline int get_recursion_context(int *recursion)
 {
unsigned int pc = preempt_count();
-   unsigned char rctx = 0;
-
-   rctx += !!(pc & (NMI_MASK));
-   rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
-   rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+   unsigned char rctx = interrupt_context_level();
 
if (recursion[rctx])
return -1;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 15d4380006e3..f6520d0a4c8c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3167,12 +3167,7 @@ static __always_inline int
 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 {
unsigned int val = cpu_buffer->current_context;
-   unsigned long pc = preempt_count();
-   int bit = 0;
-
-   bit += !!(pc & (NMI_MASK));
-   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK));
-   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+   int bit = interrupt_context_level();
 
bit = RB_CTX_NORMAL - bit;
 


Re: [PATCH] tracing: Have all levels of checks prevent recursion

2021-10-15 Thread Peter Zijlstra
On Fri, Oct 15, 2021 at 01:58:06PM -0400, Steven Rostedt wrote:
> Something like this:

I think having one copy of that in a header is better than having 3
copies. But yes, something along them lines.


Re: [PATCH] tracing: Have all levels of checks prevent recursion

2021-10-15 Thread Steven Rostedt
On Fri, 15 Oct 2021 13:35:04 -0400
Steven Rostedt  wrote:

> On Fri, 15 Oct 2021 18:17:02 +0200
> Peter Zijlstra  wrote:
> 
> > On Fri, Oct 15, 2021 at 11:00:35AM -0400, Steven Rostedt wrote:  
> > > From: "Steven Rostedt (VMware)" 
> > > 
> > > While writing an email explaining the "bit = 0" logic for a discussion on 
> > >
> >   
> > >   bit = trace_get_context_bit() + start;
> > 
> > While there, you were also going to update that function to match/use
> > get_recursion_context(). Because your version is still branch hell.  
> 
> That would probably be a separate patch. This is just a fix to a design
> flaw, changing the context tests would be performance enhancement.
> 
> Thanks for the reminder (as it was on my todo list to update that code).

Something like this:

From: "Steven Rostedt (VMware)" 
Subject: [PATCH] tracing: Reuse logic from perf's get_recursion_context()

Instead of having branches that adds noise to the branch prediction, use
the addition logic to set the bit for the level of interrupt context that
the state is currently in. This copies the logic from perf's
get_recursion_context() function.

Link: 
https://lore.kernel.org/all/20211015161702.gf174...@worktop.programming.kicks-ass.net/

Suggested-by: Peter Zijlstra 
Signed-off-by: Steven Rostedt (VMware) 
---
 include/linux/trace_recursion.h | 11 ++-
 kernel/trace/ring_buffer.c  | 12 ++--
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 168fdf07419a..41f5bfd9e93f 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -119,12 +119,13 @@ enum {
 static __always_inline int trace_get_context_bit(void)
 {
unsigned long pc = preempt_count();
+   unsigned char bit = 0;
 
-   if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
-   return TRACE_CTX_NORMAL;
-   else
-   return pc & NMI_MASK ? TRACE_CTX_NMI :
-   pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ;
+   bit += !!(pc & (NMI_MASK));
+   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK));
+   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+
+   return TRACE_CTX_NORMAL - bit;
 }
 
 #ifdef CONFIG_FTRACE_RECORD_RECURSION
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c5a3fbf19617..15d4380006e3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3168,13 +3168,13 @@ trace_recursive_lock(struct ring_buffer_per_cpu 
*cpu_buffer)
 {
unsigned int val = cpu_buffer->current_context;
unsigned long pc = preempt_count();
-   int bit;
+   int bit = 0;
 
-   if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
-   bit = RB_CTX_NORMAL;
-   else
-   bit = pc & NMI_MASK ? RB_CTX_NMI :
-   pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
+   bit += !!(pc & (NMI_MASK));
+   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK));
+   bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
+
+   bit = RB_CTX_NORMAL - bit;
 
if (unlikely(val & (1 << (bit + cpu_buffer->nest {
/*
-- 
2.31.1



[PATCH v2] powerpc/smp: do not decrement idle task preempt count in CPU offline

2021-10-15 Thread Nathan Lynch
With PREEMPT_COUNT=y, when a CPU is offlined and then onlined again, we
get:

BUG: scheduling while atomic: swapper/1/0/0x
no locks held by swapper/1/0.
CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.15.0-rc2+ #100
Call Trace:
 dump_stack_lvl+0xac/0x108
 __schedule_bug+0xac/0xe0
 __schedule+0xcf8/0x10d0
 schedule_idle+0x3c/0x70
 do_idle+0x2d8/0x4a0
 cpu_startup_entry+0x38/0x40
 start_secondary+0x2ec/0x3a0
 start_secondary_prolog+0x10/0x14

This is because powerpc's arch_cpu_idle_dead() decrements the idle task's
preempt count, for reasons explained in commit a7c2bb8279d2 ("powerpc:
Re-enable preemption before cpu_die()"), specifically "start_secondary()
expects a preempt_count() of 0."

However, since commit 2c669ef6979c ("powerpc/preempt: Don't touch the idle
task's preempt_count during hotplug") and commit f1a0a376ca0c ("sched/core:
Initialize the idle task with preemption disabled"), that justification no
longer holds.

The idle task isn't supposed to re-enable preemption, so remove the
vestigial preempt_enable() from the CPU offline path.

Tested with pseries and powernv in qemu, and pseries on PowerVM.

Fixes: 2c669ef6979c ("powerpc/preempt: Don't touch the idle task's 
preempt_count during hotplug")
Signed-off-by: Nathan Lynch 
Reviewed-by: Valentin Schneider 
---

Notes:
Changes since v1:

- remove incorrect Fixes: tag, add Valentin's r-b.

 arch/powerpc/kernel/smp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9cc7d3dbf439..605bab448f84 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1730,8 +1730,6 @@ void __cpu_die(unsigned int cpu)
 
 void arch_cpu_idle_dead(void)
 {
-   sched_preempt_enable_no_resched();
-
/*
 * Disable on the down path. This will be re-enabled by
 * start_secondary() via start_secondary_resume() below
-- 
2.31.1



Re: [PATCH] tracing: Have all levels of checks prevent recursion

2021-10-15 Thread Steven Rostedt
On Fri, 15 Oct 2021 18:17:02 +0200
Peter Zijlstra  wrote:

> On Fri, Oct 15, 2021 at 11:00:35AM -0400, Steven Rostedt wrote:
> > From: "Steven Rostedt (VMware)" 
> > 
> > While writing an email explaining the "bit = 0" logic for a discussion on  
> 
> > bit = trace_get_context_bit() + start;  
> 
> While there, you were also going to update that function to match/use
> get_recursion_context(). Because your version is still branch hell.

That would probably be a separate patch. This is just a fix to a design
flaw, changing the context tests would be performance enhancement.

Thanks for the reminder (as it was on my todo list to update that code).

-- Steve


Re: [PATCH] powerpc/smp: do not decrement idle task preempt count in CPU offline

2021-10-15 Thread Nathan Lynch
Valentin Schneider  writes:

> On 15/10/21 09:55, Nathan Lynch wrote:
>> With PREEMPT_COUNT=y, when a CPU is offlined and then onlined again, we
>> get:
>>
>> BUG: scheduling while atomic: swapper/1/0/0x
>> no locks held by swapper/1/0.
>> CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.15.0-rc2+ #100
>> Call Trace:
>>  dump_stack_lvl+0xac/0x108
>>  __schedule_bug+0xac/0xe0
>>  __schedule+0xcf8/0x10d0
>>  schedule_idle+0x3c/0x70
>>  do_idle+0x2d8/0x4a0
>>  cpu_startup_entry+0x38/0x40
>>  start_secondary+0x2ec/0x3a0
>>  start_secondary_prolog+0x10/0x14
>>
>> This is because powerpc's arch_cpu_idle_dead() decrements the idle task's
>> preempt count, for reasons explained in commit a7c2bb8279d2 ("powerpc:
>> Re-enable preemption before cpu_die()"), specifically "start_secondary()
>> expects a preempt_count() of 0."
>>
>> However, since commit 2c669ef6979c ("powerpc/preempt: Don't touch the idle
>> task's preempt_count during hotplug") and commit f1a0a376ca0c ("sched/core:
>> Initialize the idle task with preemption disabled"), that justification no
>> longer holds.
>>
>> The idle task isn't supposed to re-enable preemption, so remove the
>> vestigial preempt_enable() from the CPU offline path.
>>
>
> Humph, I got confused because 2c669ef6979c explicitly mentions hotplug,
> but that's the hotplug machinery which is already involved for bringing up
> the secondaries at boot time.
>
> IIUC your issue here is the preempt_count being messed up when
> hot-unplugging a CPU, which leads to fireworks during hotplug

That's right.

> (IOW I didn't
> test my last patch against hotplug - my bad!)
>
> Reviewed-by: Valentin Schneider 

No worries and thank you for reviewing.


>> Tested with pseries and powernv in qemu, and pseries on PowerVM.
>>
>> Fixes: 2c669ef6979c ("powerpc/preempt: Don't touch the idle task's 
>> preempt_count during hotplug")
>> Fixes: f1a0a376ca0c ("sched/core: Initialize the idle task with preemption 
>> disabled")
>
> I think only the first Fixes: is needed.

OK, I'll re-send with that changed as well as your r-b. Thanks.


Re: [PATCH 0/4] Update mpc5200 dts files to fix warnings

2021-10-15 Thread Rob Herring
On Wed, Oct 13, 2021 at 5:05 PM Anatolij Gustschin  wrote:
>
> This series fixes localbus, memory and pci node build warnings.
> It was tested with current linux-next on digsy_mtc and tqm5200
> boards.
>
> Anatolij Gustschin (4):
>   powerpc/5200: dts: add missing pci ranges
>   powerpc/5200: dts: fix pci ranges warnings
>   powerpc/5200: dts: fix memory node unit name
>   powerpc/5200: dts: fix localbus node warnings

For patches 1-3:

Reviewed-by: Rob Herring 


Re: [PATCH 4/4] powerpc/5200: dts: fix localbus node warnings

2021-10-15 Thread Rob Herring
On Wed, Oct 13, 2021 at 5:05 PM Anatolij Gustschin  wrote:
>
> Fix build warnings like:
> localbus:ranges: 'oneOf' conditional failed, one must be fixed
> ...
> Warning (unit_address_vs_reg): /localbus: node has a reg or ranges property, 
> but no unit name
> Warning (simple_bus_reg): /localbus/flash@0,0: simple-bus unit address format 
> error, expected "0"
>
> Signed-off-by: Anatolij Gustschin 
> ---
>  arch/powerpc/boot/dts/a3m071.dts| 12 +-
>  arch/powerpc/boot/dts/a4m072.dts| 20 -
>  arch/powerpc/boot/dts/charon.dts| 14 ++--
>  arch/powerpc/boot/dts/cm5200.dts|  7 --
>  arch/powerpc/boot/dts/digsy_mtc.dts | 16 --
>  arch/powerpc/boot/dts/lite5200.dts  |  4 ++--
>  arch/powerpc/boot/dts/lite5200b.dts |  6 +++--
>  arch/powerpc/boot/dts/media5200.dts | 20 +
>  arch/powerpc/boot/dts/motionpro.dts | 32 +++
>  arch/powerpc/boot/dts/mpc5200b.dtsi |  2 +-
>  arch/powerpc/boot/dts/mucmc52.dts   | 34 +++--
>  arch/powerpc/boot/dts/o2d.dts   | 10 +
>  arch/powerpc/boot/dts/o2d.dtsi  | 12 +-
>  arch/powerpc/boot/dts/o2d300.dts| 10 +
>  arch/powerpc/boot/dts/o2dnt2.dts| 10 +
>  arch/powerpc/boot/dts/o2i.dts   |  4 ++--
>  arch/powerpc/boot/dts/o2mnt.dts |  4 ++--
>  arch/powerpc/boot/dts/o3dnt.dts | 10 +
>  arch/powerpc/boot/dts/pcm030.dts|  2 +-
>  arch/powerpc/boot/dts/pcm032.dts| 26 --
>  arch/powerpc/boot/dts/tqm5200.dts   |  4 ++--
>  arch/powerpc/boot/dts/uc101.dts | 14 +++-
>  22 files changed, 151 insertions(+), 122 deletions(-)
>
> diff --git a/arch/powerpc/boot/dts/a3m071.dts 
> b/arch/powerpc/boot/dts/a3m071.dts
> index 034cfd8aa95b..14e59aaa0ba7 100644
> --- a/arch/powerpc/boot/dts/a3m071.dts
> +++ b/arch/powerpc/boot/dts/a3m071.dts
> @@ -87,15 +87,15 @@
> };
> };
>
> -   localbus {
> +   localbus@8000 {

That's not right, as 0x8000 doesn't exist in 'reg' or 'ranges'.
Without 'reg', the correct unit-address is 'fc00'.  Perhaps there
should be a 'reg' entry though.

> compatible = "fsl,mpc5200b-lpb","simple-bus";
> #address-cells = <2>;
> #size-cells = <1>;
> -   ranges = <0 0 0xfc00 0x0200
> - 3 0 0xe900 0x0008
> - 5 0 0xe800 0x0001>;
> +   ranges = <0 0 0xfc00 0x0200>,
> +<3 0 0xe900 0x0008>,
> +<5 0 0xe800 0x0001>;
>
> -   flash@0,0 {
> +   flash@0 {

This change is debatable. Normally, if we have chipselects, then
that's a separate field and a comma is appropriate. However, h/w with
chip selects require setup and aren't a simple-bus. I guess the
bootloader does the setup, so kind of a gray area.

The warning for this is off by default, so I'd leave it alone.

Rob


Re: [PATCH] tracing: Have all levels of checks prevent recursion

2021-10-15 Thread Peter Zijlstra
On Fri, Oct 15, 2021 at 11:00:35AM -0400, Steven Rostedt wrote:
> From: "Steven Rostedt (VMware)" 
> 
> While writing an email explaining the "bit = 0" logic for a discussion on

>   bit = trace_get_context_bit() + start;

While there, you were also going to update that function to match/use
get_recursion_context(). Because your version is still branch hell.


Re: [PATCH v2] powerpc/mpc512x: dts: fix PSC node warnings

2021-10-15 Thread Rob Herring
On Thu, Oct 14, 2021 at 5:42 PM Anatolij Gustschin  wrote:
>
> Rework PSC node description to fix build warnings like:
> mpc5121.dtsi:397.13-406.5: Warning (spi_bus_bridge): /soc@8000/psc@11400: 
> node name for SPI buses should be 'spi'
> mpc5121.dtsi:409.13-418.5: Warning (spi_bus_bridge): /soc@8000/psc@11500: 
> node name for SPI buses should be 'spi'
> mpc5121.dtsi:457.13-466.5: Warning (spi_bus_bridge): /soc@8000/psc@11900: 
> node name for SPI buses should be 'spi'

Okay, I now see the block supports either spi or serial modes. I would
handle this a bit differently that doesn't create a bunch of new .dtsi
files.

>
> Signed-off-by: Anatolij Gustschin 
> ---
> Changes in v2:
>  - extract PSC nodes to files which can be included
>separately and extended as needed
>
>  arch/powerpc/boot/dts/ac14xx.dts| 118 
>  arch/powerpc/boot/dts/mpc5121-psc0.dtsi |  16 +++
>  arch/powerpc/boot/dts/mpc5121-psc1.dtsi |  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc10.dtsi|  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc11.dtsi|  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc2.dtsi |  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc3.dtsi |  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc4-spi.dtsi |  17 +++
>  arch/powerpc/boot/dts/mpc5121-psc4.dtsi |  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc5-spi.dtsi |  17 +++
>  arch/powerpc/boot/dts/mpc5121-psc5.dtsi |  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc6.dtsi |  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc7.dtsi |  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc8.dtsi |  15 ++
>  arch/powerpc/boot/dts/mpc5121-psc9-spi.dtsi |  17 +++
>  arch/powerpc/boot/dts/mpc5121-psc9.dtsi |  15 ++
>  arch/powerpc/boot/dts/mpc5121.dtsi  | 148 +---
>  arch/powerpc/boot/dts/mpc5121ads.dts|  42 +++---
>  arch/powerpc/boot/dts/pdm360ng.dts  | 104 +++---
>  19 files changed, 371 insertions(+), 273 deletions(-)
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc0.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc1.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc10.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc11.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc2.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc3.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc4-spi.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc4.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc5-spi.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc5.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc6.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc7.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc8.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc9-spi.dtsi
>  create mode 100644 arch/powerpc/boot/dts/mpc5121-psc9.dtsi

[...]

> diff --git a/arch/powerpc/boot/dts/mpc5121.dtsi 
> b/arch/powerpc/boot/dts/mpc5121.dtsi
> index 3f66b91a8e3c..21674da8beb1 100644
> --- a/arch/powerpc/boot/dts/mpc5121.dtsi
> +++ b/arch/powerpc/boot/dts/mpc5121.dtsi
> @@ -87,7 +87,7 @@
> };
> };
>
> -   soc@8000 {
> +   soc: soc@8000 {
> compatible = "fsl,mpc5121-immr";
> #address-cells = <1>;
> #size-cells = <1>;
> @@ -343,152 +343,6 @@
> clock-names = "ipg";
> };
>
> -   /* 512x PSCs are not 52xx PSC compatible */
> -
> -   /* PSC0 */
> -   psc@11000 {

I would just put here 'serial@11000' and 'spi@11000' nodes with both
nodes set to disabled. Then the board dts just has to change status of
the the nodes it wants to enable (and add child nodes for spi).
Overlapping addresses are okay if nodes are disabled.

> -   compatible = "fsl,mpc5121-psc";
> -   reg = <0x11000 0x100>;
> -   interrupts = <40 0x8>;
> -   fsl,rx-fifo-size = <16>;
> -   fsl,tx-fifo-size = <16>;
> -   clocks = < MPC512x_CLK_PSC0>,
> -< MPC512x_CLK_PSC0_MCLK>;
> -   clock-names = "ipg", "mclk";
> -   };


[PATCH v1 11/11] powerpc/microwatt: Don't select the hash MMU code

2021-10-15 Thread Nicholas Piggin
Microwatt is radix-only, so it does not require hash MMU support.

This saves 20kB compressed dtbImage and 56kB vmlinux size.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/configs/microwatt_defconfig | 1 -
 arch/powerpc/platforms/microwatt/Kconfig | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/powerpc/configs/microwatt_defconfig 
b/arch/powerpc/configs/microwatt_defconfig
index 6e62966730d3..7c8eb29d8afe 100644
--- a/arch/powerpc/configs/microwatt_defconfig
+++ b/arch/powerpc/configs/microwatt_defconfig
@@ -27,7 +27,6 @@ CONFIG_PPC_MICROWATT=y
 # CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set
 CONFIG_CPU_FREQ=y
 CONFIG_HZ_100=y
-# CONFIG_PPC_MEM_KEYS is not set
 # CONFIG_SECCOMP is not set
 # CONFIG_MQ_IOSCHED_KYBER is not set
 # CONFIG_COREDUMP is not set
diff --git a/arch/powerpc/platforms/microwatt/Kconfig 
b/arch/powerpc/platforms/microwatt/Kconfig
index 823192e9d38a..5e320f49583a 100644
--- a/arch/powerpc/platforms/microwatt/Kconfig
+++ b/arch/powerpc/platforms/microwatt/Kconfig
@@ -5,7 +5,6 @@ config PPC_MICROWATT
select PPC_XICS
select PPC_ICS_NATIVE
select PPC_ICP_NATIVE
-   select PPC_HASH_MMU_NATIVE if PPC_64S_HASH_MMU
select PPC_UDBG_16550
select ARCH_RANDOM
help
-- 
2.23.0



[PATCH v1 10/11] powerpc/configs/microwatt: add POWER9_CPU

2021-10-15 Thread Nicholas Piggin
Microwatt implements a subset of ISA v3.0 (which is equivalent to
the POWER9_CPU option).

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/configs/microwatt_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/configs/microwatt_defconfig 
b/arch/powerpc/configs/microwatt_defconfig
index 9465209b8c5b..6e62966730d3 100644
--- a/arch/powerpc/configs/microwatt_defconfig
+++ b/arch/powerpc/configs/microwatt_defconfig
@@ -15,6 +15,7 @@ CONFIG_EMBEDDED=y
 # CONFIG_COMPAT_BRK is not set
 # CONFIG_SLAB_MERGE_DEFAULT is not set
 CONFIG_PPC64=y
+CONFIG_POWER9_CPU=y
 # CONFIG_PPC_KUEP is not set
 # CONFIG_PPC_KUAP is not set
 CONFIG_CPU_LITTLE_ENDIAN=y
-- 
2.23.0



[PATCH v1 09/11] powerpc/64s: Make hash MMU code build configurable

2021-10-15 Thread Nicholas Piggin
Introduce a new option CONFIG_PPC_64S_HASH_MMU which allows the 64s hash
MMU code to be compiled out if radix is selected and the minimum
supported CPU type is POWER9 or higher, and KVM is not selected.

This saves 128kB kernel image size (90kB text) on powernv_defconfig
minus KVM, 350kB on pseries_defconfig minus KVM, 40kB on a tiny config.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/book3s/64/mmu.h  | 22 ++-
 .../include/asm/book3s/64/tlbflush-hash.h |  7 ++
 arch/powerpc/include/asm/book3s/pgtable.h |  4 
 arch/powerpc/include/asm/mmu.h| 14 +---
 arch/powerpc/include/asm/mmu_context.h|  2 ++
 arch/powerpc/include/asm/paca.h   |  8 +++
 arch/powerpc/kernel/asm-offsets.c |  2 ++
 arch/powerpc/kernel/dt_cpu_ftrs.c |  8 ++-
 arch/powerpc/kernel/entry_64.S|  4 ++--
 arch/powerpc/kernel/exceptions-64s.S  | 16 ++
 arch/powerpc/kernel/mce.c |  2 +-
 arch/powerpc/kernel/mce_power.c   | 10 ++---
 arch/powerpc/kernel/paca.c| 18 ++-
 arch/powerpc/kernel/process.c | 13 ++-
 arch/powerpc/kernel/prom.c|  2 ++
 arch/powerpc/kernel/setup_64.c|  4 
 arch/powerpc/kexec/core_64.c  |  4 ++--
 arch/powerpc/kexec/ranges.c   |  4 
 arch/powerpc/kvm/Kconfig  |  1 +
 arch/powerpc/mm/book3s64/Makefile | 17 --
 arch/powerpc/mm/book3s64/hash_utils.c | 10 -
 .../{hash_hugetlbpage.c => hugetlbpage.c} |  6 +
 arch/powerpc/mm/book3s64/mmu_context.c| 16 ++
 arch/powerpc/mm/book3s64/pgtable.c| 12 ++
 arch/powerpc/mm/book3s64/radix_pgtable.c  |  4 
 arch/powerpc/mm/copro_fault.c |  2 ++
 arch/powerpc/mm/pgtable.c | 10 ++---
 arch/powerpc/platforms/Kconfig.cputype| 21 +-
 arch/powerpc/platforms/cell/Kconfig   |  1 +
 arch/powerpc/platforms/maple/Kconfig  |  1 +
 arch/powerpc/platforms/microwatt/Kconfig  |  2 +-
 arch/powerpc/platforms/pasemi/Kconfig |  1 +
 arch/powerpc/platforms/powermac/Kconfig   |  1 +
 arch/powerpc/platforms/powernv/Kconfig|  2 +-
 arch/powerpc/platforms/powernv/idle.c |  2 ++
 arch/powerpc/platforms/powernv/setup.c|  2 ++
 arch/powerpc/platforms/pseries/lpar.c | 11 --
 arch/powerpc/platforms/pseries/lparcfg.c  |  2 +-
 arch/powerpc/platforms/pseries/mobility.c |  6 +
 arch/powerpc/platforms/pseries/ras.c  |  2 ++
 arch/powerpc/platforms/pseries/reconfig.c |  2 ++
 arch/powerpc/platforms/pseries/setup.c|  6 +++--
 arch/powerpc/xmon/xmon.c  |  8 +--
 44 files changed, 233 insertions(+), 60 deletions(-)
 rename arch/powerpc/mm/book3s64/{hash_hugetlbpage.c => hugetlbpage.c} (95%)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ba5b66189358..3c36511e6133 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -936,6 +936,7 @@ config PPC_MEM_KEYS
prompt "PowerPC Memory Protection Keys"
def_bool y
depends on PPC_BOOK3S_64
+   depends on PPC_64S_HASH_MMU
select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_HAS_PKEYS
help
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index c02f42d1031e..857dc88b0043 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -98,7 +98,9 @@ typedef struct {
 * from EA and new context ids to build the new VAs.
 */
mm_context_id_t id;
+#ifdef CONFIG_PPC_64S_HASH_MMU
mm_context_id_t extended_id[TASK_SIZE_USER64/TASK_CONTEXT_SIZE];
+#endif
};
 
/* Number of bits in the mm_cpumask */
@@ -110,7 +112,9 @@ typedef struct {
/* Number of user space windows opened in process mm_context */
atomic_t vas_windows;
 
+#ifdef CONFIG_PPC_64S_HASH_MMU
struct hash_mm_context *hash_context;
+#endif
 
void __user *vdso;
/*
@@ -133,6 +137,7 @@ typedef struct {
 #endif
 } mm_context_t;
 
+#ifdef CONFIG_PPC_64S_HASH_MMU
 static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
 {
return ctx->hash_context->user_psize;
@@ -187,11 +192,22 @@ static inline struct subpage_prot_table 
*mm_ctx_subpage_prot(mm_context_t *ctx)
 }
 #endif
 
+#endif
+
 /*
  * The current system page and segment sizes
  */
-extern int mmu_linear_psize;
+#if defined(CONFIG_PPC_RADIX_MMU) && !defined(CONFIG_PPC_64S_HASH_MMU)
+#ifdef CONFIG_PPC_64K_PAGES
+#define mmu_virtual_psize MMU_PAGE_64K
+#else
+#define mmu_virtual_psize MMU_PAGE_4K
+#endif
+#else
 extern int 

[PATCH v1 08/11] powerpc/64s: Make flush_and_reload_slb a no-op when radix is enabled

2021-10-15 Thread Nicholas Piggin
The radix test can exclude slb_flush_all_realmode() from being called
because flush_and_reload_slb() is only expected to flush ERAT when
called by flush_erat(), which is only on pre-ISA v3.0 CPUs that do not
support radix.

This helps the later change to make hash support configurable to not
introduce runtime changes to radix mode behaviour.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/mce_power.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index c2f55fe7092d..cf5263b648fc 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -80,12 +80,12 @@ static bool mce_in_guest(void)
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void)
 {
-   /* Invalidate all SLBs */
-   slb_flush_all_realmode();
-
if (early_radix_enabled())
return;
 
+   /* Invalidate all SLBs */
+   slb_flush_all_realmode();
+
/*
 * This probably shouldn't happen, but it may be possible it's
 * called in early boot before SLB shadows are allocated.
-- 
2.23.0



[PATCH v1 07/11] powerpc/64s: move THP trace point creation out of hash specific file

2021-10-15 Thread Nicholas Piggin
In preparation for making hash MMU support configurable, move THP
trace point function definitions out of an otherwise hash specific
file.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/book3s64/Makefile   | 2 +-
 arch/powerpc/mm/book3s64/hash_pgtable.c | 1 -
 arch/powerpc/mm/book3s64/pgtable.c  | 1 +
 arch/powerpc/mm/book3s64/trace.c| 8 
 4 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/mm/book3s64/trace.c

diff --git a/arch/powerpc/mm/book3s64/Makefile 
b/arch/powerpc/mm/book3s64/Makefile
index 319f4b7f3357..1579e18e098d 100644
--- a/arch/powerpc/mm/book3s64/Makefile
+++ b/arch/powerpc/mm/book3s64/Makefile
@@ -5,7 +5,7 @@ ccflags-y   := $(NO_MINIMAL_TOC)
 CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
 
 obj-y  += hash_pgtable.o hash_utils.o slb.o \
-  mmu_context.o pgtable.o hash_tlb.o
+  mmu_context.o pgtable.o hash_tlb.o trace.o
 obj-$(CONFIG_PPC_HASH_MMU_NATIVE)  += hash_native.o
 obj-$(CONFIG_PPC_RADIX_MMU)+= radix_pgtable.o radix_tlb.o
 obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o
diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c 
b/arch/powerpc/mm/book3s64/hash_pgtable.c
index ad5eff097d31..7ce8914992e3 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -16,7 +16,6 @@
 
 #include 
 
-#define CREATE_TRACE_POINTS
 #include 
 
 #if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / 
TASK_CONTEXT_SIZE))
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 9e16c7b1a6c5..049843c8c875 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -28,6 +28,7 @@ unsigned long __pmd_frag_size_shift;
 EXPORT_SYMBOL(__pmd_frag_size_shift);
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+
 /*
  * This is called when relaxing access to a hugepage. It's also called in the 
page
  * fault path when we don't hit any of the major fault cases, ie, a minor
diff --git a/arch/powerpc/mm/book3s64/trace.c b/arch/powerpc/mm/book3s64/trace.c
new file mode 100644
index ..b86e7b906257
--- /dev/null
+++ b/arch/powerpc/mm/book3s64/trace.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file is for defining trace points and trace related helpers.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define CREATE_TRACE_POINTS
+#include 
+#endif
-- 
2.23.0



[PATCH v1 06/11] powerpc/pseries: lparcfg don't include slb_size line in radix mode

2021-10-15 Thread Nicholas Piggin
This avoids a change in behaviour in the later patch making hash
support configurable. This is possibly a user interface change, so
the alternative would be a hard-coded slb_size=0 here.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/lparcfg.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/lparcfg.c 
b/arch/powerpc/platforms/pseries/lparcfg.c
index f71eac74ea92..3354c00914fa 100644
--- a/arch/powerpc/platforms/pseries/lparcfg.c
+++ b/arch/powerpc/platforms/pseries/lparcfg.c
@@ -532,7 +532,8 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v)
   lppaca_shared_proc(get_lppaca()));
 
 #ifdef CONFIG_PPC_BOOK3S_64
-   seq_printf(m, "slb_size=%d\n", mmu_slb_size);
+   if (!radix_enabled())
+   seq_printf(m, "slb_size=%d\n", mmu_slb_size);
 #endif
parse_em_data(m);
maxmem_data(m);
-- 
2.23.0



[PATCH v1 05/11] powerpc/pseries: move pseries_lpar_register_process_table() out from hash specific code

2021-10-15 Thread Nicholas Piggin
This reduces ifdefs in a later change making hash support configurable.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/lpar.c | 56 +--
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 3df6bdfea475..06d6a824c0dc 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -712,6 +712,34 @@ void vpa_init(int cpu)
 
 #ifdef CONFIG_PPC_BOOK3S_64
 
+static int pseries_lpar_register_process_table(unsigned long base,
+   unsigned long page_size, unsigned long table_size)
+{
+   long rc;
+   unsigned long flags = 0;
+
+   if (table_size)
+   flags |= PROC_TABLE_NEW;
+   if (radix_enabled()) {
+   flags |= PROC_TABLE_RADIX;
+   if (mmu_has_feature(MMU_FTR_GTSE))
+   flags |= PROC_TABLE_GTSE;
+   } else
+   flags |= PROC_TABLE_HPT_SLB;
+   for (;;) {
+   rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
+   page_size, table_size);
+   if (!H_IS_LONG_BUSY(rc))
+   break;
+   mdelay(get_longbusy_msecs(rc));
+   }
+   if (rc != H_SUCCESS) {
+   pr_err("Failed to register process table (rc=%ld)\n", rc);
+   BUG();
+   }
+   return rc;
+}
+
 static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 unsigned long vpn, unsigned long pa,
 unsigned long rflags, unsigned long vflags,
@@ -1680,34 +1708,6 @@ static int pseries_lpar_resize_hpt(unsigned long shift)
return 0;
 }
 
-static int pseries_lpar_register_process_table(unsigned long base,
-   unsigned long page_size, unsigned long table_size)
-{
-   long rc;
-   unsigned long flags = 0;
-
-   if (table_size)
-   flags |= PROC_TABLE_NEW;
-   if (radix_enabled()) {
-   flags |= PROC_TABLE_RADIX;
-   if (mmu_has_feature(MMU_FTR_GTSE))
-   flags |= PROC_TABLE_GTSE;
-   } else
-   flags |= PROC_TABLE_HPT_SLB;
-   for (;;) {
-   rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
-   page_size, table_size);
-   if (!H_IS_LONG_BUSY(rc))
-   break;
-   mdelay(get_longbusy_msecs(rc));
-   }
-   if (rc != H_SUCCESS) {
-   pr_err("Failed to register process table (rc=%ld)\n", rc);
-   BUG();
-   }
-   return rc;
-}
-
 void __init hpte_init_pseries(void)
 {
mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate;
-- 
2.23.0



[PATCH v1 04/11] powerpc/64s: Move and rename do_bad_slb_fault as it is not hash specific

2021-10-15 Thread Nicholas Piggin
slb.c is hash-specific SLB management, but do_bad_slb_fault deals with
segment interrupts that occur with radix MMU as well.
---
 arch/powerpc/include/asm/interrupt.h |  2 +-
 arch/powerpc/kernel/exceptions-64s.S |  4 ++--
 arch/powerpc/mm/book3s64/slb.c   | 16 
 arch/powerpc/mm/fault.c  | 17 +
 4 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index a1d238255f07..3487aab12229 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -564,7 +564,7 @@ DECLARE_INTERRUPT_HANDLER(kernel_bad_stack);
 
 /* slb.c */
 DECLARE_INTERRUPT_HANDLER_RAW(do_slb_fault);
-DECLARE_INTERRUPT_HANDLER(do_bad_slb_fault);
+DECLARE_INTERRUPT_HANDLER(do_bad_segment_interrupt);
 
 /* hash_utils.c */
 DECLARE_INTERRUPT_HANDLER_RAW(do_hash_fault);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index eaf1f72131a1..046c99e31d01 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1430,7 +1430,7 @@ MMU_FTR_SECTION_ELSE
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
std r3,RESULT(r1)
addir3,r1,STACK_FRAME_OVERHEAD
-   bl  do_bad_slb_fault
+   bl  do_bad_segment_interrupt
b   interrupt_return_srr
 
 
@@ -1510,7 +1510,7 @@ MMU_FTR_SECTION_ELSE
 ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
std r3,RESULT(r1)
addir3,r1,STACK_FRAME_OVERHEAD
-   bl  do_bad_slb_fault
+   bl  do_bad_segment_interrupt
b   interrupt_return_srr
 
 
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index f0037bcc47a0..31f4cef3adac 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -868,19 +868,3 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault)
return err;
}
 }
-
-DEFINE_INTERRUPT_HANDLER(do_bad_slb_fault)
-{
-   int err = regs->result;
-
-   if (err == -EFAULT) {
-   if (user_mode(regs))
-   _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
-   else
-   bad_page_fault(regs, SIGSEGV);
-   } else if (err == -EINVAL) {
-   unrecoverable_exception(regs);
-   } else {
-   BUG();
-   }
-}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index a8d0ce85d39a..53ddcae0ac9e 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -620,4 +621,20 @@ DEFINE_INTERRUPT_HANDLER(do_bad_page_fault_segv)
 {
bad_page_fault(regs, SIGSEGV);
 }
+
+DEFINE_INTERRUPT_HANDLER(do_bad_segment_interrupt)
+{
+   int err = regs->result;
+
+   if (err == -EFAULT) {
+   if (user_mode(regs))
+   _exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
+   else
+   bad_page_fault(regs, SIGSEGV);
+   } else if (err == -EINVAL) {
+   unrecoverable_exception(regs);
+   } else {
+   BUG();
+   }
+}
 #endif
-- 
2.23.0



[PATCH v1 03/11] powerpc/pseries: Stop selecting PPC_HASH_MMU_NATIVE

2021-10-15 Thread Nicholas Piggin
The pseries platform does not use the native hash code but the PAPR
virtualised hash interfaces, so remove PPC_HASH_MMU_NATIVE.

This requires moving tlbiel code from hash_native.c to hash_utils.c.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   4 -
 arch/powerpc/mm/book3s64/hash_native.c| 104 --
 arch/powerpc/mm/book3s64/hash_utils.c | 104 ++
 arch/powerpc/platforms/pseries/Kconfig|   1 -
 4 files changed, 104 insertions(+), 109 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 215973b4cb26..d2e80f178b6d 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -14,7 +14,6 @@ enum {
TLB_INVAL_SCOPE_LPID = 1,   /* invalidate TLBs for current LPID */
 };
 
-#ifdef CONFIG_PPC_NATIVE
 static inline void tlbiel_all(void)
 {
/*
@@ -30,9 +29,6 @@ static inline void tlbiel_all(void)
else
hash__tlbiel_all(TLB_INVAL_SCOPE_GLOBAL);
 }
-#else
-static inline void tlbiel_all(void) { BUG(); }
-#endif
 
 static inline void tlbiel_all_lpid(bool radix)
 {
diff --git a/arch/powerpc/mm/book3s64/hash_native.c 
b/arch/powerpc/mm/book3s64/hash_native.c
index d8279bfe68ea..d2a320828c0b 100644
--- a/arch/powerpc/mm/book3s64/hash_native.c
+++ b/arch/powerpc/mm/book3s64/hash_native.c
@@ -43,110 +43,6 @@
 
 static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
 
-static inline void tlbiel_hash_set_isa206(unsigned int set, unsigned int is)
-{
-   unsigned long rb;
-
-   rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-
-   asm volatile("tlbiel %0" : : "r" (rb));
-}
-
-/*
- * tlbiel instruction for hash, set invalidation
- * i.e., r=1 and is=01 or is=10 or is=11
- */
-static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned 
int is,
-   unsigned int pid,
-   unsigned int ric, unsigned int prs)
-{
-   unsigned long rb;
-   unsigned long rs;
-   unsigned int r = 0; /* hash format */
-
-   rb = (set << PPC_BITLSHIFT(51)) | (is << PPC_BITLSHIFT(53));
-   rs = ((unsigned long)pid << PPC_BITLSHIFT(31));
-
-   asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4)
-: : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r)
-: "memory");
-}
-
-
-static void tlbiel_all_isa206(unsigned int num_sets, unsigned int is)
-{
-   unsigned int set;
-
-   asm volatile("ptesync": : :"memory");
-
-   for (set = 0; set < num_sets; set++)
-   tlbiel_hash_set_isa206(set, is);
-
-   ppc_after_tlbiel_barrier();
-}
-
-static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is)
-{
-   unsigned int set;
-
-   asm volatile("ptesync": : :"memory");
-
-   /*
-* Flush the partition table cache if this is HV mode.
-*/
-   if (early_cpu_has_feature(CPU_FTR_HVMODE))
-   tlbiel_hash_set_isa300(0, is, 0, 2, 0);
-
-   /*
-* Now invalidate the process table cache. UPRT=0 HPT modes (what
-* current hardware implements) do not use the process table, but
-* add the flushes anyway.
-*
-* From ISA v3.0B p. 1078:
-* The following forms are invalid.
-*  * PRS=1, R=0, and RIC!=2 (The only process-scoped
-*HPT caching is of the Process Table.)
-*/
-   tlbiel_hash_set_isa300(0, is, 0, 2, 1);
-
-   /*
-* Then flush the sets of the TLB proper. Hash mode uses
-* partition scoped TLB translations, which may be flushed
-* in !HV mode.
-*/
-   for (set = 0; set < num_sets; set++)
-   tlbiel_hash_set_isa300(set, is, 0, 0, 0);
-
-   ppc_after_tlbiel_barrier();
-
-   asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory");
-}
-
-void hash__tlbiel_all(unsigned int action)
-{
-   unsigned int is;
-
-   switch (action) {
-   case TLB_INVAL_SCOPE_GLOBAL:
-   is = 3;
-   break;
-   case TLB_INVAL_SCOPE_LPID:
-   is = 2;
-   break;
-   default:
-   BUG();
-   }
-
-   if (early_cpu_has_feature(CPU_FTR_ARCH_300))
-   tlbiel_all_isa300(POWER9_TLB_SETS_HASH, is);
-   else if (early_cpu_has_feature(CPU_FTR_ARCH_207S))
-   tlbiel_all_isa206(POWER8_TLB_SETS, is);
-   else if (early_cpu_has_feature(CPU_FTR_ARCH_206))
-   tlbiel_all_isa206(POWER7_TLB_SETS, is);
-   else
-   WARN(1, "%s called on pre-POWER7 CPU\n", __func__);
-}
-
 static inline unsigned long  ___tlbie(unsigned long vpn, int psize,
int apsize, int ssize)
 {
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index ebe3044711ce..ffc52ff0b3f0 

[PATCH v1 02/11] powerpc: Rename PPC_NATIVE to PPC_HASH_MMU_NATIVE

2021-10-15 Thread Nicholas Piggin
PPC_NATIVE now only controls the native HPT code, so rename it to be
more descriptive. Restrict it to Book3S only.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/book3s64/Makefile  | 2 +-
 arch/powerpc/mm/book3s64/hash_utils.c  | 2 +-
 arch/powerpc/platforms/52xx/Kconfig| 2 +-
 arch/powerpc/platforms/Kconfig | 4 ++--
 arch/powerpc/platforms/cell/Kconfig| 2 +-
 arch/powerpc/platforms/chrp/Kconfig| 2 +-
 arch/powerpc/platforms/embedded6xx/Kconfig | 2 +-
 arch/powerpc/platforms/maple/Kconfig   | 2 +-
 arch/powerpc/platforms/microwatt/Kconfig   | 2 +-
 arch/powerpc/platforms/pasemi/Kconfig  | 2 +-
 arch/powerpc/platforms/powermac/Kconfig| 2 +-
 arch/powerpc/platforms/powernv/Kconfig | 2 +-
 arch/powerpc/platforms/pseries/Kconfig | 2 +-
 13 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/Makefile 
b/arch/powerpc/mm/book3s64/Makefile
index 1b56d3af47d4..319f4b7f3357 100644
--- a/arch/powerpc/mm/book3s64/Makefile
+++ b/arch/powerpc/mm/book3s64/Makefile
@@ -6,7 +6,7 @@ CFLAGS_REMOVE_slb.o = $(CC_FLAGS_FTRACE)
 
 obj-y  += hash_pgtable.o hash_utils.o slb.o \
   mmu_context.o pgtable.o hash_tlb.o
-obj-$(CONFIG_PPC_NATIVE)   += hash_native.o
+obj-$(CONFIG_PPC_HASH_MMU_NATIVE)  += hash_native.o
 obj-$(CONFIG_PPC_RADIX_MMU)+= radix_pgtable.o radix_tlb.o
 obj-$(CONFIG_PPC_4K_PAGES) += hash_4k.o
 obj-$(CONFIG_PPC_64K_PAGES)+= hash_64k.o
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index c145776d3ae5..ebe3044711ce 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1091,7 +1091,7 @@ void __init hash__early_init_mmu(void)
ps3_early_mm_init();
else if (firmware_has_feature(FW_FEATURE_LPAR))
hpte_init_pseries();
-   else if (IS_ENABLED(CONFIG_PPC_NATIVE))
+   else if (IS_ENABLED(CONFIG_PPC_HASH_MMU_NATIVE))
hpte_init_native();
 
if (!mmu_hash_ops.hpte_insert)
diff --git a/arch/powerpc/platforms/52xx/Kconfig 
b/arch/powerpc/platforms/52xx/Kconfig
index 99d60acc20c8..b72ed2950ca8 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -34,7 +34,7 @@ config PPC_EFIKA
bool "bPlan Efika 5k2. MPC5200B based computer"
depends on PPC_MPC52xx
select PPC_RTAS
-   select PPC_NATIVE
+   select PPC_HASH_MMU_NATIVE
 
 config PPC_LITE5200
bool "Freescale Lite5200 Eval Board"
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index e02d29a9d12f..d41dad227de8 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -40,9 +40,9 @@ config EPAPR_PARAVIRT
 
  In case of doubt, say Y
 
-config PPC_NATIVE
+config PPC_HASH_MMU_NATIVE
bool
-   depends on PPC_BOOK3S_32 || PPC64
+   depends on PPC_BOOK3S
help
  Support for running natively on the hardware, i.e. without
  a hypervisor. This option is not user-selectable but should
diff --git a/arch/powerpc/platforms/cell/Kconfig 
b/arch/powerpc/platforms/cell/Kconfig
index cb70c5f25bc6..db4465c51b56 100644
--- a/arch/powerpc/platforms/cell/Kconfig
+++ b/arch/powerpc/platforms/cell/Kconfig
@@ -8,7 +8,7 @@ config PPC_CELL_COMMON
select PPC_DCR_MMIO
select PPC_INDIRECT_PIO
select PPC_INDIRECT_MMIO
-   select PPC_NATIVE
+   select PPC_HASH_MMU_NATIVE
select PPC_RTAS
select IRQ_EDGE_EOI_HANDLER
 
diff --git a/arch/powerpc/platforms/chrp/Kconfig 
b/arch/powerpc/platforms/chrp/Kconfig
index 9b5c5505718a..ff30ed579a39 100644
--- a/arch/powerpc/platforms/chrp/Kconfig
+++ b/arch/powerpc/platforms/chrp/Kconfig
@@ -11,6 +11,6 @@ config PPC_CHRP
select RTAS_ERROR_LOGGING
select PPC_MPC106
select PPC_UDBG_16550
-   select PPC_NATIVE
+   select PPC_HASH_MMU_NATIVE
select FORCE_PCI
default y
diff --git a/arch/powerpc/platforms/embedded6xx/Kconfig 
b/arch/powerpc/platforms/embedded6xx/Kconfig
index 4c6d703a4284..c54786f8461e 100644
--- a/arch/powerpc/platforms/embedded6xx/Kconfig
+++ b/arch/powerpc/platforms/embedded6xx/Kconfig
@@ -55,7 +55,7 @@ config MVME5100
select FORCE_PCI
select PPC_INDIRECT_PCI
select PPC_I8259
-   select PPC_NATIVE
+   select PPC_HASH_MMU_NATIVE
select PPC_UDBG_16550
help
  This option enables support for the Motorola (now Emerson) MVME5100
diff --git a/arch/powerpc/platforms/maple/Kconfig 
b/arch/powerpc/platforms/maple/Kconfig
index 86ae210bee9a..7fd84311ade5 100644
--- a/arch/powerpc/platforms/maple/Kconfig
+++ b/arch/powerpc/platforms/maple/Kconfig
@@ -9,7 +9,7 @@ config PPC_MAPLE
select GENERIC_TBSYNC
select PPC_UDBG_16550
select PPC_970_NAP
-   select PPC_NATIVE
+   select 

[PATCH v1 00/11] powerpc: Make hash MMU code build configurable

2021-10-15 Thread Nicholas Piggin
Now that there's a platform that can make good use of it, here's
a series that can prevent the hash MMU code being built for 64s
platforms that don't need it.

Thanks Christophe and Michael for reviews of the RFC, I hope I
got all the issues raised.

Since RFC:
- Split out large code movement from other changes.
- Used mmu ftr test constant folding rather than adding new constant
  true/false for radix_enabled().
- Restore tlbie trace point that had to be commented out in the
  previous.
- Avoid minor (probably unreachable) behaviour change in machine check
  handler when hash was not compiled.
- Fix microwatt updates so !HASH is not enforced.
- Rebase, build fixes.

Thanks,
Nick

Nicholas Piggin (11):
  powerpc: Remove unused FW_FEATURE_NATIVE references
  powerpc: Rename PPC_NATIVE to PPC_HASH_MMU_NATIVE
  powerpc/pseries: Stop selecting PPC_HASH_MMU_NATIVE
  powerpc/64s: Move and rename do_bad_slb_fault as it is not hash
specific
  powerpc/pseries: move pseries_lpar_register_process_table() out from
hash specific code
  powerpc/pseries: lparcfg don't include slb_size line in radix mode
  powerpc/64s: move THP trace point creation out of hash specific file
  powerpc/64s: Make flush_and_reload_slb a no-op when radix is enabled
  powerpc/64s: Make hash MMU code build configurable
  powerpc/configs/microwatt: add POWER9_CPU
  powerpc/microwatt: Don't select the hash MMU code

 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/configs/microwatt_defconfig  |   2 +-
 arch/powerpc/include/asm/book3s/64/mmu.h  |  22 +++-
 .../include/asm/book3s/64/tlbflush-hash.h |   7 ++
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   4 -
 arch/powerpc/include/asm/book3s/pgtable.h |   4 +
 arch/powerpc/include/asm/firmware.h   |   8 --
 arch/powerpc/include/asm/interrupt.h  |   2 +-
 arch/powerpc/include/asm/mmu.h|  14 ++-
 arch/powerpc/include/asm/mmu_context.h|   2 +
 arch/powerpc/include/asm/paca.h   |   8 ++
 arch/powerpc/kernel/asm-offsets.c |   2 +
 arch/powerpc/kernel/dt_cpu_ftrs.c |   8 +-
 arch/powerpc/kernel/entry_64.S|   4 +-
 arch/powerpc/kernel/exceptions-64s.S  |  20 ++-
 arch/powerpc/kernel/mce.c |   2 +-
 arch/powerpc/kernel/mce_power.c   |  16 ++-
 arch/powerpc/kernel/paca.c|  18 ++-
 arch/powerpc/kernel/process.c |  13 +-
 arch/powerpc/kernel/prom.c|   2 +
 arch/powerpc/kernel/setup_64.c|   4 +
 arch/powerpc/kexec/core_64.c  |   4 +-
 arch/powerpc/kexec/ranges.c   |   4 +
 arch/powerpc/kvm/Kconfig  |   1 +
 arch/powerpc/mm/book3s64/Makefile |  19 +--
 arch/powerpc/mm/book3s64/hash_native.c| 104 
 arch/powerpc/mm/book3s64/hash_pgtable.c   |   1 -
 arch/powerpc/mm/book3s64/hash_utils.c | 116 --
 .../{hash_hugetlbpage.c => hugetlbpage.c} |   6 +
 arch/powerpc/mm/book3s64/mmu_context.c|  16 +++
 arch/powerpc/mm/book3s64/pgtable.c|  13 ++
 arch/powerpc/mm/book3s64/radix_pgtable.c  |   4 +
 arch/powerpc/mm/book3s64/slb.c|  16 ---
 arch/powerpc/mm/book3s64/trace.c  |   8 ++
 arch/powerpc/mm/copro_fault.c |   2 +
 arch/powerpc/mm/fault.c   |  17 +++
 arch/powerpc/mm/pgtable.c |  10 +-
 arch/powerpc/platforms/52xx/Kconfig   |   2 +-
 arch/powerpc/platforms/Kconfig|   4 +-
 arch/powerpc/platforms/Kconfig.cputype|  21 +++-
 arch/powerpc/platforms/cell/Kconfig   |   3 +-
 arch/powerpc/platforms/chrp/Kconfig   |   2 +-
 arch/powerpc/platforms/embedded6xx/Kconfig|   2 +-
 arch/powerpc/platforms/maple/Kconfig  |   3 +-
 arch/powerpc/platforms/microwatt/Kconfig  |   1 -
 arch/powerpc/platforms/pasemi/Kconfig |   3 +-
 arch/powerpc/platforms/powermac/Kconfig   |   3 +-
 arch/powerpc/platforms/powernv/Kconfig|   2 +-
 arch/powerpc/platforms/powernv/idle.c |   2 +
 arch/powerpc/platforms/powernv/setup.c|   2 +
 arch/powerpc/platforms/pseries/Kconfig|   1 -
 arch/powerpc/platforms/pseries/lpar.c |  67 +-
 arch/powerpc/platforms/pseries/lparcfg.c  |   5 +-
 arch/powerpc/platforms/pseries/mobility.c |   6 +
 arch/powerpc/platforms/pseries/ras.c  |   2 +
 arch/powerpc/platforms/pseries/reconfig.c |   2 +
 arch/powerpc/platforms/pseries/setup.c|   6 +-
 arch/powerpc/xmon/xmon.c  |   8 +-
 58 files changed, 410 insertions(+), 241 deletions(-)
 rename arch/powerpc/mm/book3s64/{hash_hugetlbpage.c => hugetlbpage.c} (95%)
 create mode 100644 arch/powerpc/mm/book3s64/trace.c

-- 
2.23.0



[PATCH v1 01/11] powerpc: Remove unused FW_FEATURE_NATIVE references

2021-10-15 Thread Nicholas Piggin
FW_FEATURE_NATIVE_ALWAYS and FW_FEATURE_NATIVE_POSSIBLE are always
zero and never do anything. Remove them.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/firmware.h | 8 
 1 file changed, 8 deletions(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index 97a3bd9ffeb9..9b702d2b80fb 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -80,8 +80,6 @@ enum {
FW_FEATURE_POWERNV_ALWAYS = 0,
FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1,
-   FW_FEATURE_NATIVE_POSSIBLE = 0,
-   FW_FEATURE_NATIVE_ALWAYS = 0,
FW_FEATURE_POSSIBLE =
 #ifdef CONFIG_PPC_PSERIES
FW_FEATURE_PSERIES_POSSIBLE |
@@ -91,9 +89,6 @@ enum {
 #endif
 #ifdef CONFIG_PPC_PS3
FW_FEATURE_PS3_POSSIBLE |
-#endif
-#ifdef CONFIG_PPC_NATIVE
-   FW_FEATURE_NATIVE_ALWAYS |
 #endif
0,
FW_FEATURE_ALWAYS =
@@ -105,9 +100,6 @@ enum {
 #endif
 #ifdef CONFIG_PPC_PS3
FW_FEATURE_PS3_ALWAYS &
-#endif
-#ifdef CONFIG_PPC_NATIVE
-   FW_FEATURE_NATIVE_ALWAYS &
 #endif
FW_FEATURE_POSSIBLE,
 
-- 
2.23.0



Re: [PATCH] powerpc/smp: do not decrement idle task preempt count in CPU offline

2021-10-15 Thread Valentin Schneider
On 15/10/21 09:55, Nathan Lynch wrote:
> With PREEMPT_COUNT=y, when a CPU is offlined and then onlined again, we
> get:
>
> BUG: scheduling while atomic: swapper/1/0/0x
> no locks held by swapper/1/0.
> CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.15.0-rc2+ #100
> Call Trace:
>  dump_stack_lvl+0xac/0x108
>  __schedule_bug+0xac/0xe0
>  __schedule+0xcf8/0x10d0
>  schedule_idle+0x3c/0x70
>  do_idle+0x2d8/0x4a0
>  cpu_startup_entry+0x38/0x40
>  start_secondary+0x2ec/0x3a0
>  start_secondary_prolog+0x10/0x14
>
> This is because powerpc's arch_cpu_idle_dead() decrements the idle task's
> preempt count, for reasons explained in commit a7c2bb8279d2 ("powerpc:
> Re-enable preemption before cpu_die()"), specifically "start_secondary()
> expects a preempt_count() of 0."
>
> However, since commit 2c669ef6979c ("powerpc/preempt: Don't touch the idle
> task's preempt_count during hotplug") and commit f1a0a376ca0c ("sched/core:
> Initialize the idle task with preemption disabled"), that justification no
> longer holds.
>
> The idle task isn't supposed to re-enable preemption, so remove the
> vestigial preempt_enable() from the CPU offline path.
>

Humph, I got confused because 2c669ef6979c explicitly mentions hotplug,
but that's the hotplug machinery which is already involved for bringing up
the secondaries at boot time.

IIUC your issue here is the preempt_count being messed up when
hot-unplugging a CPU, which leads to fireworks during hotplug (IOW I didn't
test my last patch against hotplug - my bad!)

Reviewed-by: Valentin Schneider 

> Tested with pseries and powernv in qemu, and pseries on PowerVM.
>
> Fixes: 2c669ef6979c ("powerpc/preempt: Don't touch the idle task's 
> preempt_count during hotplug")
> Fixes: f1a0a376ca0c ("sched/core: Initialize the idle task with preemption 
> disabled")

I think only the first Fixes: is needed.

> Signed-off-by: Nathan Lynch 
> ---
>  arch/powerpc/kernel/smp.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 9cc7d3dbf439..605bab448f84 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1730,8 +1730,6 @@ void __cpu_die(unsigned int cpu)
>
>  void arch_cpu_idle_dead(void)
>  {
> - sched_preempt_enable_no_resched();
> -
>   /*
>* Disable on the down path. This will be re-enabled by
>* start_secondary() via start_secondary_resume() below
> --
> 2.31.1


[PATCH] tracing: Have all levels of checks prevent recursion

2021-10-15 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

While writing an email explaining the "bit = 0" logic for a discussion on
making ftrace_test_recursion_trylock() disable preemption, I discovered a
path that makes the "not do the logic if bit is zero" unsafe.

The recursion logic is done in hot paths like the function tracer. Thus,
any code executed causes noticeable overhead. Thus, tricks are done to try
to limit the amount of code executed. This included the recursion testing
logic.

Having recursion testing is important, as there are many paths that can
end up in an infinite recursion cycle when tracing every function in the
kernel. Thus protection is needed to prevent that from happening.

Because it is OK to recurse due to different running context levels (e.g.
an interrupt preempts a trace, and then a trace occurs in the interrupt
handler), a set of bits are used to know which context one is in (normal,
softirq, irq and NMI). If a recursion occurs in the same level, it is
prevented*.

Then there are infrastructure levels of recursion as well. When more than
one callback is attached to the same function to trace, it calls a loop
function to iterate over all the callbacks. Both the callbacks and the
loop function have recursion protection. The callbacks use the
"ftrace_test_recursion_trylock()" which has a "function" set of context
bits to test, and the loop function calls the internal
trace_test_and_set_recursion() directly, with an "internal" set of bits.

If an architecture does not implement all the features supported by ftrace
then the callbacks are never called directly, and the loop function is
called instead, which will implement the features of ftrace.

Since both the loop function and the callbacks do recursion protection, it
was seemed unnecessary to do it in both locations. Thus, a trick was made
to have the internal set of recursion bits at a more significant bit
location than the function bits. Then, if any of the higher bits were set,
the logic of the function bits could be skipped, as any new recursion
would first have to go through the loop function.

This is true for architectures that do not support all the ftrace
features, because all functions being traced must first go through the
loop function before going to the callbacks. But this is not true for
architectures that support all the ftrace features. That's because the
loop function could be called due to two callbacks attached to the same
function, but then a recursion function inside the callback could be
called that does not share any other callback, and it will be called
directly.

i.e.

 traced_function_1: [ more than one callback tracing it ]
   call loop_func

 loop_func:
   trace_recursion set internal bit
   call callback

 callback:
   trace_recursion [ skipped because internal bit is set, return 0 ]
   call traced_function_2

 traced_function_2: [ only traced by above callback ]
   call callback

 callback:
   trace_recursion [ skipped because internal bit is set, return 0 ]
   call traced_function_2

 [ wash, rinse, repeat, BOOM! out of shampoo! ]

Thus, the "bit == 0 skip" trick is not safe, unless the loop function is
call for all functions.

Since we want to encourage architectures to implement all ftrace features,
having them slow down due to this extra logic may encourage the
maintainers to update to the latest ftrace features. And because this
logic is only safe for them, remove it completely.

 [*] There is on layer of recursion that is allowed, and that is to allow
 for the transition between interrupt context (normal -> softirq ->
 irq -> NMI), because a trace may occur before the context update is
 visible to the trace recursion logic.

Link: 
https://lore.kernel.org/all/609b565a-ed6e-a1da-f025-166691b5d...@linux.alibaba.com/

Cc: sta...@vger.kernel.org
Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks")
Signed-off-by: Steven Rostedt (VMware) 
---
 include/linux/trace_recursion.h | 41 +
 1 file changed, 6 insertions(+), 35 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index a9f9c5714e65..168fdf07419a 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -16,23 +16,8 @@
  *  When function tracing occurs, the following steps are made:
  *   If arch does not support a ftrace feature:
  *call internal function (uses INTERNAL bits) which calls...
- *   If callback is registered to the "global" list, the list
- *function is called and recursion checks the GLOBAL bits.
- *then this function calls...
  *   The function callback, which can use the FTRACE bits to
  *check for recursion.
- *
- * Now if the arch does not support a feature, and it calls
- * the global list function which calls the ftrace callback
- * all three of these steps will do a recursion protection.
- * There's no reason to do one if the previous caller already
- * did. The recursion that we are 

[PATCH] powerpc/smp: do not decrement idle task preempt count in CPU offline

2021-10-15 Thread Nathan Lynch
With PREEMPT_COUNT=y, when a CPU is offlined and then onlined again, we
get:

BUG: scheduling while atomic: swapper/1/0/0x
no locks held by swapper/1/0.
CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.15.0-rc2+ #100
Call Trace:
 dump_stack_lvl+0xac/0x108
 __schedule_bug+0xac/0xe0
 __schedule+0xcf8/0x10d0
 schedule_idle+0x3c/0x70
 do_idle+0x2d8/0x4a0
 cpu_startup_entry+0x38/0x40
 start_secondary+0x2ec/0x3a0
 start_secondary_prolog+0x10/0x14

This is because powerpc's arch_cpu_idle_dead() decrements the idle task's
preempt count, for reasons explained in commit a7c2bb8279d2 ("powerpc:
Re-enable preemption before cpu_die()"), specifically "start_secondary()
expects a preempt_count() of 0."

However, since commit 2c669ef6979c ("powerpc/preempt: Don't touch the idle
task's preempt_count during hotplug") and commit f1a0a376ca0c ("sched/core:
Initialize the idle task with preemption disabled"), that justification no
longer holds.

The idle task isn't supposed to re-enable preemption, so remove the
vestigial preempt_enable() from the CPU offline path.

Tested with pseries and powernv in qemu, and pseries on PowerVM.

Fixes: 2c669ef6979c ("powerpc/preempt: Don't touch the idle task's 
preempt_count during hotplug")
Fixes: f1a0a376ca0c ("sched/core: Initialize the idle task with preemption 
disabled")
Signed-off-by: Nathan Lynch 
---
 arch/powerpc/kernel/smp.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 9cc7d3dbf439..605bab448f84 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1730,8 +1730,6 @@ void __cpu_die(unsigned int cpu)
 
 void arch_cpu_idle_dead(void)
 {
-   sched_preempt_enable_no_resched();
-
/*
 * Disable on the down path. This will be re-enabled by
 * start_secondary() via start_secondary_resume() below
-- 
2.31.1



[PATCH v2 19/24] PCI/DPC: Use RESPONSE_IS_PCI_ERROR() to check read from hardware

2021-10-15 Thread Naveen Naidu
An MMIO read from a PCI device that doesn't exist or doesn't respond
causes a PCI error.  There's no real data to return to satisfy the
CPU read, so most hardware fabricates ~0 data.

Use RESPONSE_IS_PCI_ERROR() to check the response we get when we read
data from hardware.

This helps unify PCI error response checking and make error checks
consistent and easier to find.

Compile tested only.

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/dpc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index c556e7beafe3..561c44d9429c 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -79,7 +79,7 @@ static bool dpc_completed(struct pci_dev *pdev)
u16 status;
 
pci_read_config_word(pdev, pdev->dpc_cap + PCI_EXP_DPC_STATUS, );
-   if ((status != 0x) && (status & PCI_EXP_DPC_STATUS_TRIGGER))
+   if ((!RESPONSE_IS_PCI_ERROR()) && (status & 
PCI_EXP_DPC_STATUS_TRIGGER))
return false;
 
if (test_bit(PCI_DPC_RECOVERING, >priv_flags))
@@ -312,7 +312,7 @@ static irqreturn_t dpc_irq(int irq, void *context)
 
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, );
 
-   if (!(status & PCI_EXP_DPC_STATUS_INTERRUPT) || status == (u16)(~0))
+   if (!(status & PCI_EXP_DPC_STATUS_INTERRUPT) || 
RESPONSE_IS_PCI_ERROR())
return IRQ_NONE;
 
pci_write_config_word(pdev, cap + PCI_EXP_DPC_STATUS,
-- 
2.25.1



Re: [PATCH v2 00/24] Unify PCI error response checking

2021-10-15 Thread Naveen Naidu
On 15/10, Naveen Naidu wrote:
> An MMIO read from a PCI device that doesn't exist or doesn't respond
> causes a PCI error.  There's no real data to return to satisfy the 
> CPU read, so most hardware fabricates ~0 data.
> 
> This patch series adds PCI_ERROR_RESPONSE definition and other helper
> definition SET_PCI_ERROR_RESPONSE and RESPONSE_IS_PCI_ERROR and uses it
> where appropriate to make these checks consistent and easier to find.
> 
> This helps unify PCI error response checking and make error check
> consistent and easier to find.
> 
> This series also ensures that the error response fabrication now happens
> in the PCI_OP_READ and PCI_USER_READ_CONFIG. This removes the
> responsibility from controller drivers to do the error response setting. 
> 
> Patch 1:
> - Adds the PCI_ERROR_RESPONSE and other related defintions
> - All other patches are dependent on this patch. This patch needs to
>   be applied first, before the others
> 
> Patch 2:
> - Error fabrication happens in PCI_OP_READ and PCI_USER_READ_CONFIG
>   whenever the data read via the controller driver fails.
> - This patch needs to be applied before, Patch 4/24 to Patch 15/24 are
>   applied.
> 
> Patch 3:
> - Uses SET_PCI_ERROR_RESPONSE() when device is not found and
>   RESPONSE_IS_PCI_ERROR() to check hardware read from the hardware.
> 
> Patch 4 - 15:
> - Removes redundant error fabrication that happens in controller 
>   drivers when the read from a PCI device fails.
> - These patches are dependent on Patch 2/24 of the series.
> - These can be applied in any order.
> 
> Patch 16 - 22:
> - Uses RESPONSE_IS_PCI_ERROR() to check the reads from hardware
> - Patches can be applied in any order.
> 
> Patch 23 - 24:
> - Edits the comments to include PCI_ERROR_RESPONSE alsong with
>   0x, so that it becomes easier to grep for faulty 
>   hardware reads.
> 
> Changelog
> =
> 
> v2:
> - Instead of using SET_PCI_ERROR_RESPONSE in all controller drivers
>   to fabricate error response, only use them in PCI_OP_READ and
>   PCI_USER_READ_CONFIG
> 
> Naveen Naidu (24):
>  [PATCH 1/24] PCI: Add PCI_ERROR_RESPONSE and it's related definitions
>  [PATCH 2/24] PCI: Set error response in config access defines when 
> ops->read() fails
>  [PATCH 3/24] PCI: Unify PCI error response checking
>  [PATCH 4/24] PCI: Remove redundant error fabrication when device read fails
>  [PATCH 5/24] PCI: thunder: Remove redundant error fabrication when device 
> read fails
>  [PATCH 6/24] PCI: iproc: Remove redundant error fabrication when device read 
> fails
>  [PATCH 7/24] PCI: mediatek: Remove redundant error fabrication when device 
> read fails
>  [PATCH 8/24] PCI: exynos: Remove redundant error fabrication when device 
> read fails
>  [PATCH 9/24] PCI: histb: Remove redundant error fabrication when device read 
> fails
>  [PATCH 10/24] PCI: kirin: Remove redundant error fabrication when device 
> read fails
>  [PATCH 11/24] PCI: aardvark: Remove redundant error fabrication when device 
> read fails
>  [PATCH 12/24] PCI: mvebu: Remove redundant error fabrication when device 
> read fails
>  [PATCH 13/24] PCI: altera: Remove redundant error fabrication when device 
> read fails
>  [PATCH 14/24] PCI: rcar: Remove redundant error fabrication when device read 
> fails
>  [PATCH 15/24] PCI: rockchip: Remove redundant error fabrication when device 
> read fails
>  [PATCH 16/24] PCI/ERR: Use RESPONSE_IS_PCI_ERROR() to check read from 
> hardware
>  [PATCH 17/24] PCI: vmd: Use RESPONSE_IS_PCI_ERROR() to check read from 
> hardware
>  [PATCH 18/24] PCI: pciehp: Use RESPONSE_IS_PCI_ERROR() to check read from 
> hardware
>  [PATCH 19/24] PCI/DPC: Use RESPONSE_IS_PCI_ERROR() to check read from 
> hardware
>  [PATCH 20/24] PCI/PME: Use RESPONSE_IS_PCI_ERROR() to check read from 
> hardware
>  [PATCH 21/24] PCI: cpqphp: Use RESPONSE_IS_PCI_ERROR() to check read from 
> hardware
>  [PATCH 22/24] PCI: keystone: Use PCI_ERROR_RESPONSE to specify hardware error
>  [PATCH 23/24] PCI: hv: Use PCI_ERROR_RESPONSE to specify hardware read error
>  [PATCH 24/24] PCI: xgene: Use PCI_ERROR_RESPONSE to specify hardware error
> 
>  drivers/pci/access.c| 36 
>  drivers/pci/controller/dwc/pci-exynos.c |  4 +-
>  drivers/pci/controller/dwc/pci-keystone.c   |  4 +-
>  drivers/pci/controller/dwc/pcie-histb.c |  4 +-
>  drivers/pci/controller/dwc/pcie-kirin.c |  4 +-
>  drivers/pci/controller/pci-aardvark.c   | 10 +
>  drivers/pci/controller/pci-hyperv.c |  2 +-
>  drivers/pci/controller/pci-mvebu.c  |  8 +---
>  drivers/pci/controller/pci-thunder-ecam.c   | 46 +++--
>  drivers/pci/controller/pci-thunder-pem.c|  4 +-
>  drivers/pci/controller/pci-xgene.c  |  8 ++--
>  drivers/pci/controller/pcie-altera.c|  4 +-
>  drivers/pci/controller/pcie-iproc.c |  4 +-
>  

Re: [PATCH v3 1/2] ftrace: disable preemption between ftrace_test_recursion_trylock/unlock()

2021-10-15 Thread Petr Mladek
On Fri 2021-10-15 11:13:08, 王贇 wrote:
> 
> 
> On 2021/10/14 下午11:14, Petr Mladek wrote:
> [snip]
> >> -  return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, 
> >> TRACE_FTRACE_MAX);
> >> +  int bit;
> >> +
> >> +  bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, 
> >> TRACE_FTRACE_MAX);
> >> +  /*
> >> +   * The zero bit indicate we are nested
> >> +   * in another trylock(), which means the
> >> +   * preemption already disabled.
> >> +   */
> >> +  if (bit > 0)
> >> +  preempt_disable_notrace();
> > 
> > Is this safe? The preemption is disabled only when
> > trace_test_and_set_recursion() was called by 
> > ftrace_test_recursion_trylock().
> > 
> > We must either always disable the preemtion when bit >= 0.
> > Or we have to disable the preemtion already in
> > trace_test_and_set_recursion().
> 
> Internal calling of trace_test_and_set_recursion() will disable preemption
> on succeed, it should be safe.

trace_test_and_set_recursion() does _not_ disable preemtion!
It works only because all callers disable the preemption.

It means that the comment is wrong. It is not guarantted that the
preemption will be disabled. It works only by chance.


> We can also consider move the logical into trace_test_and_set_recursion()
> and trace_clear_recursion(), but I'm not very sure about that... ftrace
> internally already make sure preemption disabled

How? Because it calls trace_test_and_set_recursion() via the trylock() API?


> , what uncovered is those users who call API trylock/unlock, isn't
> it?

And this is exactly the problem. trace_test_and_set_recursion() is in
a public header. Anyone could use it. And if anyone uses it in the
future without the trylock() and does not disable the preemtion
explicitely then we are lost again.

And it is even more dangerous. The original code disabled the
preemtion on various layers. As a result, the preemtion was disabled
several times for sure. It means that the deeper layers were
always on the safe side.

With this patch, if the first trace_test_and_set_recursion() caller
does not disable preemtion then trylock() will not disable it either
and the entire code is procceed with preemtion enabled.


> > Finally, the comment confused me a lot. The difference between nesting and
> > recursion is far from clear. And the code is tricky liky like hell :-)
> > I propose to add some comments, see below for a proposal.
> The comments do confusing, I'll make it something like:
> 
> The zero bit indicate trace recursion happened, whatever
> the recursively call was made by ftrace handler or ftrace
> itself, the preemption already disabled.

I am sorry but it is still confusing. We need to find a better way
how to clearly explain the difference between the safe and
unsafe recursion.

My understanding is that the recursion is:

  + "unsafe" when the trace code recursively enters the same trace point.

  + "safe" when ftrace_test_recursion_trylock() is called recursivelly
while still processing the same trace entry.

> >> +
> >> +  return bit;
> >>  }
> >>  /**
> >> @@ -222,9 +233,13 @@ static __always_inline int 
> >> ftrace_test_recursion_trylock(unsigned long ip,
> >>   * @bit: The return of a successful ftrace_test_recursion_trylock()
> >>   *
> >>   * This is used at the end of a ftrace callback.
> >> + *
> >> + * Preemption will be enabled (if it was previously enabled).
> >>   */
> >>  static __always_inline void ftrace_test_recursion_unlock(int bit)
> >>  {
> >> +  if (bit)
> > 
> > This is not symetric with trylock(). It should be:
> > 
> > if (bit > 0)
> > 
> > Anyway, trace_clear_recursion() quiently ignores bit != 0
> 
> Yes, bit == 0 should not happen in here.

Yes, it "should" not happen. My point is that we could make the API
more safe. We could do the right thing when
ftrace_test_recursion_unlock() is called with negative @bit.
Ideally, we should also warn about the mis-use.


Anyway, let's wait for Steven. It seems that he found another problem
with the API that should be solved first. The fix will probably
also help to better understand the "safe" vs "unsafe" recursion.

Best Regards,
Petr


[PATCH v2 00/24] Unify PCI error response checking

2021-10-15 Thread Naveen Naidu
An MMIO read from a PCI device that doesn't exist or doesn't respond
causes a PCI error.  There's no real data to return to satisfy the 
CPU read, so most hardware fabricates ~0 data.

This patch series adds PCI_ERROR_RESPONSE definition and other helper
definition SET_PCI_ERROR_RESPONSE and RESPONSE_IS_PCI_ERROR and uses it
where appropriate to make these checks consistent and easier to find.

This helps unify PCI error response checking and make error check
consistent and easier to find.

This series also ensures that the error response fabrication now happens
in the PCI_OP_READ and PCI_USER_READ_CONFIG. This removes the
responsibility from controller drivers to do the error response setting. 

Patch 1:
- Adds the PCI_ERROR_RESPONSE and other related defintions
- All other patches are dependent on this patch. This patch needs to
  be applied first, before the others

Patch 2:
- Error fabrication happens in PCI_OP_READ and PCI_USER_READ_CONFIG
  whenever the data read via the controller driver fails.
- This patch needs to be applied before, Patch 4/24 to Patch 15/24 are
  applied.

Patch 3:
- Uses SET_PCI_ERROR_RESPONSE() when device is not found and
  RESPONSE_IS_PCI_ERROR() to check hardware read from the hardware.

Patch 4 - 15:
- Removes redundant error fabrication that happens in controller 
  drivers when the read from a PCI device fails.
- These patches are dependent on Patch 2/24 of the series.
- These can be applied in any order.

Patch 16 - 22:
- Uses RESPONSE_IS_PCI_ERROR() to check the reads from hardware
- Patches can be applied in any order.

Patch 23 - 24:
- Edits the comments to include PCI_ERROR_RESPONSE alsong with
  0x, so that it becomes easier to grep for faulty 
  hardware reads.

Changelog
=

v2:
- Instead of using SET_PCI_ERROR_RESPONSE in all controller drivers
  to fabricate error response, only use them in PCI_OP_READ and
  PCI_USER_READ_CONFIG

Naveen Naidu (24):
 [PATCH 1/24] PCI: Add PCI_ERROR_RESPONSE and it's related definitions
 [PATCH 2/24] PCI: Set error response in config access defines when ops->read() 
fails
 [PATCH 3/24] PCI: Unify PCI error response checking
 [PATCH 4/24] PCI: Remove redundant error fabrication when device read fails
 [PATCH 5/24] PCI: thunder: Remove redundant error fabrication when device read 
fails
 [PATCH 6/24] PCI: iproc: Remove redundant error fabrication when device read 
fails
 [PATCH 7/24] PCI: mediatek: Remove redundant error fabrication when device 
read fails
 [PATCH 8/24] PCI: exynos: Remove redundant error fabrication when device read 
fails
 [PATCH 9/24] PCI: histb: Remove redundant error fabrication when device read 
fails
 [PATCH 10/24] PCI: kirin: Remove redundant error fabrication when device read 
fails
 [PATCH 11/24] PCI: aardvark: Remove redundant error fabrication when device 
read fails
 [PATCH 12/24] PCI: mvebu: Remove redundant error fabrication when device read 
fails
 [PATCH 13/24] PCI: altera: Remove redundant error fabrication when device read 
fails
 [PATCH 14/24] PCI: rcar: Remove redundant error fabrication when device read 
fails
 [PATCH 15/24] PCI: rockchip: Remove redundant error fabrication when device 
read fails
 [PATCH 16/24] PCI/ERR: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 17/24] PCI: vmd: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 18/24] PCI: pciehp: Use RESPONSE_IS_PCI_ERROR() to check read from 
hardware
 [PATCH 19/24] PCI/DPC: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 20/24] PCI/PME: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 21/24] PCI: cpqphp: Use RESPONSE_IS_PCI_ERROR() to check read from 
hardware
 [PATCH 22/24] PCI: keystone: Use PCI_ERROR_RESPONSE to specify hardware error
 [PATCH 23/24] PCI: hv: Use PCI_ERROR_RESPONSE to specify hardware read error
 [PATCH 24/24] PCI: xgene: Use PCI_ERROR_RESPONSE to specify hardware error

 drivers/pci/access.c| 36 
 drivers/pci/controller/dwc/pci-exynos.c |  4 +-
 drivers/pci/controller/dwc/pci-keystone.c   |  4 +-
 drivers/pci/controller/dwc/pcie-histb.c |  4 +-
 drivers/pci/controller/dwc/pcie-kirin.c |  4 +-
 drivers/pci/controller/pci-aardvark.c   | 10 +
 drivers/pci/controller/pci-hyperv.c |  2 +-
 drivers/pci/controller/pci-mvebu.c  |  8 +---
 drivers/pci/controller/pci-thunder-ecam.c   | 46 +++--
 drivers/pci/controller/pci-thunder-pem.c|  4 +-
 drivers/pci/controller/pci-xgene.c  |  8 ++--
 drivers/pci/controller/pcie-altera.c|  4 +-
 drivers/pci/controller/pcie-iproc.c |  4 +-
 drivers/pci/controller/pcie-mediatek.c  | 11 +
 drivers/pci/controller/pcie-rcar-host.c |  4 +-
 drivers/pci/controller/pcie-rockchip-host.c |  4 +-
 drivers/pci/controller/vmd.c|  2 +-
 drivers/pci/hotplug/cpqphp_ctrl.c   |  4 +-
 

[PATCH v2 00/24] Unify PCI error response checking

2021-10-15 Thread Naveen Naidu
An MMIO read from a PCI device that doesn't exist or doesn't respond
causes a PCI error.  There's no real data to return to satisfy the 
CPU read, so most hardware fabricates ~0 data.

This patch series adds PCI_ERROR_RESPONSE definition and other helper
definition SET_PCI_ERROR_RESPONSE and RESPONSE_IS_PCI_ERROR and uses it
where appropriate to make these checks consistent and easier to find.

This helps unify PCI error response checking and make error check
consistent and easier to find.

This series also ensures that the error response fabrication now happens
in the PCI_OP_READ and PCI_USER_READ_CONFIG. This removes the
responsibility from controller drivers to do the error response setting. 

Patch 1:
- Adds the PCI_ERROR_RESPONSE and other related defintions
- All other patches are dependent on this patch. This patch needs to
  be applied first, before the others

Patch 2:
- Error fabrication happens in PCI_OP_READ and PCI_USER_READ_CONFIG
  whenever the data read via the controller driver fails.
- This patch needs to be applied before, Patch 4/24 to Patch 15/24 are
  applied.

Patch 3:
- Uses SET_PCI_ERROR_RESPONSE() when device is not found and
  RESPONSE_IS_PCI_ERROR() to check hardware read from the hardware.

Patch 4 - 15:
- Removes redundant error fabrication that happens in controller 
  drivers when the read from a PCI device fails.
- These patches are dependent on Patch 2/24 of the series.
- These can be applied in any order.

Patch 16 - 22:
- Uses RESPONSE_IS_PCI_ERROR() to check the reads from hardware
- Patches can be applied in any order.

Patch 23 - 24:
- Edits the comments to include PCI_ERROR_RESPONSE alsong with
  0x, so that it becomes easier to grep for faulty 
  hardware reads.

Changelog
=

v2:
- Instead of using SET_PCI_ERROR_RESPONSE in all controller drivers
  to fabricate error response, only use them in PCI_OP_READ and
  PCI_USER_READ_CONFIG

Naveen Naidu (24):
 [PATCH 1/24] PCI: Add PCI_ERROR_RESPONSE and it's related definitions
 [PATCH 2/24] PCI: Set error response in config access defines when ops->read() 
fails
 [PATCH 3/24] PCI: Unify PCI error response checking
 [PATCH 4/24] PCI: Remove redundant error fabrication when device read fails
 [PATCH 5/24] PCI: thunder: Remove redundant error fabrication when device read 
fails
 [PATCH 6/24] PCI: iproc: Remove redundant error fabrication when device read 
fails
 [PATCH 7/24] PCI: mediatek: Remove redundant error fabrication when device 
read fails
 [PATCH 8/24] PCI: exynos: Remove redundant error fabrication when device read 
fails
 [PATCH 9/24] PCI: histb: Remove redundant error fabrication when device read 
fails
 [PATCH 10/24] PCI: kirin: Remove redundant error fabrication when device read 
fails
 [PATCH 11/24] PCI: aardvark: Remove redundant error fabrication when device 
read fails
 [PATCH 12/24] PCI: mvebu: Remove redundant error fabrication when device read 
fails
 [PATCH 13/24] PCI: altera: Remove redundant error fabrication when device read 
fails
 [PATCH 14/24] PCI: rcar: Remove redundant error fabrication when device read 
fails
 [PATCH 15/24] PCI: rockchip: Remove redundant error fabrication when device 
read fails
 [PATCH 16/24] PCI/ERR: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 17/24] PCI: vmd: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 18/24] PCI: pciehp: Use RESPONSE_IS_PCI_ERROR() to check read from 
hardware
 [PATCH 19/24] PCI/DPC: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 20/24] PCI/PME: Use RESPONSE_IS_PCI_ERROR() to check read from hardware
 [PATCH 21/24] PCI: cpqphp: Use RESPONSE_IS_PCI_ERROR() to check read from 
hardware
 [PATCH 22/24] PCI: keystone: Use PCI_ERROR_RESPONSE to specify hardware error
 [PATCH 23/24] PCI: hv: Use PCI_ERROR_RESPONSE to specify hardware read error
 [PATCH 24/24] PCI: xgene: Use PCI_ERROR_RESPONSE to specify hardware error

 drivers/pci/access.c| 36 
 drivers/pci/controller/dwc/pci-exynos.c |  4 +-
 drivers/pci/controller/dwc/pci-keystone.c   |  4 +-
 drivers/pci/controller/dwc/pcie-histb.c |  4 +-
 drivers/pci/controller/dwc/pcie-kirin.c |  4 +-
 drivers/pci/controller/pci-aardvark.c   | 10 +
 drivers/pci/controller/pci-hyperv.c |  2 +-
 drivers/pci/controller/pci-mvebu.c  |  8 +---
 drivers/pci/controller/pci-thunder-ecam.c   | 46 +++--
 drivers/pci/controller/pci-thunder-pem.c|  4 +-
 drivers/pci/controller/pci-xgene.c  |  8 ++--
 drivers/pci/controller/pcie-altera.c|  4 +-
 drivers/pci/controller/pcie-iproc.c |  4 +-
 drivers/pci/controller/pcie-mediatek.c  | 11 +
 drivers/pci/controller/pcie-rcar-host.c |  4 +-
 drivers/pci/controller/pcie-rockchip-host.c |  4 +-
 drivers/pci/controller/vmd.c|  2 +-
 drivers/pci/hotplug/cpqphp_ctrl.c   |  4 +-
 

Re: [PATCH] soc: fsl: dpio: protect smp_processor_id when get processor id

2021-10-15 Thread Robin Murphy

On 2021-10-15 07:36, meng...@windriver.com wrote:

From: Meng Li 

When enable debug kernel configs,there will be calltrace as below:

BUG: using smp_processor_id() in preemptible [] code: swapper/0/1
caller is debug_smp_processor_id+0x20/0x30
CPU: 6 PID: 1 Comm: swapper/0 Not tainted 5.10.63-yocto-standard #1
Hardware name: NXP Layerscape LX2160ARDB (DT)
Call trace:
  dump_backtrace+0x0/0x1a0
  show_stack+0x24/0x30
  dump_stack+0xf0/0x13c
  check_preemption_disabled+0x100/0x110
  debug_smp_processor_id+0x20/0x30
  dpaa2_io_query_fq_count+0xdc/0x154
  dpaa2_eth_stop+0x144/0x314
  __dev_close_many+0xdc/0x160
  __dev_change_flags+0xe8/0x220
  dev_change_flags+0x30/0x70
  ic_close_devs+0x50/0x78
  ip_auto_config+0xed0/0xf10
  do_one_initcall+0xac/0x460
  kernel_init_freeable+0x30c/0x378
  kernel_init+0x20/0x128
  ret_from_fork+0x10/0x38

Because smp_processor_id() should be invoked in preempt disable status.
So, add preempt_disable/enable() to protect smp_processor_id().


If preemption doesn't matter anyway, as the comment in the context 
implies, then it probably makes more sense just to use 
raw_smp_processor_id() instead.


Robin.


Fixes: c89105c9b390 ("staging: fsl-mc: Move DPIO from staging to 
drivers/soc/fsl")
Cc: sta...@vger.kernel.org
Signed-off-by: Meng Li 
---
  drivers/soc/fsl/dpio/dpio-service.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/soc/fsl/dpio/dpio-service.c 
b/drivers/soc/fsl/dpio/dpio-service.c
index 19f47ea9dab0..afc3b89b0fc5 100644
--- a/drivers/soc/fsl/dpio/dpio-service.c
+++ b/drivers/soc/fsl/dpio/dpio-service.c
@@ -58,8 +58,11 @@ static inline struct dpaa2_io *service_select_by_cpu(struct 
dpaa2_io *d,
 * If cpu == -1, choose the current cpu, with no guarantees about
 * potentially being migrated away.
 */
-   if (cpu < 0)
-   cpu = smp_processor_id();
+if (cpu < 0) {
+preempt_disable();
+cpu = smp_processor_id();
+preempt_enable();
+}
  
  	/* If a specific cpu was requested, pick it up immediately */

return dpio_by_cpu[cpu];



[PATCH 2/2] KVM: PPC: Book3S HV: Make idle_kvm_start_guest() return 0 if it went to guest

2021-10-15 Thread Michael Ellerman
We call idle_kvm_start_guest() from power7_offline() if the thread has
been requested to enter KVM. We pass it the SRR1 value that was returned
from power7_idle_insn() which tells us what sort of wakeup we're
processing.

Depending on the SRR1 value we pass in, the KVM code might enter the
guest, or it might return to us to do some host action if the wakeup
requires it.

If idle_kvm_start_guest() is able to handle the wakeup, and enter the
guest it is supposed to indicate that by returning a zero SRR1 value to
us.

That was the behaviour prior to commit 10d91611f426 ("powerpc/64s:
Reimplement book3s idle code in C"), however in that commit the
handling of SRR1 was reworked, and the zeroing behaviour was lost.

Returning from idle_kvm_start_guest() without zeroing the SRR1 value can
confuse the host offline code, causing the guest to crash and other
weirdness.

Fixes: 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
Cc: sta...@vger.kernel.org # v5.2+
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index ec57952b60b0..eb776d0c5d8e 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -264,6 +264,7 @@ _GLOBAL(idle_kvm_start_guest)
stdur1, -SWITCH_FRAME_SIZE(r4)
// Switch to new frame on emergency stack
mr  r1, r4
+   std r3, 32(r1)  // Save SRR1 wakeup value
SAVE_NVGPRS(r1)
 
/*
@@ -315,6 +316,10 @@ _GLOBAL(idle_kvm_start_guest)
 
 kvm_secondary_got_guest:
 
+   // About to go to guest, clear saved SRR1
+   li  r0, 0
+   std r0, 32(r1)
+
/* Set HSTATE_DSCR(r13) to something sensible */
ld  r6, PACA_DSCR_DEFAULT(r13)
std r6, HSTATE_DSCR(r13)
@@ -394,8 +399,8 @@ _GLOBAL(idle_kvm_start_guest)
mfspr   r4, SPRN_LPCR
rlwimi  r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
mtspr   SPRN_LPCR, r4
-   /* set up r3 for return */
-   mfspr   r3,SPRN_SRR1
+   // Return SRR1 wakeup value, or 0 if we went into the guest
+   ld  r3, 32(r1)
REST_NVGPRS(r1)
ld  r1, 0(r1)   // Switch back to caller stack
ld  r0, 16(r1)  // Reload LR
-- 
2.25.1



[PATCH 1/2] KVM: PPC: Book3S HV: Fix stack handling in idle_kvm_start_guest()

2021-10-15 Thread Michael Ellerman
In commit 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in
C") kvm_start_guest() became idle_kvm_start_guest(). The old code
allocated a stack frame on the emergency stack, but didn't use the
frame to store anything, and also didn't store anything in its caller's
frame.

idle_kvm_start_guest() on the other hand is written more like a normal C
function, it creates a frame on entry, and also stores CR/LR into its
callers frame (per the ABI). The problem is that there is no caller
frame on the emergency stack.

The emergency stack for a given CPU is allocated with:

  paca_ptrs[i]->emergency_sp = alloc_stack(limit, i) + THREAD_SIZE;

So emergency_sp actually points to the first address above the emergency
stack allocation for a given CPU, we must not store above it without
first decrementing it to create a frame. This is different to the
regular kernel stack, paca->kstack, which is initialised to point at an
initial frame that is ready to use.

idle_kvm_start_guest() stores the backchain, CR and LR all of which
write outside the allocation for the emergency stack. It then creates a
stack frame and saves the non-volatile registers. Unfortunately the
frame it creates is not large enough to fit the non-volatiles, and so
the saving of the non-volatile registers also writes outside the
emergency stack allocation.

The end result is that we corrupt whatever is at 0-24 bytes, and 112-248
bytes above the emergency stack allocation.

In practice this has gone unnoticed because the memory immediately above
the emergency stack happens to be used for other stack allocations,
either another CPUs mc_emergency_sp or an IRQ stack. See the order of
calls to irqstack_early_init() and emergency_stack_init().

The low addresses of another stack are the top of that stack, and so are
only used if that stack is under extreme pressue, which essentially
never happens in practice - and if it did there's a high likelyhood we'd
crash due to that stack overflowing.

Still, we shouldn't be corrupting someone else's stack, and it is purely
luck that we aren't corrupting something else.

To fix it we save CR/LR into the caller's frame using the existing r1 on
entry, we then create a SWITCH_FRAME_SIZE frame (which has space for
pt_regs) on the emergency stack with the backchain pointing to the
existing stack, and then finally we switch to the new frame on the
emergency stack.

Fixes: 10d91611f426 ("powerpc/64s: Reimplement book3s idle code in C")
Cc: sta...@vger.kernel.org # v5.2+
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 90484425a1e6..ec57952b60b0 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -255,13 +255,15 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
  * r3 contains the SRR1 wakeup value, SRR1 is trashed.
  */
 _GLOBAL(idle_kvm_start_guest)
-   ld  r4,PACAEMERGSP(r13)
mfcrr5
mflrr0
-   std r1,0(r4)
-   std r5,8(r4)
-   std r0,16(r4)
-   subir1,r4,STACK_FRAME_OVERHEAD
+   std r5, 8(r1)   // Save CR in caller's frame
+   std r0, 16(r1)  // Save LR in caller's frame
+   // Create frame on emergency stack
+   ld  r4, PACAEMERGSP(r13)
+   stdur1, -SWITCH_FRAME_SIZE(r4)
+   // Switch to new frame on emergency stack
+   mr  r1, r4
SAVE_NVGPRS(r1)
 
/*
@@ -395,10 +397,9 @@ _GLOBAL(idle_kvm_start_guest)
/* set up r3 for return */
mfspr   r3,SPRN_SRR1
REST_NVGPRS(r1)
-   addir1, r1, STACK_FRAME_OVERHEAD
-   ld  r0, 16(r1)
-   ld  r5, 8(r1)
-   ld  r1, 0(r1)
+   ld  r1, 0(r1)   // Switch back to caller stack
+   ld  r0, 16(r1)  // Reload LR
+   ld  r5, 8(r1)   // Reload CR
mtlrr0
mtcrr5
blr
-- 
2.25.1



Re: [PATCH v3 1/2] ftrace: disable preemption between ftrace_test_recursion_trylock/unlock()

2021-10-15 Thread Steven Rostedt
On Fri, 15 Oct 2021 17:12:26 +0800
王贇  wrote:

> Maybe take some example would be easier to understand...
> 
> Currently there are two way of using ftrace_test_recursion_trylock(),
> one with TRACE_FTRACE_XXX we mark as A, one with TRACE_LIST_XXX we mark
> as B, then:
> 
> A followed by B on same context got bit > 0
> B followed by A on any context got bit 0
> A followed by A on same context got bit > 0
> A followed by A followed by A on same context got bit -1
> B followed by B on same context got bit > 0
> B followed by B followed by B on same context got bit -1
> 
> If we get rid of the TRACE_TRANSITION_BIT which allowed recursion for
> onetime, then it would be:

We can't get rid of the transition bit. It's there to fix a bug. Or at
least until we finish the "noinst" issue which may be true now? I have to
go revisit it.

The reason for the transition bit, is because we were dropping function
traces, that people relied on being there. The problem is that the
recursion protection allows for nested context. That is, it will not detect
recursion if we an interrupt triggers during a trace (while the recursion
lock is held) and then that interrupt does a trace. It is allowed to call
the ftrace_test_recursion_trylock() again.

But, what happens if the trace occurs after the interrupt triggers, but
before the preempt_count states that we are now in interrupt context? As
preempt_count is used by this code to determine what context we are in, if
a trace happens in this "transition" state, without the transition bit, a
recursion is detected (false positive) and the event is dropped. People are
then confused on how an interrupt happened, but the entry of the interrupt
never triggered (according to the trace).

The transition bit allows one level of recursion to handle this case.

The "noinst" work may also remove all locations that can be traced before
the context is updated. I need to see if that is now the case, and if it
is, we can remove the transition bit.

This is not the design issue I mentioned earlier. I'm still working on that
one.

-- Steve


> 
> A followed by B on same context got bit > 0
> B followed by A on any context got bit 0
> A followed by A on same context got bit -1
> B followed by B on same context got bit -1
> 
> So as long as no continuously AAA it's safe?



Re: [PATCH v11 2/3] tty: hvc: pass DMA capable memory to put_chars()

2021-10-15 Thread Xianting Tian

Hi Greg and experts

Is this version ok for you?  thanks

在 2021/10/15 上午10:46, Xianting Tian 写道:

As well known, hvc backend can register its opertions to hvc backend.
the operations contain put_chars(), get_chars() and so on.

Some hvc backend may do dma in its operations. eg, put_chars() of
virtio-console. But in the code of hvc framework, it may pass DMA
incapable memory to put_chars() under a specific configuration, which
is explained in commit c4baad5029(virtio-console: avoid DMA from stack):
1, c[] is on stack,
hvc_console_print():
 char c[N_OUTBUF] __ALIGNED__;
 cons_ops[index]->put_chars(vtermnos[index], c, i);
2, ch is on stack,
static void hvc_poll_put_char(,,char ch)
{
 struct tty_struct *tty = driver->ttys[0];
 struct hvc_struct *hp = tty->driver_data;
 int n;

 do {
 n = hp->ops->put_chars(hp->vtermno, , 1);
 } while (n <= 0);
}

Commit c4baad5029 is just the fix to avoid DMA from stack memory, which
is passed to virtio-console by hvc framework in above code. But I think
the fix is aggressive, it directly uses kmemdup() to alloc new buffer
from kmalloc area and do memcpy no matter the memory is in kmalloc area
or not. But most importantly, it should better be fixed in the hvc
framework, by changing it to never pass stack memory to the put_chars()
function in the first place. Otherwise, we still face the same issue if
a new hvc backend using dma added in the furture.

In this patch, add 'char cons_outbuf[]' as part of 'struct hvc_struct',
so hp->cons_outbuf is no longer the stack memory, we can use it in above
cases safely. We also add lock to protect cons_outbuf instead of using
the global lock of hvc.

Introduce another array(cons_hvcs[]) for hvc pointers next to the
cons_ops[] and vtermnos[] arrays. With the array, we can easily find
hvc's cons_outbuf and its lock.

With the patch, we can revert the fix c4baad5029.

Signed-off-by: Xianting Tian 
Signed-off-by: Shile Zhang 
---
  drivers/tty/hvc/hvc_console.c | 36 ---
  drivers/tty/hvc/hvc_console.h | 21 +++-
  2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/drivers/tty/hvc/hvc_console.c b/drivers/tty/hvc/hvc_console.c
index 5957ab728..11f2463a1 100644
--- a/drivers/tty/hvc/hvc_console.c
+++ b/drivers/tty/hvc/hvc_console.c
@@ -41,16 +41,6 @@
   */
  #define HVC_CLOSE_WAIT (HZ/100) /* 1/10 of a second */
  
-/*

- * These sizes are most efficient for vio, because they are the
- * native transfer size. We could make them selectable in the
- * future to better deal with backends that want other buffer sizes.
- */
-#define N_OUTBUF   16
-#define N_INBUF16
-
-#define __ALIGNED__ __attribute__((__aligned__(L1_CACHE_BYTES)))
-
  static struct tty_driver *hvc_driver;
  static struct task_struct *hvc_task;
  
@@ -142,6 +132,7 @@ static int hvc_flush(struct hvc_struct *hp)

  static const struct hv_ops *cons_ops[MAX_NR_HVC_CONSOLES];
  static uint32_t vtermnos[MAX_NR_HVC_CONSOLES] =
{[0 ... MAX_NR_HVC_CONSOLES - 1] = -1};
+static struct hvc_struct *cons_hvcs[MAX_NR_HVC_CONSOLES];
  
  /*

   * Console APIs, NOT TTY.  These APIs are available immediately when
@@ -151,9 +142,11 @@ static uint32_t vtermnos[MAX_NR_HVC_CONSOLES] =
  static void hvc_console_print(struct console *co, const char *b,
  unsigned count)
  {
-   char c[N_OUTBUF] __ALIGNED__;
+   char *c;
unsigned i = 0, n = 0;
int r, donecr = 0, index = co->index;
+   unsigned long flags;
+   struct hvc_struct *hp;
  
  	/* Console access attempt outside of acceptable console range. */

if (index >= MAX_NR_HVC_CONSOLES)
@@ -163,6 +156,13 @@ static void hvc_console_print(struct console *co, const 
char *b,
if (vtermnos[index] == -1)
return;
  
+	hp = cons_hvcs[index];

+   if (!hp)
+   return;
+
+   c = hp->cons_outbuf;
+
+   spin_lock_irqsave(>cons_outbuf_lock, flags);
while (count > 0 || i > 0) {
if (count > 0 && i < sizeof(c)) {
if (b[n] == '\n' && !donecr) {
@@ -191,6 +191,7 @@ static void hvc_console_print(struct console *co, const 
char *b,
}
}
}
+   spin_unlock_irqrestore(>cons_outbuf_lock, flags);
hvc_console_flush(cons_ops[index], vtermnos[index]);
  }
  
@@ -878,9 +879,13 @@ static void hvc_poll_put_char(struct tty_driver *driver, int line, char ch)

struct tty_struct *tty = driver->ttys[0];
struct hvc_struct *hp = tty->driver_data;
int n;
+   unsigned long flags;
  
  	do {

-   n = hp->ops->put_chars(hp->vtermno, , 1);
+   spin_lock_irqsave(>cons_outbuf_lock, flags);
+   hp->cons_outbuf[0] = ch;
+   n = hp->ops->put_chars(hp->vtermno, >cons_outbuf[0], 1);
+   spin_unlock_irqrestore(>cons_outbuf_lock, 

Re: [PATCH 2/2] kbuild: use more subdir- for visiting subdirectories while cleaning

2021-10-15 Thread Michael Ellerman
Masahiro Yamada  writes:
>
...
> diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
> index aa6808e70647..b61d8be3c226 100644
> --- a/arch/powerpc/Makefile
> +++ b/arch/powerpc/Makefile
> @@ -411,9 +409,6 @@ install:
>   sh -x $(srctree)/$(boot)/install.sh "$(KERNELRELEASE)" vmlinux \
>   System.map "$(INSTALL_PATH)"
>  
> -archclean:
> - $(Q)$(MAKE) $(clean)=$(boot)
> -
>  ifeq ($(KBUILD_EXTMOD),)
>  # We need to generate vdso-offsets.h before compiling certain files in 
> kernel/.
>  # In order to do that, we should use the archprepare target, but we can't 
> since

Seems to work.

Acked-by: Michael Ellerman  (powerpc)

cheers


Re: [PATCH v2 06/13] asm-generic: Use HAVE_FUNCTION_DESCRIPTORS to define associated stubs

2021-10-15 Thread Nicholas Piggin
Excerpts from Nicholas Piggin's message of October 15, 2021 6:02 pm:
> Excerpts from Christophe Leroy's message of October 15, 2021 4:24 pm:
>> 
>> 
>> Le 15/10/2021 à 08:16, Nicholas Piggin a écrit :
>>> Excerpts from Christophe Leroy's message of October 14, 2021 3:49 pm:
 Replace HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR by
 HAVE_FUNCTION_DESCRIPTORS and use it instead of
 'dereference_function_descriptor' macro to know
 whether an arch has function descriptors.

 To limit churn in one of the following patches, use
 an #ifdef/#else construct with empty first part
 instead of an #ifndef in asm-generic/sections.h
>>> 
>>> Is it worth putting this into Kconfig if you're going to
>>> change it? In any case
>> 
>> That was what I wanted to do in the begining but how can I do that in 
>> Kconfig ?
>> 
>> #ifdef __powerpc64__
>> #if defined(_CALL_ELF) && _CALL_ELF == 2
>> #define PPC64_ELF_ABI_v2
>> #else
>> #define PPC64_ELF_ABI_v1
>> #endif
>> #endif /* __powerpc64__ */
>> 
>> #ifdef PPC64_ELF_ABI_v1
>> #define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1
> 
> We have ELFv2 ABI / function descriptors iff big-endian so you could 
> just select based on that.

Of course that should read ELFv1. To be clearer: BE is ELFv1 ABI and
LE is ELFv2 ABI.

Thanks,
Nick


Re: [PATCH v2] powerpc/s64: Clarify that radix lacks DEBUG_PAGEALLOC

2021-10-15 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 13/10/2021 à 23:34, Joel Stanley a écrit :
>> The page_alloc.c code will call into __kernel_map_pages when
>> DEBUG_PAGEALLOC is configured and enabled.
>> 
>> As the implementation assumes hash, this should crash spectacularly if
>> not for a bit of luck in __kernel_map_pages. In this function
>> linear_map_hash_count is always zero, the for loop exits without doing
>> any damage.
>> 
>> There are no other platforms that determine if they support
>> debug_pagealloc at runtime. Instead of adding code to mm/page_alloc.c to
>> do that, this change turns the map/unmap into a noop when in radix
>> mode and prints a warning once.
>> 
>> Signed-off-by: Joel Stanley 
>> ---
>> v2: Put __kernel_map_pages in pgtable.h
>> 
>>   arch/powerpc/include/asm/book3s/64/hash.h|  2 ++
>>   arch/powerpc/include/asm/book3s/64/pgtable.h | 11 +++
>>   arch/powerpc/include/asm/book3s/64/radix.h   |  3 +++
>>   arch/powerpc/mm/book3s64/hash_utils.c|  2 +-
>>   arch/powerpc/mm/book3s64/radix_pgtable.c |  7 +++
>>   5 files changed, 24 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
>> b/arch/powerpc/include/asm/book3s/64/hash.h
>> index d959b0195ad9..674fe0e890dc 100644
>> --- a/arch/powerpc/include/asm/book3s/64/hash.h
>> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
>> @@ -255,6 +255,8 @@ int hash__create_section_mapping(unsigned long start, 
>> unsigned long end,
>>   int nid, pgprot_t prot);
>>   int hash__remove_section_mapping(unsigned long start, unsigned long end);
>>   
>> +void hash__kernel_map_pages(struct page *page, int numpages, int enable);
>> +
>>   #endif /* !__ASSEMBLY__ */
>>   #endif /* __KERNEL__ */
>>   #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */
>> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
>> b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> index 5d34a8646f08..265661ded238 100644
>> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
>> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> @@ -1101,6 +1101,17 @@ static inline void vmemmap_remove_mapping(unsigned 
>> long start,
>>   }
>>   #endif
>>   
>> +#ifdef CONFIG_DEBUG_PAGEALLOC
>> +static inline void __kernel_map_pages(struct page *page, int numpages, int 
>> enable)
>> +{
>> +if (radix_enabled()) {
>> +radix__kernel_map_pages(page, numpages, enable);
>> +return;
>> +}
>> +hash__kernel_map_pages(page, numpages, enable);
>
> I'd have prefered something like below
>
>   if (radix_enabled())
>   radix__kernel_map_pages(page, numpages, enable);
>   else
>   hash__kernel_map_pages(page, numpages, enable);

I did that when applying.

cheers


[PATCH v1 8/8] powerpc/fsl_booke: Enable STRICT_KERNEL_RWX

2021-10-15 Thread Christophe Leroy
Enable STRICT_KERNEL_RWX on fsl_booke.

For that, we need additional TLBCAMs dedicated to linear mapping,
based on the alignment of _sinittext.

By default, up to 768 Mbytes of memory are mapped.
It uses 3 TLBCAMs of size 256 Mbytes.

With a data alignment of 16, we need up to 9 TLBCAMs:
  16/16/16/16/64/64/64/256/256

With a data alignment of 4, we need up to 12 TLBCAMs:
  4/4/4/4/16/16/16/64/64/64/256/256

With a data alignment of 1, we need up to 15 TLBCAMs:
  1/1/1/1/4/4/4/16/16/16/64/64/64/256/256

By default, set a 16 Mbytes alignment as a compromise between memory
usage and number of TLBCAMs. This can be adjusted manually when needed.

For the time being, it doens't work when the base is randomised.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6b9f523882c5..939a47642a9c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -139,6 +139,7 @@ config PPC
select ARCH_HAS_SCALED_CPUTIME  if VIRT_CPU_ACCOUNTING_NATIVE 
&& PPC_BOOK3S_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_STRICT_KERNEL_RWX   if (PPC_BOOK3S || PPC_8xx || 
40x) && !HIBERNATION
+   select ARCH_HAS_STRICT_KERNEL_RWX   if FSL_BOOKE && !HIBERNATION && 
!RANDOMIZE_BASE
select ARCH_HAS_STRICT_MODULE_RWX   if ARCH_HAS_STRICT_KERNEL_RWX 
&& !PPC_BOOK3S_32
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE
@@ -778,7 +779,8 @@ config DATA_SHIFT_BOOL
bool "Set custom data alignment"
depends on ADVANCED_OPTIONS
depends on STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE
-   depends on PPC_BOOK3S_32 || (PPC_8xx && !PIN_TLB_DATA && 
!STRICT_KERNEL_RWX)
+   depends on PPC_BOOK3S_32 || (PPC_8xx && !PIN_TLB_DATA && 
!STRICT_KERNEL_RWX) || \
+  FSL_BOOKE
help
  This option allows you to set the kernel data alignment. When
  RAM is mapped by blocks, the alignment needs to fit the size and
@@ -791,11 +793,13 @@ config DATA_SHIFT
default 24 if STRICT_KERNEL_RWX && PPC64
range 17 28 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && 
PPC_BOOK3S_32
range 19 23 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && 
PPC_8xx
+   range 20 24 if (STRICT_KERNEL_RWX || DEBUG_PAGEALLOC || KFENCE) && 
PPC_FSL_BOOKE
default 22 if STRICT_KERNEL_RWX && PPC_BOOK3S_32
default 18 if (DEBUG_PAGEALLOC || KFENCE) && PPC_BOOK3S_32
default 23 if STRICT_KERNEL_RWX && PPC_8xx
default 23 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx && PIN_TLB_DATA
default 19 if (DEBUG_PAGEALLOC || KFENCE) && PPC_8xx
+   default 24 if STRICT_KERNEL_RWX && FSL_BOOKE
default PPC_PAGE_SHIFT
help
  On Book3S 32 (603+), DBATs are used to map kernel text and rodata RO.
@@ -1123,7 +1127,10 @@ config LOWMEM_CAM_NUM_BOOL
 config LOWMEM_CAM_NUM
depends on FSL_BOOKE
int "Number of CAMs to use to map low memory" if LOWMEM_CAM_NUM_BOOL
-   default 3
+   default 3 if !STRICT_KERNEL_RWX
+   default 9 if DATA_SHIFT >= 24
+   default 12 if DATA_SHIFT >= 22
+   default 15
 
 config DYNAMIC_MEMSTART
bool "Enable page aligned dynamic load address for kernel"
-- 
2.31.1



[PATCH v1 5/8] powerpc/fsl_booke: Tell map_mem_in_cams() if init is done

2021-10-15 Thread Christophe Leroy
In order to be able to call map_mem_in_cams() once more
after init for STRICT_KERNEL_RWX, add an argument.

For now, map_mem_in_cams() is always called only during init.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/mmu_decl.h   |  2 +-
 arch/powerpc/mm/nohash/fsl_book3e.c  | 12 ++--
 arch/powerpc/mm/nohash/kaslr_booke.c |  2 +-
 arch/powerpc/mm/nohash/tlb.c |  4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index dd1cabc2ea0f..e13a3c0caa02 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -126,7 +126,7 @@ unsigned long mmu_mapin_ram(unsigned long base, unsigned 
long top);
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
 extern unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx,
-bool dryrun);
+bool dryrun, bool init);
 extern unsigned long calc_cam_sz(unsigned long ram, unsigned long virt,
 phys_addr_t phys);
 #ifdef CONFIG_PPC32
diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c 
b/arch/powerpc/mm/nohash/fsl_book3e.c
index fdf1029e62f0..375b2b8238c1 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -167,7 +167,7 @@ unsigned long calc_cam_sz(unsigned long ram, unsigned long 
virt,
 
 static unsigned long map_mem_in_cams_addr(phys_addr_t phys, unsigned long virt,
unsigned long ram, int max_cam_idx,
-   bool dryrun)
+   bool dryrun, bool init)
 {
int i;
unsigned long amount_mapped = 0;
@@ -202,12 +202,12 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t 
phys, unsigned long virt,
return amount_mapped;
 }
 
-unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun)
+unsigned long map_mem_in_cams(unsigned long ram, int max_cam_idx, bool dryrun, 
bool init)
 {
unsigned long virt = PAGE_OFFSET;
phys_addr_t phys = memstart_addr;
 
-   return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun);
+   return map_mem_in_cams_addr(phys, virt, ram, max_cam_idx, dryrun, init);
 }
 
 #ifdef CONFIG_PPC32
@@ -248,7 +248,7 @@ void __init adjust_total_lowmem(void)
ram = min((phys_addr_t)__max_low_memory, (phys_addr_t)total_lowmem);
 
i = switch_to_as1();
-   __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false);
+   __max_low_memory = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, false, 
true);
restore_to_as0(i, 0, 0, 1);
 
pr_info("Memory CAM mapping: ");
@@ -319,11 +319,11 @@ notrace void __init relocate_init(u64 dt_ptr, phys_addr_t 
start)
/* map a 64M area for the second relocation */
if (memstart_addr > start)
map_mem_in_cams(0x400, CONFIG_LOWMEM_CAM_NUM,
-   false);
+   false, true);
else
map_mem_in_cams_addr(start, PAGE_OFFSET + offset,
0x400, CONFIG_LOWMEM_CAM_NUM,
-   false);
+   false, true);
restore_to_as0(n, offset, __va(dt_ptr), 1);
/* We should never reach here */
panic("Relocation error");
diff --git a/arch/powerpc/mm/nohash/kaslr_booke.c 
b/arch/powerpc/mm/nohash/kaslr_booke.c
index 4c74e8a5482b..8fc49b1b4a91 100644
--- a/arch/powerpc/mm/nohash/kaslr_booke.c
+++ b/arch/powerpc/mm/nohash/kaslr_booke.c
@@ -314,7 +314,7 @@ static unsigned long __init kaslr_choose_location(void 
*dt_ptr, phys_addr_t size
pr_warn("KASLR: No safe seed for randomizing the kernel 
base.\n");
 
ram = min_t(phys_addr_t, __max_low_memory, size);
-   ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true);
+   ram = map_mem_in_cams(ram, CONFIG_LOWMEM_CAM_NUM, true, false);
linear_sz = min_t(unsigned long, ram, SZ_512M);
 
/* If the linear size is smaller than 64M, do not randmize */
diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c
index 5872f69141d5..fc195b9f524b 100644
--- a/arch/powerpc/mm/nohash/tlb.c
+++ b/arch/powerpc/mm/nohash/tlb.c
@@ -643,7 +643,7 @@ static void early_init_this_mmu(void)
 
if (map)
linear_map_top = map_mem_in_cams(linear_map_top,
-num_cams, false);
+num_cams, true, true);
}
 #endif
 
@@ -764,7 +764,7 @@ void setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
num_cams = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) / 4;
 
linear_sz = map_mem_in_cams(first_memblock_size, num_cams,
- 

[PATCH v1 7/8] powerpc/fsl_booke: Update of TLBCAMs after init

2021-10-15 Thread Christophe Leroy
After init, set readonly memory as ROX and set readwrite
memory as RWX, if STRICT_KERNEL_RWX is enabled.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/mmu_decl.h  |  2 +-
 arch/powerpc/mm/nohash/fsl_book3e.c | 32 +
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index e13a3c0caa02..0dd4c18f8363 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -168,7 +168,7 @@ static inline phys_addr_t v_block_mapped(unsigned long va) 
{ return 0; }
 static inline unsigned long p_block_mapped(phys_addr_t pa) { return 0; }
 #endif
 
-#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx)
+#if defined(CONFIG_PPC_BOOK3S_32) || defined(CONFIG_PPC_8xx) || 
defined(CONFIG_PPC_FSL_BOOK3E)
 void mmu_mark_initmem_nx(void);
 void mmu_mark_rodata_ro(void);
 #else
diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c 
b/arch/powerpc/mm/nohash/fsl_book3e.c
index c1bc11f46344..978e0bcdfa2c 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -181,7 +181,7 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, 
unsigned long virt,
/* Calculate CAM values */
for (i = 0; boundary && i < max_cam_idx; i++) {
unsigned long cam_sz;
-   pgprot_t prot = PAGE_KERNEL_X;
+   pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL_ROX;
 
cam_sz = calc_cam_sz(boundary, virt, phys);
if (!dryrun)
@@ -194,7 +194,7 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t phys, 
unsigned long virt,
}
for (ram -= amount_mapped; ram && i < max_cam_idx; i++) {
unsigned long cam_sz;
-   pgprot_t prot = PAGE_KERNEL_X;
+   pgprot_t prot = init ? PAGE_KERNEL_X : PAGE_KERNEL;
 
cam_sz = calc_cam_sz(ram, virt, phys);
if (!dryrun)
@@ -209,8 +209,13 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t 
phys, unsigned long virt,
if (dryrun)
return amount_mapped;
 
-   loadcam_multi(0, i, max_cam_idx);
-   tlbcam_index = i;
+   if (init) {
+   loadcam_multi(0, i, max_cam_idx);
+   tlbcam_index = i;
+   } else {
+   loadcam_multi(0, i, 0);
+   WARN_ON(i > tlbcam_index);
+   }
 
 #ifdef CONFIG_PPC64
get_paca()->tcd.esel_next = i;
@@ -279,6 +284,25 @@ void __init adjust_total_lowmem(void)
memblock_set_current_limit(memstart_addr + __max_low_memory);
 }
 
+#ifdef CONFIG_STRICT_KERNEL_RWX
+void mmu_mark_rodata_ro(void)
+{
+   /* Everything is done in mmu_mark_initmem_nx() */
+}
+#endif
+
+void mmu_mark_initmem_nx(void)
+{
+   unsigned long remapped;
+
+   if (!strict_kernel_rwx_enabled())
+   return;
+
+   remapped = map_mem_in_cams(__max_low_memory, CONFIG_LOWMEM_CAM_NUM, 
false, false);
+
+   WARN_ON(__max_low_memory != remapped);
+}
+
 void setup_initial_memory_limit(phys_addr_t first_memblock_base,
phys_addr_t first_memblock_size)
 {
-- 
2.31.1



[PATCH v1 1/8] powerpc/booke: Disable STRICT_KERNEL_RWX, DEBUG_PAGEALLOC and KFENCE

2021-10-15 Thread Christophe Leroy
fsl_booke and 44x are not able to map kernel linear memory with
pages, so they can't support DEBUG_PAGEALLOC and KFENCE, and
STRICT_KERNEL_RWX is also a problem for now.

Enable those only on book3s (both 32 and 64 except KFENCE), 8xx and 40x.

Fixes: 88df6e90fa97 ("[POWERPC] DEBUG_PAGEALLOC for 32-bit")
Fixes: 95902e6c8864 ("powerpc/mm: Implement STRICT_KERNEL_RWX on PPC32")
Fixes: 90cbac0e995d ("powerpc: Enable KFENCE for PPC32")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index ba5b66189358..6b9f523882c5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,7 +138,7 @@ config PPC
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SCALED_CPUTIME  if VIRT_CPU_ACCOUNTING_NATIVE 
&& PPC_BOOK3S_64
select ARCH_HAS_SET_MEMORY
-   select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!HIBERNATION)
+   select ARCH_HAS_STRICT_KERNEL_RWX   if (PPC_BOOK3S || PPC_8xx || 
40x) && !HIBERNATION
select ARCH_HAS_STRICT_MODULE_RWX   if ARCH_HAS_STRICT_KERNEL_RWX 
&& !PPC_BOOK3S_32
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE
@@ -150,7 +150,7 @@ config PPC
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_STACKWALK
select ARCH_SUPPORTS_ATOMIC_RMW
-   select ARCH_SUPPORTS_DEBUG_PAGEALLOCif PPC32 || PPC_BOOK3S_64
+   select ARCH_SUPPORTS_DEBUG_PAGEALLOCif PPC_BOOK3S || PPC_8xx || 40x
select ARCH_USE_BUILTIN_BSWAP
select ARCH_USE_CMPXCHG_LOCKREF if PPC64
select ARCH_USE_MEMTEST
@@ -190,7 +190,7 @@ config PPC
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN  if PPC32 && PPC_PAGE_SHIFT <= 14
select HAVE_ARCH_KASAN_VMALLOC  if PPC32 && PPC_PAGE_SHIFT <= 14
-   select HAVE_ARCH_KFENCE if PPC32
+   select HAVE_ARCH_KFENCE if PPC_BOOK3S_32 || PPC_8xx || 
40x
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS   if COMPAT
-- 
2.31.1



[PATCH v1 4/8] powerpc/fsl_booke: Enable reloading of TLBCAM without switching to AS1

2021-10-15 Thread Christophe Leroy
Avoid switching to AS1 when reloading TLBCAM after init for
STRICT_KERNEL_RWX.

When we setup AS1 we expect the entire accessible memory to be mapped
through one entry, this is not the case anymore at the end of init.

We are not changing the size of TLBCAMs, only flags, so no need to
switch to AS1.

So change loadcam_multi() to not switch to AS1 when the given
temporary tlb entry in 0.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/nohash/tlb_low.S | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S
index 5add4a51e51f..dd39074de9af 100644
--- a/arch/powerpc/mm/nohash/tlb_low.S
+++ b/arch/powerpc/mm/nohash/tlb_low.S
@@ -369,7 +369,7 @@ _GLOBAL(_tlbivax_bcast)
  * extern void loadcam_entry(unsigned int index)
  *
  * Load TLBCAM[index] entry in to the L2 CAM MMU
- * Must preserve r7, r8, r9, r10 and r11
+ * Must preserve r7, r8, r9, r10, r11, r12
  */
 _GLOBAL(loadcam_entry)
mflrr5
@@ -401,7 +401,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS)
  *
  * r3 = first entry to write
  * r4 = number of entries to write
- * r5 = temporary tlb entry
+ * r5 = temporary tlb entry (0 means no switch to AS1)
  */
 _GLOBAL(loadcam_multi)
mflrr8
@@ -409,6 +409,8 @@ _GLOBAL(loadcam_multi)
mfmsr   r11
andi.   r11,r11,MSR_IS
bne 10f
+   mr. r12, r5
+   beq 10f
 
/*
 * Set up temporary TLB entry that is the same as what we're
@@ -446,6 +448,8 @@ _GLOBAL(loadcam_multi)
/* Don't return to AS=0 if we were in AS=1 at function start */
andi.   r11,r11,MSR_IS
bne 3f
+   cmpwi   r12, 0
+   beq 3f
 
/* Return to AS=0 and clear the temporary entry */
mfmsr   r6
-- 
2.31.1



[PATCH v1 2/8] powerpc/fsl_booke: Rename fsl_booke.c to fsl_book3e.c

2021-10-15 Thread Christophe Leroy
We have a myriad of CONFIG symbols around different variants
of BOOKEs, which would be worth tidying up one day.

But at least, make file names and CONFIG option match:

We have CONFIG_FSL_BOOKE and CONFIG_PPC_FSL_BOOK3E.

fsl_booke.c is selected by and only by CONFIG_PPC_FSL_BOOK3E.
So rename it fsl_book3e to reduce confusion.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/nohash/Makefile  | 4 ++--
 arch/powerpc/mm/nohash/{fsl_booke.c => fsl_book3e.c} | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename arch/powerpc/mm/nohash/{fsl_booke.c => fsl_book3e.c} (100%)

diff --git a/arch/powerpc/mm/nohash/Makefile b/arch/powerpc/mm/nohash/Makefile
index 0424f6ce5bd8..b1f630d423d8 100644
--- a/arch/powerpc/mm/nohash/Makefile
+++ b/arch/powerpc/mm/nohash/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_PPC_BOOK3E_64) += tlb_low_64e.o 
book3e_pgtable.o
 obj-$(CONFIG_40x)  += 40x.o
 obj-$(CONFIG_44x)  += 44x.o
 obj-$(CONFIG_PPC_8xx)  += 8xx.o
-obj-$(CONFIG_PPC_FSL_BOOK3E)   += fsl_booke.o
+obj-$(CONFIG_PPC_FSL_BOOK3E)   += fsl_book3e.o
 obj-$(CONFIG_RANDOMIZE_BASE)   += kaslr_booke.o
 ifdef CONFIG_HUGETLB_PAGE
 obj-$(CONFIG_PPC_FSL_BOOK3E)   += book3e_hugetlbpage.o
@@ -16,4 +16,4 @@ endif
 # Disable kcov instrumentation on sensitive code
 # This is necessary for booting with kcov enabled on book3e machines
 KCOV_INSTRUMENT_tlb.o := n
-KCOV_INSTRUMENT_fsl_booke.o := n
+KCOV_INSTRUMENT_fsl_book3e.o := n
diff --git a/arch/powerpc/mm/nohash/fsl_booke.c 
b/arch/powerpc/mm/nohash/fsl_book3e.c
similarity index 100%
rename from arch/powerpc/mm/nohash/fsl_booke.c
rename to arch/powerpc/mm/nohash/fsl_book3e.c
-- 
2.31.1



[PATCH v1 6/8] powerpc/fsl_booke: Allocate separate TLBCAMs for readonly memory

2021-10-15 Thread Christophe Leroy
Reorganise TLBCAM allocation so that when STRICT_KERNEL_RWX is
enabled, TLBCAMs are allocated such that readonly memory uses
different TLBCAMs.

This results in an allocation looking like:

Memory CAM mapping: 4/4/4/1/1/1/1/16/16/16/64/64/64/256/256 Mb, residual: 256Mb

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/nohash/fsl_book3e.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c 
b/arch/powerpc/mm/nohash/fsl_book3e.c
index 375b2b8238c1..c1bc11f46344 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -171,15 +171,34 @@ static unsigned long map_mem_in_cams_addr(phys_addr_t 
phys, unsigned long virt,
 {
int i;
unsigned long amount_mapped = 0;
+   unsigned long boundary;
+
+   if (strict_kernel_rwx_enabled())
+   boundary = (unsigned long)(_sinittext - _stext);
+   else
+   boundary = ram;
 
/* Calculate CAM values */
-   for (i = 0; ram && i < max_cam_idx; i++) {
+   for (i = 0; boundary && i < max_cam_idx; i++) {
+   unsigned long cam_sz;
+   pgprot_t prot = PAGE_KERNEL_X;
+
+   cam_sz = calc_cam_sz(boundary, virt, phys);
+   if (!dryrun)
+   settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0);
+
+   boundary -= cam_sz;
+   amount_mapped += cam_sz;
+   virt += cam_sz;
+   phys += cam_sz;
+   }
+   for (ram -= amount_mapped; ram && i < max_cam_idx; i++) {
unsigned long cam_sz;
+   pgprot_t prot = PAGE_KERNEL_X;
 
cam_sz = calc_cam_sz(ram, virt, phys);
if (!dryrun)
-   settlbcam(i, virt, phys, cam_sz,
- pgprot_val(PAGE_KERNEL_X), 0);
+   settlbcam(i, virt, phys, cam_sz, pgprot_val(prot), 0);
 
ram -= cam_sz;
amount_mapped += cam_sz;
-- 
2.31.1



[PATCH v1 3/8] powerpc/fsl_booke: Take exec flag into account when setting TLBCAMs

2021-10-15 Thread Christophe Leroy
Don't force MAS3_SX and MAS3_UX at all time. Take into account the
exec flag.

While at it, fix a couple of closeby style problems (indent with space
and unnecessary parenthesis), it keeps more readability.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/nohash/fsl_book3e.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/nohash/fsl_book3e.c 
b/arch/powerpc/mm/nohash/fsl_book3e.c
index 03dacbe940e5..fdf1029e62f0 100644
--- a/arch/powerpc/mm/nohash/fsl_book3e.c
+++ b/arch/powerpc/mm/nohash/fsl_book3e.c
@@ -122,15 +122,17 @@ static void settlbcam(int index, unsigned long virt, 
phys_addr_t phys,
TLBCAM[index].MAS2 |= (flags & _PAGE_GUARDED) ? MAS2_G : 0;
TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0;
 
-   TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SX | MAS3_SR;
-   TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0);
+   TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR;
+   TLBCAM[index].MAS3 |= (flags & _PAGE_BAP_SX) ? MAS3_SX : 0;
+   TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_SW : 0;
if (mmu_has_feature(MMU_FTR_BIG_PHYS))
TLBCAM[index].MAS7 = (u64)phys >> 32;
 
/* Below is unlikely -- only for large user pages or similar */
if (pte_user(__pte(flags))) {
-  TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR;
-  TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0);
+   TLBCAM[index].MAS3 |= MAS3_UR;
+   TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0;
+   TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_UW : 0;
}
 
tlbcam_addrs[index].start = virt;
-- 
2.31.1



Re: [PATCH v3 1/2] ftrace: disable preemption between ftrace_test_recursion_trylock/unlock()

2021-10-15 Thread 王贇



On 2021/10/15 下午3:28, Petr Mladek wrote:
> On Fri 2021-10-15 11:13:08, 王贇 wrote:
>>
>>
>> On 2021/10/14 下午11:14, Petr Mladek wrote:
>> [snip]
 -  return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, 
 TRACE_FTRACE_MAX);
 +  int bit;
 +
 +  bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, 
 TRACE_FTRACE_MAX);
 +  /*
 +   * The zero bit indicate we are nested
 +   * in another trylock(), which means the
 +   * preemption already disabled.
 +   */
 +  if (bit > 0)
 +  preempt_disable_notrace();
>>>
>>> Is this safe? The preemption is disabled only when
>>> trace_test_and_set_recursion() was called by 
>>> ftrace_test_recursion_trylock().
>>>
>>> We must either always disable the preemtion when bit >= 0.
>>> Or we have to disable the preemtion already in
>>> trace_test_and_set_recursion().
>>
>> Internal calling of trace_test_and_set_recursion() will disable preemption
>> on succeed, it should be safe.
> 
> trace_test_and_set_recursion() does _not_ disable preemtion!
> It works only because all callers disable the preemption.

Yup.

> 
> It means that the comment is wrong. It is not guarantted that the
> preemption will be disabled. It works only by chance.
> 
> 
>> We can also consider move the logical into trace_test_and_set_recursion()
>> and trace_clear_recursion(), but I'm not very sure about that... ftrace
>> internally already make sure preemption disabled
> 
> How? Because it calls trace_test_and_set_recursion() via the trylock() API?

I mean currently all the direct caller of trace_test_and_set_recursion()
have disabled the preemption as you mentioned above, but yes if anyone later
write some kernel code to call trace_test_and_set_recursion() without
disabling preemption after, then promise broken.

> 
> 
>> , what uncovered is those users who call API trylock/unlock, isn't
>> it?
> 
> And this is exactly the problem. trace_test_and_set_recursion() is in
> a public header. Anyone could use it. And if anyone uses it in the
> future without the trylock() and does not disable the preemtion
> explicitely then we are lost again.
> 
> And it is even more dangerous. The original code disabled the
> preemtion on various layers. As a result, the preemtion was disabled
> several times for sure. It means that the deeper layers were
> always on the safe side.
> 
> With this patch, if the first trace_test_and_set_recursion() caller
> does not disable preemtion then trylock() will not disable it either
> and the entire code is procceed with preemtion enabled.

Yes, what confusing me at first is that I think people who call
trace_test_and_set_recursion() without trylock() can only be a
ftrace hacker, not a user, but in case if anyone can use it without
respect to preemption stuff, then I think the logical should be inside
trace_test_and_set_recursion() rather than trylock().

> 
> 
>>> Finally, the comment confused me a lot. The difference between nesting and
>>> recursion is far from clear. And the code is tricky liky like hell :-)
>>> I propose to add some comments, see below for a proposal.
>> The comments do confusing, I'll make it something like:
>>
>> The zero bit indicate trace recursion happened, whatever
>> the recursively call was made by ftrace handler or ftrace
>> itself, the preemption already disabled.
> 
> I am sorry but it is still confusing. We need to find a better way
> how to clearly explain the difference between the safe and
> unsafe recursion.
> 
> My understanding is that the recursion is:
> 
>   + "unsafe" when the trace code recursively enters the same trace point.
> 
>   + "safe" when ftrace_test_recursion_trylock() is called recursivelly
> while still processing the same trace entry.

Maybe take some example would be easier to understand...

Currently there are two way of using ftrace_test_recursion_trylock(),
one with TRACE_FTRACE_XXX we mark as A, one with TRACE_LIST_XXX we mark
as B, then:

A followed by B on same context got bit > 0
B followed by A on any context got bit 0
A followed by A on same context got bit > 0
A followed by A followed by A on same context got bit -1
B followed by B on same context got bit > 0
B followed by B followed by B on same context got bit -1

If we get rid of the TRACE_TRANSITION_BIT which allowed recursion for
onetime, then it would be:

A followed by B on same context got bit > 0
B followed by A on any context got bit 0
A followed by A on same context got bit -1
B followed by B on same context got bit -1

So as long as no continuously AAA it's safe?

> 
 +
 +  return bit;
  }
  /**
 @@ -222,9 +233,13 @@ static __always_inline int 
 ftrace_test_recursion_trylock(unsigned long ip,
   * @bit: The return of a successful ftrace_test_recursion_trylock()
   *
   * This is used at the end of a ftrace callback.
 + *
 + * Preemption will be enabled (if it was previously enabled).
   */
  static 

Re: [PATCH v2 06/13] asm-generic: Use HAVE_FUNCTION_DESCRIPTORS to define associated stubs

2021-10-15 Thread Nicholas Piggin
Excerpts from Christophe Leroy's message of October 15, 2021 4:24 pm:
> 
> 
> Le 15/10/2021 à 08:16, Nicholas Piggin a écrit :
>> Excerpts from Christophe Leroy's message of October 14, 2021 3:49 pm:
>>> Replace HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR by
>>> HAVE_FUNCTION_DESCRIPTORS and use it instead of
>>> 'dereference_function_descriptor' macro to know
>>> whether an arch has function descriptors.
>>>
>>> To limit churn in one of the following patches, use
>>> an #ifdef/#else construct with empty first part
>>> instead of an #ifndef in asm-generic/sections.h
>> 
>> Is it worth putting this into Kconfig if you're going to
>> change it? In any case
> 
> That was what I wanted to do in the begining but how can I do that in 
> Kconfig ?
> 
> #ifdef __powerpc64__
> #if defined(_CALL_ELF) && _CALL_ELF == 2
> #define PPC64_ELF_ABI_v2
> #else
> #define PPC64_ELF_ABI_v1
> #endif
> #endif /* __powerpc64__ */
> 
> #ifdef PPC64_ELF_ABI_v1
> #define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1

We have ELFv2 ABI / function descriptors iff big-endian so you could 
just select based on that.

I have a patch that makes the ABI version configurable which cleans
some of this up a bit, but that can be rebased on your series if we
ever merge it. Maybe just add BUILD_BUG_ONs in the above ifdef block
to ensure CONFIG_HAVE_FUNCTION_DESCRIPTORS was set the right way, so
I don't forget.

Thanks,
Nick


Re: [PATCH] powerpc/mce: check if event info is valid

2021-10-15 Thread Nicholas Piggin
Excerpts from Michael Ellerman's message of October 7, 2021 10:09 pm:
> Ganesh  writes:
>> On 8/6/21 6:53 PM, Ganesh Goudar wrote:
>>
>>> Check if the event info is valid before printing the
>>> event information. When a fwnmi enabled nested kvm guest
>>> hits a machine check exception L0 and L2 would generate
>>> machine check event info, But L1 would not generate any
>>> machine check event info as it won't go through 0x200
>>> vector and prints some unwanted message.
>>>
>>> To fix this, 'in_use' variable in machine check event info is
>>> no more in use, rename it to 'valid' and check if the event
>>> information is valid before logging the event information.
>>>
>>> without this patch L1 would print following message for
>>> exceptions encountered in L2, as event structure will be
>>> empty in L1.
>>>
>>> "Machine Check Exception, Unknown event version 0".
>>>
>>> Signed-off-by: Ganesh Goudar 
>>> ---
>>
>> Hi mpe, Any comments on this patch.
> 
> The variable rename is a bit of a distraction.
> 
> But ignoring that, how do we end up processing a machine_check_event
> that has in_use = 0?
> 
> You don't give much detail on what call path you're talking about. I
> guess we're coming in via the calls in the KVM code?
> 
> In the definition of kvm_vcpu_arch we have:
> 
>   struct machine_check_event mce_evt; /* Valid if trap == 0x200 */
> 
> And you said we're not going via 0x200 in L1. But so shouldn't we be
> teaching the KVM code not to use mce_evt when trap is not 0x200?

I'm not sure we want the MCE to skip the L1 either. It should match the 
L0 hypervisor behaviour as closely as reasonably possible.

We might have to teach the KVM pseries path to do something about the
0x200 before the common HV guest exit handler (which is where the L1
message comes from).

Thanks,
Nick


[PATCH 2/2] powerpc/eeh: Use a goto for recovery failures

2021-10-15 Thread Daniel Axtens
From: Oliver O'Halloran 

The EEH recovery logic in eeh_handle_normal_event() has some pretty strange
flow control. If we remove all the actual recovery logic we're left with
the following skeleton:

if (result != PCI_ERS_RESULT_DISCONNECT) {
...
}

if (result != PCI_ERS_RESULT_DISCONNECT) {
...
}

if (result == PCI_ERS_RESULT_NONE) {
...
}

if (result == PCI_ERS_RESULT_CAN_RECOVER) {
...
}

if (result == PCI_ERS_RESULT_CAN_RECOVER) {
...
}

if (result == PCI_ERS_RESULT_NEED_RESET) {
...
}

if ((result == PCI_ERS_RESULT_RECOVERED) ||
(result == PCI_ERS_RESULT_NONE)) {
...
goto out;
}

/*
 * unsuccessful recovery / PCI_ERS_RESULT_DISCONECTED
 * handling is here.
 */
...

out:
...

Most of the "if () { ... }" blocks above change "result" to
PCI_ERS_RESULT_DISCONNECTED if an error occurs in that recovery step. This
makes the control flow a bit confusing since it breaks the early-exit
pattern that is generally used in Linux. In any case we end up handling the
error in the final else block so why not just jump there directly? Doing so
also allows us to de-indent a bunch of code.

No functional changes.

Cc: Mahesh Salgaonkar 
Signed-off-by: Oliver O'Halloran 
[dja: rebase on top of linux-next + my preceeding refactor,
  move clearing the EEH_DEV_NO_HANDLER bit above the first goto so that
  it is always clear in the error handler code as it was before.]
Signed-off-by: Daniel Axtens 
---
 arch/powerpc/kernel/eeh_driver.c | 93 
 1 file changed, 45 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index cb3ac555c944..b8d5805c446a 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -905,18 +905,19 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
}
 #endif /* CONFIG_STACKTRACE */
 
+   eeh_for_each_pe(pe, tmp_pe)
+   eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+   edev->mode &= ~EEH_DEV_NO_HANDLER;
+
eeh_pe_update_time_stamp(pe);
pe->freeze_count++;
if (pe->freeze_count > eeh_max_freezes) {
pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour 
and has been permanently disabled.\n",
   pe->phb->global_number, pe->addr,
   pe->freeze_count);
-   result = PCI_ERS_RESULT_DISCONNECT;
-   }
 
-   eeh_for_each_pe(pe, tmp_pe)
-   eeh_pe_for_each_dev(tmp_pe, edev, tmp)
-   edev->mode &= ~EEH_DEV_NO_HANDLER;
+   goto recover_failed;
+   }
 
/* Walk the various device drivers attached to this slot through
 * a reset sequence, giving each an opportunity to do what it needs
@@ -928,39 +929,38 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 * the error. Override the result if necessary to have partially
 * hotplug for this case.
 */
-   if (result != PCI_ERS_RESULT_DISCONNECT) {
-   pr_warn("EEH: This PCI device has failed %d times in the last 
hour and will be permanently disabled after %d failures.\n",
-   pe->freeze_count, eeh_max_freezes);
-   pr_info("EEH: Notify device drivers to shutdown\n");
-   eeh_set_channel_state(pe, pci_channel_io_frozen);
-   eeh_set_irq_state(pe, false);
-   eeh_pe_report("error_detected(IO frozen)", pe,
- eeh_report_error, );
-   if ((pe->type & EEH_PE_PHB) &&
-   result != PCI_ERS_RESULT_NONE &&
-   result != PCI_ERS_RESULT_NEED_RESET)
-   result = PCI_ERS_RESULT_NEED_RESET;
-   }
+   pr_warn("EEH: This PCI device has failed %d times in the last hour and 
will be permanently disabled after %d failures.\n",
+   pe->freeze_count, eeh_max_freezes);
+   pr_info("EEH: Notify device drivers to shutdown\n");
+   eeh_set_channel_state(pe, pci_channel_io_frozen);
+   eeh_set_irq_state(pe, false);
+   eeh_pe_report("error_detected(IO frozen)", pe,
+ eeh_report_error, );
+   if (result == PCI_ERS_RESULT_DISCONNECT)
+   goto recover_failed;
+
+   /*
+* Error logged on a PHB are always fences which need a full
+* PHB reset to clear so force that to happen.
+*/
+   if ((pe->type & EEH_PE_PHB) && result != PCI_ERS_RESULT_NONE)
+   result = PCI_ERS_RESULT_NEED_RESET;
 
/* Get the current PCI slot state. This can take a long time,
 * sometimes over 300 seconds for certain systems.
 */
-   if (result != 

[PATCH 1/2] eeh: Small refactor of eeh_handle_normal_event

2021-10-15 Thread Daniel Axtens
The control flow of eeh_handle_normal_event is a bit tricky.

Break out one of the error handling paths - rather than be in
an else block, we'll make it part of the regular body of the
function and put a 'goto out;' in the true limb of the if.

Signed-off-by: Daniel Axtens 

---

This doesn't really make things that much simpler to comprehend
by itself but it makes the next patch a bit cleaner.
---
 arch/powerpc/kernel/eeh_driver.c | 69 
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 3eff6a4888e7..cb3ac555c944 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -1054,45 +1054,46 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
}
 
pr_info("EEH: Recovery successful.\n");
-   } else  {
-   /*
-* About 90% of all real-life EEH failures in the field
-* are due to poorly seated PCI cards. Only 10% or so are
-* due to actual, failed cards.
-*/
-   pr_err("EEH: Unable to recover from failure from 
PHB#%x-PE#%x.\n"
-  "Please try reseating or replacing it\n",
-   pe->phb->global_number, pe->addr);
+   goto out;
+   }
 
-   eeh_slot_error_detail(pe, EEH_LOG_PERM);
+   /*
+* About 90% of all real-life EEH failures in the field
+* are due to poorly seated PCI cards. Only 10% or so are
+* due to actual, failed cards.
+*/
+   pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
+   "Please try reseating or replacing it\n",
+   pe->phb->global_number, pe->addr);
 
-   /* Notify all devices that they're about to go down. */
-   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
-   eeh_set_irq_state(pe, false);
-   eeh_pe_report("error_detected(permanent failure)", pe,
- eeh_report_failure, NULL);
+   eeh_slot_error_detail(pe, EEH_LOG_PERM);
 
-   /* Mark the PE to be removed permanently */
-   eeh_pe_state_mark(pe, EEH_PE_REMOVED);
+   /* Notify all devices that they're about to go down. */
+   eeh_set_channel_state(pe, pci_channel_io_perm_failure);
+   eeh_set_irq_state(pe, false);
+   eeh_pe_report("error_detected(permanent failure)", pe,
+ eeh_report_failure, NULL);
 
-   /*
-* Shut down the device drivers for good. We mark
-* all removed devices correctly to avoid access
-* the their PCI config any more.
-*/
-   if (pe->type & EEH_PE_VF) {
-   eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
-   eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
-   } else {
-   eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
-   eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+   /* Mark the PE to be removed permanently */
+   eeh_pe_state_mark(pe, EEH_PE_REMOVED);
 
-   pci_lock_rescan_remove();
-   pci_hp_remove_devices(bus);
-   pci_unlock_rescan_remove();
-   /* The passed PE should no longer be used */
-   return;
-   }
+   /*
+* Shut down the device drivers for good. We mark
+* all removed devices correctly to avoid access
+* the their PCI config any more.
+*/
+   if (pe->type & EEH_PE_VF) {
+   eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
+   eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+   } else {
+   eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
+   eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+
+   pci_lock_rescan_remove();
+   pci_hp_remove_devices(bus);
+   pci_unlock_rescan_remove();
+   /* The passed PE should no longer be used */
+   return;
}
 
 out:
-- 
2.25.1



Re: [PATCH v2 08/13] asm-generic: Refactor dereference_[kernel]_function_descriptor()

2021-10-15 Thread Nicholas Piggin
Excerpts from Christophe Leroy's message of October 14, 2021 3:49 pm:
> dereference_function_descriptor() and
> dereference_kernel_function_descriptor() are identical on the
> three architectures implementing them.
> 
> Make them common and put them out-of-line in kernel/extable.c
> which is one of the users and has similar type of functions.

We should be moving more stuff out of extable.c (including all the
kernel address tests). lib/kimage.c or kelf.c or something. 

It could be after your series though.

> 
> Reviewed-by: Kees Cook 
> Reviewed-by: Arnd Bergmann 
> Signed-off-by: Christophe Leroy 
> ---
>  arch/ia64/include/asm/sections.h| 19 ---
>  arch/parisc/include/asm/sections.h  |  9 -
>  arch/parisc/kernel/process.c| 21 -
>  arch/powerpc/include/asm/sections.h | 23 ---
>  include/asm-generic/sections.h  |  2 ++
>  kernel/extable.c| 23 ++-
>  6 files changed, 24 insertions(+), 73 deletions(-)
> 
> diff --git a/arch/ia64/include/asm/sections.h 
> b/arch/ia64/include/asm/sections.h
> index 1aaed8882294..96c9bb500c34 100644
> --- a/arch/ia64/include/asm/sections.h
> +++ b/arch/ia64/include/asm/sections.h
> @@ -31,23 +31,4 @@ extern char __start_gate_brl_fsys_bubble_down_patchlist[], 
> __end_gate_brl_fsys_b
>  extern char __start_unwind[], __end_unwind[];
>  extern char __start_ivt_text[], __end_ivt_text[];
>  
> -#undef dereference_function_descriptor
> -static inline void *dereference_function_descriptor(void *ptr)
> -{
> - struct fdesc *desc = ptr;
> - void *p;
> -
> - if (!get_kernel_nofault(p, (void *)>addr))
> - ptr = p;
> - return ptr;
> -}
> -
> -#undef dereference_kernel_function_descriptor
> -static inline void *dereference_kernel_function_descriptor(void *ptr)
> -{
> - if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
> - return ptr;
> - return dereference_function_descriptor(ptr);
> -}
> -
>  #endif /* _ASM_IA64_SECTIONS_H */
> diff --git a/arch/parisc/include/asm/sections.h 
> b/arch/parisc/include/asm/sections.h
> index 37b34b357cb5..6b1fe22baaf5 100644
> --- a/arch/parisc/include/asm/sections.h
> +++ b/arch/parisc/include/asm/sections.h
> @@ -13,13 +13,4 @@ typedef Elf64_Fdesc func_desc_t;
>  
>  extern char __alt_instructions[], __alt_instructions_end[];
>  
> -#ifdef CONFIG_64BIT
> -
> -#undef dereference_function_descriptor
> -void *dereference_function_descriptor(void *);
> -
> -#undef dereference_kernel_function_descriptor
> -void *dereference_kernel_function_descriptor(void *);
> -#endif
> -
>  #endif
> diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
> index 38ec4ae81239..7382576b52a8 100644
> --- a/arch/parisc/kernel/process.c
> +++ b/arch/parisc/kernel/process.c
> @@ -266,27 +266,6 @@ get_wchan(struct task_struct *p)
>   return 0;
>  }
>  
> -#ifdef CONFIG_64BIT
> -void *dereference_function_descriptor(void *ptr)
> -{
> - Elf64_Fdesc *desc = ptr;
> - void *p;
> -
> - if (!get_kernel_nofault(p, (void *)>addr))
> - ptr = p;
> - return ptr;
> -}
> -
> -void *dereference_kernel_function_descriptor(void *ptr)
> -{
> - if (ptr < (void *)__start_opd ||
> - ptr >= (void *)__end_opd)
> - return ptr;
> -
> - return dereference_function_descriptor(ptr);
> -}
> -#endif
> -
>  static inline unsigned long brk_rnd(void)
>  {
>   return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT;
> diff --git a/arch/powerpc/include/asm/sections.h 
> b/arch/powerpc/include/asm/sections.h
> index 1322d7b2f1a3..fbfe1957edbe 100644
> --- a/arch/powerpc/include/asm/sections.h
> +++ b/arch/powerpc/include/asm/sections.h
> @@ -72,29 +72,6 @@ static inline int overlaps_kernel_text(unsigned long 
> start, unsigned long end)
>   (unsigned long)_stext < end;
>  }
>  
> -#ifdef PPC64_ELF_ABI_v1
> -
> -#undef dereference_function_descriptor
> -static inline void *dereference_function_descriptor(void *ptr)
> -{
> - struct ppc64_opd_entry *desc = ptr;
> - void *p;
> -
> - if (!get_kernel_nofault(p, (void *)>addr))
> - ptr = p;
> - return ptr;
> -}
> -
> -#undef dereference_kernel_function_descriptor
> -static inline void *dereference_kernel_function_descriptor(void *ptr)
> -{
> - if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
> - return ptr;
> -
> - return dereference_function_descriptor(ptr);
> -}
> -#endif /* PPC64_ELF_ABI_v1 */
> -
>  #endif
>  
>  #endif /* __KERNEL__ */
> diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
> index cbec7d5f1678..76163883c6ff 100644
> --- a/include/asm-generic/sections.h
> +++ b/include/asm-generic/sections.h
> @@ -60,6 +60,8 @@ extern __visible const void __nosave_begin, __nosave_end;
>  
>  /* Function descriptor handling (if any).  Override in asm/sections.h */
>  #ifdef HAVE_FUNCTION_DESCRIPTORS
> +void 

[PATCH] macintosh: ams: replace snprintf in show functions with sysfs_emit

2021-10-15 Thread Qing Wang
show() must not use snprintf() when formatting the value to be
returned to user space.

Fix the following coccicheck warning:
drivers/macintosh/ams/ams-core.c:53: WARNING: use scnprintf or sprintf.

Use sysfs_emit instead of scnprintf or sprintf makes more sense.

Signed-off-by: Qing Wang 
---
 drivers/macintosh/ams/ams-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/macintosh/ams/ams-core.c b/drivers/macintosh/ams/ams-core.c
index 01eeb23..877e8cb 100644
--- a/drivers/macintosh/ams/ams-core.c
+++ b/drivers/macintosh/ams/ams-core.c
@@ -50,7 +50,7 @@ static ssize_t ams_show_current(struct device *dev,
ams_sensors(, , );
mutex_unlock(_info.lock);
 
-   return snprintf(buf, PAGE_SIZE, "%d %d %d\n", x, y, z);
+   return sysfs_emit(buf, "%d %d %d\n", x, y, z);
 }
 
 static DEVICE_ATTR(current, S_IRUGO, ams_show_current, NULL);
-- 
2.7.4



Re: [PATCH v2 06/13] asm-generic: Use HAVE_FUNCTION_DESCRIPTORS to define associated stubs

2021-10-15 Thread Christophe Leroy




Le 15/10/2021 à 08:16, Nicholas Piggin a écrit :

Excerpts from Christophe Leroy's message of October 14, 2021 3:49 pm:

Replace HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR by
HAVE_FUNCTION_DESCRIPTORS and use it instead of
'dereference_function_descriptor' macro to know
whether an arch has function descriptors.

To limit churn in one of the following patches, use
an #ifdef/#else construct with empty first part
instead of an #ifndef in asm-generic/sections.h


Is it worth putting this into Kconfig if you're going to
change it? In any case


That was what I wanted to do in the begining but how can I do that in 
Kconfig ?


#ifdef __powerpc64__
#if defined(_CALL_ELF) && _CALL_ELF == 2
#define PPC64_ELF_ABI_v2
#else
#define PPC64_ELF_ABI_v1
#endif
#endif /* __powerpc64__ */

#ifdef PPC64_ELF_ABI_v1
#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1

Christophe



Reviewed-by: Nicholas Piggin 



Reviewed-by: Kees Cook 
Signed-off-by: Christophe Leroy 
---
  arch/ia64/include/asm/sections.h| 5 +++--
  arch/parisc/include/asm/sections.h  | 6 --
  arch/powerpc/include/asm/sections.h | 6 --
  include/asm-generic/sections.h  | 3 ++-
  include/linux/kallsyms.h| 2 +-
  5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h
index 35f24e52149a..6e55e545bf02 100644
--- a/arch/ia64/include/asm/sections.h
+++ b/arch/ia64/include/asm/sections.h
@@ -9,6 +9,9 @@
  
  #include 

  #include 
+
+#define HAVE_FUNCTION_DESCRIPTORS 1
+
  #include 
  
  extern char __phys_per_cpu_start[];

@@ -27,8 +30,6 @@ extern char __start_gate_brl_fsys_bubble_down_patchlist[], 
__end_gate_brl_fsys_b
  extern char __start_unwind[], __end_unwind[];
  extern char __start_ivt_text[], __end_ivt_text[];
  
-#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1

-
  #undef dereference_function_descriptor
  static inline void *dereference_function_descriptor(void *ptr)
  {
diff --git a/arch/parisc/include/asm/sections.h 
b/arch/parisc/include/asm/sections.h
index bb52aea0cb21..85149a89ff3e 100644
--- a/arch/parisc/include/asm/sections.h
+++ b/arch/parisc/include/asm/sections.h
@@ -2,6 +2,10 @@
  #ifndef _PARISC_SECTIONS_H
  #define _PARISC_SECTIONS_H
  
+#ifdef CONFIG_64BIT

+#define HAVE_FUNCTION_DESCRIPTORS 1
+#endif
+
  /* nothing to see, move along */
  #include 
  
@@ -9,8 +13,6 @@ extern char __alt_instructions[], __alt_instructions_end[];
  
  #ifdef CONFIG_64BIT
  
-#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1

-
  #undef dereference_function_descriptor
  void *dereference_function_descriptor(void *);
  
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h

index 32e7035863ac..bba97b8c38cf 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -8,6 +8,10 @@
  
  #define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
  
+#ifdef PPC64_ELF_ABI_v1

+#define HAVE_FUNCTION_DESCRIPTORS 1
+#endif
+
  #include 
  
  extern bool init_mem_is_free;

@@ -69,8 +73,6 @@ static inline int overlaps_kernel_text(unsigned long start, 
unsigned long end)
  
  #ifdef PPC64_ELF_ABI_v1
  
-#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1

-
  #undef dereference_function_descriptor
  static inline void *dereference_function_descriptor(void *ptr)
  {
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index d16302d3eb59..b677e926e6b3 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -59,7 +59,8 @@ extern char __noinstr_text_start[], __noinstr_text_end[];
  extern __visible const void __nosave_begin, __nosave_end;
  
  /* Function descriptor handling (if any).  Override in asm/sections.h */

-#ifndef dereference_function_descriptor
+#ifdef HAVE_FUNCTION_DESCRIPTORS
+#else
  #define dereference_function_descriptor(p) ((void *)(p))
  #define dereference_kernel_function_descriptor(p) ((void *)(p))
  #endif
diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index a1d6fc82d7f0..9f277baeb559 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -57,7 +57,7 @@ static inline int is_ksym_addr(unsigned long addr)
  
  static inline void *dereference_symbol_descriptor(void *ptr)

  {
-#ifdef HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR
+#ifdef HAVE_FUNCTION_DESCRIPTORS
struct module *mod;
  
  	ptr = dereference_kernel_function_descriptor(ptr);

--
2.31.1





Re: [PATCH v2 06/13] asm-generic: Use HAVE_FUNCTION_DESCRIPTORS to define associated stubs

2021-10-15 Thread Nicholas Piggin
Excerpts from Christophe Leroy's message of October 14, 2021 3:49 pm:
> Replace HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR by
> HAVE_FUNCTION_DESCRIPTORS and use it instead of
> 'dereference_function_descriptor' macro to know
> whether an arch has function descriptors.
> 
> To limit churn in one of the following patches, use
> an #ifdef/#else construct with empty first part
> instead of an #ifndef in asm-generic/sections.h

Is it worth putting this into Kconfig if you're going to
change it? In any case

Reviewed-by: Nicholas Piggin 

> 
> Reviewed-by: Kees Cook 
> Signed-off-by: Christophe Leroy 
> ---
>  arch/ia64/include/asm/sections.h| 5 +++--
>  arch/parisc/include/asm/sections.h  | 6 --
>  arch/powerpc/include/asm/sections.h | 6 --
>  include/asm-generic/sections.h  | 3 ++-
>  include/linux/kallsyms.h| 2 +-
>  5 files changed, 14 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/ia64/include/asm/sections.h 
> b/arch/ia64/include/asm/sections.h
> index 35f24e52149a..6e55e545bf02 100644
> --- a/arch/ia64/include/asm/sections.h
> +++ b/arch/ia64/include/asm/sections.h
> @@ -9,6 +9,9 @@
>  
>  #include 
>  #include 
> +
> +#define HAVE_FUNCTION_DESCRIPTORS 1
> +
>  #include 
>  
>  extern char __phys_per_cpu_start[];
> @@ -27,8 +30,6 @@ extern char __start_gate_brl_fsys_bubble_down_patchlist[], 
> __end_gate_brl_fsys_b
>  extern char __start_unwind[], __end_unwind[];
>  extern char __start_ivt_text[], __end_ivt_text[];
>  
> -#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1
> -
>  #undef dereference_function_descriptor
>  static inline void *dereference_function_descriptor(void *ptr)
>  {
> diff --git a/arch/parisc/include/asm/sections.h 
> b/arch/parisc/include/asm/sections.h
> index bb52aea0cb21..85149a89ff3e 100644
> --- a/arch/parisc/include/asm/sections.h
> +++ b/arch/parisc/include/asm/sections.h
> @@ -2,6 +2,10 @@
>  #ifndef _PARISC_SECTIONS_H
>  #define _PARISC_SECTIONS_H
>  
> +#ifdef CONFIG_64BIT
> +#define HAVE_FUNCTION_DESCRIPTORS 1
> +#endif
> +
>  /* nothing to see, move along */
>  #include 
>  
> @@ -9,8 +13,6 @@ extern char __alt_instructions[], __alt_instructions_end[];
>  
>  #ifdef CONFIG_64BIT
>  
> -#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1
> -
>  #undef dereference_function_descriptor
>  void *dereference_function_descriptor(void *);
>  
> diff --git a/arch/powerpc/include/asm/sections.h 
> b/arch/powerpc/include/asm/sections.h
> index 32e7035863ac..bba97b8c38cf 100644
> --- a/arch/powerpc/include/asm/sections.h
> +++ b/arch/powerpc/include/asm/sections.h
> @@ -8,6 +8,10 @@
>  
>  #define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
>  
> +#ifdef PPC64_ELF_ABI_v1
> +#define HAVE_FUNCTION_DESCRIPTORS 1
> +#endif
> +
>  #include 
>  
>  extern bool init_mem_is_free;
> @@ -69,8 +73,6 @@ static inline int overlaps_kernel_text(unsigned long start, 
> unsigned long end)
>  
>  #ifdef PPC64_ELF_ABI_v1
>  
> -#define HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR 1
> -
>  #undef dereference_function_descriptor
>  static inline void *dereference_function_descriptor(void *ptr)
>  {
> diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
> index d16302d3eb59..b677e926e6b3 100644
> --- a/include/asm-generic/sections.h
> +++ b/include/asm-generic/sections.h
> @@ -59,7 +59,8 @@ extern char __noinstr_text_start[], __noinstr_text_end[];
>  extern __visible const void __nosave_begin, __nosave_end;
>  
>  /* Function descriptor handling (if any).  Override in asm/sections.h */
> -#ifndef dereference_function_descriptor
> +#ifdef HAVE_FUNCTION_DESCRIPTORS
> +#else
>  #define dereference_function_descriptor(p) ((void *)(p))
>  #define dereference_kernel_function_descriptor(p) ((void *)(p))
>  #endif
> diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
> index a1d6fc82d7f0..9f277baeb559 100644
> --- a/include/linux/kallsyms.h
> +++ b/include/linux/kallsyms.h
> @@ -57,7 +57,7 @@ static inline int is_ksym_addr(unsigned long addr)
>  
>  static inline void *dereference_symbol_descriptor(void *ptr)
>  {
> -#ifdef HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR
> +#ifdef HAVE_FUNCTION_DESCRIPTORS
>   struct module *mod;
>  
>   ptr = dereference_kernel_function_descriptor(ptr);
> -- 
> 2.31.1
> 
> 
> 


Re: [PATCH v2 03/13] powerpc: Remove func_descr_t

2021-10-15 Thread Nicholas Piggin
Excerpts from Christophe Leroy's message of October 15, 2021 3:19 pm:
> 
> 
> Le 15/10/2021 à 00:17, Daniel Axtens a écrit :
>> Christophe Leroy  writes:
>> 
>>> 'func_descr_t' is redundant with 'struct ppc64_opd_entry'
>> 
>> So, if I understand the overall direction of the series, you're
>> consolidating powerpc around one single type for function descriptors,
>> and then you're creating a generic typedef so that generic code can
>> always do ((func_desc_t)x)->addr to get the address of a function out of
>> a function descriptor regardless of arch. (And regardless of whether the
>> arch uses function descriptors or not.)
> 
> An architecture not using function descriptors won't do much with 
> ((func_desc_t *)x)->addr. This is just done to allow building stuff 
> regardless.
> 
> I prefer something like
> 
>   if (have_function_descriptors())
>   addr = (func_desc_t *)ptr)->addr;
>   else
>   addr = ptr;

If you make a generic data type for architectures without function 
descriptors as such

typedef struct func_desc {
char addr[0];
} func_desc_t;

Then you can do that with no if. The downside is your addr has to be 
char * and it's maybe not helpful to be so "clever".

>>   - why pick ppc64_opd_entry over func_descr_t?
> 
> Good question. At the begining it was because it was in UAPI headers, 
> and also because it was the one used in our 
> dereference_function_descriptor().
> 
> But at the end maybe that's not the more logical choice. I need to look 
> a bit more.

I would prefer the func_descr_t (with 'toc' and 'env') if you're going 
to change it.

Thanks,
Nick


Re: [PATCH v2 02/13] powerpc: Rename 'funcaddr' to 'addr' in 'struct ppc64_opd_entry'

2021-10-15 Thread Nicholas Piggin
Excerpts from Christophe Leroy's message of October 14, 2021 3:49 pm:
> There are three architectures with function descriptors, try to
> have common names for the address they contain in order to
> refactor some functions into generic functions later.
> 
> powerpc has 'funcaddr'
> ia64 has 'ip'
> parisc has 'addr'
> 
> Vote for 'addr' and update 'struct ppc64_opd_entry' accordingly.

It is the "address of the entry point of the function" according to 
powerpc ELF spec, so addr seems fine.

Reviewed-by: Nicholas Piggin 

> 
> Reviewed-by: Kees Cook 
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/include/asm/elf.h  | 2 +-
>  arch/powerpc/include/asm/sections.h | 2 +-
>  arch/powerpc/kernel/module_64.c | 6 +++---
>  3 files changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
> index a4406714c060..bb0f278f9ed4 100644
> --- a/arch/powerpc/include/asm/elf.h
> +++ b/arch/powerpc/include/asm/elf.h
> @@ -178,7 +178,7 @@ void relocate(unsigned long final_address);
>  
>  /* There's actually a third entry here, but it's unused */
>  struct ppc64_opd_entry {
> - unsigned long funcaddr;
> + unsigned long addr;
>   unsigned long r2;
>  };
>  
> diff --git a/arch/powerpc/include/asm/sections.h 
> b/arch/powerpc/include/asm/sections.h
> index 6e4af4492a14..32e7035863ac 100644
> --- a/arch/powerpc/include/asm/sections.h
> +++ b/arch/powerpc/include/asm/sections.h
> @@ -77,7 +77,7 @@ static inline void *dereference_function_descriptor(void 
> *ptr)
>   struct ppc64_opd_entry *desc = ptr;
>   void *p;
>  
> - if (!get_kernel_nofault(p, (void *)>funcaddr))
> + if (!get_kernel_nofault(p, (void *)>addr))
>   ptr = p;
>   return ptr;
>  }
> diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
> index 6baa676e7cb6..82908c9be627 100644
> --- a/arch/powerpc/kernel/module_64.c
> +++ b/arch/powerpc/kernel/module_64.c
> @@ -72,11 +72,11 @@ static func_desc_t func_desc(unsigned long addr)
>  }
>  static unsigned long func_addr(unsigned long addr)
>  {
> - return func_desc(addr).funcaddr;
> + return func_desc(addr).addr;
>  }
>  static unsigned long stub_func_addr(func_desc_t func)
>  {
> - return func.funcaddr;
> + return func.addr;
>  }
>  static unsigned int local_entry_offset(const Elf64_Sym *sym)
>  {
> @@ -187,7 +187,7 @@ static int relacmp(const void *_x, const void *_y)
>  static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
>   const Elf64_Shdr *sechdrs)
>  {
> - /* One extra reloc so it's always 0-funcaddr terminated */
> + /* One extra reloc so it's always 0-addr terminated */
>   unsigned long relocs = 1;
>   unsigned i;
>  
> -- 
> 2.31.1
> 
> 
>