On Sun, Mar 8, 2026 at 11:30 PM Sungwoo Kim <[email protected]> wrote:
>
> The numa_node can be < 0 since NUMA_NO_NODE = -1. However,
> struct blk_mq_hw_ctx{} defines numa_node as unsigned int. As a result,
> numa_node is set to UINT_MAX for NUMA_NO_NODE in blk_mq_alloc_hctx().

The node argument to blk_mq_alloc_hctx() comes from
blk_mq_alloc_and_init_hctx(), which is called by
blk_mq_alloc_and_init_hctx() with int node = blk_mq_get_hctx_node(set,
i). node = NUMA_NO_NODE would suggest that blk_mq_hw_queue_to_node()
doesn't find any CPU affinitized to the queue. Is that even possible?

>
> Later, nvme_setup_descriptor_pools() accesses
> descriptor_pools[numa_node]. Due to the above, it tries to access
> descriptor_pools[UINT_MAX]. The address is garbage but accessible
> because it is canonical and still within the slab memory range.
> Therefore, no page fault occurs, and KASAN cannot detect this since it
> is beyond the redzones.
>
> Subsequently, normal I/O calls dma_pool_alloc() with the garbage pool
> address. pool->next_block contains a wild pointer, causing a general
> protection fault (GPF).
>
> To fix this, this patch changes the type of numa_node to int and adds
> a check for NUMA_NO_NODE.
>
> Log:
>
> Oops: general protection fault, probably for non-canonical address 
> 0xe9803b040854d02c: 0000 [#1] SMP KASAN PTI
> KASAN: maybe wild-memory-access in range 
> [0x4c01f82042a68160-0x4c01f82042a68167][FEMU] Err: I/O cmd failed: opcode=0x2 
> status=0x4002
> CPU: 0 UID: 0 PID: 112363 Comm: systemd-udevd Not tainted 6.19.0-dirty #10 
> PREEMPT(voluntary)
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> RIP: 0010:pool_block_pop mm/dmapool.c:187 [inline]
> RIP: 0010:dma_pool_alloc+0x110/0x990 mm/dmapool.c:417
> Code: 00 0f 85 a4 07 00 00 4c 8b 63 58 4d 85 e4 0f 84 12 01 00 00 e8 41 1d 93 
> ff 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <80> 3c 02 00 0f 85 a7 
> 07 00 00 49 8b 04 24 48 8d 7b 68 48 89 fa 48
> RSP: 0018:ffffc90002b9efd0 EFLAGS: 00010003
> RAX: dffffc0000000000 RBX: ffff888005466800 RCX: ffffffff94faab7f
> RDX: 09803f040854d02c RSI: 6c9b26c9b26c9b27 RDI: ffff88800c725ea0
> RBP: ffffc90002b9f060 R08: 0000000000000001 R09: 0000000000000001
> R10: 0000000000000003 R11: 0000000000000000 R12: 4c01f82042a68164
> R13: ffff888005466800 R14: 0000000000000820 R15: ffff888007b29000
> FS:  00007f2abc4ff8c0(0000) GS:ffff8880d1ff7000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000056360eb89000 CR3: 000000000a480000 CR4: 00000000000006f0
> Call Trace:
>  <TASK>
>  nvme_pci_setup_data_prp drivers/nvme/host/pci.c:906 [inline]
>  nvme_map_data drivers/nvme/host/pci.c:1114 [inline]
>  nvme_prep_rq.part.0+0x17d3/0x3c90 drivers/nvme/host/pci.c:1243
>  nvme_prep_rq drivers/nvme/host/pci.c:1239 [inline]
>  nvme_prep_rq_batch drivers/nvme/host/pci.c:1321 [inline]
>  nvme_queue_rqs+0x37b/0x8a0 drivers/nvme/host/pci.c:1336
>  __blk_mq_flush_list block/blk-mq.c:2848 [inline]
>  __blk_mq_flush_list+0xaa/0xe0 block/blk-mq.c:2844
>  blk_mq_dispatch_queue_requests+0x4f5/0x990 block/blk-mq.c:2893
>  blk_mq_flush_plug_list+0x232/0x650 block/blk-mq.c:2981
>  __blk_flush_plug+0x2c3/0x510 block/blk-core.c:1225
>  blk_finish_plug block/blk-core.c:1252 [inline]
>  blk_finish_plug+0x64/0xc0 block/blk-core.c:1249
>  read_pages+0x6bd/0x9d0 mm/readahead.c:176
>  page_cache_ra_unbounded+0x659/0x950 mm/readahead.c:269
>  do_page_cache_ra mm/readahead.c:332 [inline]
>  force_page_cache_ra+0x282/0x3a0 mm/readahead.c:361
>  page_cache_sync_ra+0x201/0xbf0 mm/readahead.c:579
>  filemap_get_pages+0x3be/0x1990 mm/filemap.c:2690
>  filemap_read+0x3ea/0xdf0 mm/filemap.c:2800
>  blkdev_read_iter+0x1b8/0x520 block/fops.c:856
>  new_sync_read fs/read_write.c:491 [inline]
>  vfs_read+0x90f/0xd80 fs/read_write.c:572
>  ksys_read+0x14e/0x280 fs/read_write.c:715
>  __do_sys_read fs/read_write.c:724 [inline]
>  __se_sys_read fs/read_write.c:722 [inline]
>  __x64_sys_read+0x7b/0xc0 fs/read_write.c:722
>  x64_sys_call+0x17ec/0x21b0 arch/x86/include/generated/asm/syscalls_64.h:1
>  do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
>  do_syscall_64+0x8b/0x1200 arch/x86/entry/syscall_64.c:94
>  entry_SYSCALL_64_after_hwframe+0x76/0x7e
> RIP: 0033:0x7f2abc7b204e
> Code: 0f 1f 40 00 48 8b 15 79 af 00 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb 
> ba 0f 1f 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 
> 5a c3 66 0f 1f 84 00 00 00 00 00 48 83 ec 28
> RSP: 002b:00007fff07113cb8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
> RAX: ffffffffffffffda RBX: 000056360eb6a528 RCX: 00007f2abc7b204e
> RDX: 0000000000040000 RSI: 000056360eb6a538 RDI: 000000000000000f
> RBP: 000056360e8d23d0 R08: 000056360eb6a510 R09: 00007f2abc79abe0
> R10: 0000000000040050 R11: 0000000000000246 R12: 000000003ff80000
> R13: 0000000000040000 R14: 000056360eb6a510 R15: 000056360e8d2420
>  </TASK>
> Modules linked in:
>
> Fixes: 320ae51feed5 ("blk-mq: new multi-queue block IO queueing mechanism")
> Fixes: d977506f8863 ("nvme-pci: make PRP list DMA pools per-NUMA-node")
> Acked-by: Chao Shi <[email protected]>
> Acked-by: Weidong Zhu <[email protected]>
> Acked-by: Dave Tian <[email protected]>
> Signed-off-by: Sungwoo Kim <[email protected]>
> ---
>  block/bsg-lib.c                   |  2 +-
>  drivers/block/mtip32xx/mtip32xx.c |  2 +-
>  drivers/block/nbd.c               |  2 +-
>  drivers/md/dm-rq.c                |  2 +-
>  drivers/mmc/core/queue.c          |  2 +-
>  drivers/mtd/ubi/block.c           |  2 +-
>  drivers/nvme/host/apple.c         |  2 +-
>  drivers/nvme/host/fc.c            |  2 +-
>  drivers/nvme/host/pci.c           | 11 ++++++++---
>  drivers/nvme/host/rdma.c          |  2 +-
>  drivers/nvme/host/tcp.c           |  2 +-
>  drivers/nvme/target/loop.c        |  2 +-
>  drivers/scsi/scsi_lib.c           |  2 +-
>  include/linux/blk-mq.h            |  4 ++--
>  14 files changed, 22 insertions(+), 17 deletions(-)
>
> diff --git a/block/bsg-lib.c b/block/bsg-lib.c
> index 9ceb5d0832f5..e93b1018a346 100644
> --- a/block/bsg-lib.c
> +++ b/block/bsg-lib.c
> @@ -299,7 +299,7 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx 
> *hctx,
>
>  /* called right after the request is allocated for the request_queue */
>  static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
> -                      unsigned int hctx_idx, unsigned int numa_node)
> +                      unsigned int hctx_idx, int numa_node)
>  {
>         struct bsg_job *job = blk_mq_rq_to_pdu(req);
>
> diff --git a/drivers/block/mtip32xx/mtip32xx.c 
> b/drivers/block/mtip32xx/mtip32xx.c
> index 567192e371a8..8aedba9b5690 100644
> --- a/drivers/block/mtip32xx/mtip32xx.c
> +++ b/drivers/block/mtip32xx/mtip32xx.c
> @@ -3340,7 +3340,7 @@ static void mtip_free_cmd(struct blk_mq_tag_set *set, 
> struct request *rq,
>  }
>
>  static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
> -                        unsigned int hctx_idx, unsigned int numa_node)
> +                        unsigned int hctx_idx, int numa_node)
>  {
>         struct driver_data *dd = set->driver_data;
>         struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index f6c33b21f69e..e1fac1c0c4cd 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -1888,7 +1888,7 @@ static void nbd_dbg_close(void)
>  #endif
>
>  static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
> -                           unsigned int hctx_idx, unsigned int numa_node)
> +                           unsigned int hctx_idx, int numa_node)
>  {
>         struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
>         cmd->nbd = set->driver_data;
> diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
> index a6ca92049c10..b687a209256b 100644
> --- a/drivers/md/dm-rq.c
> +++ b/drivers/md/dm-rq.c
> @@ -455,7 +455,7 @@ static void dm_start_request(struct mapped_device *md, 
> struct request *orig)
>  }
>
>  static int dm_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
> -                             unsigned int hctx_idx, unsigned int numa_node)
> +                             unsigned int hctx_idx, int numa_node)
>  {
>         struct mapped_device *md = set->driver_data;
>         struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
> diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
> index 284856c8f655..06cb29190a88 100644
> --- a/drivers/mmc/core/queue.c
> +++ b/drivers/mmc/core/queue.c
> @@ -203,7 +203,7 @@ static unsigned short mmc_get_max_segments(struct 
> mmc_host *host)
>  }
>
>  static int mmc_mq_init_request(struct blk_mq_tag_set *set, struct request 
> *req,
> -                              unsigned int hctx_idx, unsigned int numa_node)
> +                              unsigned int hctx_idx, int numa_node)
>  {
>         struct mmc_queue_req *mq_rq = req_to_mmc_queue_req(req);
>         struct mmc_queue *mq = set->driver_data;
> diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
> index b53fd147fa65..1c0bd2b36637 100644
> --- a/drivers/mtd/ubi/block.c
> +++ b/drivers/mtd/ubi/block.c
> @@ -312,7 +312,7 @@ static blk_status_t ubiblock_queue_rq(struct 
> blk_mq_hw_ctx *hctx,
>
>  static int ubiblock_init_request(struct blk_mq_tag_set *set,
>                 struct request *req, unsigned int hctx_idx,
> -               unsigned int numa_node)
> +               int numa_node)
>  {
>         struct ubiblock_pdu *pdu = blk_mq_rq_to_pdu(req);
>
> diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
> index ed61b97fde59..50ff5e9a168d 100644
> --- a/drivers/nvme/host/apple.c
> +++ b/drivers/nvme/host/apple.c
> @@ -819,7 +819,7 @@ static int apple_nvme_init_hctx(struct blk_mq_hw_ctx 
> *hctx, void *data,
>
>  static int apple_nvme_init_request(struct blk_mq_tag_set *set,
>                                    struct request *req, unsigned int hctx_idx,
> -                                  unsigned int numa_node)
> +                                  int numa_node)
>  {
>         struct apple_nvme_queue *q = set->driver_data;
>         struct apple_nvme *anv = queue_to_apple_nvme(q);
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index 6948de3f438a..64d0c5d7613a 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -2109,7 +2109,7 @@ __nvme_fc_init_request(struct nvme_fc_ctrl *ctrl,
>
>  static int
>  nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq,
> -               unsigned int hctx_idx, unsigned int numa_node)
> +               unsigned int hctx_idx, int numa_node)
>  {
>         struct nvme_fc_ctrl *ctrl = to_fc_ctrl(set->driver_data);
>         struct nvme_fcp_op_w_sgl *op = blk_mq_rq_to_pdu(rq);
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 3c83076a57e5..a5f12fc7655d 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -443,11 +443,16 @@ static bool nvme_dbbuf_update_and_check_event(u16 
> value, __le32 *dbbuf_db,
>  }
>
>  static struct nvme_descriptor_pools *
> -nvme_setup_descriptor_pools(struct nvme_dev *dev, unsigned numa_node)
> +nvme_setup_descriptor_pools(struct nvme_dev *dev, int numa_node)
>  {
> -       struct nvme_descriptor_pools *pools = 
> &dev->descriptor_pools[numa_node];
> +       struct nvme_descriptor_pools *pools;
>         size_t small_align = NVME_SMALL_POOL_SIZE;
>
> +       if (numa_node == NUMA_NO_NODE)
> +               pools = &dev->descriptor_pools[numa_node_id()];
> +       else
> +               pools = &dev->descriptor_pools[numa_node];

Simpler: if (numa_node == NUMA_NO_NODE) numa_node = numa_node_id();

> +
>         if (pools->small)
>                 return pools; /* already initialized */
>
> @@ -516,7 +521,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, 
> void *data,
>
>  static int nvme_pci_init_request(struct blk_mq_tag_set *set,
>                 struct request *req, unsigned int hctx_idx,
> -               unsigned int numa_node)
> +               int numa_node)
>  {
>         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 35c0822edb2d..c2514ef94028 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -292,7 +292,7 @@ static void nvme_rdma_exit_request(struct blk_mq_tag_set 
> *set,
>
>  static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
>                 struct request *rq, unsigned int hctx_idx,
> -               unsigned int numa_node)
> +               int numa_node)
>  {
>         struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data);
>         struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 69cb04406b47..385eef98081b 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -547,7 +547,7 @@ static void nvme_tcp_exit_request(struct blk_mq_tag_set 
> *set,
>
>  static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
>                 struct request *rq, unsigned int hctx_idx,
> -               unsigned int numa_node)
> +               int numa_node)
>  {
>         struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
>         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
> index fc8e7c9ad858..72a8ea70eae7 100644
> --- a/drivers/nvme/target/loop.c
> +++ b/drivers/nvme/target/loop.c
> @@ -202,7 +202,7 @@ static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl,
>
>  static int nvme_loop_init_request(struct blk_mq_tag_set *set,
>                 struct request *req, unsigned int hctx_idx,
> -               unsigned int numa_node)
> +               int numa_node)
>  {
>         struct nvme_loop_ctrl *ctrl = to_loop_ctrl(set->driver_data);
>         struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 4a902c9dfd8b..8958ad31ed2a 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1948,7 +1948,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx 
> *hctx,
>  }
>
>  static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request 
> *rq,
> -                               unsigned int hctx_idx, unsigned int numa_node)
> +                               unsigned int hctx_idx, int numa_node)
>  {
>         struct Scsi_Host *shost = set->driver_data;
>         struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index cae9e857aea4..1a5a3786522c 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -426,7 +426,7 @@ struct blk_mq_hw_ctx {
>         struct blk_mq_tags      *sched_tags;
>
>         /** @numa_node: NUMA node the storage adapter has been connected to. 
> */
> -       unsigned int            numa_node;
> +       int             numa_node;
>         /** @queue_num: Index of this hardware queue. */
>         unsigned int            queue_num;
>
> @@ -651,7 +651,7 @@ struct blk_mq_ops {
>          * flush request.
>          */
>         int (*init_request)(struct blk_mq_tag_set *set, struct request *,
> -                           unsigned int, unsigned int);
> +                           unsigned int, int);

Pre-existing, but naming these integer arguments would be helpful for
documentation.

Best,
Caleb

>         /**
>          * @exit_request: Ditto for exit/teardown.
>          */
> --
> 2.47.3
>
>

Reply via email to