On 5/18/2026 2:59 PM, Pavel Begunkov wrote: >> FYI, I really want SGL support before this get merged, but ignoring that >> for now: > > I was hoping to let Samsung guys to send a follow up they already have, > but I'll ask them to have about taking it into this patch set.
I had done patches on top of v3 adding SGL support and PRP list reuse optimization for the dmabuf path. Branch: https://github.com/SamsungDS/linux/commits/rw-dmabuf-v3-nvme-opt/ Also pasting the SGL patch here for quick reference: Subject: [PATCH 1/2] nvme-pci: add sgl support for dmabuf path Handle dmabuf-backed requests through the SGL setup path too. Use the cached dmabuf sg_table and keep PRP fallback where allowed. Signed-off-by: Anuj Gupta <[email protected]> --- drivers/nvme/host/pci.c | 194 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 193 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 0a49c94dd675..31e37ab8769b 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1270,6 +1270,14 @@ static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, sge->type = NVME_SGL_FMT_DATA_DESC << 4; } +static void nvme_pci_sgl_set_data_addr(struct nvme_sgl_desc *sge, + dma_addr_t addr, u32 len) +{ + sge->addr = cpu_to_le64(addr); + sge->length = cpu_to_le32(len); + sge->type = NVME_SGL_FMT_DATA_DESC << 4; +} + static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, dma_addr_t dma_addr, int entries) { @@ -1321,6 +1329,176 @@ static blk_status_t nvme_pci_setup_data_sgl(struct request *req, return iter->status; } +static unsigned int nvme_pci_dmabuf_sgl_nents(struct request *req, + dma_addr_t *first_dma, u32 *first_len) +{ + struct bio *bio = req->bio; + struct nvme_dmabuf_map *map; + struct scatterlist *sg; + unsigned long tmp; + size_t offset = bio->bi_iter.bi_bvec_done; + size_t remaining = blk_rq_payload_bytes(req); + dma_addr_t last_end = 0; + unsigned int nents = 0; + dma_addr_t dma = 0; + u32 len = 0; + bool have = false; + + map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base); + + for_each_sgtable_dma_sg(map->sgt, sg, tmp) { + size_t sg_len = sg_dma_len(sg); + dma_addr_t addr = sg_dma_address(sg); + + if (!remaining) + break; + if (offset >= sg_len) { + offset -= sg_len; + continue; + } + + addr += offset; + sg_len -= offset; + offset = 0; + + while (sg_len && remaining) { + u32 chunk = min_t(size_t, remaining, sg_len); + + if (!have || last_end != addr) { + nents++; + if (nents == 1) { + dma = addr; + len = chunk; + } + } else if (nents == 1) { + len += chunk; + } + + have = true; + last_end = addr + chunk; + addr += chunk; + sg_len -= chunk; + remaining -= chunk; + } + } + + if (unlikely(remaining)) + return 0; + + *first_dma = dma; + *first_len = len; + return nents; +} + +static unsigned int nvme_pci_dmabuf_avg_seg_size(struct request *req) +{ + dma_addr_t first_dma; + u32 first_len; + unsigned int nseg; + + nseg = nvme_pci_dmabuf_sgl_nents(req, &first_dma, &first_len); + if (!nseg) + return 0; + return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg); +} + +static blk_status_t nvme_rq_setup_dmabuf_sgl(struct request *req, + struct nvme_queue *nvmeq) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct bio *bio = req->bio; + struct nvme_dmabuf_map *map; + size_t length = blk_rq_payload_bytes(req); + struct nvme_sgl_desc *sg_list = NULL; + dma_addr_t sgl_dma = 0, first_dma, last_end = 0; + unsigned int entries, mapped = 0; + unsigned long tmp; + struct scatterlist *sg; + size_t offset, remaining; + u32 first_len; + bool have = false; + + map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base); + + entries = nvme_pci_dmabuf_sgl_nents(req, &first_dma, &first_len); + if (!entries) + return BLK_STS_IOERR; + if (entries > NVME_MAX_SEGS) + return BLK_STS_AGAIN; + + iod->cmd.common.flags = NVME_CMD_SGL_METABUF; + iod->total_len = length; + + nvme_sync_dma(nvmeq->dev, req, false); + + if (entries == 1) { + nvme_pci_sgl_set_data_addr(&iod->cmd.common.dptr.sgl, first_dma, + first_len); + return BLK_STS_OK; + } + + if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list)) + iod->flags |= IOD_SMALL_DESCRIPTOR; + + sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC, + &sgl_dma); + if (!sg_list) + return BLK_STS_RESOURCE; + iod->descriptors[iod->nr_descriptors++] = sg_list; + + offset = bio->bi_iter.bi_bvec_done; + remaining = length; + + for_each_sgtable_dma_sg(map->sgt, sg, tmp) { + size_t sg_len = sg_dma_len(sg); + dma_addr_t addr = sg_dma_address(sg); + + if (!remaining) + break; + if (offset >= sg_len) { + offset -= sg_len; + continue; + } + + addr += offset; + sg_len -= offset; + offset = 0; + + while (sg_len && remaining) { + u32 chunk = min_t(size_t, remaining, sg_len); + + if (have && last_end == addr) { + u32 old = le32_to_cpu(sg_list[mapped - 1].length); + + sg_list[mapped - 1].length = + cpu_to_le32(old + chunk); + } else { + if (WARN_ON_ONCE(mapped == entries)) + goto err_free; + nvme_pci_sgl_set_data_addr(&sg_list[mapped++], + addr, chunk); + } + + have = true; + last_end = addr + chunk; + addr += chunk; + sg_len -= chunk; + remaining -= chunk; + } + } + + if (unlikely(remaining)) + goto err_free; + + nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped); + return BLK_STS_OK; + +err_free: + iod->nr_descriptors--; + dma_pool_free(nvme_dma_pool(nvmeq, iod), sg_list, sgl_dma); + return BLK_STS_IOERR; +} + static blk_status_t nvme_pci_setup_data_simple(struct request *req, enum nvme_use_sgl use_sgl) { @@ -1369,8 +1547,22 @@ static blk_status_t nvme_map_data(struct request *req) struct blk_dma_iter iter; blk_status_t ret; - if (nvme_rq_is_dmabuf_attached(req)) + if (nvme_rq_is_dmabuf_attached(req)) { + if (use_sgl == SGL_FORCED) { + ret = nvme_rq_setup_dmabuf_sgl(req, nvmeq); + /* Regular path doesn't fall back if SGLs are forced. */ + return ret == BLK_STS_AGAIN ? BLK_STS_IOERR : ret; + } + + if (use_sgl == SGL_SUPPORTED && sgl_threshold && + nvme_pci_dmabuf_avg_seg_size(req) >= sgl_threshold) { + ret = nvme_rq_setup_dmabuf_sgl(req, nvmeq); + if (ret != BLK_STS_AGAIN) + return ret; + } + return nvme_rq_setup_dmabuf_map(req, nvmeq); + } /* * Try to skip the DMA iterator for single segment requests, as that -- 2.43.0
