[PATCH 12/12] block: add special APIs for run-time disabling of discard and friends

2024-05-28 Thread Christoph Hellwig
A few drivers optimistically try to support discard, write zeroes and
secure erase and disable the features from the I/O completion handler
if the hardware can't support them.  This disable can't be done using
the atomic queue limits API because the I/O completion handlers can't
take sleeping locks or freezer the queue.  Keep the existing clearing
of the relevant field to zero, but replace the old blk_queue_max_*
APIs with new disable APIs that force the value to 0.

Signed-off-by: Christoph Hellwig 
---
 arch/um/drivers/ubd_kern.c   |  5 ++---
 block/blk-settings.c | 41 
 drivers/block/xen-blkfront.c |  4 ++--
 drivers/scsi/sd.c|  4 ++--
 include/linux/blkdev.h   | 28 ++--
 5 files changed, 28 insertions(+), 54 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index a79a3b7c33a647..7eae1519300fbd 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -475,10 +475,9 @@ static void ubd_handler(void)
struct request_queue *q = io_req->req->q;
 
if (req_op(io_req->req) == REQ_OP_DISCARD)
-   blk_queue_max_discard_sectors(q, 0);
+   blk_queue_disable_discard(q);
if (req_op(io_req->req) == REQ_OP_WRITE_ZEROES)
-   blk_queue_max_write_zeroes_sectors(q,
-   0);
+   blk_queue_disable_write_zeroes(q);
}
blk_mq_end_request(io_req->req, io_req->error);
kfree(io_req);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0b038729608f4b..996f247fc98e80 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -293,47 +293,6 @@ int queue_limits_set(struct request_queue *q, struct 
queue_limits *lim)
 }
 EXPORT_SYMBOL_GPL(queue_limits_set);
 
-/**
- * blk_queue_max_discard_sectors - set max sectors for a single discard
- * @q:  the request queue for the device
- * @max_discard_sectors: maximum number of sectors to discard
- **/
-void blk_queue_max_discard_sectors(struct request_queue *q,
-   unsigned int max_discard_sectors)
-{
-   struct queue_limits *lim = >limits;
-
-   lim->max_hw_discard_sectors = max_discard_sectors;
-   lim->max_discard_sectors =
-   min(max_discard_sectors, lim->max_user_discard_sectors);
-}
-EXPORT_SYMBOL(blk_queue_max_discard_sectors);
-
-/**
- * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase
- * @q:  the request queue for the device
- * @max_sectors: maximum number of sectors to secure_erase
- **/
-void blk_queue_max_secure_erase_sectors(struct request_queue *q,
-   unsigned int max_sectors)
-{
-   q->limits.max_secure_erase_sectors = max_sectors;
-}
-EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors);
-
-/**
- * blk_queue_max_write_zeroes_sectors - set max sectors for a single
- *  write zeroes
- * @q:  the request queue for the device
- * @max_write_zeroes_sectors: maximum number of sectors to write per command
- **/
-void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
-   unsigned int max_write_zeroes_sectors)
-{
-   q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors;
-}
-EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
-
 void disk_update_readahead(struct gendisk *disk)
 {
blk_apply_bdi_limits(disk->bdi, >queue->limits);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index fd7c0ff2139cee..9b4ec3e4908cce 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1605,8 +1605,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
blkif_req(req)->error = BLK_STS_NOTSUPP;
info->feature_discard = 0;
info->feature_secdiscard = 0;
-   blk_queue_max_discard_sectors(rq, 0);
-   blk_queue_max_secure_erase_sectors(rq, 0);
+   blk_queue_disable_discard(rq);
+   blk_queue_disable_secure_erase(rq);
}
break;
case BLKIF_OP_FLUSH_DISKCACHE:
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 03e67936b27928..56fd523b3987a5 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -837,7 +837,7 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd 
*scmd,
 static void sd_disable_discard(struct scsi_disk *sdkp)
 {
sdkp->provisioning_mode = SD_LBP_DISABLE;
-   blk_queue_max_discard_sectors(sdkp->disk->queue, 0);
+   blk_queue_disable_

[PATCH 10/12] sr: convert to the atomic queue limits API

2024-05-28 Thread Christoph Hellwig
Assign all queue limits through a local queue_limits variable and
queue_limits_commit_update so that we can't race updating them from
multiple places, and free the queue when updating them so that
in-progress I/O submissions don't see half-updated limits.

Also use the chance to clean up variable names to standard ones.

Signed-off-by: Christoph Hellwig 
---
 drivers/scsi/sr.c | 42 +-
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 7ab000942b97fc..3f491019103e0c 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -111,7 +111,7 @@ static struct lock_class_key sr_bio_compl_lkclass;
 static int sr_open(struct cdrom_device_info *, int);
 static void sr_release(struct cdrom_device_info *);
 
-static void get_sectorsize(struct scsi_cd *);
+static int get_sectorsize(struct scsi_cd *);
 static int get_capabilities(struct scsi_cd *);
 
 static unsigned int sr_check_events(struct cdrom_device_info *cdi,
@@ -473,15 +473,15 @@ static blk_status_t sr_init_command(struct scsi_cmnd 
*SCpnt)
return BLK_STS_IOERR;
 }
 
-static void sr_revalidate_disk(struct scsi_cd *cd)
+static int sr_revalidate_disk(struct scsi_cd *cd)
 {
struct scsi_sense_hdr sshdr;
 
/* if the unit is not ready, nothing more to do */
if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, ))
-   return;
+   return 0;
sr_cd_check(>cdi);
-   get_sectorsize(cd);
+   return get_sectorsize(cd);
 }
 
 static int sr_block_open(struct gendisk *disk, blk_mode_t mode)
@@ -494,13 +494,16 @@ static int sr_block_open(struct gendisk *disk, blk_mode_t 
mode)
return -ENXIO;
 
scsi_autopm_get_device(sdev);
-   if (disk_check_media_change(disk))
-   sr_revalidate_disk(cd);
+   if (disk_check_media_change(disk)) {
+   ret = sr_revalidate_disk(cd);
+   if (ret)
+   goto out;
+   }
 
mutex_lock(>lock);
ret = cdrom_open(>cdi, mode);
mutex_unlock(>lock);
-
+out:
scsi_autopm_put_device(sdev);
if (ret)
scsi_device_put(cd->device);
@@ -685,7 +688,9 @@ static int sr_probe(struct device *dev)
blk_pm_runtime_init(sdev->request_queue, dev);
 
dev_set_drvdata(dev, cd);
-   sr_revalidate_disk(cd);
+   error = sr_revalidate_disk(cd);
+   if (error)
+   goto unregister_cdrom;
 
error = device_add_disk(>sdev_gendev, disk, NULL);
if (error)
@@ -714,13 +719,14 @@ static int sr_probe(struct device *dev)
 }
 
 
-static void get_sectorsize(struct scsi_cd *cd)
+static int get_sectorsize(struct scsi_cd *cd)
 {
+   struct request_queue *q = cd->device->request_queue;
static const u8 cmd[10] = { READ_CAPACITY };
unsigned char buffer[8] = { };
-   int the_result;
+   struct queue_limits lim;
+   int err;
int sector_size;
-   struct request_queue *queue;
struct scsi_failure failure_defs[] = {
{
.result = SCMD_FAILURE_RESULT_ANY,
@@ -736,10 +742,10 @@ static void get_sectorsize(struct scsi_cd *cd)
};
 
/* Do the command and wait.. */
-   the_result = scsi_execute_cmd(cd->device, cmd, REQ_OP_DRV_IN, buffer,
+   err = scsi_execute_cmd(cd->device, cmd, REQ_OP_DRV_IN, buffer,
  sizeof(buffer), SR_TIMEOUT, MAX_RETRIES,
  _args);
-   if (the_result) {
+   if (err) {
cd->capacity = 0x1f;
sector_size = 2048; /* A guess, just in case */
} else {
@@ -789,10 +795,12 @@ static void get_sectorsize(struct scsi_cd *cd)
set_capacity(cd->disk, cd->capacity);
}
 
-   queue = cd->device->request_queue;
-   blk_queue_logical_block_size(queue, sector_size);
-
-   return;
+   lim = queue_limits_start_update(q);
+   lim.logical_block_size = sector_size;
+   blk_mq_freeze_queue(q);
+   err = queue_limits_commit_update(q, );
+   blk_mq_unfreeze_queue(q);
+   return err;
 }
 
 static int get_capabilities(struct scsi_cd *cd)
-- 
2.43.0




[PATCH 11/12] block: remove unused queue limits API

2024-05-28 Thread Christoph Hellwig
Remove all APIs that are unused now that sd and sr have been converted
to the atomic queue limits API.

Signed-off-by: Christoph Hellwig 
---
 block/blk-settings.c   | 190 -
 include/linux/blkdev.h |  12 ---
 2 files changed, 202 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index a49abdb3554834..0b038729608f4b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -293,24 +293,6 @@ int queue_limits_set(struct request_queue *q, struct 
queue_limits *lim)
 }
 EXPORT_SYMBOL_GPL(queue_limits_set);
 
-/**
- * blk_queue_chunk_sectors - set size of the chunk for this queue
- * @q:  the request queue for the device
- * @chunk_sectors:  chunk sectors in the usual 512b unit
- *
- * Description:
- *If a driver doesn't want IOs to cross a given chunk size, it can set
- *this limit and prevent merging across chunks. Note that the block layer
- *must accept a page worth of data at any offset. So if the crossing of
- *chunks is a hard limitation in the driver, it must still be prepared
- *to split single page bios.
- **/
-void blk_queue_chunk_sectors(struct request_queue *q, unsigned int 
chunk_sectors)
-{
-   q->limits.chunk_sectors = chunk_sectors;
-}
-EXPORT_SYMBOL(blk_queue_chunk_sectors);
-
 /**
  * blk_queue_max_discard_sectors - set max sectors for a single discard
  * @q:  the request queue for the device
@@ -352,139 +334,6 @@ void blk_queue_max_write_zeroes_sectors(struct 
request_queue *q,
 }
 EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
 
-/**
- * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
- * @q:  the request queue for the device
- * @max_zone_append_sectors: maximum number of sectors to write per command
- *
- * Sets the maximum number of sectors allowed for zone append commands. If
- * Specifying 0 for @max_zone_append_sectors indicates that the queue does
- * not natively support zone append operations and that the block layer must
- * emulate these operations using regular writes.
- **/
-void blk_queue_max_zone_append_sectors(struct request_queue *q,
-   unsigned int max_zone_append_sectors)
-{
-   unsigned int max_sectors = 0;
-
-   if (WARN_ON(!blk_queue_is_zoned(q)))
-   return;
-
-   if (max_zone_append_sectors) {
-   max_sectors = min(q->limits.max_hw_sectors,
- max_zone_append_sectors);
-   max_sectors = min(q->limits.chunk_sectors, max_sectors);
-
-   /*
-* Signal eventual driver bugs resulting in the max_zone_append
-* sectors limit being 0 due to the chunk_sectors limit (zone
-* size) not set or the max_hw_sectors limit not set.
-*/
-   WARN_ON_ONCE(!max_sectors);
-   }
-
-   q->limits.max_zone_append_sectors = max_sectors;
-}
-EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors);
-
-/**
- * blk_queue_logical_block_size - set logical block size for the queue
- * @q:  the request queue for the device
- * @size:  the logical block size, in bytes
- *
- * Description:
- *   This should be set to the lowest possible block size that the
- *   storage device can address.  The default of 512 covers most
- *   hardware.
- **/
-void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
-{
-   struct queue_limits *limits = >limits;
-
-   limits->logical_block_size = size;
-
-   if (limits->discard_granularity < limits->logical_block_size)
-   limits->discard_granularity = limits->logical_block_size;
-
-   if (limits->physical_block_size < size)
-   limits->physical_block_size = size;
-
-   if (limits->io_min < limits->physical_block_size)
-   limits->io_min = limits->physical_block_size;
-
-   limits->max_hw_sectors =
-   round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT);
-   limits->max_sectors =
-   round_down(limits->max_sectors, size >> SECTOR_SHIFT);
-}
-EXPORT_SYMBOL(blk_queue_logical_block_size);
-
-/**
- * blk_queue_physical_block_size - set physical block size for the queue
- * @q:  the request queue for the device
- * @size:  the physical block size, in bytes
- *
- * Description:
- *   This should be set to the lowest possible sector size that the
- *   hardware can operate on without reverting to read-modify-write
- *   operations.
- */
-void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
-{
-   q->limits.physical_block_size = size;
-
-   if (q->limits.physical_block_size < q->limits.logical_block_size)
-   q->limits.physical_block_size = q->limits.logical_block_size;
-
-   if (q->limits.discard_granularity < q->limits.physical_block_size)
-   q->limits.discard_granularity = q->limits.phy

[PATCH 08/12] sd: cleanup zoned queue limits initialization

2024-05-28 Thread Christoph Hellwig
Consolidate setting zone-related queue limits in sd_zbc_read_zones
instead of splitting them between sd_zbc_revalidate_zones and
sd_zbc_read_zones, and move the early_zone_information initialization
in sd_zbc_read_zones above setting up the queue limits.

Signed-off-by: Christoph Hellwig 
---
 drivers/scsi/sd_zbc.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 806036e48abeda..1c24c844e8d178 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -565,12 +565,6 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
sdkp->zone_info.zone_blocks = zone_blocks;
sdkp->zone_info.nr_zones = nr_zones;
 
-   blk_queue_chunk_sectors(q,
-   logical_to_sectors(sdkp->device, zone_blocks));
-
-   /* Enable block layer zone append emulation */
-   blk_queue_max_zone_append_sectors(q, 0);
-
flags = memalloc_noio_save();
ret = blk_revalidate_disk_zones(disk);
memalloc_noio_restore(flags);
@@ -625,6 +619,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 
buf[SD_BUF_SIZE])
if (ret != 0)
goto err;
 
+   nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
+   sdkp->early_zone_info.nr_zones = nr_zones;
+   sdkp->early_zone_info.zone_blocks = zone_blocks;
+
/* The drive satisfies the kernel restrictions: set it up */
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
if (sdkp->zones_max_open == U32_MAX)
@@ -632,10 +630,10 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 
buf[SD_BUF_SIZE])
else
disk_set_max_open_zones(disk, sdkp->zones_max_open);
disk_set_max_active_zones(disk, 0);
-   nr_zones = round_up(sdkp->capacity, zone_blocks) >> ilog2(zone_blocks);
-
-   sdkp->early_zone_info.nr_zones = nr_zones;
-   sdkp->early_zone_info.zone_blocks = zone_blocks;
+   blk_queue_chunk_sectors(q,
+   logical_to_sectors(sdkp->device, zone_blocks));
+   /* Enable block layer zone append emulation */
+   blk_queue_max_zone_append_sectors(q, 0);
 
return 0;
 
-- 
2.43.0




[PATCH 06/12] sd: simplify the disable case in sd_config_discard

2024-05-28 Thread Christoph Hellwig
Fall through to the main call to blk_queue_max_discard_sectors given that
max_blocks has been initialized to zero above instead of duplicating the
call.

Signed-off-by: Christoph Hellwig 
---
 drivers/scsi/sd.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 09ffe9d826aeac..437743e3a0d37d 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -844,8 +844,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, 
unsigned int mode)
 
case SD_LBP_FULL:
case SD_LBP_DISABLE:
-   blk_queue_max_discard_sectors(q, 0);
-   return;
+   break;
 
case SD_LBP_UNMAP:
max_blocks = min_not_zero(sdkp->max_unmap_blocks,
-- 
2.43.0




[PATCH 01/12] ubd: untagle discard vs write zeroes not support handling

2024-05-28 Thread Christoph Hellwig
Discard and Write Zeroes are different operation and implemented
by different fallocate opcodes for ubd.  If one fails the other one
can work and vice versa.

Split the code to disable the operations in ubd_handler to only
disable the operation that actually failed.

Fixes: 50109b5a03b4 ("um: Add support for DISCARD in the UBD Driver")
Signed-off-by: Christoph Hellwig 
---
 arch/um/drivers/ubd_kern.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index ef805eaa9e013d..a79a3b7c33a647 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -471,9 +471,14 @@ static void ubd_handler(void)
for (count = 0; count < n/sizeof(struct io_thread_req *); 
count++) {
struct io_thread_req *io_req = (*irq_req_buffer)[count];
 
-   if ((io_req->error == BLK_STS_NOTSUPP) && 
(req_op(io_req->req) == REQ_OP_DISCARD)) {
-   blk_queue_max_discard_sectors(io_req->req->q, 
0);
-   
blk_queue_max_write_zeroes_sectors(io_req->req->q, 0);
+   if (io_req->error == BLK_STS_NOTSUPP) {
+   struct request_queue *q = io_req->req->q;
+
+   if (req_op(io_req->req) == REQ_OP_DISCARD)
+   blk_queue_max_discard_sectors(q, 0);
+   if (req_op(io_req->req) == REQ_OP_WRITE_ZEROES)
+   blk_queue_max_write_zeroes_sectors(q,
+   0);
}
blk_mq_end_request(io_req->req, io_req->error);
kfree(io_req);
-- 
2.43.0




[PATCH 04/12] sd: add a sd_disable_discard helper

2024-05-28 Thread Christoph Hellwig
Add helper to disable discard when it is not supported and use it
instead of sd_config_discard in the I/O completion handler.  This avoids
touching more fields than required in the I/O completion handler and
prepares for converting sd to use the atomic queue limits API.

Signed-off-by: Christoph Hellwig 
---
 drivers/scsi/sd.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 15d0035048d902..a8838381823254 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -821,6 +821,12 @@ static unsigned char sd_setup_protect_cmnd(struct 
scsi_cmnd *scmd,
return protect;
 }
 
+static void sd_disable_discard(struct scsi_disk *sdkp)
+{
+   sdkp->provisioning_mode = SD_LBP_DISABLE;
+   blk_queue_max_discard_sectors(sdkp->disk->queue, 0);
+}
+
 static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 {
struct request_queue *q = sdkp->disk->queue;
@@ -2245,12 +2251,12 @@ static int sd_done(struct scsi_cmnd *SCpnt)
case 0x24:  /* INVALID FIELD IN CDB */
switch (SCpnt->cmnd[0]) {
case UNMAP:
-   sd_config_discard(sdkp, SD_LBP_DISABLE);
+   sd_disable_discard(sdkp);
break;
case WRITE_SAME_16:
case WRITE_SAME:
if (SCpnt->cmnd[1] & 8) { /* UNMAP */
-   sd_config_discard(sdkp, SD_LBP_DISABLE);
+   sd_disable_discard(sdkp);
} else {
sdkp->device->no_write_same = 1;
sd_config_write_same(sdkp);
-- 
2.43.0




[PATCH 09/12] sd: convert to the atomic queue limits API

2024-05-28 Thread Christoph Hellwig
Assign all queue limits through a local queue_limits variable and
queue_limits_commit_update so that we can't race updating them from
multiple places, and free the queue when updating them so that
in-progress I/O submissions don't see half-updated limits.

Signed-off-by: Christoph Hellwig 
---
 drivers/scsi/sd.c | 126 --
 drivers/scsi/sd.h |   6 +-
 drivers/scsi/sd_zbc.c |  15 ++---
 3 files changed, 84 insertions(+), 63 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 2d08b69154b995..03e67936b27928 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -101,12 +101,13 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC);
 
 #define SD_MINORS  16
 
-static void sd_config_discard(struct scsi_disk *, unsigned int);
-static void sd_config_write_same(struct scsi_disk *);
+static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim,
+   unsigned int mode);
+static void sd_config_write_same(struct scsi_disk *sdkp,
+   struct queue_limits *lim);
 static int  sd_revalidate_disk(struct gendisk *);
 static void sd_unlock_native_capacity(struct gendisk *disk);
 static void sd_shutdown(struct device *);
-static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer);
 static void scsi_disk_release(struct device *cdev);
 
 static DEFINE_IDA(sd_index_ida);
@@ -456,7 +457,8 @@ provisioning_mode_store(struct device *dev, struct 
device_attribute *attr,
 {
struct scsi_disk *sdkp = to_scsi_disk(dev);
struct scsi_device *sdp = sdkp->device;
-   int mode;
+   struct queue_limits lim;
+   int mode, err;
 
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
@@ -472,8 +474,13 @@ provisioning_mode_store(struct device *dev, struct 
device_attribute *attr,
if (mode < 0)
return -EINVAL;
 
-   sd_config_discard(sdkp, mode);
-
+   lim = queue_limits_start_update(sdkp->disk->queue);
+   sd_config_discard(sdkp, , mode);
+   blk_mq_freeze_queue(sdkp->disk->queue);
+   err = queue_limits_commit_update(sdkp->disk->queue, );
+   blk_mq_unfreeze_queue(sdkp->disk->queue);
+   if (err)
+   return err;
return count;
 }
 static DEVICE_ATTR_RW(provisioning_mode);
@@ -556,6 +563,7 @@ max_write_same_blocks_store(struct device *dev, struct 
device_attribute *attr,
 {
struct scsi_disk *sdkp = to_scsi_disk(dev);
struct scsi_device *sdp = sdkp->device;
+   struct queue_limits lim;
unsigned long max;
int err;
 
@@ -577,8 +585,13 @@ max_write_same_blocks_store(struct device *dev, struct 
device_attribute *attr,
sdkp->max_ws_blocks = max;
}
 
-   sd_config_write_same(sdkp);
-
+   lim = queue_limits_start_update(sdkp->disk->queue);
+   sd_config_write_same(sdkp, );
+   blk_mq_freeze_queue(sdkp->disk->queue);
+   err = queue_limits_commit_update(sdkp->disk->queue, );
+   blk_mq_unfreeze_queue(sdkp->disk->queue);
+   if (err)
+   return err;
return count;
 }
 static DEVICE_ATTR_RW(max_write_same_blocks);
@@ -827,17 +840,15 @@ static void sd_disable_discard(struct scsi_disk *sdkp)
blk_queue_max_discard_sectors(sdkp->disk->queue, 0);
 }
 
-static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
+static void sd_config_discard(struct scsi_disk *sdkp, struct queue_limits *lim,
+   unsigned int mode)
 {
-   struct request_queue *q = sdkp->disk->queue;
unsigned int logical_block_size = sdkp->device->sector_size;
unsigned int max_blocks = 0;
 
-   q->limits.discard_alignment =
-   sdkp->unmap_alignment * logical_block_size;
-   q->limits.discard_granularity =
-   max(sdkp->physical_block_size,
-   sdkp->unmap_granularity * logical_block_size);
+   lim->discard_alignment = sdkp->unmap_alignment * logical_block_size;
+   lim->discard_granularity = max(sdkp->physical_block_size,
+   sdkp->unmap_granularity * logical_block_size);
sdkp->provisioning_mode = mode;
 
switch (mode) {
@@ -875,7 +886,8 @@ static void sd_config_discard(struct scsi_disk *sdkp, 
unsigned int mode)
break;
}
 
-   blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 
9));
+   lim->max_hw_discard_sectors = max_blocks *
+   (logical_block_size >> SECTOR_SHIFT);
 }
 
 static void *sd_set_special_bvec(struct request *rq, unsigned int data_len)
@@ -1010,9 +1022,9 @@ static void sd_disable_write_same(struct scsi_disk *sdkp)
blk_queue_max_write_zeroes_sectors(sdkp->disk->queue, 0);
 }
 
-static void sd_config_write_same(struct scsi_disk *sdkp)
+static void sd_config_write_same(struct scsi_disk *sdkp,
+   stru

convert the SCSI ULDs to the atomic queue limits API

2024-05-28 Thread Christoph Hellwig
Hi all,

this series converts the SCSI upper level drivers to the atomic queue
limits API.

The first patch is a bug fix for ubd that later patches depend on and
might be worth picking up for 6.10.

The second patch changes the max_sectors calculation to take the optimal
I/O size into account so that sd, nbd and rbd don't have to mess with
the user max_sector value.  I'd love to see a careful review from the
nbd and rbd maintainers for this one!

The following patches clean up a few lose ends in the sd driver, and
then convert sd and sr to the atomic queue limits API.  The final
patches remove the now unused block APIs, and convert a few to be
specific to their now more narrow use case.

The patches are against Jens' block-6.10 tree.  Due to the amount of
block layer changes in here, and other that will depend on it, it
would be good if this could eventually be merged through the block
tree, or at least a shared branch between the SCSI and block trees.

Diffstat:
 arch/um/drivers/ubd_kern.c   |   10 +
 block/blk-settings.c |  238 +--
 drivers/block/nbd.c  |2 
 drivers/block/rbd.c  |1 
 drivers/block/xen-blkfront.c |4 
 drivers/scsi/sd.c|  218 ---
 drivers/scsi/sd.h|6 -
 drivers/scsi/sd_zbc.c|   27 ++--
 drivers/scsi/sr.c|   42 ---
 include/linux/blkdev.h   |   40 +++
 10 files changed, 196 insertions(+), 392 deletions(-)



[PATCH 07/12] sd: factor out a sd_discard_mode helper

2024-05-28 Thread Christoph Hellwig
Split the logic to pick the right discard mode into a little helper
to prepare for further changes.

Signed-off-by: Christoph Hellwig 
---
 drivers/scsi/sd.c | 37 -
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 437743e3a0d37d..2d08b69154b995 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3201,6 +3201,25 @@ static void sd_read_app_tag_own(struct scsi_disk *sdkp, 
unsigned char *buffer)
return;
 }
 
+static unsigned int sd_discard_mode(struct scsi_disk *sdkp)
+{
+   if (!sdkp->lbpvpd) {
+   /* LBP VPD page not provided */
+   if (sdkp->max_unmap_blocks)
+   return SD_LBP_UNMAP;
+   return SD_LBP_WS16;
+   }
+
+   /* LBP VPD page tells us what to use */
+   if (sdkp->lbpu && sdkp->max_unmap_blocks)
+   return SD_LBP_UNMAP;
+   if (sdkp->lbpws)
+   return SD_LBP_WS16;
+   if (sdkp->lbpws10)
+   return SD_LBP_WS10;
+   return SD_LBP_DISABLE;
+}
+
 /**
  * sd_read_block_limits - Query disk device for preferred I/O sizes.
  * @sdkp: disk to query
@@ -3239,23 +3258,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp)
sdkp->unmap_alignment =
get_unaligned_be32(>data[32]) & ~(1 << 31);
 
-   if (!sdkp->lbpvpd) { /* LBP VPD page not provided */
-
-   if (sdkp->max_unmap_blocks)
-   sd_config_discard(sdkp, SD_LBP_UNMAP);
-   else
-   sd_config_discard(sdkp, SD_LBP_WS16);
-
-   } else {/* LBP VPD page tells us what to use */
-   if (sdkp->lbpu && sdkp->max_unmap_blocks)
-   sd_config_discard(sdkp, SD_LBP_UNMAP);
-   else if (sdkp->lbpws)
-   sd_config_discard(sdkp, SD_LBP_WS16);
-   else if (sdkp->lbpws10)
-   sd_config_discard(sdkp, SD_LBP_WS10);
-   else
-   sd_config_discard(sdkp, SD_LBP_DISABLE);
-   }
+   sd_config_discard(sdkp, sd_discard_mode(sdkp));
}
 
  out:
-- 
2.43.0




[PATCH 02/12] block: take io_opt and io_min into account for max_sectors

2024-05-28 Thread Christoph Hellwig
The soft max_sectors limit is normally capped by the hardware limits and
an arbitrary upper limit enforced by the kernel, but can be modified by
the user.  A few drivers want to increase this limit (nbd, rbd) or
adjust it up or down based on hardware capabilities (sd).

Change blk_validate_limits to default max_sectors to the optimal I/O
size, or upgrade it to the preferred minimal I/O size if that is
larger than the kernel default if no optimal I/O size is provided based
on the logic in the SD driver.

This keeps the existing kernel default for drivers that do not provide
an io_opt or very big io_min value, but picks a much more useful
default for those who provide these hints, and allows to remove the
hacks to set the user max_sectors limit in nbd, rbd and sd.

Note that rd picks a different value for the optimal I/O size vs the
user max_sectors value, so this is a bit of a behavior change that
could use careful review from people familiar with rbd.

Signed-off-by: Christoph Hellwig 
---
 block/blk-settings.c |  7 +++
 drivers/block/nbd.c  |  2 +-
 drivers/block/rbd.c  |  1 -
 drivers/scsi/sd.c| 29 +
 4 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index effeb9a639bb45..a49abdb3554834 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -153,6 +153,13 @@ static int blk_validate_limits(struct queue_limits *lim)
if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
return -EINVAL;
lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
+   } else if (lim->io_opt) {
+   lim->max_sectors =
+   min(max_hw_sectors, lim->io_opt >> SECTOR_SHIFT);
+   } else if (lim->io_min &&
+  lim->io_min > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) {
+   lim->max_sectors =
+   min(max_hw_sectors, lim->io_min >> SECTOR_SHIFT);
} else {
lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
}
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 22a79a62cc4eab..ad887d614d5b3f 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1808,7 +1808,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned 
int refs)
 {
struct queue_limits lim = {
.max_hw_sectors = 65536,
-   .max_user_sectors   = 256,
+   .io_opt = 256 << SECTOR_SHIFT,
.max_segments   = USHRT_MAX,
.max_segment_size   = UINT_MAX,
};
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 26ff5cd2bf0abc..05096172f334a3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4954,7 +4954,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
struct queue_limits lim = {
.max_hw_sectors = objset_bytes >> SECTOR_SHIFT,
-   .max_user_sectors   = objset_bytes >> SECTOR_SHIFT,
.io_min = rbd_dev->opts->alloc_size,
.io_opt = rbd_dev->opts->alloc_size,
.max_segments   = USHRT_MAX,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index f6c822c9cbd2d3..3dff9150ce11e2 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3593,7 +3593,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
struct request_queue *q = sdkp->disk->queue;
sector_t old_capacity = sdkp->capacity;
unsigned char *buffer;
-   unsigned int dev_max, rw_max;
+   unsigned int dev_max;
 
SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
  "sd_revalidate_disk\n"));
@@ -3675,34 +3675,15 @@ static int sd_revalidate_disk(struct gendisk *disk)
else
blk_queue_io_min(sdkp->disk->queue, 0);
 
-   if (sd_validate_opt_xfer_size(sdkp, dev_max)) {
-   q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks);
-   rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks);
-   } else {
-   q->limits.io_opt = 0;
-   rw_max = min_not_zero(logical_to_sectors(sdp, dev_max),
- (sector_t)BLK_DEF_MAX_SECTORS_CAP);
-   }
-
/*
 * Limit default to SCSI host optimal sector limit if set. There may be
 * an impact on performance for when the size of a request exceeds this
 * host limit.
 */
-   rw_max = min_not_zero(rw_max, sdp->host->opt_sectors);
-
-   /* Do not exceed controller limit */
-   rw_max = min(rw_max, queue_max_hw_sectors(q));
-
-   /*
-* Only update max_s

[PATCH 05/12] sd: add a sd_disable_write_same helper

2024-05-28 Thread Christoph Hellwig
Add helper to disable WRITE SAME when it is not supported and use it
instead of sd_config_write_same in the I/O completion handler.  This
avoids touching more fields than required in the I/O completion handler
and  prepares for converting sd to use the atomic queue limits API.

Signed-off-by: Christoph Hellwig 
---
 drivers/scsi/sd.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index a8838381823254..09ffe9d826aeac 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1004,6 +1004,13 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct 
scsi_cmnd *cmd)
return sd_setup_write_same10_cmnd(cmd, false);
 }
 
+static void sd_disable_write_same(struct scsi_disk *sdkp)
+{
+   sdkp->device->no_write_same = 1;
+   sdkp->max_ws_blocks = 0;
+   blk_queue_max_write_zeroes_sectors(sdkp->disk->queue, 0);
+}
+
 static void sd_config_write_same(struct scsi_disk *sdkp)
 {
struct request_queue *q = sdkp->disk->queue;
@@ -2258,8 +2265,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
if (SCpnt->cmnd[1] & 8) { /* UNMAP */
sd_disable_discard(sdkp);
} else {
-   sdkp->device->no_write_same = 1;
-   sd_config_write_same(sdkp);
+   sd_disable_write_same(sdkp);
req->rq_flags |= RQF_QUIET;
}
break;
-- 
2.43.0




[PATCH 03/12] sd: simplify the ZBC case in provisioning_mode_store

2024-05-28 Thread Christoph Hellwig
Don't reset the discard settings to no-op over and over when a user
writes to the provisioning attribute as that is already the default
mode for ZBC devices.  In hindsight we should have made writing to
the attribute fail for ZBC devices, but the code has probably been
around for far too long to change this now.

Signed-off-by: Christoph Hellwig 
---
 drivers/scsi/sd.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3dff9150ce11e2..15d0035048d902 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -461,14 +461,13 @@ provisioning_mode_store(struct device *dev, struct 
device_attribute *attr,
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
 
-   if (sd_is_zoned(sdkp)) {
-   sd_config_discard(sdkp, SD_LBP_DISABLE);
-   return count;
-   }
-
if (sdp->type != TYPE_DISK)
return -EINVAL;
 
+   /* ignore the proivisioning mode for ZBB devices */
+   if (sd_is_zoned(sdkp))
+   return count;
+
mode = sysfs_match_string(lbp_mode, buf);
if (mode < 0)
return -EINVAL;
-- 
2.43.0




Re: [PATCH 2/2] iommu/dma: Fix zero'ing of bounce buffer padding used by untrusted devices

2024-05-06 Thread Christoph Hellwig
On Sun, Apr 07, 2024 at 09:11:42PM -0700, mhkelle...@gmail.com wrote:
> I've wondered if this code for zero'ing the pre- and post-padding
> should go in swiotlb_tbl_map_single(). The bounce buffer proper is
> already being initialized there. But swiotlb_tbl_map_single()
> would need to test for an untrusted device (or have a "zero the
> padding" flag passed in as part of the "attrs" argument), which
> adds complexity. Thoughts?

If we want to go down that route it should be the latter.  I'm
not sure if it is an improvement, but we'd have to implement it
to see if it does.

> The commit ID of Petr's patch is X'ed out above because Petr's patch
> hasn't gone into Linus' tree yet. We can add the real commit ID once
> this patch is ready to go in.

I've fixed that up and commit the series.

Thanks a lot!




Re: [PATCH v4 bpf-next 2/2] mm: Introduce VM_SPARSE kind and vm_area_[un]map_pages().

2024-03-06 Thread Christoph Hellwig
I'd still prefer to hide the vm_area, but for now:

Reviewed-by: Christoph Hellwig 



Re: [PATCH v2 bpf-next 3/3] mm: Introduce VM_SPARSE kind and vm_area_[un]map_pages().

2024-02-29 Thread Christoph Hellwig
On Tue, Feb 27, 2024 at 05:31:28PM -0800, Alexei Starovoitov wrote:
> What would it look like with a cookie?
> A static inline wrapper around get_vm_area() that returns area->addr ?
> And the start address of vmap range will be such a cookie?

Hmm, just making the kernel virtual address the cookie actually
sounds pretty neat indeed even if I did not have that in mind.

> I guess I don't understand the motivation to hide 'struct vm_struct *'.

The prime reason is that then people will try to start random APIs that
work on it.  But let's give it a try without the wrappers and see how
things go.




Re: [PATCH v2 bpf-next 3/3] mm: Introduce VM_SPARSE kind and vm_area_[un]map_pages().

2024-02-27 Thread Christoph Hellwig
> privately-managed pages into a sparse vm area with the following steps:
> 
>   area = get_vm_area(area_size, VM_SPARSE);  // at bpf prog verification time
>   vm_area_map_pages(area, kaddr, 1, page);   // on demand
> // it will return an error if kaddr is out of range
>   vm_area_unmap_pages(area, kaddr, 1);
>   free_vm_area(area);// after bpf prog is unloaded

I'm still wondering if this should just use an opaque cookie instead
of exposing the vm_area.  But otherwise this mostly looks fine to me.

> + if (addr < (unsigned long)area->addr || (void *)end > area->addr + 
> area->size)
> + return -ERANGE;

This check is duplicated so many times that it really begs for a helper.

> +int vm_area_unmap_pages(struct vm_struct *area, unsigned long addr, unsigned 
> int count)
> +{
> + unsigned long size = ((unsigned long)count) * PAGE_SIZE;
> + unsigned long end = addr + size;
> +
> + if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
> + return -EINVAL;
> + if (addr < (unsigned long)area->addr || (void *)end > area->addr + 
> area->size)
> + return -ERANGE;
> +
> + vunmap_range(addr, end);
> + return 0;

Does it make much sense to have an error return here vs just debug
checks?  It's not like the caller can do much if it violates these
basic invariants.



Re: convert xen-blkfront to atomic queue limit updates v2

2024-02-27 Thread Christoph Hellwig
Jens,

can you pick this up with the Xen maintainer review in place?




Re: [PATCH v2 bpf-next 2/3] mm, xen: Separate xen use cases from ioremap.

2024-02-26 Thread Christoph Hellwig
On Fri, Feb 23, 2024 at 03:57:27PM -0800, Alexei Starovoitov wrote:
> From: Alexei Starovoitov 
> 
> xen grant table and xenbus ring are not ioremap the way arch specific code is 
> using it,
> so let's add VM_XEN flag to separate them from VM_IOREMAP users.
> xen will not and should not be calling ioremap_page_range() on that range.
> /proc/vmallocinfo will print such region as "xen" instead of "ioremap" as 
> well.

Splitting this out is a good idea, but XEN seems a bit of a too
generit time.  Probably GRANT_TABLE or XEN_GRANT_TABLE if that isn't
too long would be better.  Maybe the Xen maintainers have an idea.

Also more overlong commit message lines here, I'm not going to complain
on the third patch if they show up again :)




Re: [PATCH v2 bpf-next 1/3] mm: Enforce VM_IOREMAP flag and range in ioremap_page_range.

2024-02-26 Thread Christoph Hellwig
On Fri, Feb 23, 2024 at 03:57:26PM -0800, Alexei Starovoitov wrote:
> From: Alexei Starovoitov 
> 
> There are various users of get_vm_area() + ioremap_page_range() APIs.
> Enforce that get_vm_area() was requested as VM_IOREMAP type and range passed 
> to
> ioremap_page_range() matches created vm_area to avoid accidentally ioremap-ing
> into wrong address range.

Nit: overly long lines in the commit message here.

Otherwise looks good:

Reviewed-by: Christoph Hellwig 



[PATCH 4/4] xen-blkfront: atomically update queue limits

2024-02-21 Thread Christoph Hellwig
Pass the initial queue limits to blk_mq_alloc_disk and use the
blkif_set_queue_limits API to update the limits on reconnect.

Signed-off-by: Christoph Hellwig 
Acked-by: Roger Pau Monné 
---
 drivers/block/xen-blkfront.c | 41 
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 7664638a0abbfa..fd7c0ff2139cee 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -941,37 +941,35 @@ static const struct blk_mq_ops blkfront_mq_ops = {
.complete = blkif_complete_rq,
 };
 
-static void blkif_set_queue_limits(struct blkfront_info *info)
+static void blkif_set_queue_limits(const struct blkfront_info *info,
+   struct queue_limits *lim)
 {
-   struct request_queue *rq = info->rq;
unsigned int segments = info->max_indirect_segments ? :
BLKIF_MAX_SEGMENTS_PER_REQUEST;
 
-   blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
-
if (info->feature_discard) {
-   blk_queue_max_discard_sectors(rq, UINT_MAX);
+   lim->max_hw_discard_sectors = UINT_MAX;
if (info->discard_granularity)
-   rq->limits.discard_granularity = 
info->discard_granularity;
-   rq->limits.discard_alignment = info->discard_alignment;
+   lim->discard_granularity = info->discard_granularity;
+   lim->discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
-   blk_queue_max_secure_erase_sectors(rq, UINT_MAX);
+   lim->max_secure_erase_sectors = UINT_MAX;
}
 
/* Hard sector size and max sectors impersonate the equiv. hardware. */
-   blk_queue_logical_block_size(rq, info->sector_size);
-   blk_queue_physical_block_size(rq, info->physical_sector_size);
-   blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512);
+   lim->logical_block_size = info->sector_size;
+   lim->physical_block_size = info->physical_sector_size;
+   lim->max_hw_sectors = (segments * XEN_PAGE_SIZE) / 512;
 
/* Each segment in a request is up to an aligned page in size. */
-   blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
-   blk_queue_max_segment_size(rq, PAGE_SIZE);
+   lim->seg_boundary_mask = PAGE_SIZE - 1;
+   lim->max_segment_size = PAGE_SIZE;
 
/* Ensure a merged request will fit in a single I/O ring slot. */
-   blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
+   lim->max_segments = segments / GRANTS_PER_PSEG;
 
/* Make sure buffer addresses are sector-aligned. */
-   blk_queue_dma_alignment(rq, 511);
+   lim->dma_alignment = 511;
 }
 
 static const char *flush_info(struct blkfront_info *info)
@@ -1068,6 +1066,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info, u16 sector_size,
unsigned int physical_sector_size)
 {
+   struct queue_limits lim = {};
struct gendisk *gd;
int nr_minors = 1;
int err;
@@ -1134,11 +1133,13 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
if (err)
goto out_release_minors;
 
-   gd = blk_mq_alloc_disk(>tag_set, NULL, info);
+   blkif_set_queue_limits(info, );
+   gd = blk_mq_alloc_disk(>tag_set, , info);
if (IS_ERR(gd)) {
err = PTR_ERR(gd);
goto out_free_tag_set;
}
+   blk_queue_flag_set(QUEUE_FLAG_VIRT, gd->queue);
 
strcpy(gd->disk_name, DEV_NAME);
ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
@@ -1160,7 +1161,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
info->gd = gd;
info->sector_size = sector_size;
info->physical_sector_size = physical_sector_size;
-   blkif_set_queue_limits(info);
 
xlvbd_flush(info);
 
@@ -2004,14 +2004,19 @@ static int blkfront_probe(struct xenbus_device *dev,
 
 static int blkif_recover(struct blkfront_info *info)
 {
+   struct queue_limits lim;
unsigned int r_index;
struct request *req, *n;
int rc;
struct bio *bio;
struct blkfront_ring_info *rinfo;
 
+   lim = queue_limits_start_update(info->rq);
blkfront_gather_backend_features(info);
-   blkif_set_queue_limits(info);
+   blkif_set_queue_limits(info, );
+   rc = queue_limits_commit_update(info->rq, );
+   if (rc)
+   return rc;
 
for_each_rinfo(info, rinfo, r_index) {
rc = blkfront_setup_indirect(rinfo);
-- 
2.39.2




[PATCH 2/4] xen-blkfront: rely on the default discard granularity

2024-02-21 Thread Christoph Hellwig
The block layer now sets the discard granularity to the physical
block size default.  Take advantage of that in xen-blkfront and only
set the discard granularity if explicitly specified.

Signed-off-by: Christoph Hellwig 
Acked-by: Roger Pau Monné 
---
 drivers/block/xen-blkfront.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index f78167cd5a6333..1258f24b285500 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -951,8 +951,8 @@ static void blkif_set_queue_limits(struct blkfront_info 
*info)
 
if (info->feature_discard) {
blk_queue_max_discard_sectors(rq, UINT_MAX);
-   rq->limits.discard_granularity = info->discard_granularity ?:
-info->physical_sector_size;
+   if (info->discard_granularity)
+   rq->limits.discard_granularity = 
info->discard_granularity;
rq->limits.discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
blk_queue_max_secure_erase_sectors(rq, UINT_MAX);
-- 
2.39.2




convert xen-blkfront to atomic queue limit updates v2

2024-02-21 Thread Christoph Hellwig
Hi all,

this series converts xen-blkfront to the new atomic queue limits update
API in the block tree.  I don't have a Xen setup so this is compile
tested only.

Changes since v1:
 - constify the info argument to blkif_set_queue_limits
 - remove a spurious word from a commit message

Diffstat:
 xen-blkfront.c |   53 +++--
 1 file changed, 27 insertions(+), 26 deletions(-)



[PATCH 3/4] xen-blkfront: don't redundantly set max_sements in blkif_recover

2024-02-21 Thread Christoph Hellwig
blkif_set_queue_limits already sets the max_sements limits, so don't do
it a second time.  Also remove a comment about a long fixe bug in
blk_mq_update_nr_hw_queues.

Signed-off-by: Christoph Hellwig 
Acked-by: Roger Pau Monné 
---
 drivers/block/xen-blkfront.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 1258f24b285500..7664638a0abbfa 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2008,14 +2008,10 @@ static int blkif_recover(struct blkfront_info *info)
struct request *req, *n;
int rc;
struct bio *bio;
-   unsigned int segs;
struct blkfront_ring_info *rinfo;
 
blkfront_gather_backend_features(info);
-   /* Reset limits changed by blk_mq_update_nr_hw_queues(). */
blkif_set_queue_limits(info);
-   segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
-   blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG);
 
for_each_rinfo(info, rinfo, r_index) {
rc = blkfront_setup_indirect(rinfo);
@@ -2035,7 +2031,9 @@ static int blkif_recover(struct blkfront_info *info)
list_for_each_entry_safe(req, n, >requests, queuelist) {
/* Requeue pending requests (flush or discard) */
list_del_init(>queuelist);
-   BUG_ON(req->nr_phys_segments > segs);
+   BUG_ON(req->nr_phys_segments >
+  (info->max_indirect_segments ? :
+   BLKIF_MAX_SEGMENTS_PER_REQUEST));
blk_mq_requeue_request(req, false);
}
blk_mq_start_stopped_hw_queues(info->rq, true);
-- 
2.39.2




[PATCH 1/4] xen-blkfront: set max_discard/secure erase limits to UINT_MAX

2024-02-21 Thread Christoph Hellwig
Currently xen-blkfront set the max discard limit to the capacity of
the device, which is suboptimal when the capacity changes.  Just set
it to UINT_MAX, which has the same effect and is simpler.

Signed-off-by: Christoph Hellwig 
Acked-by: Roger Pau Monné 
---
 drivers/block/xen-blkfront.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 4cc2884e748463..f78167cd5a6333 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -944,20 +944,18 @@ static const struct blk_mq_ops blkfront_mq_ops = {
 static void blkif_set_queue_limits(struct blkfront_info *info)
 {
struct request_queue *rq = info->rq;
-   struct gendisk *gd = info->gd;
unsigned int segments = info->max_indirect_segments ? :
BLKIF_MAX_SEGMENTS_PER_REQUEST;
 
blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
 
if (info->feature_discard) {
-   blk_queue_max_discard_sectors(rq, get_capacity(gd));
+   blk_queue_max_discard_sectors(rq, UINT_MAX);
rq->limits.discard_granularity = info->discard_granularity ?:
 info->physical_sector_size;
rq->limits.discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
-   blk_queue_max_secure_erase_sectors(rq,
-  get_capacity(gd));
+   blk_queue_max_secure_erase_sectors(rq, UINT_MAX);
}
 
/* Hard sector size and max sectors impersonate the equiv. hardware. */
-- 
2.39.2




Re: [PATCH 4/4] xen-blkfront: atomically update queue limits

2024-02-20 Thread Christoph Hellwig
On Tue, Feb 20, 2024 at 01:35:07PM +0100, Roger Pau Monné wrote:
> On Tue, Feb 20, 2024 at 09:49:35AM +0100, Christoph Hellwig wrote:
> > Pass the initial queue limits to blk_mq_alloc_disk and use the
> > blkif_set_queue_limits API to update the limits on reconnect.
> 
> Allocating queue_limits on the stack might be a bit risky, as I fear
> this struct is likely to grow?

It might grow a little bit, but it's not actually that large, epecially
in a simple probe context that isn't in memory reclaim or similar.

> > Signed-off-by: Christoph Hellwig 
> 
> Acked-by: Roger Pau Monné 
> 
> Just one addition while you are already modifying a line.
> 
> > ---
> >  drivers/block/xen-blkfront.c | 41 
> >  1 file changed, 23 insertions(+), 18 deletions(-)
> > 
> > diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> > index 7664638a0abbfa..b77707ca2c5aa6 100644
> > --- a/drivers/block/xen-blkfront.c
> > +++ b/drivers/block/xen-blkfront.c
> > @@ -941,37 +941,35 @@ static const struct blk_mq_ops blkfront_mq_ops = {
> > .complete = blkif_complete_rq,
> >  };
> >  
> > -static void blkif_set_queue_limits(struct blkfront_info *info)
> > +static void blkif_set_queue_limits(struct blkfront_info *info,
> 
> While there, could you also constify info?

Sure.



Re: [PATCH 1/4] xen-blkfront: set max_discard/secure erase limits to UINT_MAX

2024-02-20 Thread Christoph Hellwig
On Tue, Feb 20, 2024 at 12:39:59PM +0100, Roger Pau Monné wrote:
> On Tue, Feb 20, 2024 at 09:49:32AM +0100, Christoph Hellwig wrote:
> > Currently xen-blkfront set the max discard limit to the capacity of
> > the device, which is suboptimal when the capacity changes.  Just set
> > it to UINT_MAX, which has the same effect except and is simpler.
> 
> Extra 'except' in the line above?

Yes, thanks.



[PATCH 4/4] xen-blkfront: atomically update queue limits

2024-02-20 Thread Christoph Hellwig
Pass the initial queue limits to blk_mq_alloc_disk and use the
blkif_set_queue_limits API to update the limits on reconnect.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/xen-blkfront.c | 41 
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 7664638a0abbfa..b77707ca2c5aa6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -941,37 +941,35 @@ static const struct blk_mq_ops blkfront_mq_ops = {
.complete = blkif_complete_rq,
 };
 
-static void blkif_set_queue_limits(struct blkfront_info *info)
+static void blkif_set_queue_limits(struct blkfront_info *info,
+   struct queue_limits *lim)
 {
-   struct request_queue *rq = info->rq;
unsigned int segments = info->max_indirect_segments ? :
BLKIF_MAX_SEGMENTS_PER_REQUEST;
 
-   blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
-
if (info->feature_discard) {
-   blk_queue_max_discard_sectors(rq, UINT_MAX);
+   lim->max_hw_discard_sectors = UINT_MAX;
if (info->discard_granularity)
-   rq->limits.discard_granularity = 
info->discard_granularity;
-   rq->limits.discard_alignment = info->discard_alignment;
+   lim->discard_granularity = info->discard_granularity;
+   lim->discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
-   blk_queue_max_secure_erase_sectors(rq, UINT_MAX);
+   lim->max_secure_erase_sectors = UINT_MAX;
}
 
/* Hard sector size and max sectors impersonate the equiv. hardware. */
-   blk_queue_logical_block_size(rq, info->sector_size);
-   blk_queue_physical_block_size(rq, info->physical_sector_size);
-   blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512);
+   lim->logical_block_size = info->sector_size;
+   lim->physical_block_size = info->physical_sector_size;
+   lim->max_hw_sectors = (segments * XEN_PAGE_SIZE) / 512;
 
/* Each segment in a request is up to an aligned page in size. */
-   blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
-   blk_queue_max_segment_size(rq, PAGE_SIZE);
+   lim->seg_boundary_mask = PAGE_SIZE - 1;
+   lim->max_segment_size = PAGE_SIZE;
 
/* Ensure a merged request will fit in a single I/O ring slot. */
-   blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
+   lim->max_segments = segments / GRANTS_PER_PSEG;
 
/* Make sure buffer addresses are sector-aligned. */
-   blk_queue_dma_alignment(rq, 511);
+   lim->dma_alignment = 511;
 }
 
 static const char *flush_info(struct blkfront_info *info)
@@ -1068,6 +1066,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info, u16 sector_size,
unsigned int physical_sector_size)
 {
+   struct queue_limits lim = {};
struct gendisk *gd;
int nr_minors = 1;
int err;
@@ -1134,11 +1133,13 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
if (err)
goto out_release_minors;
 
-   gd = blk_mq_alloc_disk(>tag_set, NULL, info);
+   blkif_set_queue_limits(info, );
+   gd = blk_mq_alloc_disk(>tag_set, , info);
if (IS_ERR(gd)) {
err = PTR_ERR(gd);
goto out_free_tag_set;
}
+   blk_queue_flag_set(QUEUE_FLAG_VIRT, gd->queue);
 
strcpy(gd->disk_name, DEV_NAME);
ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
@@ -1160,7 +1161,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
info->gd = gd;
info->sector_size = sector_size;
info->physical_sector_size = physical_sector_size;
-   blkif_set_queue_limits(info);
 
xlvbd_flush(info);
 
@@ -2004,14 +2004,19 @@ static int blkfront_probe(struct xenbus_device *dev,
 
 static int blkif_recover(struct blkfront_info *info)
 {
+   struct queue_limits lim;
unsigned int r_index;
struct request *req, *n;
int rc;
struct bio *bio;
struct blkfront_ring_info *rinfo;
 
+   lim = queue_limits_start_update(info->rq);
blkfront_gather_backend_features(info);
-   blkif_set_queue_limits(info);
+   blkif_set_queue_limits(info, );
+   rc = queue_limits_commit_update(info->rq, );
+   if (rc)
+   return rc;
 
for_each_rinfo(info, rinfo, r_index) {
rc = blkfront_setup_indirect(rinfo);
-- 
2.39.2




[PATCH 3/4] xen-blkfront: don't redundantly set max_sements in blkif_recover

2024-02-20 Thread Christoph Hellwig
blkif_set_queue_limits already sets the max_sements limits, so don't do
it a second time.  Also remove a comment about a long fixe bug in
blk_mq_update_nr_hw_queues.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/xen-blkfront.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 1258f24b285500..7664638a0abbfa 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2008,14 +2008,10 @@ static int blkif_recover(struct blkfront_info *info)
struct request *req, *n;
int rc;
struct bio *bio;
-   unsigned int segs;
struct blkfront_ring_info *rinfo;
 
blkfront_gather_backend_features(info);
-   /* Reset limits changed by blk_mq_update_nr_hw_queues(). */
blkif_set_queue_limits(info);
-   segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
-   blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG);
 
for_each_rinfo(info, rinfo, r_index) {
rc = blkfront_setup_indirect(rinfo);
@@ -2035,7 +2031,9 @@ static int blkif_recover(struct blkfront_info *info)
list_for_each_entry_safe(req, n, >requests, queuelist) {
/* Requeue pending requests (flush or discard) */
list_del_init(>queuelist);
-   BUG_ON(req->nr_phys_segments > segs);
+   BUG_ON(req->nr_phys_segments >
+  (info->max_indirect_segments ? :
+   BLKIF_MAX_SEGMENTS_PER_REQUEST));
blk_mq_requeue_request(req, false);
}
blk_mq_start_stopped_hw_queues(info->rq, true);
-- 
2.39.2




[PATCH 1/4] xen-blkfront: set max_discard/secure erase limits to UINT_MAX

2024-02-20 Thread Christoph Hellwig
Currently xen-blkfront set the max discard limit to the capacity of
the device, which is suboptimal when the capacity changes.  Just set
it to UINT_MAX, which has the same effect except and is simpler.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/xen-blkfront.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 4cc2884e748463..f78167cd5a6333 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -944,20 +944,18 @@ static const struct blk_mq_ops blkfront_mq_ops = {
 static void blkif_set_queue_limits(struct blkfront_info *info)
 {
struct request_queue *rq = info->rq;
-   struct gendisk *gd = info->gd;
unsigned int segments = info->max_indirect_segments ? :
BLKIF_MAX_SEGMENTS_PER_REQUEST;
 
blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
 
if (info->feature_discard) {
-   blk_queue_max_discard_sectors(rq, get_capacity(gd));
+   blk_queue_max_discard_sectors(rq, UINT_MAX);
rq->limits.discard_granularity = info->discard_granularity ?:
 info->physical_sector_size;
rq->limits.discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
-   blk_queue_max_secure_erase_sectors(rq,
-  get_capacity(gd));
+   blk_queue_max_secure_erase_sectors(rq, UINT_MAX);
}
 
/* Hard sector size and max sectors impersonate the equiv. hardware. */
-- 
2.39.2




[PATCH 2/4] xen-blkfront: rely on the default discard granularity

2024-02-20 Thread Christoph Hellwig
The block layer now sets the discard granularity to the physical
block size default.  Take advantage of that in xen-blkfront and only
set the discard granularity if explicitly specified.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/xen-blkfront.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index f78167cd5a6333..1258f24b285500 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -951,8 +951,8 @@ static void blkif_set_queue_limits(struct blkfront_info 
*info)
 
if (info->feature_discard) {
blk_queue_max_discard_sectors(rq, UINT_MAX);
-   rq->limits.discard_granularity = info->discard_granularity ?:
-info->physical_sector_size;
+   if (info->discard_granularity)
+   rq->limits.discard_granularity = 
info->discard_granularity;
rq->limits.discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
blk_queue_max_secure_erase_sectors(rq, UINT_MAX);
-- 
2.39.2




convert xen-blkfront to atomic queue limit updates

2024-02-20 Thread Christoph Hellwig
Hi all,

this series converts xen-blkfront to the new atomic queue limits update
API in the block tree.  I don't have a Xen setup so this is compile
tested only.

Diffstat:
 xen-blkfront.c |   53 +++--
 1 file changed, 27 insertions(+), 26 deletions(-)



Re: [PATCH RFC v3 for-6.8/block 04/17] mtd: block2mtd: use bdev apis

2024-01-04 Thread Christoph Hellwig
On Thu, Jan 04, 2024 at 12:28:55PM +0100, Jan Kara wrote:
> What do you think? Because when we are working with the folios it is rather
> natural to use their mapping for dirty balancing?

The real problem is that block2mtd pokes way to deep into block
internals.

I think the saviour here is Christians series to replace the bdev handle
with a struct file, which will allow to use the normal file write path
here and get rid of the entire layering volation.




Re: [PATCH RFC v3 for-6.8/block 02/17] xen/blkback: use bdev api in xen_update_blkif_status()

2024-01-04 Thread Christoph Hellwig
On Thu, Jan 04, 2024 at 12:06:31PM +0100, Jan Kara wrote:
> This function uses invalidate_inode_pages2() while invalidate_bdev() ends
> up using mapping_try_invalidate() and there are subtle behavioral
> differences between these two (for example invalidate_inode_pages2() tries
> to clean dirty pages using the ->launder_folio method). So I think you'll
> need helper like invalidate_bdev2() for this.

That assues that the existing code actually does this intentionally,
which seems doubtful.  But the change in behavior does not to be
documented and explained.




Re: [PATCH RFC v2 for-6.8/block 15/18] buffer: add a new helper to read sb block

2023-12-12 Thread Christoph Hellwig
On Mon, Dec 11, 2023 at 10:07:53PM +0800, Yu Kuai wrote:
> +static __always_inline int buffer_uptodate_or_error(struct buffer_head *bh)
> +{
> + /*
> +  * If the buffer has the write error flag, data was failed to write
> +  * out in the block. In this case, set buffer uptodate to prevent
> +  * reading old data.
> +  */
> + if (buffer_write_io_error(bh))
> + set_buffer_uptodate(bh);
> + return buffer_uptodate(bh);
> +}

So - risking this blows up into a lot of nasty work: Why do we even
clear the uptodate flag on write errors?  Doing so makes not sense to
me as the data isn't any less uptodate just because we failed to write
it..




Re: [PATCH RFC v2 for-6.8/block 17/18] ext4: remove block_device_ejected()

2023-12-12 Thread Christoph Hellwig
Looks good:

Reviewed-by: Christoph Hellwig 



Re: [PATCH RFC v2 for-6.8/block 01/18] block: add some bdev apis

2023-12-12 Thread Christoph Hellwig
> +void invalidate_bdev_range(struct block_device *bdev, pgoff_t start,
> +pgoff_t end)
> +{
> + invalidate_mapping_pages(bdev->bd_inode->i_mapping, start, end);
> +}
> +EXPORT_SYMBOL_GPL(invalidate_bdev_range);

Can we have kerneldoc comments for the new helpers please?

> +struct folio *__bdev_get_folio(struct block_device *bdev, loff_t pos,
> +fgf_t fgp_flags, gfp_t gfp)
> +{
> + return __filemap_get_folio(bdev->bd_inode->i_mapping, pos >> PAGE_SHIFT,
> +fgp_flags, gfp);
> +}
> +EXPORT_SYMBOL_GPL(__bdev_get_folio);

It's a bit silly to have a __-prefixed API without a version that
doesn't have the prefix, so I'd prefer to drop it.  Unless willy has
a good argument for keeping it the same as the filemap API.




Re: [PATCH RFC v2 for-6.8/block 01/18] block: add some bdev apis

2023-12-12 Thread Christoph Hellwig
On Mon, Dec 11, 2023 at 05:52:17PM +0100, Jan Kara wrote:
> > +void bdev_associated_mapping(struct block_device *bdev,
> > +struct address_space *mapping)
> > +{
> > +   mapping->host = bdev->bd_inode;
> > +}
> 
> Here I'm not sure - is the helper really a win? It seems a bit obscure to
> me. This initialization of another mapping for a bdev looks really special.

If we want to hide bd_inode we'll something like this helper even if
I don't particularly like it either.

But it might be a good idea to move out of this series and into the
follow on removing bd_inode, as it's rather pointless without that
context.



Re: [PATCH -next RFC 01/14] block: add some bdev apis

2023-12-06 Thread Christoph Hellwig
On Wed, Dec 06, 2023 at 12:50:38PM -0500, Theodore Ts'o wrote:
> This was added because pulling a mounted a USB thumb drive (or a HDD
> drops off the SATA bus) while the file system is mounted and actively
> in use, would result in a kernel OOPS.  If that's no longer true,
> that's great, but it would be good to test to make sure this is the
> case

And, surprise, surprise - that didn't just affect ext4.  So I ended
up fixing this properly in the block layer.

> If we really want to remove it, I'd suggest doing this as a separate
> commit, so that after we see syzbot reports, or users complaining
> about kernel crashes, we can revert the removal if necessary.

Yes, this should of course be separate, well documented commit.




Re: [PATCH -next RFC 02/14] xen/blkback: use bdev api in xen_update_blkif_status()

2023-12-05 Thread Christoph Hellwig
On Wed, Dec 06, 2023 at 02:56:05PM +0800, Yu Kuai wrote:
> > > - invalidate_inode_pages2(
> > > - blkif->vbd.bdev_handle->bdev->bd_inode->i_mapping);
> > > + invalidate_bdev(blkif->vbd.bdev_handle->bdev);
> > 
> > blkbak is a bdev exported.   I don't think it should ever call
> > invalidate_inode_pages2, through a wrapper or not.
> 
> I'm not sure about this. I'm not familiar with xen/blkback, but I saw
> that xen-blkback will open a bdev from xen_vbd_create(), hence this
> looks like a dm/md for me, hence it sounds reasonable to sync +
> invalidate the opened bdev while initialization. Please kindly correct
> me if I'm wrong.

I guess we have enough precedence for this, so the switchover here
isn't wrong.  But all this invalidating of the bdev cache seems to
be asking for trouble.




Re: [PATCH -next RFC 01/14] block: add some bdev apis

2023-12-05 Thread Christoph Hellwig
On Wed, Dec 06, 2023 at 02:50:56PM +0800, Yu Kuai wrote:
> I'm a litter confused, so there are 3 use cases:
> 1) use GFP_USER, default gfp from bdev_alloc.
> 2) use GFP_KERNEL
> 3) use GFP_NOFS
> 
> I understand that you're suggesting memalloc_nofs_save() to distinguish
> 2 and 3, but how can I distinguish 1?

You shouldn't.  Diverging from the default flags except for clearing
the FS or IO flags is simply a bug.  Note that things like block2mtd
should probably also ensure a noio allocation if they aren't doing that
yet.

> >   - use memalloc_nofs_save in extet instead of using
> > mapping_gfp_constraint to clear it from the mapping flags
> >   - remove __ext4_sb_bread_gfp and just have buffer.c helper that does
> > the right thing (either by changing the calling conventions of an
> > existing one, or adding a new one).
> 
> Thanks for the suggestions, but I'm not sure how to do this yet, I must
> read more ext4 code.

the nofs save part should be trivial.  You can just skip the rest for
now as it's not needed for this patch series.




Re: [PATCH -next RFC 01/14] block: add some bdev apis

2023-12-05 Thread Christoph Hellwig
> +void invalidate_bdev_range(struct block_device *bdev, pgoff_t start,
> +pgoff_t end)
> +{
> + invalidate_mapping_pages(bdev->bd_inode->i_mapping, start, end);
> +}
> +EXPORT_SYMBOL_GPL(invalidate_bdev_range);

All these could probably use kerneldoc comments.

For this one I really don't like it existing at all, but we'll have to
discuss that in the btrfs patch.

> +loff_t bdev_size(struct block_device *bdev)
> +{
> + loff_t size;
> +
> + spin_lock(>bd_size_lock);
> + size = i_size_read(bdev->bd_inode);
> + spin_unlock(>bd_size_lock);
> +
> + return size;
> +}
> +EXPORT_SYMBOL_GPL(bdev_size);

No need for this one.  The callers can simply use bdev_nr_bytes.

> +struct folio *bdev_read_folio(struct block_device *bdev, pgoff_t index)
> +{
> + return read_mapping_folio(bdev->bd_inode->i_mapping, index, NULL);
> +}
> +EXPORT_SYMBOL_GPL(bdev_read_folio);
> +
> +struct folio *bdev_read_folio_gfp(struct block_device *bdev, pgoff_t index,
> +   gfp_t gfp)
> +{
> + return mapping_read_folio_gfp(bdev->bd_inode->i_mapping, index, gfp);
> +}
> +EXPORT_SYMBOL_GPL(bdev_read_folio_gfp);

I think we can just drop bdev_read_folio_gfp. Half of the callers simply
pass GPK_KERNEL, and the other half passes GFP_NOFS and could just use
memalloc_nofs_save().

> +void bdev_balance_dirty_pages_ratelimited(struct block_device *bdev)
> +{
> + return balance_dirty_pages_ratelimited(bdev->bd_inode->i_mapping);
> +}
> +EXPORT_SYMBOL_GPL(bdev_balance_dirty_pages_ratelimited);

Hmm, this is just used for block2mtd, and feels a little too low-level
to me, as block2mtd really should be using the normal fileread/write
APIs.  I guess we'll have to live with it for now if we want to expedite
killing off bd_inode.

> +void bdev_correlate_mapping(struct block_device *bdev,
> + struct address_space *mapping)
> +{
> + mapping->host = bdev->bd_inode;
> +}
> +EXPORT_SYMBOL_GPL(bdev_correlate_mapping);

Maybe associated insted of correlate?  Either way this basically
fully exposes the bdev inode again :(

> +gfp_t bdev_gfp_constraint(struct block_device *bdev, gfp_t gfp)
> +{
> + return mapping_gfp_constraint(bdev->bd_inode->i_mapping, gfp);
> +}
> +EXPORT_SYMBOL_GPL(bdev_gfp_constraint);

The right fix here is to:

 - use memalloc_nofs_save in extet instead of using
   mapping_gfp_constraint to clear it from the mapping flags
 - remove __ext4_sb_bread_gfp and just have buffer.c helper that does
   the right thing (either by changing the calling conventions of an
   existing one, or adding a new one).

> +/*
> + * The del_gendisk() function uninitializes the disk-specific data
> + * structures, including the bdi structure, without telling anyone
> + * else.  Once this happens, any attempt to call mark_buffer_dirty()
> + * (for example, by ext4_commit_super), will cause a kernel OOPS.
> + * This is a kludge to prevent these oops until we can put in a proper
> + * hook in del_gendisk() to inform the VFS and file system layers.
> + */
> +int bdev_ejected(struct block_device *bdev)
> +{
> + struct backing_dev_info *bdi = inode_to_bdi(bdev->bd_inode);
> +
> + return bdi->dev == NULL;
> +}
> +EXPORT_SYMBOL_GPL(bdev_ejected);

And this code in ext4 should just go away entirely.  The bdi should
always be valid for a live bdev for years.

> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1119,6 +1119,7 @@ void bio_add_folio_nofail(struct bio *bio, struct folio 
> *folio, size_t len,
>   WARN_ON_ONCE(off > UINT_MAX);
>   __bio_add_page(bio, >page, len, off);
>  }
> +EXPORT_SYMBOL_GPL(bio_add_folio_nofail);

How is this realted?  The export is fine, but really should be a
separate, well-documented commit.

>  
> +static inline u8 block_bits(struct block_device *bdev)
> +{
> + return bdev->bd_inode->i_blkbits;
> +}

Not sure we should need this.  i_blkbits comes from the blocksize
the fs set, so it should have other ways to get at it.



Re: [PATCH -next RFC 02/14] xen/blkback: use bdev api in xen_update_blkif_status()

2023-12-05 Thread Christoph Hellwig
On Tue, Dec 05, 2023 at 08:37:16PM +0800, Yu Kuai wrote:
> diff --git a/drivers/block/xen-blkback/xenbus.c 
> b/drivers/block/xen-blkback/xenbus.c
> index e34219ea2b05..e645afa4af57 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -104,8 +104,7 @@ static void xen_update_blkif_status(struct xen_blkif 
> *blkif)
>   xenbus_dev_error(blkif->be->dev, err, "block flush");
>   return;
>   }
> - invalidate_inode_pages2(
> - blkif->vbd.bdev_handle->bdev->bd_inode->i_mapping);
> + invalidate_bdev(blkif->vbd.bdev_handle->bdev);

blkbak is a bdev exported.   I don't think it should ever call
invalidate_inode_pages2, through a wrapper or not.




Re: [PATCH -next RFC 00/14] block: don't access bd_inode directly from other modules

2023-12-05 Thread Christoph Hellwig
On Tue, Dec 05, 2023 at 08:37:14PM +0800, Yu Kuai wrote:
> From: Yu Kuai 
> 
> Patch 1 add some bdev apis, then follow up patches will use these apis
> to avoid access bd_inode directly, and hopefully the field bd_inode can
> be removed eventually(after figure out a way for fs/buffer.c).

What tree is this against?  It fails to apply to either Jens'
for-6.8/block or Linus tree in the very first patch.




Re: [PATCH block/for-next v2 01/16] block: add a new helper to get inode from block_device

2023-11-27 Thread Christoph Hellwig
On Tue, Nov 28, 2023 at 09:35:56AM +0800, Yu Kuai wrote:
> Thanks for the advice! In case I'm understanding correctly, do you mean
> that all other fs/drivers that is using pages versions can safely switch
> to folio versions now?

If you never allocate a high-order folio pages are identical to folios.
So yes, we can do folio based interfaces only, and also use that as
an opportunity to convert over the callers.

> By the way, my orginal idea was trying to add a new field 'bd_flags'
> in block_devcie, and then add a new bit so that bio_check_ro() will
> only warn once for each partition. Now that this patchset will be quite
> complex, I'll add a new bool field 'bd_ro_warned' to fix the above
> problem first, and then add 'bd_flags' once this patchset is done.

Yes, please do a minimal version if you can find space where the
rmw cycles don't cause damage to neighbouring fields.  Or just leave
the current set of warnings in if it's too hard.




Re: [PATCH block/for-next v2 01/16] block: add a new helper to get inode from block_device

2023-11-27 Thread Christoph Hellwig
On Mon, Nov 27, 2023 at 09:07:22PM +0800, Yu Kuai wrote:
> 1) Is't okay to add a new helper to pass in bdev for following apis?


For some we already have them (e.g. bdev_nr_bytes to read the bdev)
size, for some we need to add them.  The big thing that seems to
stick out is page cache API, and I think that is where we need to
define maintainable APIs for file systems and others to use the
block device page cache.  Probably only in folio versions and not
pages once if we're touching the code anyay

> 2) For the file fs/buffer.c, there are some special usage like
> following that I don't think it's good to add a helper:
> 
> spin_lock(_inode->i_mapping->private_lock);
> 
> Is't okay to move following apis from fs/buffer.c directly to
> block/bdev.c?
> 
> __find_get_block
> bdev_getblk

I'm not sure moving is a good idea, but we might end up the
some kind of low-level access from buffer.c, be that special
helpers, a separate header or something else.  Let's sort out
the rest of the kernel first.




Re: [PATCH block/for-next v2 01/16] block: add a new helper to get inode from block_device

2023-11-26 Thread Christoph Hellwig
On Mon, Nov 27, 2023 at 02:21:01PM +0800, Yu Kuai wrote:
> From: Yu Kuai 
> 
> block_devcie is allocated from bdev_alloc() by bdev_alloc_inode(), and
> currently block_device contains a pointer that point to the address of
> inode, while such inode is allocated together:

This is going the wrong way.  Nothing outside of core block layer code
should ever directly use the bdev inode.  We've been rather sloppy
and added a lot of direct reference to it, but they really need to
go away and be replaced with well defined high level operation on
struct block_device.  Once that is done we can remove the bd_inode
pointer, but replacing it with something that pokes even more deeply
into bdev internals is a bad idea.



Re: [PATCH v2] swiotlb-xen: provide the "max_mapping_size" method

2023-11-07 Thread Christoph Hellwig
Looks good:

Reviewed-by: Christoph Hellwig 



Re: [PATCH v7 9/9] swiotlb: search the software IO TLB only if the device makes use of it

2023-09-08 Thread Christoph Hellwig
On Thu, Sep 07, 2023 at 01:12:23PM +0200, Petr Tesařík wrote:
> Hi all,
> 
> sorry for my late reply; I've been away from my work setup for a
> month...

Please take a look at:

https://lore.kernel.org/linux-iommu/20230905064441.127588-1-...@lst.de/T/#u




Re: [PATCH v7 9/9] swiotlb: search the software IO TLB only if the device makes use of it

2023-08-31 Thread Christoph Hellwig
On Wed, Aug 09, 2023 at 03:20:43PM -0600, Jonathan Corbet wrote:
> > spin_unlock_irqrestore(>dma_io_tlb_lock, flags);
> >  
> > -   /* Pairs with smp_rmb() in swiotlb_find_pool(). */
> > -   smp_wmb();
> >  found:
> > +   dev->dma_uses_io_tlb = true;
> > +   /* Pairs with smp_rmb() in is_swiotlb_buffer() */
> > +   smp_wmb();
> > +
> 
> ...and here you set it if swiotlb is used.
> 
> But, as far as I can tell, you don't actually *use* this field anywhere.
> What am I missing?

It's very much unused.  Petr, I guess you wanted to use this in
is_swiotlb_buffer to avoid the lookup unless required.  Can you send
a follow up?



Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle

2023-08-28 Thread Christoph Hellwig
On Sat, Aug 26, 2023 at 03:28:52AM +0100, Al Viro wrote:
> I mean, look at claim_swapfile() for example:
> p->bdev = blkdev_get_by_dev(inode->i_rdev,
>FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
> if (IS_ERR(p->bdev)) {
> error = PTR_ERR(p->bdev);
> p->bdev = NULL;
> return error;
> }
> p->old_block_size = block_size(p->bdev);
> error = set_blocksize(p->bdev, PAGE_SIZE);
> if (error < 0)
> return error;
> we already have the file opened, and we keep it opened all the way until
> the swapoff(2); here we have noticed that it's a block device and we
>   * open the fucker again (by device number), this time claiming
> it with our swap_info_struct as holder, to be closed at swapoff(2) time
> (just before we close the file)

Note that some drivers look at FMODE_EXCL/BLK_OPEN_EXCL in ->open.
These are probably bogus and maybe we want to kill them, but that will
need an audit first.

> BTW, what happens if two threads call ioctl(fd, BLKBSZSET, )
> for the same descriptor that happens to have been opened O_EXCL?
> Without O_EXCL they would've been unable to claim the sucker at the same
> time - the holder we are using is the address of a function argument,
> i.e. something that points to kernel stack of the caller.  Those would
> conflict and we either get set_blocksize() calls fully serialized, or
> one of the callers would eat -EBUSY.  Not so in "opened with O_EXCL"
> case - they can very well overlap and IIRC set_blocksize() does *not*
> expect that kind of crap...  It's all under CAP_SYS_ADMIN, so it's not
> as if it was a meaningful security hole anyway, but it does look fishy.

The user get to keep the pieces..  BLKBSZSET is kinda bogus anyway
as the soft blocksize only matters for buffer_head-like I/O, and
there only for file systems.  Not idea why anyone would set it manually.



Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle

2023-08-28 Thread Christoph Hellwig
On Fri, Aug 25, 2023 at 03:47:56PM +0200, Jan Kara wrote:
> I can see the appeal of not having to introduce the new bdev_handle type
> and just using struct file which unifies in-kernel and userspace block
> device opens. But I can see downsides too - the last fput() happening from
> task work makes me a bit nervous whether it will not break something
> somewhere with exclusive bdev opens. Getting from struct file to bdev is
> somewhat harder but I guess a helper like F_BDEV() would solve that just
> fine.
> 
> So besides my last fput() worry about I think this could work and would be
> probably a bit nicer than what I have. But before going and redoing the whole
> series let me gather some more feedback so that we don't go back and forth.
> Christoph, Christian, Jens, any opinion?

I did think about the file a bit.  The fact that we'd need something
like an anon_file for the by_dev open was always a huge turn off for
me, but maybe my concern is overblown.  Having a struct file would
actually be really useful for a bunch of users.




Re: [PATCH v2 0/29] block: Make blkdev_get_by_*() return handle

2023-08-11 Thread Christoph Hellwig
Except for a mostly cosmetic nitpick this looks good to me:

Acked-by: Christoph Hellwig 

That's not eactly the deep review I'd like to do, but as I'm about to
head out for vacation that's probably as good as it gets.



Re: [PATCH v7 0/9] Allow dynamic allocation of software IO TLB bounce buffers

2023-08-01 Thread Christoph Hellwig
Thanks,

I've applied this to a new swiotlb-dynamic branch that I'll pull into
the dma-mapping for-next tree.




Re: [PATCH v6 0/9] Allow dynamic allocation of software IO TLB bounce buffers

2023-07-31 Thread Christoph Hellwig
I was just going to apply this, but patch 1 seems to have a non-trivial
conflict with the is_swiotlb_active removal in pci-dma.c.  Can you resend
against the current dma-mapping for-next tree?



Re: [PATCH 01/32] block: Provide blkdev_get_handle_* functions

2023-07-31 Thread Christoph Hellwig
On Mon, Jul 31, 2023 at 12:50:34PM +0200, Jan Kara wrote:
> I think the bdev_handle name is fine for the struct. After all it is
> equivalent of an open handle for the block device so IMHO bdev_handle
> captures that better than bdev_ctx.

Agreed.



Re: [PATCH v4 0/8] Allow dynamic allocation of software IO TLB bounce buffers

2023-07-20 Thread Christoph Hellwig
On Thu, Jul 20, 2023 at 10:13:20AM +0200, Petr Tesařík wrote:
> Fine with me. I removed it after all my testing showed no performance
> impact as long as the size of the initial SWIOTLB is kept at the
> default value (and sufficient for the workload), but it's OK for me if
> dynamic SWIOTLB allocations are off by default.
> 
> OTOH I'd like to make it a boot-time option rather than build-time
> option. Would that be OK for you?

I'd really like the config option to not even build the code.  But
a boot time option sounds very useful in addition to that.



Re: [PATCH v4 8/8] swiotlb: search the software IO TLB only if a device makes use of it

2023-07-20 Thread Christoph Hellwig
On Thu, Jul 20, 2023 at 10:02:38AM +0200, Petr Tesařík wrote:
> On Thu, 20 Jul 2023 08:47:44 +0200
> Christoph Hellwig  wrote:
> 
> > Any reason this can't just do a list_empty_careful on the list
> > instead of adding yet another field that grows struct device?
> 
> On which list?

dev->dma_io_tlb_mem->pools?

> 
> The dma_io_tlb_pools list only contains transient pools, but a device
> may use bounce buffers from a regular pool.

Oh, true.

> The dma_io_tlb_mem.pools list will always be non-empty, unless the
> system runs without SWIOTLB.
> 
> On a system which does have a SWIOTLB, the flag allows to differentiate
> between devices that actually use bounce buffers and devices that do
> not (e.g. because they do not have any addressing limitations).

Ok.



Re: [PATCH v4 2/8] swiotlb: add documentation and rename swiotlb_do_find_slots()

2023-07-20 Thread Christoph Hellwig
On Thu, Jul 20, 2023 at 09:56:09AM +0200, Petr Tesařík wrote:
> On Thu, 20 Jul 2023 08:38:19 +0200
> Christoph Hellwig  wrote:
> 
> > On Thu, Jul 13, 2023 at 05:23:13PM +0200, Petr Tesarik wrote:
> > > From: Petr Tesarik 
> > > 
> > > Add some kernel-doc comments and move the existing documentation of struct
> > > io_tlb_slot to its correct location. The latter was forgotten in commit
> > > 942a8186eb445 ("swiotlb: move struct io_tlb_slot to swiotlb.c").
> > > 
> > > Use the opportunity to give swiotlb_do_find_slots() a more descriptive
> > > name, which makes it clear how it differs from swiotlb_find_slots().  
> > 
> > Please keep the swiotlb_ prefix.  Otherwise this looks good to me.
> 
> Will do. Out of curiosity, why does it matter for a static (file-local)
> function?

Because it makes looking at stack traces much easier.



Re: [PATCH v4 0/8] Allow dynamic allocation of software IO TLB bounce buffers

2023-07-20 Thread Christoph Hellwig
Just to add a highlevel comment here after I feel like I need a little
more time to review the guts.

I'm still pretty concerned about the extra list that needs to be
consulted in is_swiotlb_buffer, but I can't really think of
anything better.  Maybe an xarray has better cache characteristics,
but that one requires even more allocations in the low-level dma map
path.

One thing I'd like to see for the next version is to make the
new growing code a config option at least for now.  It is a pretty
big change of the existing swiotlb behavior, and I want people to opt
into it conciously.  Maybe we can drop the option again after a few
years once everything has settled.



Re: [PATCH v4 8/8] swiotlb: search the software IO TLB only if a device makes use of it

2023-07-20 Thread Christoph Hellwig
Any reason this can't just do a list_empty_careful on the list
instead of adding yet another field that grows struct device?



Re: [PATCH v4 2/8] swiotlb: add documentation and rename swiotlb_do_find_slots()

2023-07-20 Thread Christoph Hellwig
On Thu, Jul 13, 2023 at 05:23:13PM +0200, Petr Tesarik wrote:
> From: Petr Tesarik 
> 
> Add some kernel-doc comments and move the existing documentation of struct
> io_tlb_slot to its correct location. The latter was forgotten in commit
> 942a8186eb445 ("swiotlb: move struct io_tlb_slot to swiotlb.c").
> 
> Use the opportunity to give swiotlb_do_find_slots() a more descriptive
> name, which makes it clear how it differs from swiotlb_find_slots().

Please keep the swiotlb_ prefix.  Otherwise this looks good to me.




Re: [PATCH v4 1/8] swiotlb: make io_tlb_default_mem local to swiotlb.c

2023-07-20 Thread Christoph Hellwig
On Thu, Jul 13, 2023 at 05:23:12PM +0200, Petr Tesarik wrote:
> From: Petr Tesarik 
> 
> SWIOTLB implementation details should not be exposed to the rest of the
> kernel. This will allow to make changes to the implementation without
> modifying non-swiotlb code.
> 
> To avoid breaking existing users, provide helper functions for the few
> required fields.
> 
> As a bonus, using a helper function to initialize struct device allows to
> get rid of an #ifdef in driver core.
> 
> Signed-off-by: Petr Tesarik 
> ---
>  arch/arm/xen/mm.c  |  2 +-
>  arch/mips/pci/pci-octeon.c |  2 +-
>  arch/x86/kernel/pci-dma.c  |  2 +-
>  drivers/base/core.c|  4 +---
>  drivers/xen/swiotlb-xen.c  |  2 +-
>  include/linux/swiotlb.h| 25 +++-
>  kernel/dma/swiotlb.c   | 39 +-
>  7 files changed, 67 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
> index 3d826c0b5fee..0f32c14eb786 100644
> --- a/arch/arm/xen/mm.c
> +++ b/arch/arm/xen/mm.c
> @@ -125,7 +125,7 @@ static int __init xen_mm_init(void)
>   return 0;
>  
>   /* we can work with the default swiotlb */
> - if (!io_tlb_default_mem.nslabs) {
> + if (!is_swiotlb_allocated()) {
>   rc = swiotlb_init_late(swiotlb_size_or_default(),
>  xen_swiotlb_gfp(), NULL);
>   if (rc < 0)

I'd much rather move the already initialized check into
swiotlb_init_late, which is a much cleaer interface.

>   /* we can work with the default swiotlb */
> - if (!io_tlb_default_mem.nslabs) {
> + if (!is_swiotlb_allocated()) {
>   int rc = swiotlb_init_late(swiotlb_size_or_default(),
>  GFP_KERNEL, xen_swiotlb_fixup);
>   if (rc < 0)

.. and would take care of this one as well.

> +bool is_swiotlb_allocated(void)
> +{
> + return !!io_tlb_default_mem.nslabs;

Nit: no need for the !!, we can rely on the implicit promotion to
bool.  But with the suggestion above the need for this helper
should go away anyway.



Re: [PATCH 01/32] block: Provide blkdev_get_handle_* functions

2023-07-07 Thread Christoph Hellwig
On Thu, Jul 06, 2023 at 06:14:33PM +0200, Jan Kara wrote:
> > struct bdev_handle *bdev_open_by_path(dev_t dev, blk_mode_t mode,
> > void *holder, const struct blk_holder_ops *hops);
> > void bdev_release(struct bdev_handle *handle);
> 
> I'd maybe use bdev_close() instead of bdev_release() but otherwise I like
> the new naming.

We're using release everywhese else, but if Jens is fine with that I
can live with close.



Re: [PATCH 01/32] block: Provide blkdev_get_handle_* functions

2023-07-06 Thread Christoph Hellwig
On Tue, Jul 04, 2023 at 02:21:28PM +0200, Jan Kara wrote:
> Create struct bdev_handle that contains all parameters that need to be
> passed to blkdev_put() and provide blkdev_get_handle_* functions that
> return this structure instead of plain bdev pointer. This will
> eventually allow us to pass one more argument to blkdev_put() without
> too much hassle.

Can we use the opportunity to come up with better names?  blkdev_get_*
was always a rather horrible naming convention for something that
ends up calling into ->open.

What about:

struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
const struct blk_holder_ops *hops);
struct bdev_handle *bdev_open_by_path(dev_t dev, blk_mode_t mode,
void *holder, const struct blk_holder_ops *hops);
void bdev_release(struct bdev_handle *handle);

?



Re: [PATCH RFC 0/32] block: Make blkdev_get_by_*() return handle

2023-07-06 Thread Christoph Hellwig
On Tue, Jul 04, 2023 at 02:21:27PM +0200, Jan Kara wrote:
> Hello,
> 
> this patch series implements the idea of blkdev_get_by_*() calls returning
> bdev_handle which is then passed to blkdev_put() [1]. This makes the get
> and put calls for bdevs more obviously matching and allows us to propagate
> context from get to put without having to modify all the users (again!).
> In particular I need to propagate used open flags to blkdev_put() to be able
> count writeable opens and add support for blocking writes to mounted block
> devices. I'll send that series separately.
> 
> The series is based on Linus' tree as of yesterday + two bcache fixes which 
> are
> in the block tree. Patches have passed some basic testing, I plan to test more
> users once we agree this is the right way to go.

Can you post a link to a git branch for this and the follow up series?
Especially with a fairly unstable base it's kinda hard to look at the
result otherwise.



Re: [PATCH v3 1/7] swiotlb: make io_tlb_default_mem local to swiotlb.c

2023-06-27 Thread Christoph Hellwig
On Tue, Jun 27, 2023 at 01:30:06PM +0200, Petr Tesařík wrote:
> Xen is the only user of an "is SWIOTLB present" interface. IIUC Xen
> needs bounce buffers for the PCI frontend driver, but if there is no
> other reason to have a SWIOTLB, the system does not set up one at boot.

Please take a look at my "unexport swiotlb_active v2" series that
unfortunately missed the 6.5 merge window waiting for reviews.



Re: unexport swiotlb_active v2

2023-06-19 Thread Christoph Hellwig
Any comments?  I'd really like to finish this off this merge window..



[PATCH 3/3] swiotlb: unexport is_swiotlb_active

2023-06-12 Thread Christoph Hellwig
Drivers have no business looking at dma-mapping or swiotlb internals.

Signed-off-by: Christoph Hellwig 
---
 kernel/dma/swiotlb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 775f7bb10ab184..1891faa3a6952e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -932,7 +932,6 @@ bool is_swiotlb_active(struct device *dev)
 
return mem && mem->nslabs;
 }
-EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
 #ifdef CONFIG_DEBUG_FS
 
-- 
2.39.2




[PATCH 1/3] xen/pci: add flag for PCI passthrough being possible

2023-06-12 Thread Christoph Hellwig
From: Juergen Gross 

When running as a Xen PV guests passed through PCI devices only have a
chance to work if the Xen supplied memory map has some PCI space
reserved.

Add a flag xen_pv_pci_possible which will be set in early boot in case
the memory map has at least one area with the type E820_TYPE_RESERVED.

Signed-off-by: Juergen Gross 
Signed-off-by: Christoph Hellwig 
---
 arch/x86/xen/setup.c | 6 ++
 include/xen/xen.h| 6 ++
 2 files changed, 12 insertions(+)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index c2be3efb2ba0fa..716f76c4141651 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -43,6 +43,9 @@ struct xen_memory_region 
xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
 
+/* Memory map would allow PCI passthrough. */
+bool xen_pv_pci_possible;
+
 /* E820 map used during setting up memory. */
 static struct e820_table xen_e820_table __initdata;
 
@@ -804,6 +807,9 @@ char * __init xen_memory_setup(void)
chunk_size = size;
type = xen_e820_table.entries[i].type;
 
+   if (type == E820_TYPE_RESERVED)
+   xen_pv_pci_possible = true;
+
if (type == E820_TYPE_RAM) {
if (addr < mem_end) {
chunk_size = min(size, mem_end - addr);
diff --git a/include/xen/xen.h b/include/xen/xen.h
index 0efeb652f9b8fb..5eb0a974a11e7e 100644
--- a/include/xen/xen.h
+++ b/include/xen/xen.h
@@ -29,6 +29,12 @@ extern bool xen_pvh;
 
 extern uint32_t xen_start_flags;
 
+#ifdef CONFIG_XEN_PV
+extern bool xen_pv_pci_possible;
+#else
+#define xen_pv_pci_possible0
+#endif
+
 #include 
 extern struct hvm_start_info pvh_start_info;
 
-- 
2.39.2




unexport swiotlb_active v2

2023-06-12 Thread Christoph Hellwig
Hi all,

this little series removes the last swiotlb API exposed to modules.

Changes since v1:
 - add a patch from Juergen to export if the e820 table indicates Xen PV
   PCI is enabled
 - slightly reorganize the logic to check if swiotlb is needed for
   Xen/x86
 - drop the already merged nouveau patch

Diffstat:
 arch/x86/include/asm/xen/swiotlb-xen.h |6 --
 arch/x86/kernel/pci-dma.c  |   29 +++--
 arch/x86/xen/setup.c   |6 ++
 drivers/pci/xen-pcifront.c |6 --
 include/xen/xen.h  |6 ++
 kernel/dma/swiotlb.c   |1 -
 6 files changed, 19 insertions(+), 35 deletions(-)



[PATCH 2/3] x86: always initialize xen-swiotlb when xen-pcifront is enabling

2023-06-12 Thread Christoph Hellwig
Remove the dangerous late initialization of xen-swiotlb in
pci_xen_swiotlb_init_late and instead just always initialize
xen-swiotlb in the boot code if CONFIG_XEN_PCIDEV_FRONTEND is
enabled and Xen PV PCI is possible.

Signed-off-by: Christoph Hellwig 
---
 arch/x86/include/asm/xen/swiotlb-xen.h |  6 --
 arch/x86/kernel/pci-dma.c  | 29 +++---
 drivers/pci/xen-pcifront.c |  6 --
 3 files changed, 7 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h 
b/arch/x86/include/asm/xen/swiotlb-xen.h
index 77a2d19cc9909e..abde0f44df57dc 100644
--- a/arch/x86/include/asm/xen/swiotlb-xen.h
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -2,12 +2,6 @@
 #ifndef _ASM_X86_SWIOTLB_XEN_H
 #define _ASM_X86_SWIOTLB_XEN_H
 
-#ifdef CONFIG_SWIOTLB_XEN
-extern int pci_xen_swiotlb_init_late(void);
-#else
-static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; }
-#endif
-
 int xen_swiotlb_fixup(void *buf, unsigned long nslabs);
 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
unsigned int address_bits,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index de6be0a3965ee4..f323d83e40a70b 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -72,9 +72,15 @@ static inline void __init pci_swiotlb_detect(void)
 #endif /* CONFIG_SWIOTLB */
 
 #ifdef CONFIG_SWIOTLB_XEN
+static bool xen_swiotlb_enabled(void)
+{
+   return xen_initial_domain() || x86_swiotlb_enable ||
+   (IS_ENABLED(CONFIG_XEN_PCIDEV_FRONTEND) && xen_pv_pci_possible);
+}
+
 static void __init pci_xen_swiotlb_init(void)
 {
-   if (!xen_initial_domain() && !x86_swiotlb_enable)
+   if (!xen_swiotlb_enabled())
return;
x86_swiotlb_enable = true;
x86_swiotlb_flags |= SWIOTLB_ANY;
@@ -83,27 +89,6 @@ static void __init pci_xen_swiotlb_init(void)
if (IS_ENABLED(CONFIG_PCI))
pci_request_acs();
 }
-
-int pci_xen_swiotlb_init_late(void)
-{
-   if (dma_ops == _swiotlb_dma_ops)
-   return 0;
-
-   /* we can work with the default swiotlb */
-   if (!io_tlb_default_mem.nslabs) {
-   int rc = swiotlb_init_late(swiotlb_size_or_default(),
-  GFP_KERNEL, xen_swiotlb_fixup);
-   if (rc < 0)
-   return rc;
-   }
-
-   /* XXX: this switches the dma ops under live devices! */
-   dma_ops = _swiotlb_dma_ops;
-   if (IS_ENABLED(CONFIG_PCI))
-   pci_request_acs();
-   return 0;
-}
-EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
 #else
 static inline void __init pci_xen_swiotlb_init(void)
 {
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 83c0ab50676dff..11636634ae512f 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -22,7 +22,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #include 
@@ -669,11 +668,6 @@ static int pcifront_connect_and_init_dma(struct 
pcifront_device *pdev)
 
spin_unlock(_dev_lock);
 
-   if (!err && !is_swiotlb_active(>xdev->dev)) {
-   err = pci_xen_swiotlb_init_late();
-   if (err)
-   dev_err(>xdev->dev, "Could not setup SWIOTLB!\n");
-   }
return err;
 }
 
-- 
2.39.2




Re: [PATCH 2/4] x86: always initialize xen-swiotlb when xen-pcifront is enabling

2023-06-12 Thread Christoph Hellwig
Thank you.  I'll queue it up as a separate patch.




Re: [PATCH 2/4] x86: always initialize xen-swiotlb when xen-pcifront is enabling

2023-06-12 Thread Christoph Hellwig
On Fri, Jun 09, 2023 at 05:38:28PM +0200, Juergen Gross wrote:
>>> guest started with e820_host=1 even if no PCI passthrough was planned.
>>> But this should be rather rare (at least I hope so).
>>
>> So is this an ACK for the patch and can we go ahead with it?
>
> As long as above mentioned check of the E820 map is done, yes.
>
> If you want I can send a diff to be folded into your patch on Monday.

Yes, that would be great!




Re: [PATCH 3/4] Add strict version of vsscanf()

2023-06-10 Thread Christoph Hellwig
This needs a real commit message explaining what the strict version
does and why, and why we can't just make the normal version more
strict.




Re: [PATCH 2/4] vsscanf(): Return -ERANGE on integer overflow

2023-06-10 Thread Christoph Hellwig
[Adding Richard and Linus as they're having another overflow checking
discussion and we should probably merge those]

On Fri, Jun 09, 2023 at 10:57:57PM -0400, Demi Marie Obenour wrote:
> Userspace sets errno to ERANGE, but the kernel can't do that.

That seems like a very parse commit log, and also kinda besides
the point - the kernel always returns error in-line and not through
errno.  I think you need to document here why we want to do the
overflow checking (not that I doubt it, but it really needs to be
in the commit message).

Leaving the rest of the quote here for the new arrivals.

> 
> Signed-off-by: Demi Marie Obenour 
> ---
>  include/linux/limits.h  |  1 +
>  include/linux/mfd/wl1273-core.h |  3 --
>  include/vdso/limits.h   |  3 ++
>  lib/vsprintf.c  | 80 -
>  4 files changed, 63 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/limits.h b/include/linux/limits.h
> index 
> f6bcc936901071f496e3e85bb6e1d93905b12e32..8f7fd85b41fb46e6992d9e5912da00424119227a
>  100644
> --- a/include/linux/limits.h
> +++ b/include/linux/limits.h
> @@ -8,6 +8,7 @@
>  
>  #define SIZE_MAX (~(size_t)0)
>  #define SSIZE_MAX((ssize_t)(SIZE_MAX >> 1))
> +#define SSIZE_MIN(-SSIZE_MAX - 1)
>  #define PHYS_ADDR_MAX(~(phys_addr_t)0)
>  
>  #define U8_MAX   ((u8)~0U)
> diff --git a/include/linux/mfd/wl1273-core.h b/include/linux/mfd/wl1273-core.h
> index 
> c28cf76d5c31ee1c94a9319a2e2d318bf00283a6..b81a229135ed9f756c749122a8341816031c8311
>  100644
> --- a/include/linux/mfd/wl1273-core.h
> +++ b/include/linux/mfd/wl1273-core.h
> @@ -204,9 +204,6 @@
>WL1273_IS2_TRI_OPT | \
>WL1273_IS2_RATE_48K)
>  
> -#define SCHAR_MIN (-128)
> -#define SCHAR_MAX 127
> -
>  #define WL1273_FR_EVENT  BIT(0)
>  #define WL1273_BL_EVENT  BIT(1)
>  #define WL1273_RDS_EVENT BIT(2)
> diff --git a/include/vdso/limits.h b/include/vdso/limits.h
> index 
> 0197888ad0e00b2f853d3f25ffa764f61cca7385..0cad0a2490e5efc194d874025eb3e3b846a5c7b4
>  100644
> --- a/include/vdso/limits.h
> +++ b/include/vdso/limits.h
> @@ -2,6 +2,9 @@
>  #ifndef __VDSO_LIMITS_H
>  #define __VDSO_LIMITS_H
>  
> +#define UCHAR_MAX((unsigned char)~0U)
> +#define SCHAR_MAX((signed char)(UCHAR_MAX >> 1))
> +#define SCHAR_MIN((signed char)(-SCHAR_MAX - 1))
>  #define USHRT_MAX((unsigned short)~0U)
>  #define SHRT_MAX ((short)(USHRT_MAX >> 1))
>  #define SHRT_MIN ((short)(-SHRT_MAX - 1))
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index 
> a60d348efb276d66ca07fe464883408df7fdab97..9846d2385f5b9e8f3945a5664d81047e97cf10d5
>  100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -59,7 +59,7 @@
>  bool no_hash_pointers __ro_after_init;
>  EXPORT_SYMBOL_GPL(no_hash_pointers);
>  
> -static noinline unsigned long long simple_strntoull(const char *startp, 
> size_t max_chars, char **endp, unsigned int base)
> +static noinline unsigned long long simple_strntoull(const char *startp, 
> size_t max_chars, char **endp, unsigned int base, bool *overflow)
>  {
>   const char *cp;
>   unsigned long long result = 0ULL;
> @@ -71,6 +71,8 @@ static noinline unsigned long long simple_strntoull(const 
> char *startp, size_t m
>   if (prefix_chars < max_chars) {
>   rv = _parse_integer_limit(cp, base, , max_chars - 
> prefix_chars);
>   /* FIXME */
> + if (overflow)
> + *overflow = !!(rv & KSTRTOX_OVERFLOW);
>   cp += (rv & ~KSTRTOX_OVERFLOW);
>   } else {
>   /* Field too short for prefix + digit, skip over without 
> converting */
> @@ -94,7 +96,7 @@ static noinline unsigned long long simple_strntoull(const 
> char *startp, size_t m
>  noinline
>  unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int 
> base)
>  {
> - return simple_strntoull(cp, INT_MAX, endp, base);
> + return simple_strntoull(cp, INT_MAX, endp, base, NULL);
>  }
>  EXPORT_SYMBOL(simple_strtoull);
>  
> @@ -130,18 +132,22 @@ long simple_strtol(const char *cp, char **endp, 
> unsigned int base)
>  EXPORT_SYMBOL(simple_strtol);
>  
>  static long long simple_strntoll(const char *cp, size_t max_chars, char 
> **endp,
> -  unsigned int base)
> +  unsigned int base, bool *overflow)
>  {
> + unsigned long long minand;
> + bool negate;
> +
>   /*
>* simple_strntoull() safely handles receiving max_chars==0 in the
>* case cp[0] == '-' && max_chars == 1.
>* If max_chars == 0 we can drop through and pass it to 
> simple_strntoull()
>* and the content of *cp is irrelevant.
>*/
> - if (*cp == '-' && max_chars > 0)
> - return -simple_strntoull(cp + 1, max_chars - 1, endp, base);
> -
> - return simple_strntoull(cp, max_chars, endp, base);
> + 

Re: [PATCH 1/4] Rip out simple_strtoll()

2023-06-10 Thread Christoph Hellwig
I'd maybe say remove instead of "rip out", but otherwise this looks
good:

Reviewed-by: Christoph Hellwig 




[PATCH, RFC] swiotlb-xen: fix dma to physical address translation for cache operations

2023-06-07 Thread Christoph Hellwig
All other places in swiotlb-xen got from PFN to BFN and then call
phys_to_dma on the result or vice versa, but the reverse mapping used
for cache maintenance skips the BFN to PFN mapping.

[Note: only found by code inspection, please review very carefully!]

Signed-off-by: Christoph Hellwig 
---
 drivers/xen/swiotlb-xen.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 67aa74d201627d..e4620303138b4d 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -234,7 +234,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, 
struct page *page,
 
 done:
if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
-   if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr
+   if (pfn_valid(PFN_DOWN(phys)))
arch_sync_dma_for_device(phys, size, dir);
else
xen_dma_sync_for_device(dev, dev_addr, size, dir);
@@ -258,7 +258,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, 
dma_addr_t dev_addr,
BUG_ON(dir == DMA_NONE);
 
if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
-   if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr
+   if (pfn_valid(PFN_DOWN(paddr)))
arch_sync_dma_for_cpu(paddr, size, dir);
else
xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir);
@@ -276,7 +276,7 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, 
dma_addr_t dma_addr,
phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr);
 
if (!dev_is_dma_coherent(dev)) {
-   if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr
+   if (pfn_valid(PFN_DOWN(paddr)))
arch_sync_dma_for_cpu(paddr, size, dir);
else
xen_dma_sync_for_cpu(dev, dma_addr, size, dir);
@@ -296,7 +296,7 @@ xen_swiotlb_sync_single_for_device(struct device *dev, 
dma_addr_t dma_addr,
swiotlb_sync_single_for_device(dev, paddr, size, dir);
 
if (!dev_is_dma_coherent(dev)) {
-   if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr
+   if (pfn_valid(PFN_DOWN(paddr)))
arch_sync_dma_for_device(paddr, size, dir);
else
xen_dma_sync_for_device(dev, dma_addr, size, dir);
-- 
2.39.2




Re: [PATCH 2/4] x86: always initialize xen-swiotlb when xen-pcifront is enabling

2023-06-07 Thread Christoph Hellwig
On Mon, May 22, 2023 at 10:37:09AM +0200, Juergen Gross wrote:
> In normal cases PCI passthrough in PV guests requires to start the guest
> with e820_host=1. So it should be rather easy to limit allocating the
> 64MB in PV guests to the cases where the memory map has non-RAM regions
> especially in the first 1MB of the memory.
>
> This will cover even hotplug cases. The only case not covered would be a
> guest started with e820_host=1 even if no PCI passthrough was planned.
> But this should be rather rare (at least I hope so).

So is this an ACK for the patch and can we go ahead with it?

(I'd still like to merge swiotlb-xen into swiotlb eventually, but it's
probably not going to happen this merge window)



Re: [PATCH 3/4] drm/nouveau: stop using is_swiotlb_active

2023-06-07 Thread Christoph Hellwig
On Thu, May 18, 2023 at 04:30:49PM -0400, Lyude Paul wrote:
> Reviewed-by: Lyude Paul 
> 
> Thanks for getting to this!

I've tentantively queued this up in the dma-mapping for-next tree.
Let me know if you'd prefer it to go through the nouveau tree.



Re: [PATCH 2/2] xen-blkback: Inform userspace that device has been opened

2023-06-07 Thread Christoph Hellwig
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -3,6 +3,20 @@
>  Copyright (C) 2005 Rusty Russell 
>  Copyright (C) 2005 XenSource Ltd
>  
> +In addition to the Xenstore nodes required by the Xen block device
> +specification, this implementation of blkback uses a new Xenstore
> +node: "opened".  blkback sets "opened" to "0" before the hotplug script
> +is called.  Once the device node has been opened, blkback sets "opened"
> +to "1".

This is a really odd comment style, and a really strange place for it.
To me it feels like this should just be a file in Documentation as it
relates to how to use the driver, and doesn't really explain the code.



Re: [PATCH 1/2] xen-blkback: Implement diskseq checks

2023-06-07 Thread Christoph Hellwig
On Thu, Jun 01, 2023 at 05:48:22PM -0400, Demi Marie Obenour wrote:
> + if (diskseq) {
> + struct gendisk *disk = bdev->bd_disk;
> +
> + if (unlikely(disk == NULL)) {
> + pr_err("%s: device %08x has no gendisk\n",
> +__func__, vbd->pdevice);
> + xen_vbd_free(vbd);
> + return -EFAULT;
> + }

bdev->bd_disk is never NULL.

> + diskseq_str = xenbus_read(XBT_NIL, dev->nodename, "diskseq", 
> _len);

Please avoid the overly long line.

> + if (IS_ERR(diskseq_str)) {
> + int err = PTR_ERR(diskseq_str);
> + diskseq_str = NULL;
> +
> + /*
> +  * If this does not exist, it means legacy userspace that does 
> not

.. even more so in comments.

> +  * support diskseq.
> +  */
> + if (unlikely(!XENBUS_EXIST_ERR(err))) {
> + xenbus_dev_fatal(dev, err, "reading diskseq");
> + return;
> + }
> + diskseq = 0;
> + } else if (diskseq_len <= 0) {
> + xenbus_dev_fatal(dev, -EFAULT, "diskseq must not be empty");
> + goto fail;
> + } else if (diskseq_len > 16) {

No need for a else after a return.

> + xenbus_dev_fatal(dev, -ERANGE, "diskseq too long: got %d but 
> limit is 16",
> +  diskseq_len);
> + goto fail;
> + } else if (diskseq_str[0] == '0') {
> + xenbus_dev_fatal(dev, -ERANGE, "diskseq must not start with 
> '0'");
> + goto fail;
> + } else {
> + char *diskseq_end;
> + diskseq = simple_strtoull(diskseq_str, _end, 16);
> + if (diskseq_end != diskseq_str + diskseq_len) {
> + xenbus_dev_fatal(dev, -EINVAL, "invalid diskseq");
> + goto fail;
> + }
> + kfree(diskseq_str);
> + diskseq_str = NULL;
> + }

And I suspect the code will be a lot easier to follow if you move
the diskseq validation into a separate helper.



Re: [dm-devel] [PATCH v2 00/16] Diskseq support in loop, device-mapper, and blkback

2023-05-31 Thread Christoph Hellwig
On Tue, May 30, 2023 at 04:31:00PM -0400, Demi Marie Obenour wrote:
> This work aims to allow userspace to create and destroy block devices
> in a race-free way, and to allow them to be exposed to other Xen VMs via
> blkback without races.
> 
> Changes since v1:
> 
> - Several device-mapper fixes added.

Let's get these reviewed by the DM maintainers independently.  This
series is mixing up way too many things.



Re: [PATCH 2/4] x86: always initialize xen-swiotlb when xen-pcifront is enabling

2023-05-20 Thread Christoph Hellwig
On Fri, May 19, 2023 at 02:58:57PM +0200, Christoph Hellwig wrote:
> On Fri, May 19, 2023 at 01:49:46PM +0100, Andrew Cooper wrote:
> > > The alternative would be to finally merge swiotlb-xen into swiotlb, in
> > > which case we might be able to do this later.  Let me see what I can
> > > do there.
> > 
> > If that is an option, it would be great to reduce the special-cashing.
> 
> I think it's doable, and I've been wanting it for a while.  I just
> need motivated testers, but it seems like I just found at least two :)

So looking at swiotlb-xen it does these off things where it takes a value
generated originally be xen_phys_to_dma, then only does a dma_to_phys
to go back and call pfn_valid on the result.  Does this make sense, or
is it wrong and just works by accident?  I.e. is the patch below correct?


diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 67aa74d201627d..3396c5766f0dd8 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -90,9 +90,7 @@ static inline int range_straddles_page_boundary(phys_addr_t 
p, size_t size)
 
 static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr)
 {
-   unsigned long bfn = XEN_PFN_DOWN(dma_to_phys(dev, dma_addr));
-   unsigned long xen_pfn = bfn_to_local_pfn(bfn);
-   phys_addr_t paddr = (phys_addr_t)xen_pfn << XEN_PAGE_SHIFT;
+   phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr);
 
/* If the address is outside our domain, it CAN
 * have the same virtual address as another address
@@ -234,7 +232,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, 
struct page *page,
 
 done:
if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
-   if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr
+   if (pfn_valid(PFN_DOWN(phys)))
arch_sync_dma_for_device(phys, size, dir);
else
xen_dma_sync_for_device(dev, dev_addr, size, dir);
@@ -258,7 +256,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, 
dma_addr_t dev_addr,
BUG_ON(dir == DMA_NONE);
 
if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
-   if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr
+   if (pfn_valid(PFN_DOWN(paddr)))
arch_sync_dma_for_cpu(paddr, size, dir);
else
xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir);
@@ -276,7 +274,7 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, 
dma_addr_t dma_addr,
phys_addr_t paddr = xen_dma_to_phys(dev, dma_addr);
 
if (!dev_is_dma_coherent(dev)) {
-   if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr
+   if (pfn_valid(PFN_DOWN(paddr)))
arch_sync_dma_for_cpu(paddr, size, dir);
else
xen_dma_sync_for_cpu(dev, dma_addr, size, dir);
@@ -296,7 +294,7 @@ xen_swiotlb_sync_single_for_device(struct device *dev, 
dma_addr_t dma_addr,
swiotlb_sync_single_for_device(dev, paddr, size, dir);
 
if (!dev_is_dma_coherent(dev)) {
-   if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr
+   if (pfn_valid(PFN_DOWN(paddr)))
arch_sync_dma_for_device(paddr, size, dir);
else
xen_dma_sync_for_device(dev, dma_addr, size, dir);



Re: [PATCH 2/4] x86: always initialize xen-swiotlb when xen-pcifront is enabling

2023-05-19 Thread Christoph Hellwig
On Fri, May 19, 2023 at 01:49:46PM +0100, Andrew Cooper wrote:
> > The alternative would be to finally merge swiotlb-xen into swiotlb, in
> > which case we might be able to do this later.  Let me see what I can
> > do there.
> 
> If that is an option, it would be great to reduce the special-cashing.

I think it's doable, and I've been wanting it for a while.  I just
need motivated testers, but it seems like I just found at least two :)



Re: [PATCH 2/4] x86: always initialize xen-swiotlb when xen-pcifront is enabling

2023-05-19 Thread Christoph Hellwig
On Fri, May 19, 2023 at 12:10:26PM +0200, Marek Marczykowski-Górecki wrote:
> While I would say PCI passthrough is not very common for PV guests, can
> the decision about xen-swiotlb be delayed until you can enumerate
> xenstore to check if there are any PCI devices connected (and not
> allocate xen-swiotlb by default if there are none)? This would
> still not cover the hotplug case (in which case, you'd need to force it
> with a cmdline), but at least you wouldn't loose much memory just
> because one of your VMs may use PCI passthrough (so, you have it enabled
> in your kernel).

How early can we query xenstore?  We'd need to do this before setting
up DMA for any device.

The alternative would be to finally merge swiotlb-xen into swiotlb, in
which case we might be able to do this later.  Let me see what I can
do there.



Re: [PATCH 2/4] x86: always initialize xen-swiotlb when xen-pcifront is enabling

2023-05-18 Thread Christoph Hellwig
On Thu, May 18, 2023 at 08:18:39PM +0200, Marek Marczykowski-Górecki wrote:
> On Thu, May 18, 2023 at 03:42:51PM +0200, Christoph Hellwig wrote:
> > Remove the dangerous late initialization of xen-swiotlb in
> > pci_xen_swiotlb_init_late and instead just always initialize
> > xen-swiotlb in the boot code if CONFIG_XEN_PCIDEV_FRONTEND is enabled.
> > 
> > Signed-off-by: Christoph Hellwig 
> 
> Doesn't it mean all the PV guests will basically waste 64MB of RAM
> by default each if they don't really have PCI devices?

If CONFIG_XEN_PCIDEV_FRONTEND is enabled, and the kernel's isn't booted
with swiotlb=noforce, yes.




[PATCH 1/4] x86: move a check out of pci_xen_swiotlb_init

2023-05-18 Thread Christoph Hellwig
Move the exact checks when to initialize the Xen swiotlb code out
of pci_xen_swiotlb_init and into the caller so that is uses readable
positive checks, rather than negative ones that will get even more
confusing with another addition.

Signed-off-by: Christoph Hellwig 
---
 arch/x86/kernel/pci-dma.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index de6be0a3965ee4..f887b08ac5ffe4 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -74,8 +74,6 @@ static inline void __init pci_swiotlb_detect(void)
 #ifdef CONFIG_SWIOTLB_XEN
 static void __init pci_xen_swiotlb_init(void)
 {
-   if (!xen_initial_domain() && !x86_swiotlb_enable)
-   return;
x86_swiotlb_enable = true;
x86_swiotlb_flags |= SWIOTLB_ANY;
swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup);
@@ -113,7 +111,8 @@ static inline void __init pci_xen_swiotlb_init(void)
 void __init pci_iommu_alloc(void)
 {
if (xen_pv_domain()) {
-   pci_xen_swiotlb_init();
+   if (xen_initial_domain() || x86_swiotlb_enable)
+   pci_xen_swiotlb_init();
return;
}
pci_swiotlb_detect();
-- 
2.39.2




[PATCH 3/4] drm/nouveau: stop using is_swiotlb_active

2023-05-18 Thread Christoph Hellwig
Drivers have no business looking into dma-mapping internals and check
what backend is used.  Unfortunstely the DRM core is still broken and
tries to do plain page allocations instead of using DMA API allocators
by default and uses various bandaids on when to use dma_alloc_coherent.

Switch nouveau to use the same (broken) scheme as amdgpu and radeon
to remove the last driver user of is_swiotlb_active.

Signed-off-by: Christoph Hellwig 
---
 drivers/gpu/drm/nouveau/nouveau_ttm.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c 
b/drivers/gpu/drm/nouveau/nouveau_ttm.c
index 1469a88910e45d..486f39f31a38df 100644
--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c
@@ -24,9 +24,9 @@
  */
 
 #include 
-#include 
 
 #include 
+#include 
 
 #include "nouveau_drv.h"
 #include "nouveau_gem.h"
@@ -265,7 +265,6 @@ nouveau_ttm_init(struct nouveau_drm *drm)
struct nvkm_pci *pci = device->pci;
struct nvif_mmu *mmu = >client.mmu;
struct drm_device *dev = drm->dev;
-   bool need_swiotlb = false;
int typei, ret;
 
ret = nouveau_ttm_init_host(drm, 0);
@@ -300,13 +299,10 @@ nouveau_ttm_init(struct nouveau_drm *drm)
drm->agp.cma = pci->agp.cma;
}
 
-#if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86)
-   need_swiotlb = is_swiotlb_active(dev->dev);
-#endif
-
ret = ttm_device_init(>ttm.bdev, _bo_driver, drm->dev->dev,
  dev->anon_inode->i_mapping,
- dev->vma_offset_manager, need_swiotlb,
+ dev->vma_offset_manager,
+ drm_need_swiotlb(drm->client.mmu.dmabits),
  drm->client.mmu.dmabits <= 32);
if (ret) {
NV_ERROR(drm, "error initialising bo driver, %d\n", ret);
-- 
2.39.2




unexport swiotlb_active

2023-05-18 Thread Christoph Hellwig
Hi all,

this little series removes the last swiotlb API exposed to modules.

Diffstat:
 arch/x86/include/asm/xen/swiotlb-xen.h |6 --
 arch/x86/kernel/pci-dma.c  |   28 
 drivers/gpu/drm/nouveau/nouveau_ttm.c  |   10 +++---
 drivers/pci/xen-pcifront.c |6 --
 kernel/dma/swiotlb.c   |1 -
 5 files changed, 7 insertions(+), 44 deletions(-)



[PATCH 2/4] x86: always initialize xen-swiotlb when xen-pcifront is enabling

2023-05-18 Thread Christoph Hellwig
Remove the dangerous late initialization of xen-swiotlb in
pci_xen_swiotlb_init_late and instead just always initialize
xen-swiotlb in the boot code if CONFIG_XEN_PCIDEV_FRONTEND is enabled.

Signed-off-by: Christoph Hellwig 
---
 arch/x86/include/asm/xen/swiotlb-xen.h |  6 --
 arch/x86/kernel/pci-dma.c  | 25 +++--
 drivers/pci/xen-pcifront.c |  6 --
 3 files changed, 3 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h 
b/arch/x86/include/asm/xen/swiotlb-xen.h
index 77a2d19cc9909e..abde0f44df57dc 100644
--- a/arch/x86/include/asm/xen/swiotlb-xen.h
+++ b/arch/x86/include/asm/xen/swiotlb-xen.h
@@ -2,12 +2,6 @@
 #ifndef _ASM_X86_SWIOTLB_XEN_H
 #define _ASM_X86_SWIOTLB_XEN_H
 
-#ifdef CONFIG_SWIOTLB_XEN
-extern int pci_xen_swiotlb_init_late(void);
-#else
-static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; }
-#endif
-
 int xen_swiotlb_fixup(void *buf, unsigned long nslabs);
 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
unsigned int address_bits,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index f887b08ac5ffe4..c4a7ead9eb674e 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -81,27 +81,6 @@ static void __init pci_xen_swiotlb_init(void)
if (IS_ENABLED(CONFIG_PCI))
pci_request_acs();
 }
-
-int pci_xen_swiotlb_init_late(void)
-{
-   if (dma_ops == _swiotlb_dma_ops)
-   return 0;
-
-   /* we can work with the default swiotlb */
-   if (!io_tlb_default_mem.nslabs) {
-   int rc = swiotlb_init_late(swiotlb_size_or_default(),
-  GFP_KERNEL, xen_swiotlb_fixup);
-   if (rc < 0)
-   return rc;
-   }
-
-   /* XXX: this switches the dma ops under live devices! */
-   dma_ops = _swiotlb_dma_ops;
-   if (IS_ENABLED(CONFIG_PCI))
-   pci_request_acs();
-   return 0;
-}
-EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late);
 #else
 static inline void __init pci_xen_swiotlb_init(void)
 {
@@ -111,7 +90,9 @@ static inline void __init pci_xen_swiotlb_init(void)
 void __init pci_iommu_alloc(void)
 {
if (xen_pv_domain()) {
-   if (xen_initial_domain() || x86_swiotlb_enable)
+   if (xen_initial_domain() ||
+   IS_ENABLED(CONFIG_XEN_PCIDEV_FRONTEND) ||
+   x86_swiotlb_enable)
pci_xen_swiotlb_init();
return;
}
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 83c0ab50676dff..11636634ae512f 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -22,7 +22,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #include 
@@ -669,11 +668,6 @@ static int pcifront_connect_and_init_dma(struct 
pcifront_device *pdev)
 
spin_unlock(_dev_lock);
 
-   if (!err && !is_swiotlb_active(>xdev->dev)) {
-   err = pci_xen_swiotlb_init_late();
-   if (err)
-   dev_err(>xdev->dev, "Could not setup SWIOTLB!\n");
-   }
return err;
 }
 
-- 
2.39.2




[PATCH 4/4] swiotlb: unexport is_swiotlb_active

2023-05-18 Thread Christoph Hellwig
Drivers have no business looking at dma-mapping or swiotlb internals.

Signed-off-by: Christoph Hellwig 
---
 kernel/dma/swiotlb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index af2e304c672c43..9f1fd28264a067 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -921,7 +921,6 @@ bool is_swiotlb_active(struct device *dev)
 
return mem && mem->nslabs;
 }
-EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
 #ifdef CONFIG_DEBUG_FS
 
-- 
2.39.2




Re: Report in downstream Debian: mpt3sas broken with xen dom0 with update to 5.10.149 in 5.10.y.

2022-10-24 Thread Christoph Hellwig
On Mon, Oct 24, 2022 at 05:28:05PM +, Andrew Cooper wrote:
> I don't know exactly how this translates to Linux internals, but most
> devices are fine and it's routinely the mpt2/3sas drivers which
> encounter problems.  It would be lovely if we could get to the bottom of
> this for once and for all.

So to summarize my two mails:  I think te use of dma_get_required_mask
in mpt3sas is wrong, and the dma_get_required_mask return value from
xen-swiotlb is also wrong.  Fixing either one should fix this problem,
and I think we should fix both.



Re: Report in downstream Debian: mpt3sas broken with xen dom0 with update to 5.10.149 in 5.10.y.

2022-10-24 Thread Christoph Hellwig
On Mon, Oct 24, 2022 at 05:26:44PM +0530, Sreekanth Reddy wrote:
> This issue is getting observed after having the below patch changes,
> 2b9aba0c5d58e141e32bb1bb4c7cd91d19f075b8 scsi: mpt3sas: Fix return
> value check of dma_get_required_mask()

Looking at this commit it seems odd.  dma_get_required_mask() should
only be used as an optimization for hardware that actually benefits
from a lower DMA Mask.  That means either classic PCI that requires
DAC cycles, or firmware architectures like aic7xxx that do need
additional overhead.  I don't think either is the case for mpt3sas,
so I think (in addition to fixing up the Xen required mask), mpt3sas
should do something like:

diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c 
b/drivers/scsi/mpt3sas/mpt3sas_base.c
index 4e981ccaac4163..295942a8989780 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -2992,8 +2992,7 @@ _base_config_dma_addressing(struct MPT3SAS_ADAPTER *ioc, 
struct pci_dev *pdev)
struct sysinfo s;
u64 coherent_dma_mask, dma_mask;
 
-   if (ioc->is_mcpu_endpoint || sizeof(dma_addr_t) == 4 ||
-   dma_get_required_mask(>dev) <= DMA_BIT_MASK(32)) {
+   if (ioc->is_mcpu_endpoint) {
ioc->dma_mask = 32;
coherent_dma_mask = dma_mask = DMA_BIT_MASK(32);
/* Set 63 bit DMA mask for all SAS3 and SAS35 controllers */



Re: Report in downstream Debian: mpt3sas broken with xen dom0 with update to 5.10.149 in 5.10.y.

2022-10-24 Thread Christoph Hellwig
On Mon, Oct 24, 2022 at 03:20:43PM +0200, Juergen Gross wrote:
> Dom0 is (normally) a PV domain, so the physical memory can be still above
> 4 GB even with dom0_mem set to 4GB.

Which means that we need to ensure the DMA ops for Xen-PV (which is
always xen-swiotlb I think?) need to return DMA_BIT_MASK(64) or whatever
is the highest possible address.



Re: i915 "GPU HANG", bisected to a2daa27c0c61 "swiotlb: simplify swiotlb_max_segment"

2022-10-18 Thread Christoph Hellwig
On Tue, Oct 18, 2022 at 04:53:50PM +0200, Juergen Gross wrote:
>> If we don't need the IS_ENABLED is not needed I'm all for dropping it.
>> But unless I misread the code, on arm/arm64 even PV guests are 1:1
>> mapped so that all Linux physically contigous memory also is Xen
>> contigous, so we don't need the hack.
>
> There are no PV guests on arm/arm64.

Ok, that's the part I was missing.  In that case we should be fine
without the IS_ENABLED indeed.



Re: i915 "GPU HANG", bisected to a2daa27c0c61 "swiotlb: simplify swiotlb_max_segment"

2022-10-18 Thread Christoph Hellwig
On Tue, Oct 18, 2022 at 04:21:43PM +0200, Jan Beulich wrote:
> Leaving the "i915 abuses" part aside (because I can't tell what exactly the
> abuse is), but assuming that "can't cope with bounce buffering" means they
> don't actually use the allocated buffers, I'd suggest this:

Except for one odd place i915 never uses dma_alloc_* but always allocates
memory itself and then maps it, but then treats it as if it was a
dma_alloc_coherent allocations, that is never does ownership changes.

> I've dropped the TDX related remark because I don't think it's meaningful
> for PV guests.

This remark is for TDX in general, not Xen related.  With TDX and other
confidentatial computing schemes, all DMA must be bounce buffered, and
all drivers skipping dma_sync* calls are broken.

> Otoh I've left the "abuses ignores" word sequence as is, no
> matter that it reads odd to me. Plus, as hinted at before, I'm not
> convinced the IS_ENABLED() use is actually necessary or warranted here.

If we don't need the IS_ENABLED is not needed I'm all for dropping it.
But unless I misread the code, on arm/arm64 even PV guests are 1:1
mapped so that all Linux physically contigous memory also is Xen
contigous, so we don't need the hack.



Re: i915 "GPU HANG", bisected to a2daa27c0c61 "swiotlb: simplify swiotlb_max_segment"

2022-10-18 Thread Christoph Hellwig
On Tue, Oct 18, 2022 at 10:57:37AM +0200, Jan Beulich wrote:
> Shouldn't this then be xen_pv_domain() that you use here, and - if you
> really want IS_ENABLED() in addition - CONFIG_XEN_PV?

I'll need help from people that understand Xen better than me what
the exact conditions (and maybe also comments are).



Re: i915 "GPU HANG", bisected to a2daa27c0c61 "swiotlb: simplify swiotlb_max_segment"

2022-10-18 Thread Christoph Hellwig
On Tue, Oct 18, 2022 at 05:52:16AM +0200, Marek Marczykowski-Górecki wrote:
> not only) when using IGD in Xen PV dom0. After not very long time Xorg
> crashes, and dmesg contain messages like this:
> 
> i915 :00:02.0: [drm] GPU HANG: ecode 7:1:01fffbfe, in Xorg [5337]
> i915 :00:02.0: [drm] Resetting rcs0 for stopped heartbeat on rcs0
> i915 :00:02.0: [drm] Xorg[5337] context reset due to GPU hang



> I tried reverting just this commit on top of 6.0.x, but the context
> changed significantly in subsequent commits, so after trying reverting
> it together with 3 or 4 more commits I gave up.
> 
> What may be an important detail, the system heavily uses cross-VM shared
> memory (gntdev) to map window contents from VMs. This is Qubes OS, and
> it uses Xen 4.14.

Can you try the patch below?

---
>From 26fe4749750f1bf843666ca777e297279994e33a Mon Sep 17 00:00:00 2001
From: Robert Beckett 
Date: Tue, 26 Jul 2022 16:39:35 +0100
Subject: drm/i915: stop abusing swiotlb_max_segment

Calling swiotlb functions directly is nowadays considered harmful. See
https://lore.kernel.org/intel-gfx/20220711082614.ga29...@lst.de/

Replace swiotlb_max_segment() calls with dma_max_mapping_size().
In i915_gem_object_get_pages_internal() no longer consider max_segment
only if CONFIG_SWIOTLB is enabled. There can be other (iommu related)
causes of specific max segment sizes.

Signed-off-by: Robert Beckett 
Signed-off-by: Christoph Hellwig 
[hch: added the Xen hack]
---
 drivers/gpu/drm/i915/gem/i915_gem_internal.c | 19 +++--
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c|  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm.c  |  4 +--
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c  |  2 +-
 drivers/gpu/drm/i915/i915_scatterlist.h  | 30 +++-
 5 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c 
b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index c698f95af15fe..629acb403a2c9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -6,7 +6,6 @@
 
 #include 
 #include 
-#include 
 
 #include "i915_drv.h"
 #include "i915_gem.h"
@@ -38,22 +37,12 @@ static int i915_gem_object_get_pages_internal(struct 
drm_i915_gem_object *obj)
struct scatterlist *sg;
unsigned int sg_page_sizes;
unsigned int npages;
-   int max_order;
+   int max_order = MAX_ORDER;
+   unsigned int max_segment;
gfp_t gfp;
 
-   max_order = MAX_ORDER;
-#ifdef CONFIG_SWIOTLB
-   if (is_swiotlb_active(obj->base.dev->dev)) {
-   unsigned int max_segment;
-
-   max_segment = swiotlb_max_segment();
-   if (max_segment) {
-   max_segment = max_t(unsigned int, max_segment,
-   PAGE_SIZE) >> PAGE_SHIFT;
-   max_order = min(max_order, ilog2(max_segment));
-   }
-   }
-#endif
+   max_segment = i915_sg_segment_size(i915->drm.dev) >> PAGE_SHIFT;
+   max_order = min(max_order, get_order(max_segment));
 
gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_RECLAIMABLE;
if (IS_I965GM(i915) || IS_I965G(i915)) {
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c 
b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index f42ca1179f373..11125c32dd35d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -194,7 +194,7 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj)
struct intel_memory_region *mem = obj->mm.region;
struct address_space *mapping = obj->base.filp->f_mapping;
const unsigned long page_count = obj->base.size / PAGE_SIZE;
-   unsigned int max_segment = i915_sg_segment_size();
+   unsigned int max_segment = i915_sg_segment_size(i915->drm.dev);
struct sg_table *st;
struct sgt_iter sgt_iter;
struct page *page;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
index e3fc38dd5db04..de5d0a7241027 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
@@ -189,7 +189,7 @@ static int i915_ttm_tt_shmem_populate(struct ttm_device 
*bdev,
struct drm_i915_private *i915 = container_of(bdev, typeof(*i915), bdev);
struct intel_memory_region *mr = i915->mm.regions[INTEL_MEMORY_SYSTEM];
struct i915_ttm_tt *i915_tt = container_of(ttm, typeof(*i915_tt), ttm);
-   const unsigned int max_segment = i915_sg_segment_size();
+   const unsigned int max_segment = i915_sg_segment_size(i915->drm.dev);
const size_t size = (size_t)ttm->num_pages << PAGE_SHIFT;
struct file *filp = i915_tt->filp;
struct sgt_iter sgt_iter;
@@ -538,7 +538,7 @@ static struct i915_refct_sgt *i915_ttm_tt_get

Re: [PATCH RFC v2 1/1] wiotlb: split buffer into 32-bit default and 64-bit extra zones

2022-09-20 Thread Christoph Hellwig
On Sat, Aug 20, 2022 at 12:42:50AM -0700, Dongli Zhang wrote:
> Hello,
> 
> I used to send out RFC v1 to introduce an extra io_tlb_mem (created with
> SWIOTLB_ANY) in addition to the default io_tlb_mem (32-bit).  The
> dev->dma_io_tlb_mem is set to either default or the extra io_tlb_mem,
> depending on dma mask. However, that is not good for setting
> dev->dma_io_tlb_mem at swiotlb layer transparently as suggested by
> Christoph Hellwig.
> 
> https://lore.kernel.org/all/20220609005553.30954-1-dongli.zh...@oracle.com/
> 
> Therefore, this is another RFC v2 implementation following a different
> direction. The core ideas are:
> 
> 1. The swiotlb is splited into two zones, io_tlb_mem->zone[0] (32-bit) and
> io_tlb_mem->zone[1] (64-bit).
> 
> struct io_tlb_mem {
>   struct io_tlb_zone zone[SWIOTLB_NR];
>   struct dentry *debugfs;
>   bool late_alloc;
>   bool force_bounce;
>   bool for_alloc;
>   bool has_extra;
> };
> 
> struct io_tlb_zone {
>   phys_addr_t start;
>   phys_addr_t end;
>   void *vaddr;
>   unsigned long nslabs;
>   unsigned long used;
>   unsigned int nareas;
>   unsigned int area_nslabs;
>   struct io_tlb_area *areas;
>   struct io_tlb_slot *slots;
> };
> 
> 2. By default, only io_tlb_mem->zone[0] is available. The
> io_tlb_mem->zone[1] is allocated conditionally if:
> 
> - the "swiotlb=" is configured to allocate extra buffer, and
> - the SWIOTLB_EXTRA is set in the flag (this is to make sure arch(s) other
>   than x86/sev/xen will not enable it until it is fully tested by each
>   arch, e.g., mips/powerpc). Currently it is enabled for x86 and xen.
> 
> 3. During swiotlb map, whether zone[0] (32-bit) or zone[1] (64-bit
> SWIOTLB_ANY)
> is used depends on min_not_zero(*dev->dma_mask, dev->bus_dma_limit).
> 
> To test the RFC v2, here is the QEMU command line.
> 
> qemu-system-x86_64 -smp 8 -m 32G -enable-kvm -vnc :5 -hda disk.img \
> -kernel path-to-linux/arch/x86_64/boot/bzImage \
> -append "root=/dev/sda1 init=/sbin/init text console=ttyS0 loglevel=7 
> swiotlb=32768,4194304,force" \
> -net nic -net user,hostfwd=tcp::5025-:22 \
> -device nvme,drive=nvme01,serial=helloworld -drive 
> file=test.qcow2,if=none,id=nvme01 \
> -serial stdio
> 
> There is below in syslog. The extra 8GB buffer is allocated.
> 
> [0.152251] software IO TLB: area num 8.
> ... ...
> [3.706088] PCI-DMA: Using software bounce buffering for IO (SWIOTLB)
> [3.707334] software IO TLB: mapped default [mem 
> 0xbbfd7000-0xbffd7000] (64MB)
> [3.708585] software IO TLB: mapped extra [mem 
> 0x00061cc0-0x00081cc0] (8192MB)
> 
> After the FIO is triggered over NVMe, the 64-bit buffer is used.
> 
> $ cat /sys/kernel/debug/swiotlb/io_tlb_nslabs_extra
> 4194304
> $ cat /sys/kernel/debug/swiotlb/io_tlb_used_extra
> 327552
> 
> Would you mind helping if this is the right direction to go?
> 
> Thank you very much!
> 
> Cc: Konrad Wilk 
> Cc: Joe Jin 
> Signed-off-by: Dongli Zhang 
> ---
>  arch/arm/xen/mm.c  |   2 +-
>  arch/mips/pci/pci-octeon.c |   5 +-
>  arch/x86/include/asm/xen/swiotlb-xen.h |   2 +-
>  arch/x86/kernel/pci-dma.c  |   6 +-
>  drivers/xen/swiotlb-xen.c  |  18 +-
>  include/linux/swiotlb.h|  73 +++--

> diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c
> index 3d826c0..4edfa42 100644
> --- a/arch/arm/xen/mm.c
> +++ b/arch/arm/xen/mm.c
> @@ -125,7 +125,7 @@ static int __init xen_mm_init(void)
>   return 0;
>  
>   /* we can work with the default swiotlb */
> - if (!io_tlb_default_mem.nslabs) {
> + if (!io_tlb_default_mem.zone[SWIOTLB_DF].nslabs) {
>   rc = swiotlb_init_late(swiotlb_size_or_default(),
>  xen_swiotlb_gfp(), NULL);
>   if (rc < 0)

First thing we need before doing anything about multiple default
pools is to get all the knowledge of details hidden inside swiotlb.c.

For swiotlb_init_late that seems easy as we can just move the check
into it.

> diff --git a/arch/mips/pci/pci-octeon.c b/arch/mips/pci/pci-octeon.c
> index e457a18..0bf0859 100644
> --- a/arch/mips/pci/pci-octeon.c
> +++ b/arch/mips/pci/pci-octeon.c
> @@ -654,6 +654,9 @@ static int __init octeon_pci_setup(void)
>   octeon_pci_mem_resource.end =
>   octeon_pci_mem_resource.start + (1ul << 30);
>   } else {
> + struct io_tlb_mem *mem = _tlb_default_mem;
> + struct io_tlb_zone *zone = &

  1   2   3   4   5   6   7   8   9   10   >