from:"'Christoph Hellwig'"

Re: [PATCH v2] mm: z3fold: deprecate CONFIG_Z3FOLD

2024-09-16 Thread Christoph Hellwig

I'd still prefer to just kill it right away, but as the second best
option I'm ok with this:

Acked-by: Christoph Hellwig

Re: clearly mark DMA_OPS support as an architecture feature v2

2024-09-03 Thread Christoph Hellwig

I've pulled this into the dma-mapping for-next tree, although I'd
love to see one of the vdpa maintainers look over patch 1.  I'm
pretty sure it's correct, but a confirmation would be good.

Re: [PATCH 2/2] dma-mapping: clearly mark DMA ops as an architecture feature

2024-08-28 Thread Christoph Hellwig

On Wed, Aug 28, 2024 at 08:21:14AM +0200, Andreas Larsson wrote:
> On 2024-08-28 08:10, Christoph Hellwig wrote:
> > --- a/drivers/xen/Kconfig
> > +++ b/drivers/xen/Kconfig
> > @@ -177,8 +177,8 @@ config XEN_GRANT_DMA_ALLOC
> >  
> >  config SWIOTLB_XEN
> > def_bool y
> > +   depends on ARCH_DMA_OPS
> 
> Rename to ARCH_HAS_DMA_OPS in v2 is missing here

Thanks,

I've fixed this up locally now.

[PATCH 2/2] dma-mapping: clearly mark DMA ops as an architecture feature

2024-08-27 Thread Christoph Hellwig

DMA ops are a helper for architectures and not for drivers to override
the DMA implementation.

Unfortunately driver authors keep ignoring this.  Make the fact more
clear by renaming the symbol to ARCH_HAS_DMA_OPS and having the two drivers
overriding their dma_ops depend on that.  These drivers should probably be
marked broken, but we can give them a bit of a grace period for that.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Thomas Gleixner 
Acked-by: Sakari Ailus  # for IPU6
Acked-by: Robin Murphy 
---
 arch/Kconfig | 9 +
 arch/alpha/Kconfig   | 2 +-
 arch/arm/Kconfig | 2 +-
 arch/arm64/Kconfig   | 1 +
 arch/mips/Kconfig| 2 +-
 arch/parisc/Kconfig  | 2 +-
 arch/powerpc/Kconfig | 2 +-
 arch/s390/Kconfig| 2 +-
 arch/sparc/Kconfig   | 2 +-
 arch/x86/Kconfig | 2 +-
 drivers/macintosh/macio_asic.c   | 4 ++--
 drivers/media/pci/intel/ipu6/Kconfig | 7 ++-
 drivers/vdpa/Kconfig | 7 ++-
 drivers/xen/Kconfig  | 4 ++--
 include/linux/device.h   | 2 +-
 include/linux/dma-map-ops.h  | 6 +++---
 kernel/dma/Kconfig   | 9 ++---
 kernel/dma/Makefile  | 2 +-
 18 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 975dd22a2dbd22..61c4ec04878754 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -17,6 +17,15 @@ config CPU_MITIGATIONS
def_bool y
 endif
 
+#
+# Selected by architectures that need custom DMA operations for e.g. legacy
+# IOMMUs not handled by dma-iommu.  Drivers must never select this symbol.
+#
+config ARCH_HAS_DMA_OPS
+   depends on HAS_DMA
+   select DMA_OPS_HELPERS
+   bool
+
 menu "General architecture-dependent options"
 
 config ARCH_HAS_SUBPAGE_FAULTS
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 50ff06d5b799c9..109a4cddcd1389 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -4,12 +4,12 @@ config ALPHA
default y
select ARCH_32BIT_USTAT_F_TINODE
select ARCH_HAS_CURRENT_STACK_POINTER
+   select ARCH_HAS_DMA_OPS if PCI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_NO_PREEMPT
select ARCH_NO_SG_CHAIN
select ARCH_USE_CMPXCHG_LOCKREF
-   select DMA_OPS if PCI
select FORCE_PCI
select PCI_DOMAINS if PCI
select PCI_SYSCALL if PCI
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 54b2bb817a7fc0..f5f7995a2f8f59 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -10,6 +10,7 @@ config ARM
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DMA_ALLOC if MMU
+   select ARCH_HAS_DMA_OPS
select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
@@ -54,7 +55,6 @@ config ARM
select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
select DMA_DECLARE_COHERENT
select DMA_GLOBAL_POOL if !MMU
-   select DMA_OPS
select DMA_NONCOHERENT_MMAP if MMU
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2f8ff354ca670..40940cbde435a3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -24,6 +24,7 @@ config ARM64
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
+   select ARCH_HAS_DMA_OPS if XEN
select ARCH_HAS_DMA_PREP_COHERENT
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_FAST_MULTIPLIER
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 60077e57693563..023ad33a7e945d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -8,6 +8,7 @@ config MIPS
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_CURRENT_STACK_POINTER if !CC_IS_CLANG || CLANG_VERSION 
>= 14
select ARCH_HAS_DEBUG_VIRTUAL if !64BIT
+   select ARCH_HAS_DMA_OPS if MACH_JAZZ
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_KCOV
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE if !EVA
@@ -393,7 +394,6 @@ config MACH_JAZZ
select ARC_PROMLIB
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
-   select DMA_OPS
select FW_ARC
select FW_ARC32
select ARCH_MAY_HAVE_PC_FDC
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index b0a2ac3ba91610..859835a0692c24 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -10,6 +10,7 @@ config PARISC
select ARCH_WANT_FRAME_POINTERS
select ARCH_HAS_CPU_CACHE_ALIASING
select ARCH_HAS_DMA_ALLOC if PA11
+   select ARCH_HAS_DMA_OPS
select ARCH_HAS_ELF_RANDOMIZE

[PATCH 1/2] vdpa_sim: don't select DMA_OPS

2024-08-27 Thread Christoph Hellwig

vdpa_sim has been fixed to not override the dma_map_ops in commit
6c3d329e6486 ("vdpa_sim: get rid of DMA ops"), so don't select the
symbol and don't depend on HAS_DMA.

Signed-off-by: Christoph Hellwig 
---
 drivers/vdpa/Kconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 5265d09fc1c409..b08de3b7706109 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -11,8 +11,7 @@ if VDPA
 
 config VDPA_SIM
tristate "vDPA device simulator core"
-   depends on RUNTIME_TESTING_MENU && HAS_DMA
-   select DMA_OPS
+   depends on RUNTIME_TESTING_MENU
select VHOST_RING
select IOMMU_IOVA
help
-- 
2.43.0

clearly mark DMA_OPS support as an architecture feature v2

2024-08-27 Thread Christoph Hellwig

Hi all,

we've had a long standing problems where drivers try to hook into the
DMA_OPS mechanisms to override them for something that is not DMA, or
to introduce additional dispatching.

Now that we are not using DMA_OPS support for dma-iommu and can build
kernels without DMA_OPS support on many common setups this becomes even
more problematic.

This series renames the option to ARCH_HAS_DMA_OPS and adds very explicit
comment to not use it in drivers.  The ipu6 and vdpa_sim/user drivers
that abuse the mechanism are made to depend on the option instead of
selecting it with a big comment, but I expect this to be fixed rather
sooner than later (I know the ipu6 maintainers are on it based on a
previous discussion).

Changes since v1:
 - s/ARCH_DMA_OPS/ARCH_HAS_DMA_OPS/g
 - spelling fixes
 - vdpa_sim actually doesn't need dma ops these days, add a prep patch
   to remove the dependency

Re: [PATCH v2] powerpc: warn on emulation of dcbz instruction in kernel mode

2024-08-27 Thread Christoph Hellwig

On Sat, Aug 24, 2024 at 12:17:57PM -0500, Segher Boessenkool wrote:
> > Are these functions also used on DMA coherent memory ?
> 
> Most won't show up high on most profiles, heh.  Which you already
> can see from the problem not being attacked yet: if it was so obviously
> a problem, some people would have wanted to do something about it :-)

Most drivers try to avoid coherent allocations in the fast path if
they can.  Another good option for Christians problem would be
to switch the the dmaengine driver to use dma_alloc_pages - it doesn't
actually need uncached memory as far as I can, dma_alloc_coherent is
just the only API we had to allocate guaranteed DMAable memory for most
of Linux's existence.

Re: clearly mark DMA_OPS support as an architecture feasture

2024-08-25 Thread Christoph Hellwig

On Mon, Aug 26, 2024 at 02:27:27PM +0800, Jason Wang wrote:
> Actually I meant, we can extend the virtio_config_ops to allow mapping
> ops there, then simulator and VDUSE can hook the map ops there.

>From a quick glance that feels like the right layer of abstraction,
although the config part of the name feels wrong at that point.

clearly mark DMA_OPS support as an architecture feasture

2024-08-23 Thread Christoph Hellwig

Hi all,

we've had a long standing problems where drivers try to hook into the
DMA_OPS mechanisms to override them for something that is not DMA, or
to introduce additional dispatching.

Now that we are not using DMA_OPS support for dma-iommu and can build
kernels without DMA_OPS support on many common setups this becomes even
more problematic.

This series renames the option to ARCH_DMA_OPS and adds very explicit
comment to not use it in drivers.  The ipu6 and vdpa_sim/user drivers
that abuse the mechanism are made to depend on the option instead of
selecting it with a big comment, but I expect this to be fixed rather
sooner than later (I know the ipu6 maintainers are on it based on a
previous discussion).

[PATCH] dma-mapping: clear mark DMA ops as an architecture feature

2024-08-23 Thread Christoph Hellwig

DMA ops are a helper for architectures and not for drivers to override
the DMA implementation.  Unfortunately driver authors keep ignoring
this.  Make this more clear by renaming the symbol to ARCH_DMA_OPS,
have the three drivers overriding it depend on that.  They should
probably also be marked broken, but we can give them a bit of a grace
period for that.

Signed-off-by: Christoph Hellwig 
---
 arch/Kconfig |  9 +
 arch/alpha/Kconfig   |  2 +-
 arch/arm/Kconfig |  2 +-
 arch/arm64/Kconfig   |  1 +
 arch/mips/Kconfig|  2 +-
 arch/parisc/Kconfig  |  2 +-
 arch/powerpc/Kconfig |  2 +-
 arch/s390/Kconfig|  2 +-
 arch/sparc/Kconfig   |  2 +-
 arch/x86/Kconfig |  2 +-
 drivers/macintosh/macio_asic.c   |  4 ++--
 drivers/media/pci/intel/ipu6/Kconfig |  7 ++-
 drivers/vdpa/Kconfig | 14 --
 drivers/xen/Kconfig  |  4 ++--
 include/linux/device.h   |  2 +-
 include/linux/dma-map-ops.h  |  6 +++---
 kernel/dma/Kconfig   |  9 ++---
 kernel/dma/Makefile  |  2 +-
 18 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 975dd22a2dbd22..6abd0f1c1d833e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -17,6 +17,15 @@ config CPU_MITIGATIONS
def_bool y
 endif
 
+#
+# Selected by architectures that need custom DMA operations for e.g. legacy
+# IOMMUs not handled by dma-iommu.  Drivers must never select this symbol.
+#
+config ARCH_DMA_OPS
+   depends on HAS_DMA
+   select DMA_OPS_HELPERS
+   bool
+
 menu "General architecture-dependent options"
 
 config ARCH_HAS_SUBPAGE_FAULTS
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 50ff06d5b799c9..c6d716d8bdd095 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -3,13 +3,13 @@ config ALPHA
bool
default y
select ARCH_32BIT_USTAT_F_TINODE
+   select ARCH_DMA_OPS if PCI
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_NO_PREEMPT
select ARCH_NO_SG_CHAIN
select ARCH_USE_CMPXCHG_LOCKREF
-   select DMA_OPS if PCI
select FORCE_PCI
select PCI_DOMAINS if PCI
select PCI_SYSCALL if PCI
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 54b2bb817a7fc0..a823fd14d65987 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -10,6 +10,7 @@ config ARM
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DMA_ALLOC if MMU
+   select ARCH_DMA_OPS
select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
@@ -54,7 +55,6 @@ config ARM
select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS
select DMA_DECLARE_COHERENT
select DMA_GLOBAL_POOL if !MMU
-   select DMA_OPS
select DMA_NONCOHERENT_MMAP if MMU
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a2f8ff354ca670..ce1650ceb5b596 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -15,6 +15,7 @@ config ARM64
select ARCH_BINFMT_ELF_EXTRA_PHDRS
select ARCH_BINFMT_ELF_STATE
select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE
+   select ARCH_DMA_OPS if XEN
select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
select ARCH_ENABLE_MEMORY_HOTPLUG
select ARCH_ENABLE_MEMORY_HOTREMOVE
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 60077e57693563..3b5a1aef1e9bc0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -391,9 +391,9 @@ config MACH_JAZZ
bool "Jazz family of machines"
select ARC_MEMORY
select ARC_PROMLIB
+   select ARCH_DMA_OPS
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
-   select DMA_OPS
select FW_ARC
select FW_ARC32
select ARCH_MAY_HAVE_PC_FDC
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index b0a2ac3ba91610..c77f9de3e8cc1b 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -3,6 +3,7 @@ config PARISC
def_bool y
select ALTERNATE_USER_ADDRESS_SPACE
select ARCH_32BIT_OFF_T if !64BIT
+   select ARCH_DMA_OPS
select ARCH_MIGHT_HAVE_PC_PARPORT
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
@@ -23,7 +24,6 @@ config PARISC
select ARCH_HAS_CACHE_LINE_SIZE
select ARCH_HAS_DEBUG_VM_PGTABLE
select HAVE_RELIABLE_STACKTRACE
-   select DMA_OPS
select RTC_CLASS
select RTC_DRV_GENERIC
select INIT_ALL_POSSIBLE
diff --git a

Re: [PATCH v2] powerpc: warn on emulation of dcbz instruction in kernel mode

2024-08-23 Thread Christoph Hellwig

On Fri, Aug 23, 2024 at 08:06:00AM -0500, Segher Boessenkool wrote:
> What does "uncached memory" even mean here?  Literally it would be
> I=1 memory (uncachEABLE memory), but more likely you want M=0 memory
> here ("non-memory memory", "not well-behaved memory", MMIO often).

Regular kernel memory vmapped with pgprot_noncached().

> If memset() is expected to be used with M=0, you cannot do any serious
> optimisations to it at all.  If memset() is expected to be used with I=1
> it should use a separate code path for it, probably the caller should
> make the distinction.

DMA coherent memory which uses uncached memory for platforms that
do not provide hardware dma coherence can end up just about anywhere
in the kernel.  We could use special routines for a few places in
the DMA subsystem, but there might be plenty of others.

Re: [PATCH v2] powerpc: warn on emulation of dcbz instruction in kernel mode

2024-08-23 Thread Christoph Hellwig

dma_alloc_from_pool is the allocator used when the caller can't
sleep, and as that is reusing memory it really has to call memset
or a memset-like function on the already uncached memory
unfortunately.  The dma engine operation that is doing this allocation
is documented as not being able to sleep, which is a bit unfortunate
as the storage driver above it could sleep just fine.

Adding the dmaengine maintainer and list if there is a way to pass a gfp
flag or some other indicator that the implementations can sleep,
which would avoid the need to use the not very scalable dma pool,
and thus also the need to memset on uncached memory.

Re: [PATCH v2] powerpc: warn on emulation of dcbz instruction in kernel mode

2024-08-22 Thread Christoph Hellwig

On Thu, Aug 22, 2024 at 06:39:33AM +, LEROY Christophe wrote:
> powerpc has a magic instruction 'dcbz' which clears a full cacheline in 
> one go. It is far more efficient than a loop to store zeros, and since 
> 2015 memset(0) has been implemented with that instruction (commit 
> 5b2a32e80634 ("powerpc/32: memset(0): use cacheable_memzero"))
> 
> But that instruction generates an alignment exception when used on 
> non-cached memory (whether it is RAM or not doesn't matter). It is then 
> emulated by the kernel but it of course leads to a serious performance 
> degradation, hence the warning added by commit cbe654c77961 ("powerpc: 
> warn on emulation of dcbz instruction in kernel mode"). Until now it 
> helped identify and fix use of memset() on IO memory.
> 
> But if memset() is expected to be used with non-cached RAM, then I don't 
> know what to do. Any suggestion ?

I'd suggest two things:

 1) remove the warning.  The use case is perfectly valid and everything
using uncached memory is already slow, so people will just have to
deal with it.  Maybe offer a trace point instead if people care about
it.
 2) figure out a way to avoid this case in the dma-coherent allocator,
which is probably the only case where it happens frequently
(a few drivers also zero or re-zero coherent memory, but most of the
 time that is cargo cult programming and not actually needed)

For 2 I can think of two options:

 a) provide a arch hook for zeroing the dma memory that defaults to
memset, but which powerpc can override
 a) figure out a way to clear the memory before marking it uncached
if we can

a) it obviously easier to verify, but b) is probably going to give
   way better performance.

Below is an untested implementation of b) for dma-direct, I just need to
find out if there is any architecture that requires the memory to be
zeroed after it іt has been remapped.  The iommu drivers might also
need similar treatment.

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4480a3cd92e087..66e94b32ab0081 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -275,6 +275,9 @@ void *dma_direct_alloc(struct device *dev, size_t size,
if (force_dma_unencrypted(dev))
prot = pgprot_decrypted(prot);

+   if (!PageHighMem(page))
+   memset(page_address(page), 0, size);
+
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);

@@ -283,14 +286,15 @@ void *dma_direct_alloc(struct device *dev, size_t size,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
+   if (PageHighMem(page))
+   memset(ret, 0, size);
} else {
ret = page_address(page);
if (dma_set_decrypted(dev, ret, size))
goto out_leak_pages;
+   memset(ret, 0, size);
}

-   memset(ret, 0, size);
-
if (set_uncached) {
arch_dma_prep_coherent(page, size);
ret = arch_dma_set_uncached(ret, size);

Re: [PATCH v2] powerpc: warn on emulation of dcbz instruction in kernel mode

2024-08-21 Thread Christoph Hellwig

On Thu, Aug 22, 2024 at 05:25:10AM +, LEROY Christophe wrote:
> > and this results in a call to dma_direct_allocation(), which has one
> > innocent looking memset():
> 
> 
> memset() can't be used on non-cached memory, memset_io() has to be used
> instead.

No, we use memset on uncached memory all the time.  Note that uncached
memory != __iomem memory, for which you DO have to use memset_io.

Re: [PATCH v6 RESED 0/2] dma: support DMA zone starting above 4GB

2024-08-21 Thread Christoph Hellwig

Thanks,

applied to the dma-mapping tree for Linux 6.12.

Re: [PATCH v2] ata: pata_macio: Use WARN instead of BUG

2024-08-21 Thread Christoph Hellwig

On Thu, Aug 22, 2024 at 12:13:52AM +0300, Sergei Shtylyov wrote:
> On 8/20/24 6:04 AM, Michael Ellerman wrote:
> 
> > The overflow/underflow conditions in pata_macio_qc_prep() should never
> > happen. But if they do there's no need to kill the system entirely, a
> > WARN and failing the IO request should be sufficient and might allow the
> > system to keep running.
> 
>WARN*() can kill your system with panic_on_warn -- Android is particularly
> fond of this kernel parameter but I guess it's not your case... :-)
>Greg KH usually advices against using these macros. :-)

And in this case he is simply totally wrong.  The whole poing of WARN_ON
is to have a standardized way to assert conditions.

Re: Since 6.10 - kernel oops/panics on G4 macmini due to change in drivers/ata/pata_macio.c

2024-08-17 Thread Christoph Hellwig

On Sat, Aug 17, 2024 at 09:46:31AM +1000, Michael Ellerman wrote:
> Same behaviour on a kernel with PAGE_SIZE = 4KB.
> 
> I don't know why max_sectors_kb starts out with a different value on my
> system, but anyway the bug is lurking there, even if it doesn't trip by
> default in some configurations.

Various distributions use udev rules to increase it.

Re: [PATCH v3 2/3] dma-mapping: replace zone_dma_bits by zone_dma_limit

2024-08-01 Thread Christoph Hellwig

On Wed, Jul 31, 2024 at 06:24:24PM -0700, Nathan Chancellor wrote:
> Unfortunately, I am not sure either... I do not see anything obviously,
> so perhaps it could just be avoided with the __diag() infrastructure?
> 
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index 3dbc0b89d6fb..b58e7eb9c8f1 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -20,7 +20,12 @@
>   * it for entirely different regions. In that case the arch code needs to
>   * override the variable below for dma-direct to work properly.
>   */
> +__diag_push();
> +__diag_ignore(clang, 13, "-Wconstant-conversion",
> +   "Clang incorrectly thinks the n == 64 case in DMA_BIT_MASK() can 
> happen here,"
> +   "which would truncate with a 32-bit phys_addr_t");
>  phys_addr_t zone_dma_limit __ro_after_init = DMA_BIT_MASK(24);

So..  The code above is clearly wrong, as DMA_BIT_MASK always returns a
u64, and phys_addr_t can be smaller than that.  So at least in this case
the warning seems perfectly valid and the code has issues because it is
mixing different concepts.

Where do you see warnings like this upstream?

Re: linux-next: runtime warning after merge of the dma-mapping tree

2024-08-01 Thread Christoph Hellwig

Ok, I guess this is what Robin was referring to.

A midlayer like SCSI really shouldn't directly call dma layer
functions without knowing that the underlying bus is DMA capable.

I'll see what I can do about it, and in the meantime drop this patch
and the companion from the dma-mapping tree.

Re: [PATCH v3 2/3] dma-mapping: replace zone_dma_bits by zone_dma_limit

2024-07-30 Thread Christoph Hellwig

On Mon, Jul 29, 2024 at 07:12:08PM -0700, Nathan Chancellor wrote:
> >  | ~~   ^~~~
> >include/linux/dma-mapping.h:77:40: note: expanded from macro 
> > 'DMA_BIT_MASK'
> >   77 | #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1))
> >  |^
> >2 warnings generated.
> 
> FWIW, this is likely a false positive due to an issue in Clang with the
> control flow graph for global variables:
> 
> https://github.com/ClangBuiltLinux/linux/issues/92
> 
> DMA_BIT_MASK() has been the biggest offender :/ If there is any way to
> refactor this code to avoid this, that would be great (as that has been
> one of our longest outstanding issues and getting it fixed in the
> compiler does not seem super easy at this point).

I have no idea what you'd want changed here, but I'll happily take
patches.

Re: [PATCH 14/26] block: move the nonrot flag to queue_limits

2024-07-25 Thread Christoph Hellwig

On Thu, Jul 25, 2024 at 01:35:46PM +0200, Wouter Verhelst wrote:
> NBD actually exports a flag for rotational devices; it's defined in
> nbd.h in the NBD userland source as
> 
> #define NBD_FLAG_ROTATIONAL (1 << 4)/* Use elevator algorithm - 
> rotational media */
> 
> which is passed in the same flags field which also contains the
> NBD_FLAG_SEND_FLUSH and NBD_FLAG_SEND_FUA flags.
> 
> Perhaps we might want to look at that flag and set the device to
> rotational if it is specified?

Yes, that sounds good.  Can you send a patch?

Re: [PATCH 07/13] huge_memory: Allow mappings of PUD sized pages

2024-07-02 Thread Christoph Hellwig

On Tue, Jul 02, 2024 at 08:19:01PM +1000, Alistair Popple wrote:
> > (B) As long as we have subpage mapcounts, this prevents vmemmap
> > optimizations [1]. Is that only used for device-dax for now and are
> > there no plans to make use of that for fs-dax?
> 
> I don't have any plans to. This is purely focussed on refcounting pages
> "like normal" so we can get rid of all the DAX special casing.
> 
> > (C) We managed without so far :)
> 
> Indeed, although Christoph has asked repeatedly ([1], [2] and likely
> others) that this gets fixed and I finally got sick of it coming up
> everytime I need to touch something with ZONE_DEVICE pages :)
> 
> Also it removes the need for people to understand the special DAX page
> recounting scheme and ends up removing a bunch of cruft as a bonus:
> 
>  59 files changed, 485 insertions(+), 869 deletions(-)
> 
> And that's before I clean up all the pgmap reference handling. It also
> removes the pXX_trans_huge and pXX_leaf distinction. So we managed, but
> things could be better IMHO.

Yes.  I can't wait for this series making the finish line.  There might
be more chance for cleanups and optimizations around ZONE_DEVICE, but
this alone is a huge step forward.

Re: [PATCH 06/13] mm/memory: Add dax_insert_pfn

2024-07-02 Thread Christoph Hellwig

On Tue, Jul 02, 2024 at 09:18:31AM +0200, David Hildenbrand wrote:
> We have this comparably nasty vmf_insert_mixed() that FS dax abused to 
> insert into !VM_MIXED VMAs. Is that abuse now stopping and are there maybe 
> ways to get rid of vmf_insert_mixed()?

Unfortunately it is also used by a few drm drivers and not just DAX.

Re: [axboe-block:for-next] [block] 1122c0c1cc: aim7.jobs-per-min 22.6% improvement

2024-07-02 Thread Christoph Hellwig

On Mon, Jul 01, 2024 at 04:22:19PM +0800, Oliver Sang wrote:
> from below, it seems the patchset doesn't introduce any performance 
> improvement
> but a regression now. is this expected?

Not having the improvement at least alleviate my concerns about data
integrity.  I'm still curious where it comes from as it isn't exactly
expected.

Re: [PATCH 10/13] fs/dax: Properly refcount fs dax pages

2024-06-26 Thread Christoph Hellwig

> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index eb61598..b7a31ae 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -126,11 +126,11 @@ static vm_fault_t __dev_dax_pte_fault(struct dev_dax 
> *dev_dax,
>   return VM_FAULT_SIGBUS;
>   }
>  
> - pfn = phys_to_pfn_t(phys, PFN_DEV|PFN_MAP);
> + pfn = phys_to_pfn_t(phys, 0);
>  
>   dax_set_mapping(vmf, pfn, fault_size);
>  
> - return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
> + return dax_insert_pfn(vmf->vma, vmf->address, pfn, vmf->flags & 
> FAULT_FLAG_WRITE);

Plenty overly long lines here and later.

Q: hould dax_insert_pfn take a vm_fault structure instead of the vma?
Or are the potential use cases that aren't from the fault path?
similar instead of the bool write passing the fault flags might actually
make things more readable than the bool.

Also at least currently it seems like there are no modular users despite
the export, or am I missing something?

> + blk_queue_flag_set(QUEUE_FLAG_DAX, q);

Just as a heads up, setting of these flags has changed a lot in
linux-next.

>  {
> + /*
> +  * Make sure we flush any cached data to the page now that it's free.
> +  */
> + if (PageDirty(page))
> + dax_flush(NULL, page_address(page), page_size(page));
> +

Adding the magic dax_dev == NULL case to dax_flush and going through it
vs just calling arch_wb_cache_pmem directly here seems odd.

But I also don't quite understand how it is related to the rest
of the patch anyway.

> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -373,6 +373,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
>   unsigned long start = addr;
>  
>   ptl = pmd_trans_huge_lock(pmd, vma);
> + if (vma_is_dax(vma))
> + ptl = NULL;
>   if (ptl) {

This feels sufficiently magic to warrant a comment.

>   if (!pmd_present(*pmd))
>   goto out;
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index b7e1599..f11ee0d 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -1016,7 +1016,8 @@ static void __ref __init_zone_device_page(struct page 
> *page, unsigned long pfn,
>*/
>   if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
>   pgmap->type == MEMORY_DEVICE_COHERENT ||
> - pgmap->type == MEMORY_DEVICE_PCI_P2PDMA)
> + pgmap->type == MEMORY_DEVICE_PCI_P2PDMA ||
> + pgmap->type == MEMORY_DEVICE_FS_DAX)
>   set_page_count(page, 0);
>  }

So we'll skip this for MEMORY_DEVICE_GENERIC only.  Does anyone remember
if that's actively harmful or just not needed?  If the latter it might
be simpler to just set the page count unconditionally here.

Re: [PATCH 05/13] mm: Allow compound zone device pages

2024-06-26 Thread Christoph Hellwig

On Thu, Jun 27, 2024 at 10:54:20AM +1000, Alistair Popple wrote:
>  static struct nouveau_dmem_chunk *nouveau_page_to_chunk(struct page *page)
>  {
> - return container_of(page->pgmap, struct nouveau_dmem_chunk, pagemap);
> + return container_of(page_dev_pagemap(page), struct nouveau_dmem_chunk, 
> pagemap);

Overly long line hee (and quite a few more).

Re: [PATCH 04/13] fs/dax: Add dax_page_free callback

2024-06-26 Thread Christoph Hellwig

On Thu, Jun 27, 2024 at 10:54:19AM +1000, Alistair Popple wrote:
> When a fs dax page is freed it has to notify filesystems that the page
> has been unpinned/unmapped and is free. Currently this involves
> special code in the page free paths to detect a transition of refcount
> from 2 to 1 and to call some fs dax specific code.
> 
> A future change will require this to happen when the page refcount
> drops to zero. In this case we can use the existing
> pgmap->ops->page_free() callback so wire that up for all devices that
> support FS DAX (nvdimm and virtio).

Given that ->page_ffree is only called from free_zone_device_folio
and right next to a switch on the the type, can't we just do the
wake_up_var there without the somewhat confusing indirect call that
just back in common code without any driver logic?

Re: [PATCH 03/13] fs/dax: Refactor wait for dax idle page

2024-06-26 Thread Christoph Hellwig

On Thu, Jun 27, 2024 at 10:54:18AM +1000, Alistair Popple wrote:
> A FS DAX page is considered idle when its refcount drops to one. This
> is currently open-coded in all file systems supporting FS DAX. Move
> the idle detection to a common function to make future changes easier.
> 
> Signed-off-by: Alistair Popple 
> Reviewed-by: Jan Kara 

I'm pretty sure I already review this ages ago, but:

Reviewed-by: Christoph Hellwig

Re: [PATCH 02/13] pci/p2pdma: Don't initialise page refcount to one

2024-06-26 Thread Christoph Hellwig

On Thu, Jun 27, 2024 at 10:54:17AM +1000, Alistair Popple wrote:
> The reference counts for ZONE_DEVICE private pages should be
> initialised by the driver when the page is actually allocated by the
> driver allocator, not when they are first created. This is currently
> the case for MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_COHERENT pages
> but not MEMORY_DEVICE_PCI_P2PDMA pages so fix that up.
> 
> Signed-off-by: Alistair Popple 
> ---
>  drivers/pci/p2pdma.c | 2 ++
>  mm/memremap.c| 8 
>  mm/mm_init.c | 4 +++-
>  3 files changed, 9 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 4f47a13..1e9ea32 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -128,6 +128,8 @@ static int p2pmem_alloc_mmap(struct file *filp, struct 
> kobject *kobj,
>   goto out;
>   }
>  
> + set_page_count(virt_to_page(kaddr), 1);

Can we have a comment here?  Without that it feels a bit too much like
black magic when reading the code.

> + if (folio->page.pgmap->type == MEMORY_DEVICE_PRIVATE ||
> + folio->page.pgmap->type == MEMORY_DEVICE_COHERENT)
> + put_dev_pagemap(folio->page.pgmap);
> + else if (folio->page.pgmap->type != MEMORY_DEVICE_PCI_P2PDMA)
>   /*
>* Reset the refcount to 1 to prepare for handing out the page
>* again.
>*/
>   folio_set_count(folio, 1);

Where the else if evaluates to MEMORY_DEVICE_FS_DAX ||
MEMORY_DEVICE_GENERIC.  Maybe make this a switch statement handling
all cases of the enum to make it clear and have the compiler generate
a warning when a new type is added without being handled here?

> @@ -1014,7 +1015,8 @@ static void __ref __init_zone_device_page(struct page 
> *page, unsigned long pfn,
>* which will set the page count to 1 when allocating the page.
>*/
>   if (pgmap->type == MEMORY_DEVICE_PRIVATE ||
> + pgmap->type == MEMORY_DEVICE_COHERENT ||
> + pgmap->type == MEMORY_DEVICE_PCI_P2PDMA)
>   set_page_count(page, 0);

Similarly here a switch with explanation of what will be handled and
what not would be nice.

Re: [PATCH 06/13] mm/memory: Add dax_insert_pfn

2024-06-26 Thread Christoph Hellwig

On Thu, Jun 27, 2024 at 10:54:21AM +1000, Alistair Popple wrote:
> +extern void prep_compound_page(struct page *page, unsigned int order);

No need for the extern.

>  static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t 
> *pte,
> - unsigned long addr, struct page *page, pgprot_t prot)
> + unsigned long addr, struct page *page, pgprot_t prot, 
> bool mkwrite)

Overly long line.

> + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot, 
> mkwrite);

.. same here.

> +vm_fault_t dax_insert_pfn(struct vm_area_struct *vma,
> + unsigned long addr, pfn_t pfn_t, bool write)

This could probably use a kerneldoc comment.

Re: [axboe-block:for-next] [block] 1122c0c1cc: aim7.jobs-per-min 22.6% improvement

2024-06-26 Thread Christoph Hellwig

On Thu, Jun 27, 2024 at 10:35:38AM +0800, Oliver Sang wrote:
> 
> I failed to apply patch in your previous reply to 1122c0c1cc or current tip
> of axboe-block/for-next:
> c1440ed442a58 (axboe-block/for-next) Merge branch 'for-6.11/block' into 
> for-next

That already includes it.

> 
> but it's ok to apply upon next:
> * 0fc4bfab2cd45 (tag: next-20240625) Add linux-next specific files for 
> 20240625
> 
> I've already started the test based on this applyment.
> is the expectation that patch should not introduce performance change 
> comparing
> to 0fc4bfab2cd45?
> 
> or if this applyment is not ok, please just give me guidance. Thanks!

The expectation is that the latest block branch (and thus linux-next)
doesn't see this performance change.

Re: [axboe-block:for-next] [block] bd4a633b6f: fsmark.files_per_sec -64.5% regression

2024-06-25 Thread Christoph Hellwig

On Wed, Jun 26, 2024 at 02:11:11PM +0800, Oliver Sang wrote:
> hi, Christoph Hellwig,
> 
> On Mon, Jun 24, 2024 at 10:35:37AM +0200, Christoph Hellwig wrote:
> > This is odd to say at least.  Any chance you can check the value
> > of /sys/block/$DEVICE/queue/rotational for the relevant device before
> > and after this commit?  And is this an ATA or NVMe SSD?
> > 
> 
> yeah, as Niklas mentioned, it's an ATA SSD.
> 
> I checked the /sys/block/$DEVICE/queue/rotational before and after this 
> commit,
> both show '0'. not sure if this is expected.
> 
> anyway, I noticed you send a patch [1]
> 
> so I applied this patch upon bd4a633b6f, and found the performance restored.

Thanks for testing!

Re: [axboe-block:for-next] [block] 1122c0c1cc: aim7.jobs-per-min 22.6% improvement

2024-06-25 Thread Christoph Hellwig

On Wed, Jun 26, 2024 at 10:10:49AM +0800, Oliver Sang wrote:
> I'm not sure I understand this test request. as in title, we see a good
> improvement of aim7 for 1122c0c1cc, and we didn't observe other issues for
> this commit.

The improvement suggests we are not sending cache flushes when we should
send them, or at least just handle them in md.

> do you mean this improvement is not expected or exposes some problems instead?
> then by below patch, should the performance back to the level of parent of
> 1122c0c1cc?
> 
> sure! it's our great pleasure to test your patches. I noticed there are
> [1]
> https://lore.kernel.org/all/20240625110603.50885-2-...@lst.de/
> which includes "[PATCH 1/7] md: set md-specific flags for all queue limits"
> [2]
> https://lore.kernel.org/all/20240625145955.115252-2-...@lst.de/
> which includes "[PATCH 1/8] md: set md-specific flags for all queue limits"
> 
> which one you suggest us to test?
> do we only need to apply the first patch "md: set md-specific flags for all 
> queue limits"
> upon 1122c0c1cc?
> then is the expectation the performance back to parent of 1122c0c1cc?

Either just the patch in reply or the entire [2] series would be fine.

Thanks!

Re: [axboe-block:for-next] [block] 1122c0c1cc: aim7.jobs-per-min 22.6% improvement

2024-06-25 Thread Christoph Hellwig

Hi Oliver,

can you test the patch below?  It restores the previous behavior if
the device did not have a volatile write cache.  I think at least
for raid0 and raid1 without bitmap the new behavior actually is correct
and better, but it will need fixes for other modes.  If the underlying
devices did have a volatile write cache I'm a bit lost what the problem
was and this probably won't fix the issue.

---
>From 81c816827197f811e14add7a79220ed9eef6af02 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig 
Date: Tue, 25 Jun 2024 08:48:18 +0200
Subject: md: set md-specific flags for all queue limits

The md driver wants to enforce a number of flags to an all devices, even
when not inheriting them from the underlying devices.  To make sure these
flags survive the queue_limits_set calls that md uses to update the
queue limits without deriving them form the previous limits add a new
md_init_stacking_limits helper that calls blk_set_stacking_limits and sets
these flags.

Fixes: 1122c0c1cc71 ("block: move cache control settings out of queue->flags")
Signed-off-by: Christoph Hellwig 
---
 drivers/md/md.c | 13 -
 drivers/md/md.h |  1 +
 drivers/md/raid0.c  |  2 +-
 drivers/md/raid1.c  |  2 +-
 drivers/md/raid10.c |  2 +-
 drivers/md/raid5.c  |  2 +-
 6 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 69ea54aedd99a1..8368438e58e989 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5853,6 +5853,13 @@ static void mddev_delayed_delete(struct work_struct *ws)
kobject_put(&mddev->kobj);
 }
 
+void md_init_stacking_limits(struct queue_limits *lim)
+{
+   blk_set_stacking_limits(lim);
+   lim->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
+   BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
+}
+
 struct mddev *md_alloc(dev_t dev, char *name)
 {
/*
@@ -5871,10 +5878,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
int shift;
int unit;
int error;
-   struct queue_limits lim = {
-   .features   = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
- BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT,
-   };
 
/*
 * Wait for any previous instance of this device to be completely
@@ -5914,7 +5917,7 @@ struct mddev *md_alloc(dev_t dev, char *name)
 */
mddev->hold_active = UNTIL_STOP;
 
-   disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+   disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
if (IS_ERR(disk)) {
error = PTR_ERR(disk);
goto out_free_mddev;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index c4d7ebf9587d07..28cb4b0b6c1740 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -893,6 +893,7 @@ extern int strict_strtoul_scaled(const char *cp, unsigned 
long *res, int scale);
 
 extern int mddev_init(struct mddev *mddev);
 extern void mddev_destroy(struct mddev *mddev);
+void md_init_stacking_limits(struct queue_limits *lim);
 struct mddev *md_alloc(dev_t dev, char *name);
 void mddev_put(struct mddev *mddev);
 extern int md_run(struct mddev *mddev);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 62634e2a33bd0f..32d58752477847 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -379,7 +379,7 @@ static int raid0_set_limits(struct mddev *mddev)
struct queue_limits lim;
int err;
 
-   blk_set_stacking_limits(&lim);
+   md_init_stacking_limits(&lim);
lim.max_hw_sectors = mddev->chunk_sectors;
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
lim.io_min = mddev->chunk_sectors << 9;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1a0eba65b8a92b..04a0c2ca173245 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3194,7 +3194,7 @@ static int raid1_set_limits(struct mddev *mddev)
struct queue_limits lim;
int err;
 
-   blk_set_stacking_limits(&lim);
+   md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
err = mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY);
if (err) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3334aa803c8380..2a9c4ee982e023 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3974,7 +3974,7 @@ static int raid10_set_queue_limits(struct mddev *mddev)
struct queue_limits lim;
int err;
 
-   blk_set_stacking_limits(&lim);
+   md_init_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0192a6323f09ba..10219205160bbf 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7708,7 +7708,7 @@ static int raid5_set_limits(struct mddev *mddev)
 */
stripe = round

Re: [PATCH 14/26] block: move the nonrot flag to queue_limits

2024-06-24 Thread Christoph Hellwig

On Mon, Jun 24, 2024 at 11:08:16AM -0600, Keith Busch wrote:
> On Mon, Jun 17, 2024 at 08:04:41AM +0200, Christoph Hellwig wrote:
> > -#define blk_queue_nonrot(q)test_bit(QUEUE_FLAG_NONROT, 
> > &(q)->queue_flags)
> > +#define blk_queue_nonrot(q)((q)->limits.features & 
> > BLK_FEAT_ROTATIONAL)
> 
> This is inverted. Should be:
> 
>  #define blk_queue_nonrot(q)  (!((q)->limits.features & BLK_FEAT_ROTATIONAL))

Ah yes.  And the sysfs attribute doesn't go through the macro and
won't show the effect.  I'll send a fixup.

Re: [axboe-block:for-next] [block] bd4a633b6f: fsmark.files_per_sec -64.5% regression

2024-06-24 Thread Christoph Hellwig

On Mon, Jun 24, 2024 at 03:45:57PM +0200, Niklas Cassel wrote:
> Seems to be ATA SSD:
> https://download.01.org/0day-ci/archive/20240624/202406241546.6bbd44a7-oliver.s...@intel.com/job.yaml
> 
> ssd_partitions: 
> "/dev/disk/by-id/ata-INTEL_SSDSC2BG012T4_BTHC428201ZX1P2OGN-part1"
> 
> Most likely btrfs does something different depending on the nonrot flag
> being set or not. (And like you are suggesting, most likely the value of
> the nonrot flag is somehow different after commit bd4a633b6f)

Yes, btrfs does.  That's why I'm curious about the before and after,
as I can't see any way how they would be set differently.  Right now
I can only claim with vitual AHCI devices, which claim to be rotational,
though.

Re: [axboe-block:for-next] [block] bd4a633b6f: fsmark.files_per_sec -64.5% regression

2024-06-24 Thread Christoph Hellwig

This is odd to say at least.  Any chance you can check the value
of /sys/block/$DEVICE/queue/rotational for the relevant device before
and after this commit?  And is this an ATA or NVMe SSD?

Re: move features flags into queue_limits v2

2024-06-19 Thread Christoph Hellwig

On Wed, Jun 19, 2024 at 08:21:14AM -0600, Jens Axboe wrote:
> Please check for-6.11/block, as I pulled in the changes to the main
> block branch and that threw some merge conflicts mostly due to Damien's
> changes in for-6.11/block. While fixing those up, I also came across
> oddities like:
> 
> (limits->features & limits->features & BLK_FEAT_ZONED)) {
> 
> which don't make much sense and hence I changed them to
> 
> (limits->features & BLK_FEAT_ZONED)) {

Yeah.  The above is harmless but of course completely pointless.

[PATCH 26/26] block: move the bounce flag into the features field

2024-06-16 Thread Christoph Hellwig

Move the bounce flag into the features field to reclaim a little bit of
space.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-settings.c| 1 -
 block/blk.h | 2 +-
 drivers/scsi/scsi_lib.c | 2 +-
 include/linux/blkdev.h  | 6 --
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 96e07f24bd9aa1..d0e9096f93ca8a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -479,7 +479,6 @@ int blk_stack_limits(struct queue_limits *t, struct 
queue_limits *b,
b->max_write_zeroes_sectors);
t->max_zone_append_sectors = 
min(queue_limits_max_zone_append_sectors(t),
 
queue_limits_max_zone_append_sectors(b));
-   t->bounce = max(t->bounce, b->bounce);
 
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
diff --git a/block/blk.h b/block/blk.h
index 79e8d5d4fe0caf..fa32f7fad5d7e6 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -394,7 +394,7 @@ struct bio *__blk_queue_bounce(struct bio *bio, struct 
request_queue *q);
 static inline bool blk_queue_may_bounce(struct request_queue *q)
 {
return IS_ENABLED(CONFIG_BOUNCE) &&
-   q->limits.bounce == BLK_BOUNCE_HIGH &&
+   (q->limits.features & BLK_FEAT_BOUNCE_HIGH) &&
max_low_pfn >= max_pfn;
 }
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 54f771ec8cfb5e..e2f7bfb2b9e450 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1986,7 +1986,7 @@ void scsi_init_limits(struct Scsi_Host *shost, struct 
queue_limits *lim)
shost->dma_alignment, dma_get_cache_alignment() - 1);
 
if (shost->no_highmem)
-   lim->bounce = BLK_BOUNCE_HIGH;
+   lim->features |= BLK_FEAT_BOUNCE_HIGH;
 
dma_set_seg_boundary(dev, shost->dma_boundary);
dma_set_max_seg_size(dev, shost->max_segment_size);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c433ebf6f2030..e96ba7b97288d2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -325,6 +325,9 @@ enum {
 
/* skip this queue in blk_mq_(un)quiesce_tagset */
BLK_FEAT_SKIP_TAGSET_QUIESCE= (1u << 13),
+
+   /* bounce all highmem pages */
+   BLK_FEAT_BOUNCE_HIGH= (1u << 14),
 };
 
 /*
@@ -332,7 +335,7 @@ enum {
  */
 #define BLK_FEAT_INHERIT_MASK \
(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \
-BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED)
+BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH)
 
 /* internal flags in queue_limits.flags */
 enum {
@@ -352,7 +355,6 @@ enum blk_bounce {
 struct queue_limits {
unsigned intfeatures;
unsigned intflags;
-   enum blk_bounce bounce;
unsigned long   seg_boundary_mask;
unsigned long   virt_boundary_mask;
 
-- 
2.43.0

[PATCH 25/26] block: move the skip_tagset_quiesce flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the skip_tagset_quiesce flag into the queue_limits feature field so
that it can be set atomically with the queue frozen.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-mq-debugfs.c   | 1 -
 drivers/nvme/host/core.c | 8 +---
 include/linux/blkdev.h   | 6 --
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 8b5a68861c119b..344f9e503bdb32 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -93,7 +93,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(SQ_SCHED),
-   QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE),
 };
 #undef QUEUE_FLAG_NAME
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 31e752e8d632cd..bf410d10b12006 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4489,13 +4489,15 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, 
struct blk_mq_tag_set *set,
return ret;
 
if (ctrl->ops->flags & NVME_F_FABRICS) {
-   ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL);
+   struct queue_limits lim = {
+   .features   = BLK_FEAT_SKIP_TAGSET_QUIESCE,
+   };
+
+   ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
-   blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
-  ctrl->connect_q);
}
 
ctrl->tagset = set;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ab0f7dfba556eb..2c433ebf6f2030 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -322,6 +322,9 @@ enum {
 
/* supports PCI(e) p2p requests */
BLK_FEAT_PCI_P2PDMA = (1u << 12),
+
+   /* skip this queue in blk_mq_(un)quiesce_tagset */
+   BLK_FEAT_SKIP_TAGSET_QUIESCE= (1u << 13),
 };
 
 /*
@@ -594,7 +597,6 @@ struct request_queue {
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE 28  /* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
-#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/
 
 #define QUEUE_FLAG_MQ_DEFAULT  (1UL << QUEUE_FLAG_SAME_COMP)
 
@@ -629,7 +631,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct 
request_queue *q);
 #define blk_queue_registered(q)test_bit(QUEUE_FLAG_REGISTERED, 
&(q)->queue_flags)
 #define blk_queue_sq_sched(q)  test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
 #define blk_queue_skip_tagset_quiesce(q) \
-   test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags)
+   ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
-- 
2.43.0

[PATCH 24/26] block: move the pci_p2pdma flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the pci_p2pdma flag into the queue_limits feature field so that it
can be set atomically with the queue frozen.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-mq-debugfs.c   | 1 -
 drivers/nvme/host/core.c | 8 +++-
 include/linux/blkdev.h   | 7 ---
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f2fd72f4414ae8..8b5a68861c119b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -90,7 +90,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
-   QUEUE_FLAG_NAME(PCI_P2PDMA),
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(SQ_SCHED),
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 5ecf762d7c8837..31e752e8d632cd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3735,6 +3735,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct 
nvme_ns_info *info)
 
if (ctrl->opts && ctrl->opts->data_digest)
lim.features |= BLK_FEAT_STABLE_WRITES;
+   if (ctrl->ops->supports_pci_p2pdma &&
+   ctrl->ops->supports_pci_p2pdma(ctrl))
+   lim.features |= BLK_FEAT_PCI_P2PDMA;
 
disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns);
if (IS_ERR(disk))
@@ -3744,11 +3747,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct 
nvme_ns_info *info)
 
ns->disk = disk;
ns->queue = disk->queue;
-
-   if (ctrl->ops->supports_pci_p2pdma &&
-   ctrl->ops->supports_pci_p2pdma(ctrl))
-   blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
-
ns->ctrl = ctrl;
kref_init(&ns->kref);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1077cb8d8fd808..ab0f7dfba556eb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -319,6 +319,9 @@ enum {
 
/* supports Zone Reset All */
BLK_FEAT_ZONE_RESETALL  = (1u << 11),
+
+   /* supports PCI(e) p2p requests */
+   BLK_FEAT_PCI_P2PDMA = (1u << 12),
 };
 
 /*
@@ -588,7 +591,6 @@ struct request_queue {
 #define QUEUE_FLAG_STATS   20  /* track IO start and completion times 
*/
 #define QUEUE_FLAG_REGISTERED  22  /* queue has been registered to a disk 
*/
 #define QUEUE_FLAG_QUIESCED24  /* queue has been quiesced */
-#define QUEUE_FLAG_PCI_P2PDMA  25  /* device supports PCI p2p requests */
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE 28  /* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
@@ -611,8 +613,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct 
request_queue *q);
 #define blk_queue_zone_resetall(q) \
((q)->limits.features & BLK_FEAT_ZONE_RESETALL)
 #define blk_queue_dax(q)   ((q)->limits.features & BLK_FEAT_DAX)
-#define blk_queue_pci_p2pdma(q)\
-   test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
+#define blk_queue_pci_p2pdma(q)((q)->limits.features & 
BLK_FEAT_PCI_P2PDMA)
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 #define blk_queue_rq_alloc_time(q) \
test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags)
-- 
2.43.0

[PATCH 23/26] block: move the zone_resetall flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the zone_resetall flag into the queue_limits feature field so that
it can be set atomically with the queue frozen.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-mq-debugfs.c | 1 -
 drivers/block/null_blk/zoned.c | 3 +--
 drivers/block/ublk_drv.c   | 4 +---
 drivers/block/virtio_blk.c | 3 +--
 drivers/nvme/host/zns.c| 3 +--
 drivers/scsi/sd_zbc.c  | 5 +
 include/linux/blkdev.h | 6 --
 7 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3a21527913840d..f2fd72f4414ae8 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -91,7 +91,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
QUEUE_FLAG_NAME(PCI_P2PDMA),
-   QUEUE_FLAG_NAME(ZONE_RESETALL),
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(SQ_SCHED),
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index ca8e739e76b981..b42c00f1313254 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -158,7 +158,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
sector += dev->zone_size_sects;
}
 
-   lim->features |= BLK_FEAT_ZONED;
+   lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL;
lim->chunk_sectors = dev->zone_size_sects;
lim->max_zone_append_sectors = dev->zone_append_max_sectors;
lim->max_open_zones = dev->zone_max_open;
@@ -171,7 +171,6 @@ int null_register_zoned_dev(struct nullb *nullb)
struct request_queue *q = nullb->q;
struct gendisk *disk = nullb->disk;
 
-   blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
disk->nr_zones = bdev_nr_zones(disk->part0);
 
pr_info("%s: using %s zone append\n",
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 69c16018cbb19a..4fdff13fc23b8a 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -248,8 +248,6 @@ static int ublk_dev_param_zoned_validate(const struct 
ublk_device *ub)
 
 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
 {
-   blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
-
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
 }
 
@@ -2196,7 +2194,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, 
struct io_uring_cmd *cmd)
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
return -EOPNOTSUPP;
 
-   lim.features |= BLK_FEAT_ZONED;
+   lim.features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL;
lim.max_active_zones = p->max_active_zones;
lim.max_open_zones =  p->max_open_zones;
lim.max_zone_append_sectors = p->max_zone_append_sectors;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cea45b296f8bec..6c64a67ab9c901 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -728,7 +728,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk 
*vblk,
 
dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
 
-   lim->features |= BLK_FEAT_ZONED;
+   lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL;
 
virtio_cread(vdev, struct virtio_blk_config,
 zoned.max_open_zones, &v);
@@ -1548,7 +1548,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 */
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
(lim.features & BLK_FEAT_ZONED)) {
-   blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
err = blk_revalidate_disk_zones(vblk->disk);
if (err)
goto out_cleanup_disk;
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 06f2417aa50de7..99bb89c2495ae3 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -108,13 +108,12 @@ int nvme_query_zone_info(struct nvme_ns *ns, unsigned 
lbaf,
 void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
struct nvme_zone_info *zi)
 {
-   lim->features |= BLK_FEAT_ZONED;
+   lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL;
lim->max_open_zones = zi->max_open_zones;
lim->max_active_zones = zi->max_active_zones;
lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
lim->chunk_sectors = ns->head->zsze =
nvme_lba_to_sect(ns->head, zi->zone_size);
-   blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue);
 }
 
 static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index d3f8466

[PATCH 22/26] block: move the zoned flag into the features field

2024-06-16 Thread Christoph Hellwig

Move the zoned flags into the features field to reclaim a little
bit of space.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-settings.c   |  5 ++---
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/block/ublk_drv.c   |  2 +-
 drivers/block/virtio_blk.c |  5 +++--
 drivers/md/dm-table.c  | 11 ++-
 drivers/md/dm-zone.c   |  2 +-
 drivers/md/dm-zoned-target.c   |  2 +-
 drivers/nvme/host/zns.c|  2 +-
 drivers/scsi/sd_zbc.c  |  2 +-
 include/linux/blkdev.h |  9 ++---
 10 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 026ba68d829856..96e07f24bd9aa1 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -68,7 +68,7 @@ static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
 
 static int blk_validate_zoned_limits(struct queue_limits *lim)
 {
-   if (!lim->zoned) {
+   if (!(lim->features & BLK_FEAT_ZONED)) {
if (WARN_ON_ONCE(lim->max_open_zones) ||
WARN_ON_ONCE(lim->max_active_zones) ||
WARN_ON_ONCE(lim->zone_write_granularity) ||
@@ -602,8 +602,7 @@ int blk_stack_limits(struct queue_limits *t, struct 
queue_limits *b,
   b->max_secure_erase_sectors);
t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity);
-   t->zoned = max(t->zoned, b->zoned);
-   if (!t->zoned) {
+   if (!(t->features & BLK_FEAT_ZONED)) {
t->zone_write_granularity = 0;
t->max_zone_append_sectors = 0;
}
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index f118d304f31080..ca8e739e76b981 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -158,7 +158,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
sector += dev->zone_size_sects;
}
 
-   lim->zoned = true;
+   lim->features |= BLK_FEAT_ZONED;
lim->chunk_sectors = dev->zone_size_sects;
lim->max_zone_append_sectors = dev->zone_append_max_sectors;
lim->max_open_zones = dev->zone_max_open;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 4fcde099935868..69c16018cbb19a 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2196,7 +2196,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, 
struct io_uring_cmd *cmd)
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
return -EOPNOTSUPP;
 
-   lim.zoned = true;
+   lim.features |= BLK_FEAT_ZONED;
lim.max_active_zones = p->max_active_zones;
lim.max_open_zones =  p->max_open_zones;
lim.max_zone_append_sectors = p->max_zone_append_sectors;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 13a2f24f176628..cea45b296f8bec 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -728,7 +728,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk 
*vblk,
 
dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
 
-   lim->zoned = true;
+   lim->features |= BLK_FEAT_ZONED;
 
virtio_cread(vdev, struct virtio_blk_config,
 zoned.max_open_zones, &v);
@@ -1546,7 +1546,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 * All steps that follow use the VQs therefore they need to be
 * placed after the virtio_device_ready() call above.
 */
-   if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
+   if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+   (lim.features & BLK_FEAT_ZONED)) {
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
err = blk_revalidate_disk_zones(vblk->disk);
if (err)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index ca1f136575cff4..df6313c3fe6ba4 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1605,12 +1605,12 @@ int dm_calculate_queue_limits(struct dm_table *t,
ti->type->iterate_devices(ti, dm_set_device_limits,
  &ti_limits);
 
-   if (!zoned && ti_limits.zoned) {
+   if (!zoned && (ti_limits.features & BLK_FEAT_ZONED)) {
/*
 * After stacking all limits, validate all devices
 * in table support this zoned model and zone sectors.
 */
-   zoned = ti_limits.zoned;
+   zoned = (ti_limits.features & BLK_FEAT_ZONED);
zone_sectors

[PATCH 21/26] block: move the poll flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the poll flag into the queue_limits feature field so that it can
be set atomically with the queue frozen.

Stacking drivers are simplified in that they now can simply set the
flag, and blk_stack_limits will clear it when the features is not
supported by any of the underlying devices.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-core.c  |  5 ++--
 block/blk-mq-debugfs.c|  1 -
 block/blk-mq.c| 31 +++-
 block/blk-settings.c  | 10 ---
 block/blk-sysfs.c |  4 +--
 drivers/md/dm-table.c | 54 +--
 drivers/nvme/host/multipath.c | 12 +---
 include/linux/blkdev.h|  4 ++-
 8 files changed, 45 insertions(+), 76 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2b45a4df9a1aa1..8d9fbd353fc7fc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -791,7 +791,7 @@ void submit_bio_noacct(struct bio *bio)
}
}
 
-   if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+   if (!(q->limits.features & BLK_FEAT_POLL))
bio_clear_polled(bio);
 
switch (bio_op(bio)) {
@@ -915,8 +915,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, 
unsigned int flags)
return 0;
 
q = bdev_get_queue(bdev);
-   if (cookie == BLK_QC_T_NONE ||
-   !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+   if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL))
return 0;
 
blk_flush_plug(current->plug, false);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f4fa820251ce83..3a21527913840d 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -87,7 +87,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NOXMERGES),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
-   QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 43235acc87505f..e2b9710ddc5ad1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4109,6 +4109,12 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q);
 }
 
+static bool blk_mq_can_poll(struct blk_mq_tag_set *set)
+{
+   return set->nr_maps > HCTX_TYPE_POLL &&
+   set->map[HCTX_TYPE_POLL].nr_queues;
+}
+
 struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata)
 {
@@ -4119,6 +4125,8 @@ struct request_queue *blk_mq_alloc_queue(struct 
blk_mq_tag_set *set,
if (!lim)
lim = &default_lim;
lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
+   if (blk_mq_can_poll(set))
+   lim->features |= BLK_FEAT_POLL;
 
q = blk_alloc_queue(lim, set->numa_node);
if (IS_ERR(q))
@@ -4273,17 +4281,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set 
*set,
mutex_unlock(&q->sysfs_lock);
 }
 
-static void blk_mq_update_poll_flag(struct request_queue *q)
-{
-   struct blk_mq_tag_set *set = q->tag_set;
-
-   if (set->nr_maps > HCTX_TYPE_POLL &&
-   set->map[HCTX_TYPE_POLL].nr_queues)
-   blk_queue_flag_set(QUEUE_FLAG_POLL, q);
-   else
-   blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
-}
-
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
 {
@@ -4311,7 +4308,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set 
*set,
q->tag_set = set;
 
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
-   blk_mq_update_poll_flag(q);
 
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->flush_list);
@@ -4798,8 +4794,10 @@ static void __blk_mq_update_nr_hw_queues(struct 
blk_mq_tag_set *set,
 fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
+   struct queue_limits lim;
+
blk_mq_realloc_hw_ctxs(set, q);
-   blk_mq_update_poll_flag(q);
+
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
 
@@ -4811,6 +4809,13 @@ static void __blk_mq_update_nr_hw_queues(struct 
blk_mq_tag_set *set,
set->nr_hw_queues = prev_nr_hw_queues;
goto fallback;
}
+   lim = queue_limits_start_update(q);
+   if (blk_mq_can_poll(set))
+   lim.features |= BLK_FEAT_POLL;
+   else
+   lim.features &= ~BLK_FEAT_POLL;
+   if (queue_limits_commit_update(q, &lim) < 0)
+   pr_warn("updating

[PATCH 20/26] block: move the dax flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the dax flag into the queue_limits feature field so that it can be
set atomically with the queue frozen.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-mq-debugfs.c   | 1 -
 drivers/md/dm-table.c| 4 ++--
 drivers/nvdimm/pmem.c| 7 ++-
 drivers/s390/block/dcssblk.c | 2 +-
 include/linux/blkdev.h   | 6 --
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 62b132e9a9ce3b..f4fa820251ce83 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -88,7 +88,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(POLL),
-   QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 84d636712c7284..e44697037e86f4 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1834,11 +1834,11 @@ int dm_table_set_restrictions(struct dm_table *t, 
struct request_queue *q,
limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
 
if (dm_table_supports_dax(t, device_not_dax_capable)) {
-   blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+   limits->features |= BLK_FEAT_DAX;
if (dm_table_supports_dax(t, 
device_not_dax_synchronous_capable))
set_dax_synchronous(t->md->dax_dev);
} else
-   blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
+   limits->features &= ~BLK_FEAT_DAX;
 
if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
dax_write_cache(t->md->dax_dev, true);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index b821dcf018f6ae..1dd74c969d5a09 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -465,7 +465,6 @@ static int pmem_attach_disk(struct device *dev,
struct dax_device *dax_dev;
struct nd_pfn_sb *pfn_sb;
struct pmem_device *pmem;
-   struct request_queue *q;
struct gendisk *disk;
void *addr;
int rc;
@@ -499,6 +498,8 @@ static int pmem_attach_disk(struct device *dev,
}
if (fua)
lim.features |= BLK_FEAT_FUA;
+   if (is_nd_pfn(dev))
+   lim.features |= BLK_FEAT_DAX;
 
if (!devm_request_mem_region(dev, res->start, resource_size(res),
dev_name(&ndns->dev))) {
@@ -509,7 +510,6 @@ static int pmem_attach_disk(struct device *dev,
disk = blk_alloc_disk(&lim, nid);
if (IS_ERR(disk))
return PTR_ERR(disk);
-   q = disk->queue;
 
pmem->disk = disk;
pmem->pgmap.owner = pmem;
@@ -547,9 +547,6 @@ static int pmem_attach_disk(struct device *dev,
}
pmem->virt_addr = addr;
 
-   if (pmem->pfn_flags & PFN_MAP)
-   blk_queue_flag_set(QUEUE_FLAG_DAX, q);
-
disk->fops  = &pmem_fops;
disk->private_data  = pmem;
nvdimm_namespace_disk_name(ndns, disk->disk_name);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 6d1689a2717e5f..d5a5d11ae0dcdf 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -548,6 +548,7 @@ dcssblk_add_store(struct device *dev, struct 
device_attribute *attr, const char
 {
struct queue_limits lim = {
.logical_block_size = 4096,
+   .features   = BLK_FEAT_DAX,
};
int rc, i, j, num_of_segments;
struct dcssblk_dev_info *dev_info;
@@ -643,7 +644,6 @@ dcssblk_add_store(struct device *dev, struct 
device_attribute *attr, const char
dev_info->gd->fops = &dcssblk_devops;
dev_info->gd->private_data = dev_info;
dev_info->gd->flags |= GENHD_FL_NO_PART;
-   blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->gd->queue);
 
seg_byte_size = (dev_info->end - dev_info->start + 1);
set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3d4519d609d95..7022e06a3dd9a3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -307,6 +307,9 @@ enum {
 
/* supports REQ_NOWAIT */
BLK_FEAT_NOWAIT = (1u << 7),
+
+   /* supports DAX */
+   BLK_FEAT_DAX= (1u << 8),
 };
 
 /*
@@ -575,7 +578,6 @@ struct request_queue {
 #define QUEUE_FLAG_SAME_FORCE  12  /* force complete on same CPU */
 #define QUEUE_FLAG_INIT_DONE   14  /* queue is initialized */
 #define QUEUE_FLAG_POLL16  /* IO polling enabled if set */
-#define QUEUE_FLAG_DAX 19  /* device support

[PATCH 19/26] block: move the nowait flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the nowait flag into the queue_limits feature field so that it can
be set atomically with the queue frozen.

Stacking drivers are simplified in that they now can simply set the
flag, and blk_stack_limits will clear it when the features is not
supported by any of the underlying devices.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c|  1 -
 block/blk-mq.c|  2 +-
 block/blk-settings.c  |  9 +
 drivers/block/brd.c   |  4 ++--
 drivers/md/dm-table.c | 18 +++---
 drivers/md/md.c   | 18 +-
 drivers/nvme/host/multipath.c |  3 +--
 include/linux/blkdev.h|  9 +
 8 files changed, 22 insertions(+), 42 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 957774e40b1d0c..62b132e9a9ce3b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -96,7 +96,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(ZONE_RESETALL),
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
-   QUEUE_FLAG_NAME(NOWAIT),
QUEUE_FLAG_NAME(SQ_SCHED),
QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE),
 };
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cf67dc13f7dd4c..43235acc87505f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4118,7 +4118,7 @@ struct request_queue *blk_mq_alloc_queue(struct 
blk_mq_tag_set *set,
 
if (!lim)
lim = &default_lim;
-   lim->features |= BLK_FEAT_IO_STAT;
+   lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
 
q = blk_alloc_queue(lim, set->numa_node);
if (IS_ERR(q))
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 536ee202fcdccb..bf4622c19b5c09 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -459,6 +459,15 @@ int blk_stack_limits(struct queue_limits *t, struct 
queue_limits *b,
 
t->features |= (b->features & BLK_FEAT_INHERIT_MASK);
 
+   /*
+* BLK_FEAT_NOWAIT needs to be supported both by the stacking driver
+* and all underlying devices.  The stacking driver sets the flag
+* before stacking the limits, and this will clear the flag if any
+* of the underlying devices does not support it.
+*/
+   if (!(b->features & BLK_FEAT_NOWAIT))
+   t->features &= ~BLK_FEAT_NOWAIT;
+
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_user_sectors = min_not_zero(t->max_user_sectors,
b->max_user_sectors);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index d77deb571dbd06..a300645cd9d4a5 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -335,7 +335,8 @@ static int brd_alloc(int i)
.max_hw_discard_sectors = UINT_MAX,
.max_discard_segments   = 1,
.discard_granularity= PAGE_SIZE,
-   .features   = BLK_FEAT_SYNCHRONOUS,
+   .features   = BLK_FEAT_SYNCHRONOUS |
+ BLK_FEAT_NOWAIT,
};
 
list_for_each_entry(brd, &brd_devices, brd_list)
@@ -367,7 +368,6 @@ static int brd_alloc(int i)
strscpy(disk->disk_name, buf, DISK_NAME_LEN);
set_capacity(disk, rd_size * 2);

-   blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
err = add_disk(disk);
if (err)
goto out_cleanup_disk;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index aaf379cb15d91f..84d636712c7284 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -582,7 +582,7 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 static void dm_set_stacking_limits(struct queue_limits *limits)
 {
blk_set_stacking_limits(limits);
-   limits->features |= BLK_FEAT_IO_STAT;
+   limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
 }
 
 /*
@@ -1746,12 +1746,6 @@ static bool dm_table_supports_write_zeroes(struct 
dm_table *t)
return true;
 }
 
-static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev,
-sector_t start, sector_t len, void *data)
-{
-   return !bdev_nowait(dev->bdev);
-}
-
 static bool dm_table_supports_nowait(struct dm_table *t)
 {
for (unsigned int i = 0; i < t->num_targets; i++) {
@@ -1759,10 +1753,6 @@ static bool dm_table_supports_nowait(struct dm_table *t)
 
if (!dm_target_supports_nowait(ti->type))
return false;
-
-   if (!ti->type->iterate_devices ||
-   ti->type->iterate_devices(ti, device_not_nowait_capable, 
NULL))
-   return false;
}
 
return true;
@@ -1824,10 +1814,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct 
request_queue *q,
 {
int r;
 
-   if (dm_t

[PATCH 17/26] block: move the stable_writes flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the stable_writes flag into the queue_limits feature field so that
it can be set atomically with the queue frozen.

The flag is now inherited by blk_stack_limits, which greatly simplifies
the code in dm, and fixed md which previously did not pass on the flag
set on lower devices.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-mq-debugfs.c |  1 -
 block/blk-sysfs.c  | 29 +
 drivers/block/drbd/drbd_main.c |  5 ++---
 drivers/block/rbd.c|  9 +++--
 drivers/block/zram/zram_drv.c  |  2 +-
 drivers/md/dm-table.c  | 19 ---
 drivers/md/raid5.c |  6 --
 drivers/mmc/core/queue.c   |  5 +++--
 drivers/nvme/host/core.c   |  9 +
 drivers/nvme/host/multipath.c  |  4 
 drivers/scsi/iscsi_tcp.c   |  8 
 include/linux/blkdev.h |  9 ++---
 12 files changed, 29 insertions(+), 77 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index cbe99444ed1a54..eb73f1d348e5a9 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -88,7 +88,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
-   QUEUE_FLAG_NAME(STABLE_WRITES),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6f58530fb3c08e..cde525724831ef 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -296,37 +296,10 @@ static ssize_t queue_##_name##_store(struct request_queue 
*q,  \
return queue_feature_store(q, page, count, _feature);\
 }
 
-#define QUEUE_SYSFS_BIT_FNS(name, flag, neg)   \
-static ssize_t \
-queue_##name##_show(struct request_queue *q, char *page)   \
-{  \
-   int bit;\
-   bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \
-   return queue_var_show(neg ? !bit : bit, page);  \
-}  \
-static ssize_t \
-queue_##name##_store(struct request_queue *q, const char *page, size_t count) \
-{  \
-   unsigned long val;  \
-   ssize_t ret;\
-   ret = queue_var_store(&val, page, count);   \
-   if (ret < 0)\
-return ret;\
-   if (neg)\
-   val = !val; \
-   \
-   if (val)\
-   blk_queue_flag_set(QUEUE_FLAG_##flag, q);   \
-   else\
-   blk_queue_flag_clear(QUEUE_FLAG_##flag, q); \
-   return ret; \
-}
-
 QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
 QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM)
 QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT)
-QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
-#undef QUEUE_SYSFS_BIT_FNS
+QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES);
 
 static ssize_t queue_zoned_show(struct request_queue *q, char *page)
 {
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2ef29a47807550..f92673f05c7abc 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2698,7 +2698,8 @@ enum drbd_ret_code drbd_create_device(struct 
drbd_config_context *adm_ctx, unsig
 */
.max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8,
.features   = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
- BLK_FEAT_ROTATIONAL,
+ BLK_FEAT_ROTATIONAL |
+ BLK_FEAT_STABLE_WRITES,
};
 
device = minor_to_device(minor);
@@ -2737,8 +2738,6 @@ enum drbd_ret_code drbd_create_device(struct 
drbd_config_context *adm_ctx, unsig
sprintf(disk->disk_name, "drbd%d", minor);
disk->private_data = device;
 
-   blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
-
device->md_io.page = alloc_page(GF

[PATCH 18/26] block: move the synchronous flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the synchronous flag into the queue_limits feature field so that it
can be set atomically with the queue frozen.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-mq-debugfs.c| 1 -
 drivers/block/brd.c   | 2 +-
 drivers/block/zram/zram_drv.c | 4 ++--
 drivers/nvdimm/btt.c  | 3 +--
 drivers/nvdimm/pmem.c | 4 ++--
 include/linux/blkdev.h| 7 ---
 6 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index eb73f1d348e5a9..957774e40b1d0c 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -85,7 +85,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_COMP),
QUEUE_FLAG_NAME(FAIL_IO),
QUEUE_FLAG_NAME(NOXMERGES),
-   QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(POLL),
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index b25dc463b5e3a6..d77deb571dbd06 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -335,6 +335,7 @@ static int brd_alloc(int i)
.max_hw_discard_sectors = UINT_MAX,
.max_discard_segments   = 1,
.discard_granularity= PAGE_SIZE,
+   .features   = BLK_FEAT_SYNCHRONOUS,
};
 
list_for_each_entry(brd, &brd_devices, brd_list)
@@ -366,7 +367,6 @@ static int brd_alloc(int i)
strscpy(disk->disk_name, buf, DISK_NAME_LEN);
set_capacity(disk, rd_size * 2);

-   blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
err = add_disk(disk);
if (err)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f8f1b5b54795ac..efcb8d9d274c31 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -2208,7 +2208,8 @@ static int zram_add(void)
 #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
.max_write_zeroes_sectors   = UINT_MAX,
 #endif
-   .features   = BLK_FEAT_STABLE_WRITES,
+   .features   = BLK_FEAT_STABLE_WRITES |
+ BLK_FEAT_SYNCHRONOUS,
};
struct zram *zram;
int ret, device_id;
@@ -2246,7 +2247,6 @@ static int zram_add(void)
 
/* Actual capacity set using sysfs (/sys/block/zram/disksize */
set_capacity(zram->disk, 0);
-   blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
if (ret)
goto out_cleanup_disk;
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index e474afa8e9f68d..e79c06d65bb77b 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1501,6 +1501,7 @@ static int btt_blk_init(struct btt *btt)
.logical_block_size = btt->sector_size,
.max_hw_sectors = UINT_MAX,
.max_integrity_segments = 1,
+   .features   = BLK_FEAT_SYNCHRONOUS,
};
int rc;
 
@@ -1518,8 +1519,6 @@ static int btt_blk_init(struct btt *btt)
btt->btt_disk->fops = &btt_fops;
btt->btt_disk->private_data = btt;
 
-   blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue);
-
set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
rc = device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL);
if (rc)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 501cf226df0187..b821dcf018f6ae 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -455,7 +455,8 @@ static int pmem_attach_disk(struct device *dev,
.logical_block_size = pmem_sector_size(ndns),
.physical_block_size= PAGE_SIZE,
.max_hw_sectors = UINT_MAX,
-   .features   = BLK_FEAT_WRITE_CACHE,
+   .features   = BLK_FEAT_WRITE_CACHE |
+ BLK_FEAT_SYNCHRONOUS,
};
int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res;
@@ -546,7 +547,6 @@ static int pmem_attach_disk(struct device *dev,
}
pmem->virt_addr = addr;
 
-   blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q);
if (pmem->pfn_flags & PFN_MAP)
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8936eb6ba60956..cee7b44a142513 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -301,6 +301,9 @@ enum {
 
/* don't modify data until writeback is done */
BLK_FEAT_STABLE_WRITES  = (1u << 5),
+
+   /* always compl

[PATCH 16/26] block: move the io_stat flag setting to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the io_stat flag into the queue_limits feature field so that it can
be set atomically with the queue frozen.

Simplify md and dm to set the flag unconditionally instead of avoiding
setting a simple flag for cases where it already is set by other means,
which is a bit pointless.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c|  1 -
 block/blk-mq.c|  6 +-
 block/blk-sysfs.c |  2 +-
 drivers/md/dm-table.c | 12 +---
 drivers/md/dm.c   | 13 +++--
 drivers/md/md.c   |  5 ++---
 drivers/nvme/host/multipath.c |  2 +-
 include/linux/blkdev.h|  9 +
 8 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 6b7edb50bfd3fa..cbe99444ed1a54 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -84,7 +84,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NOMERGES),
QUEUE_FLAG_NAME(SAME_COMP),
QUEUE_FLAG_NAME(FAIL_IO),
-   QUEUE_FLAG_NAME(IO_STAT),
QUEUE_FLAG_NAME(NOXMERGES),
QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 58b0d6c7cc34d6..cf67dc13f7dd4c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4116,7 +4116,11 @@ struct request_queue *blk_mq_alloc_queue(struct 
blk_mq_tag_set *set,
struct request_queue *q;
int ret;
 
-   q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
+   if (!lim)
+   lim = &default_lim;
+   lim->features |= BLK_FEAT_IO_STAT;
+
+   q = blk_alloc_queue(lim, set->numa_node);
if (IS_ERR(q))
return q;
q->queuedata = queuedata;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9174aca3b85526..6f58530fb3c08e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -324,7 +324,7 @@ queue_##name##_store(struct request_queue *q, const char 
*page, size_t count) \
 
 QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
 QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM)
-QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
+QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT)
 QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
 #undef QUEUE_SYSFS_BIT_FNS
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0a3838e45affd4..5d5431e531aea9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -579,6 +579,12 @@ int dm_split_args(int *argc, char ***argvp, char *input)
return 0;
 }
 
+static void dm_set_stacking_limits(struct queue_limits *limits)
+{
+   blk_set_stacking_limits(limits);
+   limits->features |= BLK_FEAT_IO_STAT;
+}
+
 /*
  * Impose necessary and sufficient conditions on a devices's table such
  * that any incoming bio which respects its logical_block_size can be
@@ -617,7 +623,7 @@ static int validate_hardware_logical_block_alignment(struct 
dm_table *t,
for (i = 0; i < t->num_targets; i++) {
ti = dm_table_get_target(t, i);
 
-   blk_set_stacking_limits(&ti_limits);
+   dm_set_stacking_limits(&ti_limits);
 
/* combine all target devices' limits */
if (ti->type->iterate_devices)
@@ -1591,7 +1597,7 @@ int dm_calculate_queue_limits(struct dm_table *t,
unsigned int zone_sectors = 0;
bool zoned = false;
 
-   blk_set_stacking_limits(limits);
+   dm_set_stacking_limits(limits);
 
t->integrity_supported = true;
for (unsigned int i = 0; i < t->num_targets; i++) {
@@ -1604,7 +1610,7 @@ int dm_calculate_queue_limits(struct dm_table *t,
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
 
-   blk_set_stacking_limits(&ti_limits);
+   dm_set_stacking_limits(&ti_limits);
 
if (!ti->type->iterate_devices) {
/* Set I/O hints portion of queue limits */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 13037d6a6f62a2..8a976cee448bed 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2386,22 +2386,15 @@ int dm_setup_md_queue(struct mapped_device *md, struct 
dm_table *t)
struct table_device *td;
int r;
 
-   switch (type) {
-   case DM_TYPE_REQUEST_BASED:
+   WARN_ON_ONCE(type == DM_TYPE_NONE);
+
+   if (type == DM_TYPE_REQUEST_BASED) {
md->disk->fops = &dm_rq_blk_dops;
r = dm_mq_init_request_queue(md, t);
if (r) {
DMERR("Cannot initialize queue for request-based dm 
mapped device");
return r;
}
-   break;
-   case DM_TYPE_BIO_BASED:
-   case DM_TYPE_DAX_BIO_BASED:
-   blk_queue_flag_set(QUEUE_FLAG_IO

[PATCH 15/26] block: move the add_random flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the add_random flag into the queue_limits feature field so that it
can be set atomically with the queue frozen.

Note that this also removes code from dm to clear the flag based on
the underlying devices, which can't be reached as dm devices will
always start out without the flag set.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 block/blk-mq-debugfs.c|  1 -
 block/blk-sysfs.c |  6 +++---
 drivers/block/mtip32xx/mtip32xx.c |  1 -
 drivers/md/dm-table.c | 18 --
 drivers/mmc/core/queue.c  |  2 --
 drivers/mtd/mtd_blkdevs.c |  3 ---
 drivers/s390/block/scm_blk.c  |  4 
 drivers/scsi/scsi_lib.c   |  3 +--
 drivers/scsi/sd.c | 11 +++
 include/linux/blkdev.h|  5 +++--
 10 files changed, 10 insertions(+), 44 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4d0e62ec88f033..6b7edb50bfd3fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -86,7 +86,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(FAIL_IO),
QUEUE_FLAG_NAME(IO_STAT),
QUEUE_FLAG_NAME(NOXMERGES),
-   QUEUE_FLAG_NAME(ADD_RANDOM),
QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 637ed3bbbfb46f..9174aca3b85526 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -323,7 +323,7 @@ queue_##name##_store(struct request_queue *q, const char 
*page, size_t count) \
 }
 
 QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
-QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
+QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM)
 QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
 QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
 #undef QUEUE_SYSFS_BIT_FNS
@@ -561,7 +561,7 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry 
= {
 
 QUEUE_RW_ENTRY(queue_rotational, "rotational");
 QUEUE_RW_ENTRY(queue_iostats, "iostats");
-QUEUE_RW_ENTRY(queue_random, "add_random");
+QUEUE_RW_ENTRY(queue_add_random, "add_random");
 QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
 
 #ifdef CONFIG_BLK_WBT
@@ -665,7 +665,7 @@ static struct attribute *queue_attrs[] = {
&queue_nomerges_entry.attr,
&queue_iostats_entry.attr,
&queue_stable_writes_entry.attr,
-   &queue_random_entry.attr,
+   &queue_add_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_fua_entry.attr,
diff --git a/drivers/block/mtip32xx/mtip32xx.c 
b/drivers/block/mtip32xx/mtip32xx.c
index 1dbbf72659d549..c6ef0546ffc9d2 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3485,7 +3485,6 @@ static int mtip_block_initialize(struct driver_data *dd)
goto start_service_thread;
 
/* Set device limits. */
-   blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue);
dma_set_max_seg_size(&dd->pdev->dev, 0x40);
 
/* Set the capacity of the device in 512 byte sectors. */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index c062af32970934..0a3838e45affd4 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1716,14 +1716,6 @@ static int device_dax_write_cache_enabled(struct 
dm_target *ti,
return false;
 }
 
-static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
-sector_t start, sector_t len, void *data)
-{
-   struct request_queue *q = bdev_get_queue(dev->bdev);
-
-   return !blk_queue_add_random(q);
-}
-
 static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev 
*dev,
   sector_t start, sector_t len, void 
*data)
 {
@@ -1876,16 +1868,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct 
request_queue *q,
else
blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
 
-   /*
-* Determine whether or not this queue's I/O timings contribute
-* to the entropy pool, Only request-based targets use this.
-* Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
-* have it set.
-*/
-   if (blk_queue_add_random(q) &&
-   dm_table_any_dev_attr(t, device_is_not_random, NULL))
-   blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
-
/*
 * For a zoned target, setup the zones related queue attributes
 * and resources necessary for zone append emulation if necessary.
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index b4f62fa845864c..da00904d4a3c7e 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -387,8 +387,6 @@ static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq,

[PATCH 14/26] block: move the nonrot flag to queue_limits

2024-06-16 Thread Christoph Hellwig

Move the nonrot flag into the queue_limits feature field so that it can
be set atomically with the queue frozen.

Use the chance to switch to defaulting to non-rotational and require
the driver to opt into rotational, which matches the polarity of the
sysfs interface.

For the z2ram, ps3vram, 2x memstick, ubiblock and dcssblk the new
rotational flag is not set as they clearly are not rotational despite
this being a behavior change.  There are some other drivers that
unconditionally set the rotational flag to keep the existing behavior
as they arguably can be used on rotational devices even if that is
probably not their main use today (e.g. virtio_blk and drbd).

The flag is automatically inherited in blk_stack_limits matching the
existing behavior in dm and md.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
---
 arch/m68k/emu/nfblock.c |  1 +
 arch/um/drivers/ubd_kern.c  |  1 -
 arch/xtensa/platforms/iss/simdisk.c |  5 +++-
 block/blk-mq-debugfs.c  |  1 -
 block/blk-sysfs.c   | 39 ++---
 drivers/block/amiflop.c |  5 +++-
 drivers/block/aoe/aoeblk.c  |  1 +
 drivers/block/ataflop.c |  5 +++-
 drivers/block/brd.c |  2 --
 drivers/block/drbd/drbd_main.c  |  3 ++-
 drivers/block/floppy.c  |  3 ++-
 drivers/block/loop.c|  8 +++---
 drivers/block/mtip32xx/mtip32xx.c   |  1 -
 drivers/block/n64cart.c |  2 --
 drivers/block/nbd.c |  5 
 drivers/block/null_blk/main.c   |  1 -
 drivers/block/pktcdvd.c |  1 +
 drivers/block/ps3disk.c |  3 ++-
 drivers/block/rbd.c |  3 ---
 drivers/block/rnbd/rnbd-clt.c   |  4 ---
 drivers/block/sunvdc.c  |  1 +
 drivers/block/swim.c|  5 +++-
 drivers/block/swim3.c   |  5 +++-
 drivers/block/ublk_drv.c|  9 +++
 drivers/block/virtio_blk.c  |  4 ++-
 drivers/block/xen-blkfront.c|  1 -
 drivers/block/zram/zram_drv.c   |  2 --
 drivers/cdrom/gdrom.c   |  1 +
 drivers/md/bcache/super.c   |  2 --
 drivers/md/dm-table.c   | 12 -
 drivers/md/md.c | 13 --
 drivers/mmc/core/queue.c|  1 -
 drivers/mtd/mtd_blkdevs.c   |  1 -
 drivers/nvdimm/btt.c|  1 -
 drivers/nvdimm/pmem.c   |  1 -
 drivers/nvme/host/core.c|  1 -
 drivers/nvme/host/multipath.c   |  1 -
 drivers/s390/block/dasd_genhd.c |  1 -
 drivers/s390/block/scm_blk.c|  1 -
 drivers/scsi/sd.c   |  4 +--
 include/linux/blkdev.h  | 10 
 41 files changed, 83 insertions(+), 88 deletions(-)

diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 642fb80c5c4e31..8eea7ef9115146 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -98,6 +98,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
 {
struct queue_limits lim = {
.logical_block_size = bsize,
+   .features   = BLK_FEAT_ROTATIONAL,
};
struct nfhd_device *dev;
int dev_id = id - NFHD_DEV_OFFSET;
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 19e01691ea0ea7..9f1e76ddda5a26 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -882,7 +882,6 @@ static int ubd_add(int n, char **error_out)
goto out_cleanup_tags;
}
 
-   blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
disk->major = UBD_MAJOR;
disk->first_minor = n << UBD_SHIFT;
disk->minors = 1 << UBD_SHIFT;
diff --git a/arch/xtensa/platforms/iss/simdisk.c 
b/arch/xtensa/platforms/iss/simdisk.c
index defc67909a9c74..d6d2b533a5744d 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -263,6 +263,9 @@ static const struct proc_ops simdisk_proc_ops = {
 static int __init simdisk_setup(struct simdisk *dev, int which,
struct proc_dir_entry *procdir)
 {
+   struct queue_limits lim = {
+   .features   = BLK_FEAT_ROTATIONAL,
+   };
char tmp[2] = { '0' + which, 0 };
int err;
 
@@ -271,7 +274,7 @@ static int __init simdisk_setup(struct simdisk *dev, int 
which,
spin_lock_init(&dev->lock);
dev->users = 0;
 
-   dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE);
+   dev->gd = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(dev->gd)) {
err = PTR_ERR(dev->gd);
goto out;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index e8b9db7c30c455..4d0e62ec88f033 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -84,7 +84,6 @@ static const char *const blk_queue_flag_name[] = {

[PATCH 13/26] block: move cache control settings out of queue->flags

2024-06-16 Thread Christoph Hellwig

Move the cache control settings into the queue_limits so that the flags
can be set atomically with the device queue frozen.

Add new features and flags field for the driver set flags, and internal
(usually sysfs-controlled) flags in the block layer.  Note that we'll
eventually remove enough field from queue_limits to bring it back to the
previous size.

The disable flag is inverted compared to the previous meaning, which
means it now survives a rescan, similar to the max_sectors and
max_discard_sectors user limits.

The FLUSH and FUA flags are now inherited by blk_stack_limits, which
simplified the code in dm a lot, but also causes a slight behavior
change in that dm-switch and dm-unstripe now advertise a write cache
despite setting num_flush_bios to 0.  The I/O path will handle this
gracefully, but as far as I can tell the lack of num_flush_bios
and thus flush support is a pre-existing data integrity bug in those
targets that really needs fixing, after which a non-zero num_flush_bios
should be required in dm for targets that map to underlying devices.

Signed-off-by: Christoph Hellwig 
Acked-by: Ulf Hansson  [mmc]
---
 .../block/writeback_cache_control.rst | 67 +++
 arch/um/drivers/ubd_kern.c|  2 +-
 block/blk-core.c  |  2 +-
 block/blk-flush.c |  9 ++-
 block/blk-mq-debugfs.c|  2 -
 block/blk-settings.c  | 29 ++--
 block/blk-sysfs.c | 29 +---
 block/blk-wbt.c   |  4 +-
 drivers/block/drbd/drbd_main.c|  2 +-
 drivers/block/loop.c  |  9 +--
 drivers/block/nbd.c   | 14 ++--
 drivers/block/null_blk/main.c | 12 ++--
 drivers/block/ps3disk.c   |  7 +-
 drivers/block/rnbd/rnbd-clt.c | 10 +--
 drivers/block/ublk_drv.c  |  8 ++-
 drivers/block/virtio_blk.c| 20 --
 drivers/block/xen-blkfront.c  |  8 ++-
 drivers/md/bcache/super.c |  7 +-
 drivers/md/dm-table.c | 39 +++
 drivers/md/md.c   |  8 ++-
 drivers/mmc/core/block.c  | 42 ++--
 drivers/mmc/core/queue.c  | 12 ++--
 drivers/mmc/core/queue.h  |  3 +-
 drivers/mtd/mtd_blkdevs.c |  5 +-
 drivers/nvdimm/pmem.c |  4 +-
 drivers/nvme/host/core.c  |  7 +-
 drivers/nvme/host/multipath.c |  6 --
 drivers/scsi/sd.c | 28 +---
 include/linux/blkdev.h| 38 +--
 29 files changed, 227 insertions(+), 206 deletions(-)

diff --git a/Documentation/block/writeback_cache_control.rst 
b/Documentation/block/writeback_cache_control.rst
index b208488d0aae85..c575e08beda8e3 100644
--- a/Documentation/block/writeback_cache_control.rst
+++ b/Documentation/block/writeback_cache_control.rst
@@ -46,41 +46,50 @@ worry if the underlying devices need any explicit cache 
flushing and how
 the Forced Unit Access is implemented.  The REQ_PREFLUSH and REQ_FUA flags
 may both be set on a single bio.
 
+Feature settings for block drivers
+--
 
-Implementation details for bio based block drivers
---
+For devices that do not support volatile write caches there is no driver
+support required, the block layer completes empty REQ_PREFLUSH requests before
+entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from
+requests that have a payload.
 
-These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit
-directly below the submit_bio interface.  For remapping drivers the REQ_FUA
-bits need to be propagated to underlying devices, and a global flush needs
-to be implemented for bios with the REQ_PREFLUSH bit set.  For real device
-drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits
-on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without
-data can be completed successfully without doing any work.  Drivers for
-devices with volatile caches need to implement the support for these
-flags themselves without any help from the block layer.
+For devices with volatile write caches the driver needs to tell the block layer
+that it supports flushing caches by setting the
 
+   BLK_FEAT_WRITE_CACHE
 
-Implementation details for request_fn based block drivers
--
+flag in the queue_limits feature field.  For devices that also support the FUA
+bit the block layer needs to be told to pass on the REQ_FUA bit by also setting
+the
 
-For devices that do not support volatile write caches there is no driver
-support required, the

[PATCH 12/26] block: remove blk_flush_policy

2024-06-16 Thread Christoph Hellwig

Fold blk_flush_policy into the only caller to prepare for pending changes
to it.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Bart Van Assche 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
---
 block/blk-flush.c | 33 +++--
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index c17cf8ed8113db..2234f8b3fc05f2 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -100,23 +100,6 @@ blk_get_flush_queue(struct request_queue *q, struct 
blk_mq_ctx *ctx)
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
 }
 
-static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
-{
-   unsigned int policy = 0;
-
-   if (blk_rq_sectors(rq))
-   policy |= REQ_FSEQ_DATA;
-
-   if (fflags & (1UL << QUEUE_FLAG_WC)) {
-   if (rq->cmd_flags & REQ_PREFLUSH)
-   policy |= REQ_FSEQ_PREFLUSH;
-   if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
-   (rq->cmd_flags & REQ_FUA))
-   policy |= REQ_FSEQ_POSTFLUSH;
-   }
-   return policy;
-}
-
 static unsigned int blk_flush_cur_seq(struct request *rq)
 {
return 1 << ffz(rq->flush.seq);
@@ -399,12 +382,26 @@ bool blk_insert_flush(struct request *rq)
 {
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags;  /* may change, cache */
-   unsigned int policy = blk_flush_policy(fflags, rq);
struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
+   unsigned int policy = 0;
 
/* FLUSH/FUA request must never be merged */
WARN_ON_ONCE(rq->bio != rq->biotail);
 
+   if (blk_rq_sectors(rq))
+   policy |= REQ_FSEQ_DATA;
+
+   /*
+* Check which flushes we need to sequence for this operation.
+*/
+   if (fflags & (1UL << QUEUE_FLAG_WC)) {
+   if (rq->cmd_flags & REQ_PREFLUSH)
+   policy |= REQ_FSEQ_PREFLUSH;
+   if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
+   (rq->cmd_flags & REQ_FUA))
+   policy |= REQ_FSEQ_POSTFLUSH;
+   }
+
/*
 * @policy now records what operations need to be done.  Adjust
 * REQ_PREFLUSH and FUA for the driver.
-- 
2.43.0

[PATCH 11/26] block: freeze the queue in queue_attr_store

2024-06-16 Thread Christoph Hellwig

queue_attr_store updates attributes used to control generating I/O, and
can cause malformed bios if changed with I/O in flight.  Freeze the queue
in common code instead of adding it to almost every attribute.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Bart Van Assche 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
---
 block/blk-mq.c| 5 +++--
 block/blk-sysfs.c | 9 ++---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0d4cd39c3d25da..58b0d6c7cc34d6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4631,13 +4631,15 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
int ret;
unsigned long i;
 
+   if (WARN_ON_ONCE(!q->mq_freeze_depth))
+   return -EINVAL;
+
if (!set)
return -EINVAL;
 
if (q->nr_requests == nr)
return 0;
 
-   blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
 
ret = 0;
@@ -4671,7 +4673,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
}
 
blk_mq_unquiesce_queue(q);
-   blk_mq_unfreeze_queue(q);
 
return ret;
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f0f9314ab65c61..5c787965b7d09e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -189,12 +189,9 @@ static ssize_t queue_discard_max_store(struct 
request_queue *q,
if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
return -EINVAL;
 
-   blk_mq_freeze_queue(q);
lim = queue_limits_start_update(q);
lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
err = queue_limits_commit_update(q, &lim);
-   blk_mq_unfreeze_queue(q);
-
if (err)
return err;
return ret;
@@ -241,11 +238,9 @@ queue_max_sectors_store(struct request_queue *q, const 
char *page, size_t count)
if (ret < 0)
return ret;
 
-   blk_mq_freeze_queue(q);
lim = queue_limits_start_update(q);
lim.max_user_sectors = max_sectors_kb << 1;
err = queue_limits_commit_update(q, &lim);
-   blk_mq_unfreeze_queue(q);
if (err)
return err;
return ret;
@@ -585,13 +580,11 @@ static ssize_t queue_wb_lat_store(struct request_queue 
*q, const char *page,
 * ends up either enabling or disabling wbt completely. We can't
 * have IO inflight if that happens.
 */
-   blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
 
wbt_set_min_lat(q, val);
 
blk_mq_unquiesce_queue(q);
-   blk_mq_unfreeze_queue(q);
 
return count;
 }
@@ -722,9 +715,11 @@ queue_attr_store(struct kobject *kobj, struct attribute 
*attr,
if (!entry->store)
return -EIO;
 
+   blk_mq_freeze_queue(q);
mutex_lock(&q->sysfs_lock);
res = entry->store(q, page, length);
mutex_unlock(&q->sysfs_lock);
+   blk_mq_unfreeze_queue(q);
return res;
 }
 
-- 
2.43.0

[PATCH 10/26] nbd: move setting the cache control flags to __nbd_set_size

2024-06-16 Thread Christoph Hellwig

Move setting the cache control flags in nbd in preparation for moving
these flags into the queue_limits structure.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Bart Van Assche 
Reviewed-by: Josef Bacik 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
---
 drivers/block/nbd.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index ad887d614d5b3f..44b8c671921e5c 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -342,6 +342,12 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t 
bytesize,
lim.max_hw_discard_sectors = UINT_MAX;
else
lim.max_hw_discard_sectors = 0;
+   if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH))
+   blk_queue_write_cache(nbd->disk->queue, false, false);
+   else if (nbd->config->flags & NBD_FLAG_SEND_FUA)
+   blk_queue_write_cache(nbd->disk->queue, true, true);
+   else
+   blk_queue_write_cache(nbd->disk->queue, true, false);
lim.logical_block_size = blksize;
lim.physical_block_size = blksize;
error = queue_limits_commit_update(nbd->disk->queue, &lim);
@@ -1286,19 +1292,10 @@ static void nbd_bdev_reset(struct nbd_device *nbd)
 
 static void nbd_parse_flags(struct nbd_device *nbd)
 {
-   struct nbd_config *config = nbd->config;
-   if (config->flags & NBD_FLAG_READ_ONLY)
+   if (nbd->config->flags & NBD_FLAG_READ_ONLY)
set_disk_ro(nbd->disk, true);
else
set_disk_ro(nbd->disk, false);
-   if (config->flags & NBD_FLAG_SEND_FLUSH) {
-   if (config->flags & NBD_FLAG_SEND_FUA)
-   blk_queue_write_cache(nbd->disk->queue, true, true);
-   else
-   blk_queue_write_cache(nbd->disk->queue, true, false);
-   }
-   else
-   blk_queue_write_cache(nbd->disk->queue, false, false);
 }
 
 static void send_disconnects(struct nbd_device *nbd)
-- 
2.43.0

[PATCH 09/26] virtio_blk: remove virtblk_update_cache_mode

2024-06-16 Thread Christoph Hellwig

virtblk_update_cache_mode boils down to a single call to
blk_queue_write_cache.  Remove it in preparation for moving the cache
control flags into the queue_limits.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Bart Van Assche 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Johannes Thumshirn 
---
 drivers/block/virtio_blk.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2351f411fa4680..378b241911ca87 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1089,14 +1089,6 @@ static int virtblk_get_cache_mode(struct virtio_device 
*vdev)
return writeback;
 }
 
-static void virtblk_update_cache_mode(struct virtio_device *vdev)
-{
-   u8 writeback = virtblk_get_cache_mode(vdev);
-   struct virtio_blk *vblk = vdev->priv;
-
-   blk_queue_write_cache(vblk->disk->queue, writeback, false);
-}
-
 static const char *const virtblk_cache_types[] = {
"write through", "write back"
 };
@@ -1116,7 +1108,7 @@ cache_type_store(struct device *dev, struct 
device_attribute *attr,
return i;
 
virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
-   virtblk_update_cache_mode(vdev);
+   blk_queue_write_cache(disk->queue, virtblk_get_cache_mode(vdev), false);
return count;
 }
 
@@ -1528,7 +1520,8 @@ static int virtblk_probe(struct virtio_device *vdev)
vblk->index = index;
 
/* configure queue flush support */
-   virtblk_update_cache_mode(vdev);
+   blk_queue_write_cache(vblk->disk->queue, virtblk_get_cache_mode(vdev),
+   false);
 
/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
-- 
2.43.0

[PATCH 08/26] loop: fold loop_update_rotational into loop_reconfigure_limits

2024-06-16 Thread Christoph Hellwig

This prepares for moving the rotational flag into the queue_limits and
also fixes it for the case where the loop device is backed by a block
device.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Bart Van Assche 
---
 drivers/block/loop.c | 23 ---
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6a4826708a3acf..8991de8fb1bb0b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -916,24 +916,6 @@ static void loop_free_idle_workers_timer(struct timer_list 
*timer)
return loop_free_idle_workers(lo, false);
 }
 
-static void loop_update_rotational(struct loop_device *lo)
-{
-   struct file *file = lo->lo_backing_file;
-   struct inode *file_inode = file->f_mapping->host;
-   struct block_device *file_bdev = file_inode->i_sb->s_bdev;
-   struct request_queue *q = lo->lo_queue;
-   bool nonrot = true;
-
-   /* not all filesystems (e.g. tmpfs) have a sb->s_bdev */
-   if (file_bdev)
-   nonrot = bdev_nonrot(file_bdev);
-
-   if (nonrot)
-   blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
-   else
-   blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
-}
-
 /**
  * loop_set_status_from_info - configure device from loop_info
  * @lo: struct loop_device to configure
@@ -1003,6 +985,10 @@ static int loop_reconfigure_limits(struct loop_device 
*lo, unsigned short bsize)
lim.logical_block_size = bsize;
lim.physical_block_size = bsize;
lim.io_min = bsize;
+   if (!backing_bdev || bdev_nonrot(backing_bdev))
+   blk_queue_flag_set(QUEUE_FLAG_NONROT, lo->lo_queue);
+   else
+   blk_queue_flag_clear(QUEUE_FLAG_NONROT, lo->lo_queue);
loop_config_discard(lo, &lim);
return queue_limits_commit_update(lo->lo_queue, &lim);
 }
@@ -1099,7 +1085,6 @@ static int loop_configure(struct loop_device *lo, 
blk_mode_t mode,
if (WARN_ON_ONCE(error))
goto out_unlock;
 
-   loop_update_rotational(lo);
loop_update_dio(lo);
loop_sysfs_init(lo);
 
-- 
2.43.0

[PATCH 07/26] loop: also use the default block size from an underlying block device

2024-06-16 Thread Christoph Hellwig

Fix the code in loop_reconfigure_limits to pick a default block size for
O_DIRECT file descriptors to also work when the loop device sits on top
of a block device and not just on a regular file on a block device based
file system.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Bart Van Assche 
---
 drivers/block/loop.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index eea3e4919e356e..6a4826708a3acf 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -988,10 +988,16 @@ static int loop_reconfigure_limits(struct loop_device 
*lo, unsigned short bsize)
 {
struct file *file = lo->lo_backing_file;
struct inode *inode = file->f_mapping->host;
+   struct block_device *backing_bdev = NULL;
struct queue_limits lim;
 
+   if (S_ISBLK(inode->i_mode))
+   backing_bdev = I_BDEV(inode);
+   else if (inode->i_sb->s_bdev)
+   backing_bdev = inode->i_sb->s_bdev;
+
if (!bsize)
-   bsize = loop_default_blocksize(lo, inode->i_sb->s_bdev);
+   bsize = loop_default_blocksize(lo, backing_bdev);
 
lim = queue_limits_start_update(lo->lo_queue);
lim.logical_block_size = bsize;
-- 
2.43.0

[PATCH 06/26] loop: regularize upgrading the block size for direct I/O

2024-06-16 Thread Christoph Hellwig

The LOOP_CONFIGURE path automatically upgrades the block size to that
of the underlying file for O_DIRECT file descriptors, but the
LOOP_SET_BLOCK_SIZE path does not.  Fix this by lifting the code to
pick the block size into common code.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Bart Van Assche 
---
 drivers/block/loop.c | 25 +++--
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ce197cbea5f434..eea3e4919e356e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -975,10 +975,24 @@ loop_set_status_from_info(struct loop_device *lo,
return 0;
 }
 
+static unsigned short loop_default_blocksize(struct loop_device *lo,
+   struct block_device *backing_bdev)
+{
+   /* In case of direct I/O, match underlying block size */
+   if ((lo->lo_backing_file->f_flags & O_DIRECT) && backing_bdev)
+   return bdev_logical_block_size(backing_bdev);
+   return SECTOR_SIZE;
+}
+
 static int loop_reconfigure_limits(struct loop_device *lo, unsigned short 
bsize)
 {
+   struct file *file = lo->lo_backing_file;
+   struct inode *inode = file->f_mapping->host;
struct queue_limits lim;
 
+   if (!bsize)
+   bsize = loop_default_blocksize(lo, inode->i_sb->s_bdev);
+
lim = queue_limits_start_update(lo->lo_queue);
lim.logical_block_size = bsize;
lim.physical_block_size = bsize;
@@ -997,7 +1011,6 @@ static int loop_configure(struct loop_device *lo, 
blk_mode_t mode,
int error;
loff_t size;
bool partscan;
-   unsigned short bsize;
bool is_loop;
 
if (!file)
@@ -1076,15 +1089,7 @@ static int loop_configure(struct loop_device *lo, 
blk_mode_t mode,
if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
blk_queue_write_cache(lo->lo_queue, true, false);
 
-   if (config->block_size)
-   bsize = config->block_size;
-   else if ((lo->lo_backing_file->f_flags & O_DIRECT) && 
inode->i_sb->s_bdev)
-   /* In case of direct I/O, match underlying block size */
-   bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
-   else
-   bsize = 512;
-
-   error = loop_reconfigure_limits(lo, bsize);
+   error = loop_reconfigure_limits(lo, config->block_size);
if (WARN_ON_ONCE(error))
goto out_unlock;
 
-- 
2.43.0

[PATCH 05/26] loop: always update discard settings in loop_reconfigure_limits

2024-06-16 Thread Christoph Hellwig

Simplify loop_reconfigure_limits by always updating the discard limits.
This adds a little more work to loop_set_block_size, but doesn't change
the outcome as the discard flag won't change.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Bart Van Assche 
---
 drivers/block/loop.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fd671028fa8554..ce197cbea5f434 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -975,8 +975,7 @@ loop_set_status_from_info(struct loop_device *lo,
return 0;
 }
 
-static int loop_reconfigure_limits(struct loop_device *lo, unsigned short 
bsize,
-   bool update_discard_settings)
+static int loop_reconfigure_limits(struct loop_device *lo, unsigned short 
bsize)
 {
struct queue_limits lim;
 
@@ -984,8 +983,7 @@ static int loop_reconfigure_limits(struct loop_device *lo, 
unsigned short bsize,
lim.logical_block_size = bsize;
lim.physical_block_size = bsize;
lim.io_min = bsize;
-   if (update_discard_settings)
-   loop_config_discard(lo, &lim);
+   loop_config_discard(lo, &lim);
return queue_limits_commit_update(lo->lo_queue, &lim);
 }
 
@@ -1086,7 +1084,7 @@ static int loop_configure(struct loop_device *lo, 
blk_mode_t mode,
else
bsize = 512;
 
-   error = loop_reconfigure_limits(lo, bsize, true);
+   error = loop_reconfigure_limits(lo, bsize);
if (WARN_ON_ONCE(error))
goto out_unlock;
 
@@ -1496,7 +1494,7 @@ static int loop_set_block_size(struct loop_device *lo, 
unsigned long arg)
invalidate_bdev(lo->lo_device);
 
blk_mq_freeze_queue(lo->lo_queue);
-   err = loop_reconfigure_limits(lo, arg, false);
+   err = loop_reconfigure_limits(lo, arg);
loop_update_dio(lo);
blk_mq_unfreeze_queue(lo->lo_queue);
 
-- 
2.43.0

[PATCH 03/26] sd: move zone limits setup out of sd_read_block_characteristics

2024-06-16 Thread Christoph Hellwig

Move a bit of code that sets up the zone flag and the write granularity
into sd_zbc_read_zones to be with the rest of the zoned limits.

Signed-off-by: Christoph Hellwig 
---
 drivers/scsi/sd.c | 21 +
 drivers/scsi/sd_zbc.c |  9 +
 2 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 664523048ce819..66f7d1e3429c86 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -3312,29 +3312,10 @@ static void sd_read_block_characteristics(struct 
scsi_disk *sdkp,
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
}
 
-
-#ifdef CONFIG_BLK_DEV_ZONED /* sd_probe rejects ZBD devices early otherwise */
-   if (sdkp->device->type == TYPE_ZBC) {
-   lim->zoned = true;
-
-   /*
-* Per ZBC and ZAC specifications, writes in sequential write
-* required zones of host-managed devices must be aligned to
-* the device physical block size.
-*/
-   lim->zone_write_granularity = sdkp->physical_block_size;
-   } else {
-   /*
-* Host-aware devices are treated as conventional.
-*/
-   lim->zoned = false;
-   }
-#endif /* CONFIG_BLK_DEV_ZONED */
-
if (!sdkp->first_scan)
return;
 
-   if (lim->zoned)
+   if (sdkp->device->type == TYPE_ZBC)
sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block 
device\n");
else if (sdkp->zoned == 1)
sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as 
regular disk\n");
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 8cc9c025017961..360ec980499529 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -601,6 +601,15 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct 
queue_limits *lim,
if (sdkp->device->type != TYPE_ZBC)
return 0;
 
+   lim->zoned = true;
+
+   /*
+* Per ZBC and ZAC specifications, writes in sequential write required
+* zones of host-managed devices must be aligned to the device physical
+* block size.
+*/
+   lim->zone_write_granularity = sdkp->physical_block_size;
+
/* READ16/WRITE16/SYNC16 is mandatory for ZBC devices */
sdkp->device->use_16_for_rw = 1;
sdkp->device->use_10_for_rw = 0;
-- 
2.43.0

[PATCH 04/26] loop: stop using loop_reconfigure_limits in __loop_clr_fd

2024-06-16 Thread Christoph Hellwig

__loop_clr_fd wants to clear all settings on the device.  Prepare for
moving more settings into the block limits by open coding
loop_reconfigure_limits.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Bart Van Assche 
---
 drivers/block/loop.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 93780f41646b75..fd671028fa8554 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1133,6 +1133,7 @@ static int loop_configure(struct loop_device *lo, 
blk_mode_t mode,
 
 static void __loop_clr_fd(struct loop_device *lo, bool release)
 {
+   struct queue_limits lim;
struct file *filp;
gfp_t gfp = lo->old_gfp_mask;
 
@@ -1156,7 +1157,14 @@ static void __loop_clr_fd(struct loop_device *lo, bool 
release)
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
-   loop_reconfigure_limits(lo, 512, false);
+
+   /* reset the block size to the default */
+   lim = queue_limits_start_update(lo->lo_queue);
+   lim.logical_block_size = SECTOR_SIZE;
+   lim.physical_block_size = SECTOR_SIZE;
+   lim.io_min = SECTOR_SIZE;
+   queue_limits_commit_update(lo->lo_queue, &lim);
+
invalidate_disk(lo->lo_disk);
loop_sysfs_exit(lo);
/* let user-space know about this change */
-- 
2.43.0

[PATCH 02/26] sd: remove sd_is_zoned

2024-06-16 Thread Christoph Hellwig

Since commit 7437bb73f087 ("block: remove support for the host aware zone
model"), only ZBC devices expose a zoned access model.  sd_is_zoned is
used to check for that and thus return false for host aware devices.

Replace the helper with the simple open coded TYPE_ZBC check to fix this.

Fixes: 7437bb73f087 ("block: remove support for the host aware zone model")
Signed-off-by: Christoph Hellwig 
Reviewed-by: Bart Van Assche 
Reviewed-by: Damien Le Moal 
Reviewed-by: Hannes Reinecke 
Reviewed-by: Johannes Thumshirn 
---
 drivers/scsi/sd.c |  6 +-
 drivers/scsi/sd.h |  5 -
 drivers/scsi/sd_zbc.c | 13 -
 3 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index e01393ed42076b..664523048ce819 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -466,10 +466,6 @@ provisioning_mode_store(struct device *dev, struct 
device_attribute *attr,
if (sdp->type != TYPE_DISK)
return -EINVAL;
 
-   /* ignore the provisioning mode for ZBC devices */
-   if (sd_is_zoned(sdkp))
-   return count;
-
mode = sysfs_match_string(lbp_mode, buf);
if (mode < 0)
return -EINVAL;
@@ -2288,7 +2284,7 @@ static int sd_done(struct scsi_cmnd *SCpnt)
}
 
  out:
-   if (sd_is_zoned(sdkp))
+   if (sdkp->device->type == TYPE_ZBC)
good_bytes = sd_zbc_complete(SCpnt, good_bytes, &sshdr);
 
SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 726f1613f6cb56..7603b3c67b233f 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -222,11 +222,6 @@ static inline sector_t sectors_to_logical(struct 
scsi_device *sdev, sector_t sec
 
 void sd_dif_config_host(struct scsi_disk *sdkp, struct queue_limits *lim);
 
-static inline int sd_is_zoned(struct scsi_disk *sdkp)
-{
-   return sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC;
-}
-
 #ifdef CONFIG_BLK_DEV_ZONED
 
 int sd_zbc_read_zones(struct scsi_disk *sdkp, struct queue_limits *lim,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index f685838d9ed214..8cc9c025017961 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -232,7 +232,7 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t 
sector,
int zone_idx = 0;
int ret;
 
-   if (!sd_is_zoned(sdkp))
+   if (sdkp->device->type != TYPE_ZBC)
/* Not a zoned device */
return -EOPNOTSUPP;
 
@@ -300,7 +300,7 @@ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd 
*cmd)
struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
sector_t sector = blk_rq_pos(rq);
 
-   if (!sd_is_zoned(sdkp))
+   if (sdkp->device->type != TYPE_ZBC)
/* Not a zoned device */
return BLK_STS_IOERR;
 
@@ -521,7 +521,7 @@ static int sd_zbc_check_capacity(struct scsi_disk *sdkp, 
unsigned char *buf,
 
 static void sd_zbc_print_zones(struct scsi_disk *sdkp)
 {
-   if (!sd_is_zoned(sdkp) || !sdkp->capacity)
+   if (sdkp->device->type != TYPE_ZBC || !sdkp->capacity)
return;
 
if (sdkp->capacity & (sdkp->zone_info.zone_blocks - 1))
@@ -598,13 +598,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, struct 
queue_limits *lim,
u32 zone_blocks = 0;
int ret;
 
-   if (!sd_is_zoned(sdkp)) {
-   /*
-* Device managed or normal SCSI disk, no special handling
-* required.
-*/
+   if (sdkp->device->type != TYPE_ZBC)
return 0;
-   }
 
/* READ16/WRITE16/SYNC16 is mandatory for ZBC devices */
sdkp->device->use_16_for_rw = 1;
-- 
2.43.0

move features flags into queue_limits v2

2024-06-16 Thread Christoph Hellwig

Hi all,

this is the third and last major series to convert settings to
queue_limits for this merge window.  After a bunch of prep patches to
get various drivers in shape, it moves all the queue_flags that specify
driver controlled features into the queue limits so that they can be
set atomically and are separated from the blk-mq internal flags.

Note that I've only Cc'ed the maintainers for drivers with non-mechanical
changes as the Cc list is already huge.

This series sits on top of the for-6.11/block-limits branch.

A git tree is available here:

git://git.infradead.org/users/hch/block.git block-limit-flags

Gitweb:


http://git.infradead.org/?p=users/hch/block.git;a=shortlog;h=refs/heads/block-limit-flags


Changes since v1:
 - fix an inverted condition
 - fix the runtime flush disable in xen-blkfront
 - remove sd_is_zoned entirely
 - use SECTOR_SIZE in a few more places
 - fix REQ_NOWAIT disabling for dm targets that don't support it
 - fix typos
 - reword various commit logs

Diffstat:
 Documentation/block/writeback_cache_control.rst |   67 
 arch/m68k/emu/nfblock.c |1 
 arch/um/drivers/ubd_kern.c  |3 
 arch/xtensa/platforms/iss/simdisk.c |5 
 block/blk-core.c|7 
 block/blk-flush.c   |   36 ++--
 block/blk-mq-debugfs.c  |   13 -
 block/blk-mq.c  |   42 +++--
 block/blk-settings.c|   46 ++
 block/blk-sysfs.c   |  118 ---
 block/blk-wbt.c |4 
 block/blk.h |2 
 drivers/block/amiflop.c |5 
 drivers/block/aoe/aoeblk.c  |1 
 drivers/block/ataflop.c |5 
 drivers/block/brd.c |6 
 drivers/block/drbd/drbd_main.c  |6 
 drivers/block/floppy.c  |3 
 drivers/block/loop.c|   79 --
 drivers/block/mtip32xx/mtip32xx.c   |2 
 drivers/block/n64cart.c |2 
 drivers/block/nbd.c |   24 +--
 drivers/block/null_blk/main.c   |   13 -
 drivers/block/null_blk/zoned.c  |3 
 drivers/block/pktcdvd.c |1 
 drivers/block/ps3disk.c |8 -
 drivers/block/rbd.c |   12 -
 drivers/block/rnbd/rnbd-clt.c   |   14 -
 drivers/block/sunvdc.c  |1 
 drivers/block/swim.c|5 
 drivers/block/swim3.c   |5 
 drivers/block/ublk_drv.c|   21 +-
 drivers/block/virtio_blk.c  |   37 ++--
 drivers/block/xen-blkfront.c|   53 +++---
 drivers/block/zram/zram_drv.c   |6 
 drivers/cdrom/gdrom.c   |1 
 drivers/md/bcache/super.c   |9 -
 drivers/md/dm-table.c   |  183 +---
 drivers/md/dm-zone.c|2 
 drivers/md/dm-zoned-target.c|2 
 drivers/md/dm.c |   13 -
 drivers/md/md.c |   40 -
 drivers/md/raid5.c  |6 
 drivers/mmc/core/block.c|   42 ++---
 drivers/mmc/core/queue.c|   20 +-
 drivers/mmc/core/queue.h|3 
 drivers/mtd/mtd_blkdevs.c   |9 -
 drivers/nvdimm/btt.c|4 
 drivers/nvdimm/pmem.c   |   14 -
 drivers/nvme/host/core.c|   33 ++--
 drivers/nvme/host/multipath.c   |   24 ---
 drivers/nvme/host/zns.c |3 
 drivers/s390/block/dasd_genhd.c |1 
 drivers/s390/block/dcssblk.c|2 
 drivers/s390/block/scm_blk.c|5 
 drivers/scsi/iscsi_tcp.c|8 -
 drivers/scsi/scsi_lib.c |5 
 drivers/scsi/sd.c   |   66 +++-
 drivers/scsi/sd.h   |5 
 drivers/scsi/sd_zbc.c   |   25 +--
 include/linux/blkdev.h  |  119 ++-
 61 files changed, 572 insertions(+), 728 deletions(-)

[PATCH 01/26] xen-blkfront: don't disable cache flushes when they fail

2024-06-16 Thread Christoph Hellwig

blkfront always had a robust negotiation protocol for detecting a write
cache.  Stop simply disabling cache flushes in the block layer as the
flags handling is moving to the atomic queue limits API that needs
user context to freeze the queue for that.  Instead handle the case
of the feature flags cleared inside of blkfront.  This removes old
debug code to check for such a mismatch which was previously impossible
to hit, including the check for passthrough requests that blkfront
never used to start with.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/xen-blkfront.c | 44 +++-
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9b4ec3e4908cce..851b03844edd13 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -788,6 +788,11 @@ static int blkif_queue_rw_req(struct request *req, struct 
blkfront_ring_info *ri
 * A barrier request a superset of FUA, so we can
 * implement it the same way.  (It's also a FLUSH+FUA,
 * since it is guaranteed ordered WRT previous writes.)
+*
+* Note that can end up here with a FUA write and the
+* flags cleared.  This happens when the flag was
+* run-time disabled after a failing I/O, and we'll
+* simplify submit it as a normal write.
 */
if (info->feature_flush && info->feature_fua)
ring_req->operation =
@@ -795,8 +800,6 @@ static int blkif_queue_rw_req(struct request *req, struct 
blkfront_ring_info *ri
else if (info->feature_flush)
ring_req->operation =
BLKIF_OP_FLUSH_DISKCACHE;
-   else
-   ring_req->operation = 0;
}
ring_req->u.rw.nr_segments = num_grant;
if (unlikely(require_extra_req)) {
@@ -887,16 +890,6 @@ static inline void flush_requests(struct 
blkfront_ring_info *rinfo)
notify_remote_via_irq(rinfo->irq);
 }
 
-static inline bool blkif_request_flush_invalid(struct request *req,
-  struct blkfront_info *info)
-{
-   return (blk_rq_is_passthrough(req) ||
-   ((req_op(req) == REQ_OP_FLUSH) &&
-!info->feature_flush) ||
-   ((req->cmd_flags & REQ_FUA) &&
-!info->feature_fua));
-}
-
 static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
  const struct blk_mq_queue_data *qd)
 {
@@ -908,12 +901,22 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx 
*hctx,
rinfo = get_rinfo(info, qid);
blk_mq_start_request(qd->rq);
spin_lock_irqsave(&rinfo->ring_lock, flags);
-   if (RING_FULL(&rinfo->ring))
-   goto out_busy;
 
-   if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
-   goto out_err;
+   /*
+* Check if the backend actually supports flushes.
+*
+* While the block layer won't send us flushes if we don't claim to
+* support them, the Xen protocol allows the backend to revoke support
+* at any time.  That is of course a really bad idea and dangerous, but
+* has been allowed for 10+ years.  In that case we simply clear the
+* flags, and directly return here for an empty flush and ignore the
+* FUA flag later on.
+*/
+   if (unlikely(req_op(qd->rq) == REQ_OP_FLUSH && !info->feature_flush))
+   goto complete;
 
+   if (RING_FULL(&rinfo->ring))
+   goto out_busy;
if (blkif_queue_request(qd->rq, rinfo))
goto out_busy;
 
@@ -921,14 +924,14 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx 
*hctx,
spin_unlock_irqrestore(&rinfo->ring_lock, flags);
return BLK_STS_OK;
 
-out_err:
-   spin_unlock_irqrestore(&rinfo->ring_lock, flags);
-   return BLK_STS_IOERR;
-
 out_busy:
blk_mq_stop_hw_queue(hctx);
spin_unlock_irqrestore(&rinfo->ring_lock, flags);
return BLK_STS_DEV_RESOURCE;
+complete:
+   spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+   blk_mq_end_request(qd->rq, BLK_STS_OK);
+   return BLK_STS_OK;
 }
 
 static void blkif_complete_rq(struct request *rq)
@@ -1627,7 +1630,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
blkif_req(req)->error = BLK_STS_OK;
info->feature_fua = 0;
info->

Re: [PATCH 02/26] sd: move zone limits setup out of sd_read_block_characteristics

2024-06-16 Thread Christoph Hellwig

On Mon, Jun 17, 2024 at 08:01:04AM +0900, Damien Le Moal wrote:
> On 6/13/24 18:39, Christoph Hellwig wrote:
> > On Tue, Jun 11, 2024 at 02:51:24PM +0900, Damien Le Moal wrote:
> >>> + if (sdkp->device->type == TYPE_ZBC)
> >>
> >> Nit: use sd_is_zoned() here ?
> > 
> > Actually - is there much in even keeping sd_is_zoned now that the
> > host aware support is removed?  Just open coding the type check isn't
> > any more code, and probably easier to follow.
> 
> Removing this helper is fine by me.

FYI, I've removed it yesterday, but not done much of the cleanups suggest
here.  We should probably do those in a follow up up, uncluding removing
the !ZBC check in sd_zbc_check_zoned_characteristics.

Re: [PATCH 10/26] xen-blkfront: don't disable cache flushes when they fail

2024-06-13 Thread Christoph Hellwig

On Wed, Jun 12, 2024 at 05:56:15PM +0200, Roger Pau Monné wrote:
> Right.  AFAICT advertising "feature-barrier" and/or
> "feature-flush-cache" could be done based on whether blkback
> understand those commands, not on whether the underlying storage
> supports the equivalent of them.
> 
> Worst case we can print a warning message once about the underlying
> storage failing to complete flush/barrier requests, and that data
> integrity might not be guaranteed going forward, and not propagate the
> error to the upper layer?
> 
> What would be the consequence of propagating a flush error to the
> upper layers?

If you propage the error to the upper layer you will generate an
I/O error there, which usually leads to a file system shutdown.

> Given the description of the feature in the blkif header, I'm afraid
> we cannot guarantee that seeing the feature exposed implies barrier or
> flush support, since the request could fail at any time (or even from
> the start of the disk attachment) and it would still sadly be a correct
> implementation given the description of the options.

Well, then we could do something like the patch below, which keeps
the existing behavior, but insolates the block layer from it and
removes the only user of blk_queue_write_cache from interrupt
context:

---
>From e6e82c769ab209a77302994c3829cf6ff7a595b8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig 
Date: Thu, 30 May 2024 08:58:52 +0200
Subject: xen-blkfront: don't disable cache flushes when they fail

blkfront always had a robust negotiation protocol for detecting a write
cache.  Stop simply disabling cache flushes in the block layer as the
flags handling is moving to the atomic queue limits API that needs
user context to freeze the queue for that.  Instead handle the case
of the feature flags cleared inside of blkfront.  This removes old
debug code to check for such a mismatch which was previously impossible
to hit, including the check for passthrough requests that blkfront
never used to start with.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/xen-blkfront.c | 44 +++-
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9b4ec3e4908cce..e2c92d5095ff17 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -788,6 +788,14 @@ static int blkif_queue_rw_req(struct request *req, struct 
blkfront_ring_info *ri
 * A barrier request a superset of FUA, so we can
 * implement it the same way.  (It's also a FLUSH+FUA,
 * since it is guaranteed ordered WRT previous writes.)
+*
+* Note that can end up here with a FUA write and the
+* flags cleared.  This happens when the flag was
+* run-time disabled and raced with I/O submission in
+* the block layer.  We submit it as a normal write
+* here.  A pure flush should never end up here with
+* the flags cleared as they are completed earlier for
+* the !feature_flush case.
 */
if (info->feature_flush && info->feature_fua)
ring_req->operation =
@@ -795,8 +803,6 @@ static int blkif_queue_rw_req(struct request *req, struct 
blkfront_ring_info *ri
else if (info->feature_flush)
ring_req->operation =
BLKIF_OP_FLUSH_DISKCACHE;
-   else
-   ring_req->operation = 0;
}
ring_req->u.rw.nr_segments = num_grant;
if (unlikely(require_extra_req)) {
@@ -887,16 +893,6 @@ static inline void flush_requests(struct 
blkfront_ring_info *rinfo)
notify_remote_via_irq(rinfo->irq);
 }
 
-static inline bool blkif_request_flush_invalid(struct request *req,
-  struct blkfront_info *info)
-{
-   return (blk_rq_is_passthrough(req) ||
-   ((req_op(req) == REQ_OP_FLUSH) &&
-!info->feature_flush) ||
-   ((req->cmd_flags & REQ_FUA) &&
-!info->feature_fua));
-}
-
 static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
  const struct blk_mq_queue_data *qd)
 {
@@ -908,23 +904,30 @@ static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx 
*hctx,
rinfo = get_rinfo(info, qid);
blk_mq_start_request(qd->rq);
spin_lock_irqsave(&rinfo->ring_lock, flags);
-   if (RING_FULL(&rinfo->ring))
-   goto out_busy;

Re: [PATCH 02/26] sd: move zone limits setup out of sd_read_block_characteristics

2024-06-13 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 02:51:24PM +0900, Damien Le Moal wrote:
> > +   if (sdkp->device->type == TYPE_ZBC)
> 
> Nit: use sd_is_zoned() here ?

Actually - is there much in even keeping sd_is_zoned now that the
host aware support is removed?  Just open coding the type check isn't
any more code, and probably easier to follow.

Re: [PATCH 10/26] xen-blkfront: don't disable cache flushes when they fail

2024-06-12 Thread Christoph Hellwig

On Wed, Jun 12, 2024 at 10:01:18AM +0200, Roger Pau Monné wrote:
> On Tue, Jun 11, 2024 at 07:19:10AM +0200, Christoph Hellwig wrote:
> > blkfront always had a robust negotiation protocol for detecting a write
> > cache.  Stop simply disabling cache flushes when they fail as that is
> > a grave error.
> 
> It's my understanding the current code attempts to cover up for the
> lack of guarantees the feature itself provides:

> So even when the feature is exposed, the backend might return
> EOPNOTSUPP for the flush/barrier operations.

How is this supposed to work?  I mean in the worst case we could
just immediately complete the flush requests in the driver, but
we're really lying to any upper layer.

> Such failure is tied on whether the underlying blkback storage
> supports REQ_OP_WRITE with REQ_PREFLUSH operation.  blkback will
> expose "feature-barrier" and/or "feature-flush-cache" without knowing
> whether the underlying backend supports those operations, hence the
> weird fallback in blkfront.

If we are just talking about the Linux blkback driver (I know there
probably are a few other implementations) it won't every do that.
I see it has code to do so, but the Linux block layer doesn't
allow the flush operation to randomly fail if it was previously
advertised.  Note that even blkfront conforms to this as it fixes
up the return value when it gets this notsupp error to ok.

> Overall blkback should ensure that REQ_PREFLUSH is supported before
> exposing "feature-barrier" or "feature-flush-cache", as then the
> exposed features would really match what the underlying backend
> supports (rather than the commands blkback knows about).

Yes.  The in-tree xen-blkback does that, but even without that the
Linux block layer actually makes sure flushes sent by upper layers
always succeed even when not supported.

Re: [PATCH 21/26] block: move the poll flag to queue_limits

2024-06-11 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 05:21:07PM +0900, Damien Le Moal wrote:
> Kind of the same remark as for io_stat about this not really being a device
> feature. But I guess seeing "features" as a queue feature rather than just a
> device feature makes it OK to have poll (and io_stat) as a feature rather than
> a flag.

So unlike io_stat this very much is a feature and a feature only as
we don't even allow changing it.  It purely exposes a device (or
rather driver) capability.

Re: [PATCH 19/26] block: move the nowait flag to queue_limits

2024-06-11 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 05:16:37PM +0900, Damien Le Moal wrote:
> > @@ -1825,9 +1815,7 @@ int dm_table_set_restrictions(struct dm_table *t, 
> > struct request_queue *q,
> > int r;
> >  
> > if (dm_table_supports_nowait(t))
> > -   blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
> > -   else
> > -   blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
> > +   limits->features &= ~BLK_FEAT_NOWAIT;
> 
> Shouldn't you set the flag here instead of clearing it ?

No, but the dm_table_supports_nowait check needs to be inverted.

Re: [PATCH 16/26] block: move the io_stat flag setting to queue_limits

2024-06-11 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 05:09:45PM +0900, Damien Le Moal wrote:
> On 6/11/24 2:19 PM, Christoph Hellwig wrote:
> > Move the io_stat flag into the queue_limits feature field so that it
> > can be set atomically and all I/O is frozen when changing the flag.
> 
> Why a feature ? It seems more appropriate for io_stat to be a flag rather than
> a feature as that is a block layer thing rather than a device characteristic, 
> no ?

Because it must actually be supported by the driver for bio based
drivers.  Then again we also support chaning it through sysfs, so
we might actually need both.  At least unlike say the cache it's
not actively harmful when enabled despite not being supported.

I can look into that, but I'll do it in another series after getting
all the driver changes out.

Re: [PATCH 13/26] block: move cache control settings out of queue->flags

2024-06-11 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 04:55:04PM +0900, Damien Le Moal wrote:
> On 6/11/24 2:19 PM, Christoph Hellwig wrote:
> > Move the cache control settings into the queue_limits so that they
> > can be set atomically and all I/O is frozen when changing the
> > flags.
> 
> ...so that they can be set atomically with the device queue frozen when
> changing the flags.
> 
> may be better.

Sure.

If there was anything below I've skipped it after skipping over two
pages of full quotes.

Re: [PATCH 13/26] block: move cache control settings out of queue->flags

2024-06-11 Thread Christoph Hellwig

A friendly reminder that I've skipped over the full quote.  Please
properly quote mails if you want your replies to be seen.

Re: [PATCH 10/26] xen-blkfront: don't disable cache flushes when they fail

2024-06-11 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 04:30:39PM +0900, Damien Le Moal wrote:
> On 6/11/24 2:19 PM, Christoph Hellwig wrote:
> > blkfront always had a robust negotiation protocol for detecting a write
> > cache.  Stop simply disabling cache flushes when they fail as that is
> > a grave error.
> > 
> > Signed-off-by: Christoph Hellwig 
> 
> Looks good to me but maybe mention that removal of xlvbd_flush() as well ?
> And it feels like the "stop disabling cache flushes when they fail" part 
> should
> be a fix patch sent separately...

I'll move the patch to the front of the series to get more attention from
the maintainers, but otherwise the xlvbd_flush remova lis the really
trivial part here.

Re: [PATCH 02/26] sd: move zone limits setup out of sd_read_block_characteristics

2024-06-11 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 02:51:24PM +0900, Damien Le Moal wrote:
> > -   if (!sd_is_zoned(sdkp))
> > +   if (!sd_is_zoned(sdkp)) {
> > +   lim->zoned = false;
> 
> Maybe we should clear the other zone related limits here ? If the drive is
> reformatted/converted from SMR to CMR (FORMAT WITH PRESET), the other zone
> limits may be set already, no ?

Yes, but we would not end up here.  The device type is constant over
the struct of the scsi_device and we'd have to fully reprobe it.

So we don't need to clear any flags, including the actual zoned flag
here.

Re: [PATCH 06/26] loop: also use the default block size from an underlying block device

2024-06-10 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 02:58:56PM +0900, Damien Le Moal wrote:
> > +   if (S_ISBLK(inode->i_mode))
> > +   backing_bdev = I_BDEV(inode);
> > +   else if (inode->i_sb->s_bdev)
> > +   backing_bdev = inode->i_sb->s_bdev;
> > +
> 
> Why not move this hunk inside the below "if" ? (backing_dev declaration can go
> there too).

Because another use will pop up a bit later :)

Re: [PATCH 05/26] loop: regularize upgrading the lock size for direct I/O

2024-06-10 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 02:56:59PM +0900, Damien Le Moal wrote:
> > +   if (!bsize)
> > +   bsize = loop_default_blocksize(lo, inode->i_sb->s_bdev);
> 
> If bsize is specified and there is a backing dev used with direct IO, should 
> it
> be checked that bsize is a multiple of bdev_logical_block_size(backing_bdev) ?

For direct I/O that check would be useful.  For buffered I/O we can do
read-modify-write cycles.  However this series is already huge and not
primarily about improving the loop driver parameter validation, so
I'll defer this for now.

Re: [PATCH 03/26] loop: stop using loop_reconfigure_limits in __loop_clr_fd

2024-06-10 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 02:53:19PM +0900, Damien Le Moal wrote:
> > +   /* reset the block size to the default */
> > +   lim = queue_limits_start_update(lo->lo_queue);
> > +   lim.logical_block_size = 512;
> 
> Nit: SECTOR_SIZE ? maybe ?

Yes.  I was following the existing code, but SECTOR_SIZE is probably
a better choice here.

Re: [PATCH 02/26] sd: move zone limits setup out of sd_read_block_characteristics

2024-06-10 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 07:52:39AM +0200, Christoph Hellwig wrote:
> > Maybe we should clear the other zone related limits here ? If the drive is
> > reformatted/converted from SMR to CMR (FORMAT WITH PRESET), the other zone
> > limits may be set already, no ?
> 
> blk_validate_zoned_limits already takes care of that.

Sorry, brainfart.  The integrity code does that, but not the zoned
code.  I suspect the core code might be a better place for it,
though.

Re: [PATCH 02/26] sd: move zone limits setup out of sd_read_block_characteristics

2024-06-10 Thread Christoph Hellwig

On Tue, Jun 11, 2024 at 02:51:24PM +0900, Damien Le Moal wrote:
> > -   if (lim->zoned)
> > +   if (sdkp->device->type == TYPE_ZBC)
> 
> Nit: use sd_is_zoned() here ?

Yes.

> > -   if (!sd_is_zoned(sdkp))
> > +   if (!sd_is_zoned(sdkp)) {
> > +   lim->zoned = false;
> 
> Maybe we should clear the other zone related limits here ? If the drive is
> reformatted/converted from SMR to CMR (FORMAT WITH PRESET), the other zone
> limits may be set already, no ?

blk_validate_zoned_limits already takes care of that.

[PATCH 26/26] block: move the bounce flag into the feature field

2024-06-10 Thread Christoph Hellwig

Move the bounce field into the flags field to reclaim a little bit of
space.

Signed-off-by: Christoph Hellwig 
---
 block/blk-settings.c| 1 -
 block/blk.h | 2 +-
 drivers/scsi/scsi_lib.c | 2 +-
 include/linux/blkdev.h  | 6 --
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 96e07f24bd9aa1..d0e9096f93ca8a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -479,7 +479,6 @@ int blk_stack_limits(struct queue_limits *t, struct 
queue_limits *b,
b->max_write_zeroes_sectors);
t->max_zone_append_sectors = 
min(queue_limits_max_zone_append_sectors(t),
 
queue_limits_max_zone_append_sectors(b));
-   t->bounce = max(t->bounce, b->bounce);
 
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
diff --git a/block/blk.h b/block/blk.h
index 79e8d5d4fe0caf..fa32f7fad5d7e6 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -394,7 +394,7 @@ struct bio *__blk_queue_bounce(struct bio *bio, struct 
request_queue *q);
 static inline bool blk_queue_may_bounce(struct request_queue *q)
 {
return IS_ENABLED(CONFIG_BOUNCE) &&
-   q->limits.bounce == BLK_BOUNCE_HIGH &&
+   (q->limits.features & BLK_FEAT_BOUNCE_HIGH) &&
max_low_pfn >= max_pfn;
 }
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 54f771ec8cfb5e..e2f7bfb2b9e450 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1986,7 +1986,7 @@ void scsi_init_limits(struct Scsi_Host *shost, struct 
queue_limits *lim)
shost->dma_alignment, dma_get_cache_alignment() - 1);
 
if (shost->no_highmem)
-   lim->bounce = BLK_BOUNCE_HIGH;
+   lim->features |= BLK_FEAT_BOUNCE_HIGH;
 
dma_set_seg_boundary(dev, shost->dma_boundary);
dma_set_max_seg_size(dev, shost->max_segment_size);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d7ad25def6e50b..d1d9787e76ce73 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -325,6 +325,9 @@ enum {
 
/* skip this queue in blk_mq_(un)quiesce_tagset */
BLK_FEAT_SKIP_TAGSET_QUIESCE= (1u << 13),
+
+   /* bounce all highmem pages */
+   BLK_FEAT_BOUNCE_HIGH= (1u << 14),
 };
 
 /*
@@ -332,7 +335,7 @@ enum {
  */
 #define BLK_FEAT_INHERIT_MASK \
(BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_ROTATIONAL | \
-BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED)
+BLK_FEAT_STABLE_WRITES | BLK_FEAT_ZONED | BLK_FEAT_BOUNCE_HIGH)
 
 /* internal flags in queue_limits.flags */
 enum {
@@ -352,7 +355,6 @@ enum blk_bounce {
 struct queue_limits {
unsigned intfeatures;
unsigned intflags;
-   enum blk_bounce bounce;
unsigned long   seg_boundary_mask;
unsigned long   virt_boundary_mask;
 
-- 
2.43.0

[PATCH 24/26] block: move the pci_p2pdma flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the pci_p2pdma flag into the queue_limits feature field so that it
can be set atomically and all I/O is frozen when changing the flag.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c   | 1 -
 drivers/nvme/host/core.c | 8 +++-
 include/linux/blkdev.h   | 7 ---
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f2fd72f4414ae8..8b5a68861c119b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -90,7 +90,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
-   QUEUE_FLAG_NAME(PCI_P2PDMA),
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(SQ_SCHED),
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 5ecf762d7c8837..31e752e8d632cd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3735,6 +3735,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct 
nvme_ns_info *info)
 
if (ctrl->opts && ctrl->opts->data_digest)
lim.features |= BLK_FEAT_STABLE_WRITES;
+   if (ctrl->ops->supports_pci_p2pdma &&
+   ctrl->ops->supports_pci_p2pdma(ctrl))
+   lim.features |= BLK_FEAT_PCI_P2PDMA;
 
disk = blk_mq_alloc_disk(ctrl->tagset, &lim, ns);
if (IS_ERR(disk))
@@ -3744,11 +3747,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct 
nvme_ns_info *info)
 
ns->disk = disk;
ns->queue = disk->queue;
-
-   if (ctrl->ops->supports_pci_p2pdma &&
-   ctrl->ops->supports_pci_p2pdma(ctrl))
-   blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
-
ns->ctrl = ctrl;
kref_init(&ns->kref);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ffb7a42871b4ed..cc4f6e64e8e3f5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -319,6 +319,9 @@ enum {
 
/* supports Zone Reset All */
BLK_FEAT_ZONE_RESETALL  = (1u << 11),
+
+   /* supports PCI(e) p2p requests */
+   BLK_FEAT_PCI_P2PDMA = (1u << 12),
 };
 
 /*
@@ -588,7 +591,6 @@ struct request_queue {
 #define QUEUE_FLAG_STATS   20  /* track IO start and completion times 
*/
 #define QUEUE_FLAG_REGISTERED  22  /* queue has been registered to a disk 
*/
 #define QUEUE_FLAG_QUIESCED24  /* queue has been quiesced */
-#define QUEUE_FLAG_PCI_P2PDMA  25  /* device supports PCI p2p requests */
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE 28  /* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
@@ -611,8 +613,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct 
request_queue *q);
 #define blk_queue_zone_resetall(q) \
((q)->limits.features & BLK_FEAT_ZONE_RESETALL)
 #define blk_queue_dax(q)   ((q)->limits.features & BLK_FEAT_DAX)
-#define blk_queue_pci_p2pdma(q)\
-   test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
+#define blk_queue_pci_p2pdma(q)((q)->limits.features & 
BLK_FEAT_PCI_P2PDMA)
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
 #define blk_queue_rq_alloc_time(q) \
test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags)
-- 
2.43.0

[PATCH 25/26] block: move the skip_tagset_quiesce flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the skip_tagset_quiesce flag into the queue_limits feature field so
that it can be set atomically and all I/O is frozen when changing the
flag.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c   | 1 -
 drivers/nvme/host/core.c | 8 +---
 include/linux/blkdev.h   | 6 --
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 8b5a68861c119b..344f9e503bdb32 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -93,7 +93,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(SQ_SCHED),
-   QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE),
 };
 #undef QUEUE_FLAG_NAME
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 31e752e8d632cd..bf410d10b12006 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4489,13 +4489,15 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, 
struct blk_mq_tag_set *set,
return ret;
 
if (ctrl->ops->flags & NVME_F_FABRICS) {
-   ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL);
+   struct queue_limits lim = {
+   .features   = BLK_FEAT_SKIP_TAGSET_QUIESCE,
+   };
+
+   ctrl->connect_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
-   blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
-  ctrl->connect_q);
}
 
ctrl->tagset = set;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cc4f6e64e8e3f5..d7ad25def6e50b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -322,6 +322,9 @@ enum {
 
/* supports PCI(e) p2p requests */
BLK_FEAT_PCI_P2PDMA = (1u << 12),
+
+   /* skip this queue in blk_mq_(un)quiesce_tagset */
+   BLK_FEAT_SKIP_TAGSET_QUIESCE= (1u << 13),
 };
 
 /*
@@ -594,7 +597,6 @@ struct request_queue {
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE 28  /* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
-#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/
 
 #define QUEUE_FLAG_MQ_DEFAULT  (1UL << QUEUE_FLAG_SAME_COMP)
 
@@ -629,7 +631,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct 
request_queue *q);
 #define blk_queue_registered(q)test_bit(QUEUE_FLAG_REGISTERED, 
&(q)->queue_flags)
 #define blk_queue_sq_sched(q)  test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
 #define blk_queue_skip_tagset_quiesce(q) \
-   test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags)
+   ((q)->limits.features & BLK_FEAT_SKIP_TAGSET_QUIESCE)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
-- 
2.43.0

[PATCH 23/26] block: move the zone_resetall flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the zone_resetall flag into the queue_limits feature field so that
it can be set atomically and all I/O is frozen when changing the flag.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c | 1 -
 drivers/block/null_blk/zoned.c | 3 +--
 drivers/block/ublk_drv.c   | 4 +---
 drivers/block/virtio_blk.c | 3 +--
 drivers/nvme/host/zns.c| 3 +--
 drivers/scsi/sd_zbc.c  | 5 +
 include/linux/blkdev.h | 6 --
 7 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3a21527913840d..f2fd72f4414ae8 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -91,7 +91,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
QUEUE_FLAG_NAME(PCI_P2PDMA),
-   QUEUE_FLAG_NAME(ZONE_RESETALL),
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
QUEUE_FLAG_NAME(SQ_SCHED),
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index ca8e739e76b981..b42c00f1313254 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -158,7 +158,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
sector += dev->zone_size_sects;
}
 
-   lim->features |= BLK_FEAT_ZONED;
+   lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL;
lim->chunk_sectors = dev->zone_size_sects;
lim->max_zone_append_sectors = dev->zone_append_max_sectors;
lim->max_open_zones = dev->zone_max_open;
@@ -171,7 +171,6 @@ int null_register_zoned_dev(struct nullb *nullb)
struct request_queue *q = nullb->q;
struct gendisk *disk = nullb->disk;
 
-   blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
disk->nr_zones = bdev_nr_zones(disk->part0);
 
pr_info("%s: using %s zone append\n",
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 69c16018cbb19a..4fdff13fc23b8a 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -248,8 +248,6 @@ static int ublk_dev_param_zoned_validate(const struct 
ublk_device *ub)
 
 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
 {
-   blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
-
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
 }
 
@@ -2196,7 +2194,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, 
struct io_uring_cmd *cmd)
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
return -EOPNOTSUPP;
 
-   lim.features |= BLK_FEAT_ZONED;
+   lim.features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL;
lim.max_active_zones = p->max_active_zones;
lim.max_open_zones =  p->max_open_zones;
lim.max_zone_append_sectors = p->max_zone_append_sectors;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index cea45b296f8bec..6c64a67ab9c901 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -728,7 +728,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk 
*vblk,
 
dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
 
-   lim->features |= BLK_FEAT_ZONED;
+   lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL;
 
virtio_cread(vdev, struct virtio_blk_config,
 zoned.max_open_zones, &v);
@@ -1548,7 +1548,6 @@ static int virtblk_probe(struct virtio_device *vdev)
 */
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
(lim.features & BLK_FEAT_ZONED)) {
-   blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
err = blk_revalidate_disk_zones(vblk->disk);
if (err)
goto out_cleanup_disk;
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 06f2417aa50de7..99bb89c2495ae3 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -108,13 +108,12 @@ int nvme_query_zone_info(struct nvme_ns *ns, unsigned 
lbaf,
 void nvme_update_zone_info(struct nvme_ns *ns, struct queue_limits *lim,
struct nvme_zone_info *zi)
 {
-   lim->features |= BLK_FEAT_ZONED;
+   lim->features |= BLK_FEAT_ZONED | BLK_FEAT_ZONE_RESETALL;
lim->max_open_zones = zi->max_open_zones;
lim->max_active_zones = zi->max_active_zones;
lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
lim->chunk_sectors = ns->head->zsze =
nvme_lba_to_sect(ns->head, zi->zone_size);
-   blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue);
 }
 
 static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 26b6e92350cda9..8c79f588f

[PATCH 22/26] block: move the zoned flag into the feature field

2024-06-10 Thread Christoph Hellwig

Move the boolean zoned field into the flags field to reclaim a little
bit of space.

Signed-off-by: Christoph Hellwig 
---
 block/blk-settings.c   |  5 ++---
 drivers/block/null_blk/zoned.c |  2 +-
 drivers/block/ublk_drv.c   |  2 +-
 drivers/block/virtio_blk.c |  5 +++--
 drivers/md/dm-table.c  | 11 ++-
 drivers/md/dm-zone.c   |  2 +-
 drivers/md/dm-zoned-target.c   |  2 +-
 drivers/nvme/host/zns.c|  2 +-
 drivers/scsi/sd_zbc.c  |  4 ++--
 include/linux/blkdev.h |  9 ++---
 10 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 026ba68d829856..96e07f24bd9aa1 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -68,7 +68,7 @@ static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
 
 static int blk_validate_zoned_limits(struct queue_limits *lim)
 {
-   if (!lim->zoned) {
+   if (!(lim->features & BLK_FEAT_ZONED)) {
if (WARN_ON_ONCE(lim->max_open_zones) ||
WARN_ON_ONCE(lim->max_active_zones) ||
WARN_ON_ONCE(lim->zone_write_granularity) ||
@@ -602,8 +602,7 @@ int blk_stack_limits(struct queue_limits *t, struct 
queue_limits *b,
   b->max_secure_erase_sectors);
t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity);
-   t->zoned = max(t->zoned, b->zoned);
-   if (!t->zoned) {
+   if (!(t->features & BLK_FEAT_ZONED)) {
t->zone_write_granularity = 0;
t->max_zone_append_sectors = 0;
}
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index f118d304f31080..ca8e739e76b981 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -158,7 +158,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
sector += dev->zone_size_sects;
}
 
-   lim->zoned = true;
+   lim->features |= BLK_FEAT_ZONED;
lim->chunk_sectors = dev->zone_size_sects;
lim->max_zone_append_sectors = dev->zone_append_max_sectors;
lim->max_open_zones = dev->zone_max_open;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 4fcde099935868..69c16018cbb19a 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2196,7 +2196,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, 
struct io_uring_cmd *cmd)
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
return -EOPNOTSUPP;
 
-   lim.zoned = true;
+   lim.features |= BLK_FEAT_ZONED;
lim.max_active_zones = p->max_active_zones;
lim.max_open_zones =  p->max_open_zones;
lim.max_zone_append_sectors = p->max_zone_append_sectors;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 13a2f24f176628..cea45b296f8bec 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -728,7 +728,7 @@ static int virtblk_read_zoned_limits(struct virtio_blk 
*vblk,
 
dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
 
-   lim->zoned = true;
+   lim->features |= BLK_FEAT_ZONED;
 
virtio_cread(vdev, struct virtio_blk_config,
 zoned.max_open_zones, &v);
@@ -1546,7 +1546,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 * All steps that follow use the VQs therefore they need to be
 * placed after the virtio_device_ready() call above.
 */
-   if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
+   if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+   (lim.features & BLK_FEAT_ZONED)) {
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
err = blk_revalidate_disk_zones(vblk->disk);
if (err)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 653c253b6f7f32..48ccd9a396d8e6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1605,12 +1605,12 @@ int dm_calculate_queue_limits(struct dm_table *t,
ti->type->iterate_devices(ti, dm_set_device_limits,
  &ti_limits);
 
-   if (!zoned && ti_limits.zoned) {
+   if (!zoned && (ti_limits.features & BLK_FEAT_ZONED)) {
/*
 * After stacking all limits, validate all devices
 * in table support this zoned model and zone sectors.
 */
-   zoned = ti_limits.zoned;
+   zoned = (ti_limits.features & BLK_FEAT_ZONED);
zone_sectors = ti_limits.chu

[PATCH 20/26] block: move the dax flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the dax flag into the queue_limits feature field so that it
can be set atomically and all I/O is frozen when changing the flag.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c   | 1 -
 drivers/md/dm-table.c| 4 ++--
 drivers/nvdimm/pmem.c| 7 ++-
 drivers/s390/block/dcssblk.c | 2 +-
 include/linux/blkdev.h   | 6 --
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 62b132e9a9ce3b..f4fa820251ce83 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -88,7 +88,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(POLL),
-   QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index eee43d27733f9a..d3a960aee03c6a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1834,11 +1834,11 @@ int dm_table_set_restrictions(struct dm_table *t, 
struct request_queue *q,
limits->features |= BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA;
 
if (dm_table_supports_dax(t, device_not_dax_capable)) {
-   blk_queue_flag_set(QUEUE_FLAG_DAX, q);
+   limits->features |= BLK_FEAT_DAX;
if (dm_table_supports_dax(t, 
device_not_dax_synchronous_capable))
set_dax_synchronous(t->md->dax_dev);
} else
-   blk_queue_flag_clear(QUEUE_FLAG_DAX, q);
+   limits->features &= ~BLK_FEAT_DAX;
 
if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL))
dax_write_cache(t->md->dax_dev, true);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index b821dcf018f6ae..1dd74c969d5a09 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -465,7 +465,6 @@ static int pmem_attach_disk(struct device *dev,
struct dax_device *dax_dev;
struct nd_pfn_sb *pfn_sb;
struct pmem_device *pmem;
-   struct request_queue *q;
struct gendisk *disk;
void *addr;
int rc;
@@ -499,6 +498,8 @@ static int pmem_attach_disk(struct device *dev,
}
if (fua)
lim.features |= BLK_FEAT_FUA;
+   if (is_nd_pfn(dev))
+   lim.features |= BLK_FEAT_DAX;
 
if (!devm_request_mem_region(dev, res->start, resource_size(res),
dev_name(&ndns->dev))) {
@@ -509,7 +510,6 @@ static int pmem_attach_disk(struct device *dev,
disk = blk_alloc_disk(&lim, nid);
if (IS_ERR(disk))
return PTR_ERR(disk);
-   q = disk->queue;
 
pmem->disk = disk;
pmem->pgmap.owner = pmem;
@@ -547,9 +547,6 @@ static int pmem_attach_disk(struct device *dev,
}
pmem->virt_addr = addr;
 
-   if (pmem->pfn_flags & PFN_MAP)
-   blk_queue_flag_set(QUEUE_FLAG_DAX, q);
-
disk->fops  = &pmem_fops;
disk->private_data  = pmem;
nvdimm_namespace_disk_name(ndns, disk->disk_name);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 6d1689a2717e5f..d5a5d11ae0dcdf 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -548,6 +548,7 @@ dcssblk_add_store(struct device *dev, struct 
device_attribute *attr, const char
 {
struct queue_limits lim = {
.logical_block_size = 4096,
+   .features   = BLK_FEAT_DAX,
};
int rc, i, j, num_of_segments;
struct dcssblk_dev_info *dev_info;
@@ -643,7 +644,6 @@ dcssblk_add_store(struct device *dev, struct 
device_attribute *attr, const char
dev_info->gd->fops = &dcssblk_devops;
dev_info->gd->private_data = dev_info;
dev_info->gd->flags |= GENHD_FL_NO_PART;
-   blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->gd->queue);
 
seg_byte_size = (dev_info->end - dev_info->start + 1);
set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 59c2327692589b..c2545580c5b134 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -307,6 +307,9 @@ enum {
 
/* supports REQ_NOWAIT */
BLK_FEAT_NOWAIT = (1u << 7),
+
+   /* supports DAX */
+   BLK_FEAT_DAX= (1u << 8),
 };
 
 /*
@@ -575,7 +578,6 @@ struct request_queue {
 #define QUEUE_FLAG_SAME_FORCE  12  /* force complete on same CPU */
 #define QUEUE_FLAG_INIT_DONE   14  /* queue is initialized */
 #define QUEUE_FLAG_POLL16  /* IO polling enabled if set */
-#define QUEUE_FLAG_DAX 19  /* device support

[PATCH 21/26] block: move the poll flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the poll flag into the queue_limits feature field so that it
can be set atomically and all I/O is frozen when changing the flag.

Stacking drivers are simplified in that they now can simply set the
flag, and blk_stack_limits will clear it when the features is not
supported by any of the underlying devices.

Signed-off-by: Christoph Hellwig 
---
 block/blk-core.c  |  5 ++--
 block/blk-mq-debugfs.c|  1 -
 block/blk-mq.c| 31 +++-
 block/blk-settings.c  | 10 ---
 block/blk-sysfs.c |  4 +--
 drivers/md/dm-table.c | 54 +--
 drivers/nvme/host/multipath.c | 12 +---
 include/linux/blkdev.h|  4 ++-
 8 files changed, 45 insertions(+), 76 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2b45a4df9a1aa1..8d9fbd353fc7fc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -791,7 +791,7 @@ void submit_bio_noacct(struct bio *bio)
}
}
 
-   if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+   if (!(q->limits.features & BLK_FEAT_POLL))
bio_clear_polled(bio);
 
switch (bio_op(bio)) {
@@ -915,8 +915,7 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, 
unsigned int flags)
return 0;
 
q = bdev_get_queue(bdev);
-   if (cookie == BLK_QC_T_NONE ||
-   !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+   if (cookie == BLK_QC_T_NONE || !(q->limits.features & BLK_FEAT_POLL))
return 0;
 
blk_flush_plug(current->plug, false);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f4fa820251ce83..3a21527913840d 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -87,7 +87,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NOXMERGES),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
-   QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(QUIESCED),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 43235acc87505f..e2b9710ddc5ad1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4109,6 +4109,12 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q);
 }
 
+static bool blk_mq_can_poll(struct blk_mq_tag_set *set)
+{
+   return set->nr_maps > HCTX_TYPE_POLL &&
+   set->map[HCTX_TYPE_POLL].nr_queues;
+}
+
 struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata)
 {
@@ -4119,6 +4125,8 @@ struct request_queue *blk_mq_alloc_queue(struct 
blk_mq_tag_set *set,
if (!lim)
lim = &default_lim;
lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
+   if (blk_mq_can_poll(set))
+   lim->features |= BLK_FEAT_POLL;
 
q = blk_alloc_queue(lim, set->numa_node);
if (IS_ERR(q))
@@ -4273,17 +4281,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set 
*set,
mutex_unlock(&q->sysfs_lock);
 }
 
-static void blk_mq_update_poll_flag(struct request_queue *q)
-{
-   struct blk_mq_tag_set *set = q->tag_set;
-
-   if (set->nr_maps > HCTX_TYPE_POLL &&
-   set->map[HCTX_TYPE_POLL].nr_queues)
-   blk_queue_flag_set(QUEUE_FLAG_POLL, q);
-   else
-   blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
-}
-
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
 {
@@ -4311,7 +4308,6 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set 
*set,
q->tag_set = set;
 
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
-   blk_mq_update_poll_flag(q);
 
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->flush_list);
@@ -4798,8 +4794,10 @@ static void __blk_mq_update_nr_hw_queues(struct 
blk_mq_tag_set *set,
 fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
+   struct queue_limits lim;
+
blk_mq_realloc_hw_ctxs(set, q);
-   blk_mq_update_poll_flag(q);
+
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
 
@@ -4811,6 +4809,13 @@ static void __blk_mq_update_nr_hw_queues(struct 
blk_mq_tag_set *set,
set->nr_hw_queues = prev_nr_hw_queues;
goto fallback;
}
+   lim = queue_limits_start_update(q);
+   if (blk_mq_can_poll(set))
+   lim.features |= BLK_FEAT_POLL;
+   else
+   lim.features &= ~BLK_FEAT_POLL;
+   if (queue_limits_commit_update(q, &lim) < 0)
+   pr_warn("updating

[PATCH 19/26] block: move the nowait flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the nowait flag into the queue_limits feature field so that it
can be set atomically and all I/O is frozen when changing the flag.

Stacking drivers are simplified in that they now can simply set the
flag, and blk_stack_limits will clear it when the features is not
supported by any of the underlying devices.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c|  1 -
 block/blk-mq.c|  2 +-
 block/blk-settings.c  |  9 +
 drivers/block/brd.c   |  4 ++--
 drivers/md/dm-table.c | 16 ++--
 drivers/md/md.c   | 18 +-
 drivers/nvme/host/multipath.c |  3 +--
 include/linux/blkdev.h|  9 +
 8 files changed, 21 insertions(+), 41 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 957774e40b1d0c..62b132e9a9ce3b 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -96,7 +96,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(ZONE_RESETALL),
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
QUEUE_FLAG_NAME(HCTX_ACTIVE),
-   QUEUE_FLAG_NAME(NOWAIT),
QUEUE_FLAG_NAME(SQ_SCHED),
QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE),
 };
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cf67dc13f7dd4c..43235acc87505f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4118,7 +4118,7 @@ struct request_queue *blk_mq_alloc_queue(struct 
blk_mq_tag_set *set,
 
if (!lim)
lim = &default_lim;
-   lim->features |= BLK_FEAT_IO_STAT;
+   lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
 
q = blk_alloc_queue(lim, set->numa_node);
if (IS_ERR(q))
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 536ee202fcdccb..bf4622c19b5c09 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -459,6 +459,15 @@ int blk_stack_limits(struct queue_limits *t, struct 
queue_limits *b,
 
t->features |= (b->features & BLK_FEAT_INHERIT_MASK);
 
+   /*
+* BLK_FEAT_NOWAIT needs to be supported both by the stacking driver
+* and all underlying devices.  The stacking driver sets the flag
+* before stacking the limits, and this will clear the flag if any
+* of the underlying devices does not support it.
+*/
+   if (!(b->features & BLK_FEAT_NOWAIT))
+   t->features &= ~BLK_FEAT_NOWAIT;
+
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_user_sectors = min_not_zero(t->max_user_sectors,
b->max_user_sectors);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index d77deb571dbd06..a300645cd9d4a5 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -335,7 +335,8 @@ static int brd_alloc(int i)
.max_hw_discard_sectors = UINT_MAX,
.max_discard_segments   = 1,
.discard_granularity= PAGE_SIZE,
-   .features   = BLK_FEAT_SYNCHRONOUS,
+   .features   = BLK_FEAT_SYNCHRONOUS |
+ BLK_FEAT_NOWAIT,
};
 
list_for_each_entry(brd, &brd_devices, brd_list)
@@ -367,7 +368,6 @@ static int brd_alloc(int i)
strscpy(disk->disk_name, buf, DISK_NAME_LEN);
set_capacity(disk, rd_size * 2);

-   blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
err = add_disk(disk);
if (err)
goto out_cleanup_disk;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index f4e1b50ffdcda5..eee43d27733f9a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -582,7 +582,7 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 static void dm_set_stacking_limits(struct queue_limits *limits)
 {
blk_set_stacking_limits(limits);
-   limits->features |= BLK_FEAT_IO_STAT;
+   limits->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
 }
 
 /*
@@ -1746,12 +1746,6 @@ static bool dm_table_supports_write_zeroes(struct 
dm_table *t)
return true;
 }
 
-static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev,
-sector_t start, sector_t len, void *data)
-{
-   return !bdev_nowait(dev->bdev);
-}
-
 static bool dm_table_supports_nowait(struct dm_table *t)
 {
for (unsigned int i = 0; i < t->num_targets; i++) {
@@ -1759,10 +1753,6 @@ static bool dm_table_supports_nowait(struct dm_table *t)
 
if (!dm_target_supports_nowait(ti->type))
return false;
-
-   if (!ti->type->iterate_devices ||
-   ti->type->iterate_devices(ti, device_not_nowait_capable, 
NULL))
-   return false;
}
 
return true;
@@ -1825,9 +1815,7 @@ int dm_table_set_restrictions(struct dm_table *t, struct 
request_queue *q,
int r;

[PATCH 18/26] block: move the synchronous flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the synchronous flag into the queue_limits feature field so that it
can be set atomically and all I/O is frozen when changing the flag.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c| 1 -
 drivers/block/brd.c   | 2 +-
 drivers/block/zram/zram_drv.c | 4 ++--
 drivers/nvdimm/btt.c  | 3 +--
 drivers/nvdimm/pmem.c | 4 ++--
 include/linux/blkdev.h| 7 ---
 6 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index eb73f1d348e5a9..957774e40b1d0c 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -85,7 +85,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_COMP),
QUEUE_FLAG_NAME(FAIL_IO),
QUEUE_FLAG_NAME(NOXMERGES),
-   QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(POLL),
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index b25dc463b5e3a6..d77deb571dbd06 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -335,6 +335,7 @@ static int brd_alloc(int i)
.max_hw_discard_sectors = UINT_MAX,
.max_discard_segments   = 1,
.discard_granularity= PAGE_SIZE,
+   .features   = BLK_FEAT_SYNCHRONOUS,
};
 
list_for_each_entry(brd, &brd_devices, brd_list)
@@ -366,7 +367,6 @@ static int brd_alloc(int i)
strscpy(disk->disk_name, buf, DISK_NAME_LEN);
set_capacity(disk, rd_size * 2);

-   blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
err = add_disk(disk);
if (err)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index f8f1b5b54795ac..efcb8d9d274c31 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -2208,7 +2208,8 @@ static int zram_add(void)
 #if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
.max_write_zeroes_sectors   = UINT_MAX,
 #endif
-   .features   = BLK_FEAT_STABLE_WRITES,
+   .features   = BLK_FEAT_STABLE_WRITES |
+ BLK_FEAT_SYNCHRONOUS,
};
struct zram *zram;
int ret, device_id;
@@ -2246,7 +2247,6 @@ static int zram_add(void)
 
/* Actual capacity set using sysfs (/sys/block/zram/disksize */
set_capacity(zram->disk, 0);
-   blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
if (ret)
goto out_cleanup_disk;
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index e474afa8e9f68d..e79c06d65bb77b 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1501,6 +1501,7 @@ static int btt_blk_init(struct btt *btt)
.logical_block_size = btt->sector_size,
.max_hw_sectors = UINT_MAX,
.max_integrity_segments = 1,
+   .features   = BLK_FEAT_SYNCHRONOUS,
};
int rc;
 
@@ -1518,8 +1519,6 @@ static int btt_blk_init(struct btt *btt)
btt->btt_disk->fops = &btt_fops;
btt->btt_disk->private_data = btt;
 
-   blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue);
-
set_capacity(btt->btt_disk, btt->nlba * btt->sector_size >> 9);
rc = device_add_disk(&btt->nd_btt->dev, btt->btt_disk, NULL);
if (rc)
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 501cf226df0187..b821dcf018f6ae 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -455,7 +455,8 @@ static int pmem_attach_disk(struct device *dev,
.logical_block_size = pmem_sector_size(ndns),
.physical_block_size= PAGE_SIZE,
.max_hw_sectors = UINT_MAX,
-   .features   = BLK_FEAT_WRITE_CACHE,
+   .features   = BLK_FEAT_WRITE_CACHE |
+ BLK_FEAT_SYNCHRONOUS,
};
int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res;
@@ -546,7 +547,6 @@ static int pmem_attach_disk(struct device *dev,
}
pmem->virt_addr = addr;
 
-   blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q);
if (pmem->pfn_flags & PFN_MAP)
blk_queue_flag_set(QUEUE_FLAG_DAX, q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index db14c61791e022..4d908e29c760da 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -301,6 +301,9 @@ enum {
 
/* don't modify data until writeback is done */
BLK_FEAT_STABLE_WRITES  = (1u << 5),
+
+   /* always completes in

[PATCH 17/26] block: move the stable_write flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the io_stat flag into the queue_limits feature field so that it can
be set atomically and all I/O is frozen when changing the flag.

The flag is now inherited by blk_stack_limits, which greatly simplifies
the code in dm, and fixed md which previously did not pass on the flag
set on lower devices.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c |  1 -
 block/blk-sysfs.c  | 29 +
 drivers/block/drbd/drbd_main.c |  5 ++---
 drivers/block/rbd.c|  9 +++--
 drivers/block/zram/zram_drv.c  |  2 +-
 drivers/md/dm-table.c  | 19 ---
 drivers/md/raid5.c |  6 --
 drivers/mmc/core/queue.c   |  5 +++--
 drivers/nvme/host/core.c   |  9 +
 drivers/nvme/host/multipath.c  |  4 
 drivers/scsi/iscsi_tcp.c   |  8 
 include/linux/blkdev.h |  9 ++---
 12 files changed, 29 insertions(+), 77 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index cbe99444ed1a54..eb73f1d348e5a9 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -88,7 +88,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
-   QUEUE_FLAG_NAME(STABLE_WRITES),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6f58530fb3c08e..cde525724831ef 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -296,37 +296,10 @@ static ssize_t queue_##_name##_store(struct request_queue 
*q,  \
return queue_feature_store(q, page, count, _feature);\
 }
 
-#define QUEUE_SYSFS_BIT_FNS(name, flag, neg)   \
-static ssize_t \
-queue_##name##_show(struct request_queue *q, char *page)   \
-{  \
-   int bit;\
-   bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \
-   return queue_var_show(neg ? !bit : bit, page);  \
-}  \
-static ssize_t \
-queue_##name##_store(struct request_queue *q, const char *page, size_t count) \
-{  \
-   unsigned long val;  \
-   ssize_t ret;\
-   ret = queue_var_store(&val, page, count);   \
-   if (ret < 0)\
-return ret;\
-   if (neg)\
-   val = !val; \
-   \
-   if (val)\
-   blk_queue_flag_set(QUEUE_FLAG_##flag, q);   \
-   else\
-   blk_queue_flag_clear(QUEUE_FLAG_##flag, q); \
-   return ret; \
-}
-
 QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
 QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM)
 QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT)
-QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
-#undef QUEUE_SYSFS_BIT_FNS
+QUEUE_SYSFS_FEATURE(stable_writes, BLK_FEAT_STABLE_WRITES);
 
 static ssize_t queue_zoned_show(struct request_queue *q, char *page)
 {
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2ef29a47807550..f92673f05c7abc 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2698,7 +2698,8 @@ enum drbd_ret_code drbd_create_device(struct 
drbd_config_context *adm_ctx, unsig
 */
.max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8,
.features   = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA |
- BLK_FEAT_ROTATIONAL,
+ BLK_FEAT_ROTATIONAL |
+ BLK_FEAT_STABLE_WRITES,
};
 
device = minor_to_device(minor);
@@ -2737,8 +2738,6 @@ enum drbd_ret_code drbd_create_device(struct 
drbd_config_context *adm_ctx, unsig
sprintf(disk->disk_name, "drbd%d", minor);
disk->private_data = device;
 
-   blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
-
device->md_io.page = alloc_page(GF

[PATCH 16/26] block: move the io_stat flag setting to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the io_stat flag into the queue_limits feature field so that it
can be set atomically and all I/O is frozen when changing the flag.

Simplify md and dm to set the flag unconditionally instead of avoiding
setting a simple flag for cases where it already is set by other means,
which is a bit pointless.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c|  1 -
 block/blk-mq.c|  6 +-
 block/blk-sysfs.c |  2 +-
 drivers/md/dm-table.c | 12 +---
 drivers/md/dm.c   | 13 +++--
 drivers/md/md.c   |  5 ++---
 drivers/nvme/host/multipath.c |  2 +-
 include/linux/blkdev.h|  9 +
 8 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 6b7edb50bfd3fa..cbe99444ed1a54 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -84,7 +84,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NOMERGES),
QUEUE_FLAG_NAME(SAME_COMP),
QUEUE_FLAG_NAME(FAIL_IO),
-   QUEUE_FLAG_NAME(IO_STAT),
QUEUE_FLAG_NAME(NOXMERGES),
QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 58b0d6c7cc34d6..cf67dc13f7dd4c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4116,7 +4116,11 @@ struct request_queue *blk_mq_alloc_queue(struct 
blk_mq_tag_set *set,
struct request_queue *q;
int ret;
 
-   q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
+   if (!lim)
+   lim = &default_lim;
+   lim->features |= BLK_FEAT_IO_STAT;
+
+   q = blk_alloc_queue(lim, set->numa_node);
if (IS_ERR(q))
return q;
q->queuedata = queuedata;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9174aca3b85526..6f58530fb3c08e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -324,7 +324,7 @@ queue_##name##_store(struct request_queue *q, const char 
*page, size_t count) \
 
 QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
 QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM)
-QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
+QUEUE_SYSFS_FEATURE(iostats, BLK_FEAT_IO_STAT)
 QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
 #undef QUEUE_SYSFS_BIT_FNS
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 7654babc2775c1..3e3b713502f61e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -579,6 +579,12 @@ int dm_split_args(int *argc, char ***argvp, char *input)
return 0;
 }
 
+static void dm_set_stacking_limits(struct queue_limits *limits)
+{
+   blk_set_stacking_limits(limits);
+   limits->features |= BLK_FEAT_IO_STAT;
+}
+
 /*
  * Impose necessary and sufficient conditions on a devices's table such
  * that any incoming bio which respects its logical_block_size can be
@@ -617,7 +623,7 @@ static int validate_hardware_logical_block_alignment(struct 
dm_table *t,
for (i = 0; i < t->num_targets; i++) {
ti = dm_table_get_target(t, i);
 
-   blk_set_stacking_limits(&ti_limits);
+   dm_set_stacking_limits(&ti_limits);
 
/* combine all target devices' limits */
if (ti->type->iterate_devices)
@@ -1591,7 +1597,7 @@ int dm_calculate_queue_limits(struct dm_table *t,
unsigned int zone_sectors = 0;
bool zoned = false;
 
-   blk_set_stacking_limits(limits);
+   dm_set_stacking_limits(limits);
 
t->integrity_supported = true;
for (unsigned int i = 0; i < t->num_targets; i++) {
@@ -1604,7 +1610,7 @@ int dm_calculate_queue_limits(struct dm_table *t,
for (unsigned int i = 0; i < t->num_targets; i++) {
struct dm_target *ti = dm_table_get_target(t, i);
 
-   blk_set_stacking_limits(&ti_limits);
+   dm_set_stacking_limits(&ti_limits);
 
if (!ti->type->iterate_devices) {
/* Set I/O hints portion of queue limits */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 13037d6a6f62a2..8a976cee448bed 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2386,22 +2386,15 @@ int dm_setup_md_queue(struct mapped_device *md, struct 
dm_table *t)
struct table_device *td;
int r;
 
-   switch (type) {
-   case DM_TYPE_REQUEST_BASED:
+   WARN_ON_ONCE(type == DM_TYPE_NONE);
+
+   if (type == DM_TYPE_REQUEST_BASED) {
md->disk->fops = &dm_rq_blk_dops;
r = dm_mq_init_request_queue(md, t);
if (r) {
DMERR("Cannot initialize queue for request-based dm 
mapped device");
return r;
}
-   break;
-   case DM_TYPE_BIO_BASED:
-   case DM_TYPE_DAX_BIO_BASED:
-   blk_queue_flag_set(QUEUE_FLAG_IO

[PATCH 15/26] block: move the add_random flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the add_random flag into the queue_limits feature field so that it
can be set atomically and all I/O is frozen when changing the flag.

Note that this also removes code from dm to clear the flag based on
the underlying devices, which can't be reached as dm devices will
always start out without the flag set.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq-debugfs.c|  1 -
 block/blk-sysfs.c |  6 +++---
 drivers/block/mtip32xx/mtip32xx.c |  1 -
 drivers/md/dm-table.c | 18 --
 drivers/mmc/core/queue.c  |  2 --
 drivers/mtd/mtd_blkdevs.c |  3 ---
 drivers/s390/block/scm_blk.c  |  4 
 drivers/scsi/scsi_lib.c   |  3 +--
 drivers/scsi/sd.c | 11 +++
 include/linux/blkdev.h|  5 +++--
 10 files changed, 10 insertions(+), 44 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 4d0e62ec88f033..6b7edb50bfd3fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -86,7 +86,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(FAIL_IO),
QUEUE_FLAG_NAME(IO_STAT),
QUEUE_FLAG_NAME(NOXMERGES),
-   QUEUE_FLAG_NAME(ADD_RANDOM),
QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(INIT_DONE),
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 637ed3bbbfb46f..9174aca3b85526 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -323,7 +323,7 @@ queue_##name##_store(struct request_queue *q, const char 
*page, size_t count) \
 }
 
 QUEUE_SYSFS_FEATURE(rotational, BLK_FEAT_ROTATIONAL)
-QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
+QUEUE_SYSFS_FEATURE(add_random, BLK_FEAT_ADD_RANDOM)
 QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
 QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
 #undef QUEUE_SYSFS_BIT_FNS
@@ -561,7 +561,7 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry 
= {
 
 QUEUE_RW_ENTRY(queue_rotational, "rotational");
 QUEUE_RW_ENTRY(queue_iostats, "iostats");
-QUEUE_RW_ENTRY(queue_random, "add_random");
+QUEUE_RW_ENTRY(queue_add_random, "add_random");
 QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
 
 #ifdef CONFIG_BLK_WBT
@@ -665,7 +665,7 @@ static struct attribute *queue_attrs[] = {
&queue_nomerges_entry.attr,
&queue_iostats_entry.attr,
&queue_stable_writes_entry.attr,
-   &queue_random_entry.attr,
+   &queue_add_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_fua_entry.attr,
diff --git a/drivers/block/mtip32xx/mtip32xx.c 
b/drivers/block/mtip32xx/mtip32xx.c
index 1dbbf72659d549..c6ef0546ffc9d2 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3485,7 +3485,6 @@ static int mtip_block_initialize(struct driver_data *dd)
goto start_service_thread;
 
/* Set device limits. */
-   blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue);
dma_set_max_seg_size(&dd->pdev->dev, 0x40);
 
/* Set the capacity of the device in 512 byte sectors. */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3514a57c2df5d2..7654babc2775c1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1716,14 +1716,6 @@ static int device_dax_write_cache_enabled(struct 
dm_target *ti,
return false;
 }
 
-static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
-sector_t start, sector_t len, void *data)
-{
-   struct request_queue *q = bdev_get_queue(dev->bdev);
-
-   return !blk_queue_add_random(q);
-}
-
 static int device_not_write_zeroes_capable(struct dm_target *ti, struct dm_dev 
*dev,
   sector_t start, sector_t len, void 
*data)
 {
@@ -1876,16 +1868,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct 
request_queue *q,
else
blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, q);
 
-   /*
-* Determine whether or not this queue's I/O timings contribute
-* to the entropy pool, Only request-based targets use this.
-* Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not
-* have it set.
-*/
-   if (blk_queue_add_random(q) &&
-   dm_table_any_dev_attr(t, device_is_not_random, NULL))
-   blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
-
/*
 * For a zoned target, setup the zones related queue attributes
 * and resources necessary for zone append emulation if necessary.
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index b4f62fa845864c..da00904d4a3c7e 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -387,8 +387,6 @@ static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq,

[PATCH 14/26] block: move the nonrot flag to queue_limits

2024-06-10 Thread Christoph Hellwig

Move the norot flag into the queue_limits feature field so that it can be
set atomically and all I/O is frozen when changing the flag.

Use the chance to switch to defaulting to non-rotational and require
the driver to opt into rotational, which matches the polarity of the
sysfs interface.

For the z2ram, ps3vram, 2x memstick, ubiblock and dcssblk the new
rotational flag is not set as they clearly are not rotational despite
this being a behavior change.  There are some other drivers that
unconditionally set the rotational flag to keep the existing behavior
as they arguably can be used on rotational devices even if that is
probably not their main use today (e.g. virtio_blk and drbd).

The flag is automatically inherited in blk_stack_limits matching the
existing behavior in dm and md.

Signed-off-by: Christoph Hellwig 
---
 arch/m68k/emu/nfblock.c |  1 +
 arch/um/drivers/ubd_kern.c  |  1 -
 arch/xtensa/platforms/iss/simdisk.c |  5 +++-
 block/blk-mq-debugfs.c  |  1 -
 block/blk-sysfs.c   | 39 ++---
 drivers/block/amiflop.c |  5 +++-
 drivers/block/aoe/aoeblk.c  |  1 +
 drivers/block/ataflop.c |  5 +++-
 drivers/block/brd.c |  2 --
 drivers/block/drbd/drbd_main.c  |  3 ++-
 drivers/block/floppy.c  |  3 ++-
 drivers/block/loop.c|  8 +++---
 drivers/block/mtip32xx/mtip32xx.c   |  1 -
 drivers/block/n64cart.c |  2 --
 drivers/block/nbd.c |  5 
 drivers/block/null_blk/main.c   |  1 -
 drivers/block/pktcdvd.c |  1 +
 drivers/block/ps3disk.c |  3 ++-
 drivers/block/rbd.c |  3 ---
 drivers/block/rnbd/rnbd-clt.c   |  4 ---
 drivers/block/sunvdc.c  |  1 +
 drivers/block/swim.c|  5 +++-
 drivers/block/swim3.c   |  5 +++-
 drivers/block/ublk_drv.c|  9 +++
 drivers/block/virtio_blk.c  |  4 ++-
 drivers/block/xen-blkfront.c|  1 -
 drivers/block/zram/zram_drv.c   |  2 --
 drivers/cdrom/gdrom.c   |  1 +
 drivers/md/bcache/super.c   |  2 --
 drivers/md/dm-table.c   | 12 -
 drivers/md/md.c | 13 --
 drivers/mmc/core/queue.c|  1 -
 drivers/mtd/mtd_blkdevs.c   |  1 -
 drivers/nvdimm/btt.c|  1 -
 drivers/nvdimm/pmem.c   |  1 -
 drivers/nvme/host/core.c|  1 -
 drivers/nvme/host/multipath.c   |  1 -
 drivers/s390/block/dasd_genhd.c |  1 -
 drivers/s390/block/scm_blk.c|  1 -
 drivers/scsi/sd.c   |  4 +--
 include/linux/blkdev.h  | 10 
 41 files changed, 83 insertions(+), 88 deletions(-)

diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 642fb80c5c4e31..8eea7ef9115146 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -98,6 +98,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
 {
struct queue_limits lim = {
.logical_block_size = bsize,
+   .features   = BLK_FEAT_ROTATIONAL,
};
struct nfhd_device *dev;
int dev_id = id - NFHD_DEV_OFFSET;
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 19e01691ea0ea7..9f1e76ddda5a26 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -882,7 +882,6 @@ static int ubd_add(int n, char **error_out)
goto out_cleanup_tags;
}
 
-   blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
disk->major = UBD_MAJOR;
disk->first_minor = n << UBD_SHIFT;
disk->minors = 1 << UBD_SHIFT;
diff --git a/arch/xtensa/platforms/iss/simdisk.c 
b/arch/xtensa/platforms/iss/simdisk.c
index defc67909a9c74..d6d2b533a5744d 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -263,6 +263,9 @@ static const struct proc_ops simdisk_proc_ops = {
 static int __init simdisk_setup(struct simdisk *dev, int which,
struct proc_dir_entry *procdir)
 {
+   struct queue_limits lim = {
+   .features   = BLK_FEAT_ROTATIONAL,
+   };
char tmp[2] = { '0' + which, 0 };
int err;
 
@@ -271,7 +274,7 @@ static int __init simdisk_setup(struct simdisk *dev, int 
which,
spin_lock_init(&dev->lock);
dev->users = 0;
 
-   dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE);
+   dev->gd = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(dev->gd)) {
err = PTR_ERR(dev->gd);
goto out;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index e8b9db7c30c455..4d0e62ec88f033 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -84,7 +84,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NOMERGES),

[PATCH 13/26] block: move cache control settings out of queue->flags

2024-06-10 Thread Christoph Hellwig

Move the cache control settings into the queue_limits so that they
can be set atomically and all I/O is frozen when changing the
flags.

Add new features and flags field for the driver set flags, and internal
(usually sysfs-controlled) flags in the block layer.  Note that we'll
eventually remove enough field from queue_limits to bring it back to the
previous size.

The disable flag is inverted compared to the previous meaning, which
means it now survives a rescan, similar to the max_sectors and
max_discard_sectors user limits.

The FLUSH and FUA flags are now inherited by blk_stack_limits, which
simplified the code in dm a lot, but also causes a slight behavior
change in that dm-switch and dm-unstripe now advertise a write cache
despite setting num_flush_bios to 0.  The I/O path will handle this
gracefully, but as far as I can tell the lack of num_flush_bios
and thus flush support is a pre-existing data integrity bug in those
targets that really needs fixing, after which a non-zero num_flush_bios
should be required in dm for targets that map to underlying devices.

Signed-off-by: Christoph Hellwig 
---
 .../block/writeback_cache_control.rst | 67 +++
 arch/um/drivers/ubd_kern.c|  2 +-
 block/blk-core.c  |  2 +-
 block/blk-flush.c |  9 ++-
 block/blk-mq-debugfs.c|  2 -
 block/blk-settings.c  | 29 ++--
 block/blk-sysfs.c | 29 +---
 block/blk-wbt.c   |  4 +-
 drivers/block/drbd/drbd_main.c|  2 +-
 drivers/block/loop.c  |  9 +--
 drivers/block/nbd.c   | 14 ++--
 drivers/block/null_blk/main.c | 12 ++--
 drivers/block/ps3disk.c   |  7 +-
 drivers/block/rnbd/rnbd-clt.c | 10 +--
 drivers/block/ublk_drv.c  |  8 ++-
 drivers/block/virtio_blk.c| 20 --
 drivers/block/xen-blkfront.c  |  9 ++-
 drivers/md/bcache/super.c |  7 +-
 drivers/md/dm-table.c | 39 +++
 drivers/md/md.c   |  8 ++-
 drivers/mmc/core/block.c  | 42 ++--
 drivers/mmc/core/queue.c  | 12 ++--
 drivers/mmc/core/queue.h  |  3 +-
 drivers/mtd/mtd_blkdevs.c |  5 +-
 drivers/nvdimm/pmem.c |  4 +-
 drivers/nvme/host/core.c  |  7 +-
 drivers/nvme/host/multipath.c |  6 --
 drivers/scsi/sd.c | 28 +---
 include/linux/blkdev.h| 38 +--
 29 files changed, 227 insertions(+), 207 deletions(-)

diff --git a/Documentation/block/writeback_cache_control.rst 
b/Documentation/block/writeback_cache_control.rst
index b208488d0aae85..9cfe27f90253c7 100644
--- a/Documentation/block/writeback_cache_control.rst
+++ b/Documentation/block/writeback_cache_control.rst
@@ -46,41 +46,50 @@ worry if the underlying devices need any explicit cache 
flushing and how
 the Forced Unit Access is implemented.  The REQ_PREFLUSH and REQ_FUA flags
 may both be set on a single bio.
 
+Feature settings for block drivers
+--
 
-Implementation details for bio based block drivers
---
+For devices that do not support volatile write caches there is no driver
+support required, the block layer completes empty REQ_PREFLUSH requests before
+entering the driver and strips off the REQ_PREFLUSH and REQ_FUA bits from
+requests that have a payload.
 
-These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit
-directly below the submit_bio interface.  For remapping drivers the REQ_FUA
-bits need to be propagated to underlying devices, and a global flush needs
-to be implemented for bios with the REQ_PREFLUSH bit set.  For real device
-drivers that do not have a volatile cache the REQ_PREFLUSH and REQ_FUA bits
-on non-empty bios can simply be ignored, and REQ_PREFLUSH requests without
-data can be completed successfully without doing any work.  Drivers for
-devices with volatile caches need to implement the support for these
-flags themselves without any help from the block layer.
+For devices with volatile write caches the driver needs to tell the block layer
+that it supports flushing caches by setting the
 
+   BLK_FEAT_WRITE_CACHE
 
-Implementation details for request_fn based block drivers
--
+flag in the queue_limits feature field.  For devices that also support the FUA
+bit the block layer needs to be told to pass on the REQ_FUA bit by also setting
+the
 
-For devices that do not support volatile write caches there is no driver
-support required, the block layer comp

[PATCH 12/26] block: remove blk_flush_policy

2024-06-10 Thread Christoph Hellwig

Fold blk_flush_policy into the only caller to prepare for pending changes
to it.

Signed-off-by: Christoph Hellwig 
---
 block/blk-flush.c | 33 +++--
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index c17cf8ed8113db..2234f8b3fc05f2 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -100,23 +100,6 @@ blk_get_flush_queue(struct request_queue *q, struct 
blk_mq_ctx *ctx)
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
 }
 
-static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
-{
-   unsigned int policy = 0;
-
-   if (blk_rq_sectors(rq))
-   policy |= REQ_FSEQ_DATA;
-
-   if (fflags & (1UL << QUEUE_FLAG_WC)) {
-   if (rq->cmd_flags & REQ_PREFLUSH)
-   policy |= REQ_FSEQ_PREFLUSH;
-   if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
-   (rq->cmd_flags & REQ_FUA))
-   policy |= REQ_FSEQ_POSTFLUSH;
-   }
-   return policy;
-}
-
 static unsigned int blk_flush_cur_seq(struct request *rq)
 {
return 1 << ffz(rq->flush.seq);
@@ -399,12 +382,26 @@ bool blk_insert_flush(struct request *rq)
 {
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags;  /* may change, cache */
-   unsigned int policy = blk_flush_policy(fflags, rq);
struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
+   unsigned int policy = 0;
 
/* FLUSH/FUA request must never be merged */
WARN_ON_ONCE(rq->bio != rq->biotail);
 
+   if (blk_rq_sectors(rq))
+   policy |= REQ_FSEQ_DATA;
+
+   /*
+* Check which flushes we need to sequence for this operation.
+*/
+   if (fflags & (1UL << QUEUE_FLAG_WC)) {
+   if (rq->cmd_flags & REQ_PREFLUSH)
+   policy |= REQ_FSEQ_PREFLUSH;
+   if (!(fflags & (1UL << QUEUE_FLAG_FUA)) &&
+   (rq->cmd_flags & REQ_FUA))
+   policy |= REQ_FSEQ_POSTFLUSH;
+   }
+
/*
 * @policy now records what operations need to be done.  Adjust
 * REQ_PREFLUSH and FUA for the driver.
-- 
2.43.0

[PATCH 11/26] block: freeze the queue in queue_attr_store

2024-06-10 Thread Christoph Hellwig

queue_attr_store updates attributes used to control generating I/O, and
can cause malformed bios if changed with I/O in flight.  Freeze the queue
in common code instead of adding it to almost every attribute.

Signed-off-by: Christoph Hellwig 
---
 block/blk-mq.c| 5 +++--
 block/blk-sysfs.c | 9 ++---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0d4cd39c3d25da..58b0d6c7cc34d6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4631,13 +4631,15 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
int ret;
unsigned long i;
 
+   if (WARN_ON_ONCE(!q->mq_freeze_depth))
+   return -EINVAL;
+
if (!set)
return -EINVAL;
 
if (q->nr_requests == nr)
return 0;
 
-   blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
 
ret = 0;
@@ -4671,7 +4673,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, 
unsigned int nr)
}
 
blk_mq_unquiesce_queue(q);
-   blk_mq_unfreeze_queue(q);
 
return ret;
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f0f9314ab65c61..5c787965b7d09e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -189,12 +189,9 @@ static ssize_t queue_discard_max_store(struct 
request_queue *q,
if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
return -EINVAL;
 
-   blk_mq_freeze_queue(q);
lim = queue_limits_start_update(q);
lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
err = queue_limits_commit_update(q, &lim);
-   blk_mq_unfreeze_queue(q);
-
if (err)
return err;
return ret;
@@ -241,11 +238,9 @@ queue_max_sectors_store(struct request_queue *q, const 
char *page, size_t count)
if (ret < 0)
return ret;
 
-   blk_mq_freeze_queue(q);
lim = queue_limits_start_update(q);
lim.max_user_sectors = max_sectors_kb << 1;
err = queue_limits_commit_update(q, &lim);
-   blk_mq_unfreeze_queue(q);
if (err)
return err;
return ret;
@@ -585,13 +580,11 @@ static ssize_t queue_wb_lat_store(struct request_queue 
*q, const char *page,
 * ends up either enabling or disabling wbt completely. We can't
 * have IO inflight if that happens.
 */
-   blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
 
wbt_set_min_lat(q, val);
 
blk_mq_unquiesce_queue(q);
-   blk_mq_unfreeze_queue(q);
 
return count;
 }
@@ -722,9 +715,11 @@ queue_attr_store(struct kobject *kobj, struct attribute 
*attr,
if (!entry->store)
return -EIO;
 
+   blk_mq_freeze_queue(q);
mutex_lock(&q->sysfs_lock);
res = entry->store(q, page, length);
mutex_unlock(&q->sysfs_lock);
+   blk_mq_unfreeze_queue(q);
return res;
 }
 
-- 
2.43.0

[PATCH 10/26] xen-blkfront: don't disable cache flushes when they fail

2024-06-10 Thread Christoph Hellwig

blkfront always had a robust negotiation protocol for detecting a write
cache.  Stop simply disabling cache flushes when they fail as that is
a grave error.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/xen-blkfront.c | 29 +
 1 file changed, 9 insertions(+), 20 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9b4ec3e4908cce..9794ac2d3299d1 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -982,18 +982,6 @@ static const char *flush_info(struct blkfront_info *info)
return "barrier or flush: disabled;";
 }
 
-static void xlvbd_flush(struct blkfront_info *info)
-{
-   blk_queue_write_cache(info->rq, info->feature_flush ? true : false,
- info->feature_fua ? true : false);
-   pr_info("blkfront: %s: %s %s %s %s %s %s %s\n",
-   info->gd->disk_name, flush_info(info),
-   "persistent grants:", info->feature_persistent ?
-   "enabled;" : "disabled;", "indirect descriptors:",
-   info->max_indirect_segments ? "enabled;" : "disabled;",
-   "bounce buffer:", info->bounce ? "enabled" : "disabled;");
-}
-
 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
 {
int major;
@@ -1162,7 +1150,15 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
info->sector_size = sector_size;
info->physical_sector_size = physical_sector_size;
 
-   xlvbd_flush(info);
+   blk_queue_write_cache(info->rq, info->feature_flush ? true : false,
+ info->feature_fua ? true : false);
+
+   pr_info("blkfront: %s: %s %s %s %s %s %s %s\n",
+   info->gd->disk_name, flush_info(info),
+   "persistent grants:", info->feature_persistent ?
+   "enabled;" : "disabled;", "indirect descriptors:",
+   info->max_indirect_segments ? "enabled;" : "disabled;",
+   "bounce buffer:", info->bounce ? "enabled" : "disabled;");
 
if (info->vdisk_info & VDISK_READONLY)
set_disk_ro(gd, 1);
@@ -1622,13 +1618,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
   info->gd->disk_name, 
op_name(bret.operation));
blkif_req(req)->error = BLK_STS_NOTSUPP;
}
-   if (unlikely(blkif_req(req)->error)) {
-   if (blkif_req(req)->error == BLK_STS_NOTSUPP)
-   blkif_req(req)->error = BLK_STS_OK;
-   info->feature_fua = 0;
-   info->feature_flush = 0;
-   xlvbd_flush(info);
-   }
fallthrough;
case BLKIF_OP_READ:
case BLKIF_OP_WRITE:
-- 
2.43.0

[PATCH 09/26] nbd: move setting the cache control flags to __nbd_set_size

2024-06-10 Thread Christoph Hellwig

Move setting the cache control flags in nbd in preparation for moving
these flags into the queue_limits structure.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/nbd.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index ad887d614d5b3f..44b8c671921e5c 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -342,6 +342,12 @@ static int __nbd_set_size(struct nbd_device *nbd, loff_t 
bytesize,
lim.max_hw_discard_sectors = UINT_MAX;
else
lim.max_hw_discard_sectors = 0;
+   if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH))
+   blk_queue_write_cache(nbd->disk->queue, false, false);
+   else if (nbd->config->flags & NBD_FLAG_SEND_FUA)
+   blk_queue_write_cache(nbd->disk->queue, true, true);
+   else
+   blk_queue_write_cache(nbd->disk->queue, true, false);
lim.logical_block_size = blksize;
lim.physical_block_size = blksize;
error = queue_limits_commit_update(nbd->disk->queue, &lim);
@@ -1286,19 +1292,10 @@ static void nbd_bdev_reset(struct nbd_device *nbd)
 
 static void nbd_parse_flags(struct nbd_device *nbd)
 {
-   struct nbd_config *config = nbd->config;
-   if (config->flags & NBD_FLAG_READ_ONLY)
+   if (nbd->config->flags & NBD_FLAG_READ_ONLY)
set_disk_ro(nbd->disk, true);
else
set_disk_ro(nbd->disk, false);
-   if (config->flags & NBD_FLAG_SEND_FLUSH) {
-   if (config->flags & NBD_FLAG_SEND_FUA)
-   blk_queue_write_cache(nbd->disk->queue, true, true);
-   else
-   blk_queue_write_cache(nbd->disk->queue, true, false);
-   }
-   else
-   blk_queue_write_cache(nbd->disk->queue, false, false);
 }
 
 static void send_disconnects(struct nbd_device *nbd)
-- 
2.43.0

[PATCH 08/26] virtio_blk: remove virtblk_update_cache_mode

2024-06-10 Thread Christoph Hellwig

virtblk_update_cache_mode boils down to a single call to
blk_queue_write_cache.  Remove it in preparation for moving the cache
control flags into the queue_limits.

Signed-off-by: Christoph Hellwig 
---
 drivers/block/virtio_blk.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2351f411fa4680..378b241911ca87 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1089,14 +1089,6 @@ static int virtblk_get_cache_mode(struct virtio_device 
*vdev)
return writeback;
 }
 
-static void virtblk_update_cache_mode(struct virtio_device *vdev)
-{
-   u8 writeback = virtblk_get_cache_mode(vdev);
-   struct virtio_blk *vblk = vdev->priv;
-
-   blk_queue_write_cache(vblk->disk->queue, writeback, false);
-}
-
 static const char *const virtblk_cache_types[] = {
"write through", "write back"
 };
@@ -1116,7 +1108,7 @@ cache_type_store(struct device *dev, struct 
device_attribute *attr,
return i;
 
virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
-   virtblk_update_cache_mode(vdev);
+   blk_queue_write_cache(disk->queue, virtblk_get_cache_mode(vdev), false);
return count;
 }
 
@@ -1528,7 +1520,8 @@ static int virtblk_probe(struct virtio_device *vdev)
vblk->index = index;
 
/* configure queue flush support */
-   virtblk_update_cache_mode(vdev);
+   blk_queue_write_cache(vblk->disk->queue, virtblk_get_cache_mode(vdev),
+   false);
 
/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
-- 
2.43.0

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1655 matches

Mail list logo