Re: [PATCH v4] memregion: Add cpu_cache_invalidate_memregion() interface

2022-10-27 Thread Davidlohr Bueso

On Thu, 27 Oct 2022, Dave Hansen wrote:


diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 67745ceab0db..b68661d0633b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,6 +69,7 @@ config X86
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE  if ACPI
select ARCH_HAS_CACHE_LINE_SIZE
+   select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION  if X86_64


What is 64-bit only about this?

I don't expect to have a lot of NVDIMMs or CXL devices on 32-bit
kernels, but it would be nice to remove this if it's not strictly
needed.  Or, to add a changelog nugget that says:

Restrict this to X86_64 kernels.  It would probably work on 32-
bit, but there is no practical reason to use 32-bit kernels and
no one is testing them.


Yes, this was to further limit the potential users.



[PATCH v3 -next] memregion: Add cpu_cache_invalidate_memregion() interface

2022-09-19 Thread Davidlohr Bueso
With CXL security features, global CPU cache flushing nvdimm requirements
are no longer specific to that subsystem, even beyond the scope of
security_ops. CXL will need such semantics for features not necessarily
limited to persistent memory.

The functionality this is enabling is to be able to instantaneously
secure erase potentially terabytes of memory at once and the kernel
needs to be sure that none of the data from before the erase is still
present in the cache. It is also used when unlocking a memory device
where speculative reads and firmware accesses could have cached poison
from before the device was unlocked.

This capability is typically only used once per-boot (for unlock), or
once per bare metal provisioning event (secure erase), like when handing
off the system to another tenant or decommissioning a device. It may
also be used for dynamic CXL region provisioning.

Users must first call cpu_cache_has_invalidate_memregion() to know whether
this functionality is available on the architecture. Only enable it on
x86-64 via the wbinvd() hammer. Hypervisors are not supported as TDX
guests may trigger a virtualization exception and may need proper handling
to recover. See:

   e2efb6359e62 ("ACPICA: Avoid cache flush inside virtual machines")

Signed-off-by: Davidlohr Bueso 
---
Changes from v2 
(https://lore.kernel.org/all/20220829212918.4039240-1-d...@stgolabs.net/):
- Change the names and params (Dan).
- GPL symbols (Boris).
- Mentioned VMM check in the changelog (Boris).


 arch/x86/Kconfig |  1 +
 arch/x86/mm/pat/set_memory.c | 15 +
 drivers/acpi/nfit/intel.c| 41 
 include/linux/memregion.h| 35 ++
 lib/Kconfig  |  3 +++
 5 files changed, 72 insertions(+), 23 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2e8f6fd28e59..fa5cc581315a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,6 +69,7 @@ config X86
select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE
select ARCH_HAS_ACPI_TABLE_UPGRADE  if ACPI
select ARCH_HAS_CACHE_LINE_SIZE
+   select ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION  if X86_64
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLEif !X86_PAE
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 0656db33574d..7d940ae2fede 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -330,6 +330,21 @@ void arch_invalidate_pmem(void *addr, size_t size)
 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
 #endif
 
+#ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION
+bool cpu_cache_has_invalidate_memregion(void)
+{
+   return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
+}
+EXPORT_SYMBOL_GPL(cpu_cache_has_invalidate_memregion);
+
+int cpu_cache_invalidate_memregion(int res_desc)
+{
+   wbinvd_on_all_cpus();
+   return 0;
+}
+EXPORT_SYMBOL_GPL(cpu_cache_invalidate_memregion);
+#endif
+
 static void __cpa_flush_all(void *arg)
 {
unsigned long cache = (unsigned long)arg;
diff --git a/drivers/acpi/nfit/intel.c b/drivers/acpi/nfit/intel.c
index 8dd792a55730..b2bfbf5797da 100644
--- a/drivers/acpi/nfit/intel.c
+++ b/drivers/acpi/nfit/intel.c
@@ -3,6 +3,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "intel.h"
 #include "nfit.h"
@@ -190,8 +191,6 @@ static int intel_security_change_key(struct nvdimm *nvdimm,
}
 }
 
-static void nvdimm_invalidate_cache(void);
-
 static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm,
const struct nvdimm_key_data *key_data)
 {
@@ -213,6 +212,9 @@ static int __maybe_unused intel_security_unlock(struct 
nvdimm *nvdimm,
if (!test_bit(NVDIMM_INTEL_UNLOCK_UNIT, _mem->dsm_mask))
return -ENOTTY;
 
+   if (!cpu_cache_has_invalidate_memregion())
+   return -EINVAL;
+
memcpy(nd_cmd.cmd.passphrase, key_data->data,
sizeof(nd_cmd.cmd.passphrase));
rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, _cmd, sizeof(nd_cmd), NULL);
@@ -228,7 +230,7 @@ static int __maybe_unused intel_security_unlock(struct 
nvdimm *nvdimm,
}
 
/* DIMM unlocked, invalidate all CPU caches before we read it */
-   nvdimm_invalidate_cache();
+   cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
 
return 0;
 }
@@ -297,8 +299,11 @@ static int __maybe_unused intel_security_erase(struct 
nvdimm *nvdimm,
if (!test_bit(cmd, _mem->dsm_mask))
return -ENOTTY;
 
+   if (!cpu_cache_has_invalidate_memregion())
+   return -EINVAL;
+
/* flush all cache before we erase DIMM */
-   nvdimm_invalidate_cache();
+   cpu_cache_invalidate_memregion(IORES_DESC_PERSISTENT_MEMORY);
memcpy(nd_cmd.cmd.passphrase, key->data,
  

Re: [PATCH -next] memregion: Add arch_flush_memregion() interface

2022-09-13 Thread Davidlohr Bueso

On Fri, 09 Sep 2022, Jonathan Cameron wrote:


On Thu, 8 Sep 2022 16:22:26 -0700
Dan Williams  wrote:


Andrew Morton wrote:
> On Thu, 8 Sep 2022 15:51:50 -0700 Dan Williams  
wrote:
>
> > Jonathan Cameron wrote:
> > > On Wed, 7 Sep 2022 18:07:31 -0700
> > > Dan Williams  wrote:
> > >
> > > > Andrew Morton wrote:
> > > > > I really dislike the term "flush".  Sometimes it means writeback,
> > > > > sometimes it means invalidate.  Perhaps at other times it means
> > > > > both.
> > > > >
> > > > > Can we please be very clear in comments and changelogs about exactly
> > > > > what this "flush" does.   With bonus points for being more specific 
in the
> > > > > function naming?
> > > > >
> > > >
> > > > That's a good point, "flush" has been cargo-culted along in Linux's
> > > > cache management APIs to mean write-back-and-invalidate. In this case I
> > > > think this API is purely about invalidate. It just so happens that x86
> > > > has not historically had a global invalidate instruction readily
> > > > available which leads to the overuse of wbinvd.
> > > >
> > > > It would be nice to make clear that this API is purely about
> > > > invalidating any data cached for a physical address impacted by address
> > > > space management event (secure erase / new region provision). Write-back
> > > > is an unnecessary side-effect.
> > > >
> > > > So how about:
> > > >
> > > > s/arch_flush_memregion/cpu_cache_invalidate_memregion/?
> > >
> > > Want to indicate it 'might' write back perhaps?
> > > So could be invalidate or clean and invalidate (using arm ARM terms just 
to add
> > > to the confusion ;)
> > >
> > > Feels like there will be potential race conditions where that matters as 
we might
> > > force stale data to be written back.
> > >
> > > Perhaps a comment is enough for that. Anyone have the "famous last words" 
feeling?
> >
> > Is "invalidate" not clear that write-back is optional? Maybe not.
>
> Yes, I'd say that "invalidate" means "dirty stuff may of may not have
> been written back".  Ditto for invalidate_inode_pages2().
>
> > Also, I realized that we tried to include the address range to allow for
> > the possibility of flushing by virtual address range, but that
> > overcomplicates the use. I.e. if someone issue secure erase and the
> > region association is not established does that mean that mean that the
> > cache invalidation is not needed? It could be the case that someone
> > disables a device, does the secure erase, and then reattaches to the
> > same region. The cache invalidation is needed, but at the time of the
> > secure erase the HPA was unknown.
> >
> > All this to say that I feel the bikeshedding will need to continue until
> > morale improves.
> >
> > I notice that the DMA API uses 'sync' to indicate, "make this memory
> > consistent/coherent for the CPU or the device", so how about an API like
> >
> > memregion_sync_for_cpu(int res_desc)
> >
> > ...where the @res_desc would be IORES_DESC_CXL for all CXL and
> > IORES_DESC_PERSISTENT_MEMORY for the current nvdimm use case.
>
> "sync" is another of my pet peeves ;) In filesystem land, at least.
> Does it mean "start writeback and return" or does it mean "start
> writeback, wait for its completion then return".

Ok, no "sync" :).

/**
 * cpu_cache_invalidate_memregion - drop any CPU cached data for
 * memregions described by @res_des
 * @res_desc: one of the IORES_DESC_* types
 *
 * Perform cache maintenance after a memory event / operation that
 * changes the contents of physical memory in a cache-incoherent manner.
 * For example, memory-device secure erase, or provisioning new CXL
 * regions. This routine may or may not write back any dirty contents
 * while performing the invalidation.
 *
 * Returns 0 on success or negative error code on a failure to perform
 * the cache maintenance.
 */
int cpu_cache_invalidate_memregion(int res_desc)


lgtm


Likewise, and I don't see anyone else objecting so I'll go ahead and send
a new iteration.

Thanks,
Davidlohr



Re: [PATCH -next] memregion: Add arch_flush_memregion() interface

2022-09-07 Thread Davidlohr Bueso

Not sure the proper way to route this (akpm?). But unless any remaining
objections, could this be picked up?

Thanks,
Davidlohr



Re: [PATCH -next] memregion: Add arch_flush_memregion() interface

2022-09-07 Thread Davidlohr Bueso

On Wed, 07 Sep 2022, Borislav Petkov wrote:


On Mon, Aug 29, 2022 at 02:29:18PM -0700, Davidlohr Bueso wrote:

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 1abd5438f126..18463cb704fb 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -330,6 +330,20 @@ void arch_invalidate_pmem(void *addr, size_t size)
 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
 #endif

+#ifdef CONFIG_ARCH_HAS_MEMREGION_INVALIDATE
+bool arch_has_flush_memregion(void)
+{
+   return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);


This looks really weird. Why does this need to care about HV at all?


So the context here is:

e2efb6359e62 ("ACPICA: Avoid cache flush inside virtual machines")



Does that nfit stuff even run in guests?


No, nor does cxl. This was mostly in general a precautionary check such
that the api is unavailable in VMs.




+EXPORT_SYMBOL(arch_has_flush_memregion);


...


+EXPORT_SYMBOL(arch_flush_memregion);


Why aren't those exports _GPL?


Fine by me.

Thanks,
Davidlohr



Re: [PATCH -next] memregion: Add arch_flush_memregion() interface

2022-09-07 Thread Davidlohr Bueso

On Wed, 07 Sep 2022, Dan Williams wrote:


Davidlohr Bueso wrote:

On Wed, 07 Sep 2022, Borislav Petkov wrote:

>On Mon, Aug 29, 2022 at 02:29:18PM -0700, Davidlohr Bueso wrote:
>> diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
>> index 1abd5438f126..18463cb704fb 100644
>> --- a/arch/x86/mm/pat/set_memory.c
>> +++ b/arch/x86/mm/pat/set_memory.c
>> @@ -330,6 +330,20 @@ void arch_invalidate_pmem(void *addr, size_t size)
>>  EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
>>  #endif
>>
>> +#ifdef CONFIG_ARCH_HAS_MEMREGION_INVALIDATE
>> +bool arch_has_flush_memregion(void)
>> +{
>> +  return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
>
>This looks really weird. Why does this need to care about HV at all?

So the context here is:

e2efb6359e62 ("ACPICA: Avoid cache flush inside virtual machines")

>
>Does that nfit stuff even run in guests?

No, nor does cxl. This was mostly in general a precautionary check such
that the api is unavailable in VMs.


To be clear nfit stuff and CXL does run in guests, but they do not
support secure-erase in a guest.


Yes, I meant the feats this api enables.


However, the QEMU CXL enabling is building the ability to do *guest
physical* address space management, but in that case the driver can be
paravirtualized to realize that it is not managing host-physical address
space and does not need to flush caches. That will need some indicator
to differentiate virtual CXL memory expanders from assigned devices. Is
there such a thing as a PCIe-virtio extended capability to differentiate
physical vs emulated devices?


In any case such check would be specific to each user (cxl in this case),
and outside the scope of _this_ particular api. Here we just really want to
avoid the broken TDX guest bits.

Thanks,
Davidlohr



[PATCH -next] memregion: Add arch_flush_memregion() interface

2022-08-29 Thread Davidlohr Bueso
With CXL security features, global CPU cache flushing nvdimm requirements
are no longer specific to that subsystem, even beyond the scope of
security_ops. CXL will need such semantics for features not necessarily
limited to persistent memory.

The functionality this is enabling is to be able to instantaneously
secure erase potentially terabytes of memory at once and the kernel
needs to be sure that none of the data from before the secure is still
present in the cache. It is also used when unlocking a memory device
where speculative reads and firmware accesses could have cached poison
from before the device was unlocked.

This capability is typically only used once per-boot (for unlock), or
once per bare metal provisioning event (secure erase), like when handing
off the system to another tenant or decommissioning a device.

Users must first call arch_has_flush_memregion() to know whether this
functionality is available on the architecture. Only enable it on x86-64
via the wbinvd() hammer.

Signed-off-by: Davidlohr Bueso 
---

Changes from v2 
(https://lore.kernel.org/all/20220819171024.1766857-1-d...@stgolabs.net/):
- Redid to use memregion based interfaces + VMM check on x86 (Dan)
- Restricted the flushing to x86-64.

Note: Since we still are dealing with a physical "range" at this level,
added the spa range for nfit even though this is unused.

 arch/x86/Kconfig |  1 +
 arch/x86/mm/pat/set_memory.c | 14 +++
 drivers/acpi/nfit/intel.c| 45 ++--
 include/linux/memregion.h| 25 
 lib/Kconfig  |  3 +++
 5 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9920f1341c8..594e6b6a4925 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -81,6 +81,7 @@ config X86
select ARCH_HAS_KCOVif X86_64
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
+   select ARCH_HAS_MEMREGION_INVALIDATEif X86_64
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_APIif X86_64
select ARCH_HAS_PTE_DEVMAP  if X86_64
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 1abd5438f126..18463cb704fb 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -330,6 +330,20 @@ void arch_invalidate_pmem(void *addr, size_t size)
 EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
 #endif
 
+#ifdef CONFIG_ARCH_HAS_MEMREGION_INVALIDATE
+bool arch_has_flush_memregion(void)
+{
+   return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR);
+}
+EXPORT_SYMBOL(arch_has_flush_memregion);
+
+void arch_flush_memregion(phys_addr_t phys, resource_size_t size)
+{
+   wbinvd_on_all_cpus();
+}
+EXPORT_SYMBOL(arch_flush_memregion);
+#endif
+
 static void __cpa_flush_all(void *arg)
 {
unsigned long cache = (unsigned long)arg;
diff --git a/drivers/acpi/nfit/intel.c b/drivers/acpi/nfit/intel.c
index 8dd792a55730..32e622f51cde 100644
--- a/drivers/acpi/nfit/intel.c
+++ b/drivers/acpi/nfit/intel.c
@@ -3,6 +3,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "intel.h"
 #include "nfit.h"
@@ -190,12 +191,11 @@ static int intel_security_change_key(struct nvdimm 
*nvdimm,
}
 }
 
-static void nvdimm_invalidate_cache(void);
-
 static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm,
const struct nvdimm_key_data *key_data)
 {
struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+   struct acpi_nfit_system_address *spa = nfit_mem->spa_dcr;
struct {
struct nd_cmd_pkg pkg;
struct nd_intel_unlock_unit cmd;
@@ -213,6 +213,9 @@ static int __maybe_unused intel_security_unlock(struct 
nvdimm *nvdimm,
if (!test_bit(NVDIMM_INTEL_UNLOCK_UNIT, _mem->dsm_mask))
return -ENOTTY;
 
+   if (!arch_has_flush_memregion())
+   return -EINVAL;
+
memcpy(nd_cmd.cmd.passphrase, key_data->data,
sizeof(nd_cmd.cmd.passphrase));
rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, _cmd, sizeof(nd_cmd), NULL);
@@ -228,7 +231,7 @@ static int __maybe_unused intel_security_unlock(struct 
nvdimm *nvdimm,
}
 
/* DIMM unlocked, invalidate all CPU caches before we read it */
-   nvdimm_invalidate_cache();
+   arch_flush_memregion(spa->address, spa->length);
 
return 0;
 }
@@ -279,6 +282,7 @@ static int __maybe_unused intel_security_erase(struct 
nvdimm *nvdimm,
 {
int rc;
struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
+   struct acpi_nfit_system_address *spa = nfit_mem->spa_dcr;
unsigned int cmd = ptype == NVDIMM_MASTER ?
NVDIMM_INTEL_MASTER_SECURE_ERASE : NVDIMM_INTEL_SECURE_ERASE;
struct {
@@ -297,8 +301,11 @@ static int __maybe_unused intel_security_erase(struct 

Re: [PATCH v2] arch/cacheflush: Introduce flush_all_caches()

2022-08-23 Thread Davidlohr Bueso

On Mon, 22 Aug 2022, Dan Williams wrote:


Davidlohr Bueso wrote:

On Sun, 21 Aug 2022, Christoph Hellwig wrote:

>On Fri, Aug 19, 2022 at 10:10:24AM -0700, Davidlohr Bueso wrote:
>> index b192d917a6d0..ac4d4fd4e508 100644
>> --- a/arch/x86/include/asm/cacheflush.h
>> +++ b/arch/x86/include/asm/cacheflush.h
>> @@ -10,4 +10,8 @@
>>
>>  void clflush_cache_range(void *addr, unsigned int size);
>>
>> +/* see comments in the stub version */
>> +#define flush_all_caches() \
>> +  do { wbinvd_on_all_cpus(); } while(0)
>
>Yikes.  This is just a horrible, horrible name and placement for a bad
>hack that should have no generic relevance.

Why does this have no generic relevance? There's already been discussions
on how much wbinv is hated[0].

>Please fix up the naming to make it clear that this function is for a
>very specific nvdimm use case, and move it to a nvdimm-specific header
>file.

Do you have any suggestions for a name? And, as the changelog describes,
this is not nvdimm specific anymore, and the whole point of all this is
volatile memory components for cxl, hence nvdimm namespace is bogus.

[0] 
https://lore.kernel.org/all/yvtc2u1j%2fqip8...@worktop.programming.kicks-ass.net/


While it is not nvdimm specific anymore, it's still specific to "memory
devices that can bulk invalidate a physical address space". I.e. it's
not as generic as its location in arch/x86/include/asm/cacheflush.h
would imply. So, similar to arch_invalidate_pmem(), lets keep it in a
device-driver-specific header file, because hch and peterz are right, we
need to make this much more clear that it is not for general
consumption.


Fine, I won't argue - although I don't particularly agree, at least wrt
the naming. Imo my naming does _exactly_ what it should do and is much
easier to read than arch_has_flush_memregion() which is counter intuitive
when we are in fact flushing everything. This does not either make it
any more clearer about virt vs physical mappings either (except that
it's no longer associated to cacheflush). But, excepting arm cacheflush.h's
rare arch with braino cache users get way too much credit in their namespace
usage.

But yes there is no doubt that my version is more inviting than it should be,
which made me think of naming it to flush_all_caches_careful() so the user
is forced to at least check the function (or one would hope).

Anyway, I'll send a new version based on the below - I particularly agree
with the hypervisor bits.

Thanks,
Davidlohr



Re: [PATCH v2] arch/cacheflush: Introduce flush_all_caches()

2022-08-22 Thread Davidlohr Bueso

On Sun, 21 Aug 2022, Christoph Hellwig wrote:


On Fri, Aug 19, 2022 at 10:10:24AM -0700, Davidlohr Bueso wrote:

index b192d917a6d0..ac4d4fd4e508 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -10,4 +10,8 @@

 void clflush_cache_range(void *addr, unsigned int size);

+/* see comments in the stub version */
+#define flush_all_caches() \
+   do { wbinvd_on_all_cpus(); } while(0)


Yikes.  This is just a horrible, horrible name and placement for a bad
hack that should have no generic relevance.


Why does this have no generic relevance? There's already been discussions
on how much wbinv is hated[0].


Please fix up the naming to make it clear that this function is for a
very specific nvdimm use case, and move it to a nvdimm-specific header
file.


Do you have any suggestions for a name? And, as the changelog describes,
this is not nvdimm specific anymore, and the whole point of all this is
volatile memory components for cxl, hence nvdimm namespace is bogus.

[0] 
https://lore.kernel.org/all/yvtc2u1j%2fqip8...@worktop.programming.kicks-ass.net/

Thanks,
Davidlohr



Re: [PATCH v2] arch/cacheflush: Introduce flush_all_caches()

2022-08-20 Thread Davidlohr Bueso

On Fri, 19 Aug 2022, Ira Weiny wrote:


Did you mean "must"?


Yep.


+ * such as those which caches are in a consistent state. The
+ * caller can verify the situation early on.
+ */
+#ifndef flush_all_caches
+# define flush_all_caches_capable() false
+static inline void flush_all_caches(void)
+{
+   WARN_ON_ONCE("cache invalidation required\n");
+}


With the addition of flush_all_caches_capable() will flush_all_caches() ever be
called?


No, it should not. Hence you get a splat if you call it bogusly.



[PATCH v2] arch/cacheflush: Introduce flush_all_caches()

2022-08-19 Thread Davidlohr Bueso
With CXL security features, global CPU cache flushing nvdimm requirements
are no longer specific to that subsystem, even beyond the scope of
security_ops. CXL will need such semantics for features not necessarily
limited to persistent memory.

The functionality this is enabling is to be able to instantaneously
secure erase potentially terabytes of memory at once and the kernel
needs to be sure that none of the data from before the secure is still
present in the cache. It is also used when unlocking a memory device
where speculative reads and firmware accesses could have cached poison
from before the device was unlocked.

This capability is typically only used once per-boot (for unlock), or
once per bare metal provisioning event (secure erase), like when handing
off the system to another tenant or decomissioning a device. That small
scope plus the fact that none of this is available to a VM limits the
potential damage.

While the scope of this is for physical address space, add a new
flush_all_caches() in cacheflush headers such that each architecture
can define it, when capable. For x86 just use the wbinvd hammer and
prevent any other arch from being capable.

Signed-off-by: Davidlohr Bueso 
---

Changes from v1 
(https://lore.kernel.org/all/20220815160706.tqd42dv24tgb7x7y@offworld/):
- Added comments and improved changelog to reflect this is
  routine should be avoided and not considered a general API (Peter, Dan).

 arch/x86/include/asm/cacheflush.h |  4 +++
 drivers/acpi/nfit/intel.c | 41 ++-
 include/asm-generic/cacheflush.h  | 31 +++
 3 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h 
b/arch/x86/include/asm/cacheflush.h
index b192d917a6d0..ac4d4fd4e508 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -10,4 +10,8 @@
 
 void clflush_cache_range(void *addr, unsigned int size);
 
+/* see comments in the stub version */
+#define flush_all_caches() \
+   do { wbinvd_on_all_cpus(); } while(0)
+
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/drivers/acpi/nfit/intel.c b/drivers/acpi/nfit/intel.c
index 8dd792a55730..f2f6c31e6ab7 100644
--- a/drivers/acpi/nfit/intel.c
+++ b/drivers/acpi/nfit/intel.c
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "intel.h"
 #include "nfit.h"
 
@@ -190,8 +191,6 @@ static int intel_security_change_key(struct nvdimm *nvdimm,
}
 }
 
-static void nvdimm_invalidate_cache(void);
-
 static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm,
const struct nvdimm_key_data *key_data)
 {
@@ -210,6 +209,9 @@ static int __maybe_unused intel_security_unlock(struct 
nvdimm *nvdimm,
};
int rc;
 
+   if (!flush_all_caches_capable())
+   return -EINVAL;
+
if (!test_bit(NVDIMM_INTEL_UNLOCK_UNIT, _mem->dsm_mask))
return -ENOTTY;
 
@@ -228,7 +230,7 @@ static int __maybe_unused intel_security_unlock(struct 
nvdimm *nvdimm,
}
 
/* DIMM unlocked, invalidate all CPU caches before we read it */
-   nvdimm_invalidate_cache();
+   flush_all_caches();
 
return 0;
 }
@@ -294,11 +296,14 @@ static int __maybe_unused intel_security_erase(struct 
nvdimm *nvdimm,
},
};
 
+   if (!flush_all_caches_capable())
+   return -EINVAL;
+
if (!test_bit(cmd, _mem->dsm_mask))
return -ENOTTY;
 
/* flush all cache before we erase DIMM */
-   nvdimm_invalidate_cache();
+   flush_all_caches();
memcpy(nd_cmd.cmd.passphrase, key->data,
sizeof(nd_cmd.cmd.passphrase));
rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, _cmd, sizeof(nd_cmd), NULL);
@@ -318,7 +323,7 @@ static int __maybe_unused intel_security_erase(struct 
nvdimm *nvdimm,
}
 
/* DIMM erased, invalidate all CPU caches before we read it */
-   nvdimm_invalidate_cache();
+   flush_all_caches();
return 0;
 }
 
@@ -338,6 +343,9 @@ static int __maybe_unused 
intel_security_query_overwrite(struct nvdimm *nvdimm)
},
};
 
+   if (!flush_all_caches_capable())
+   return -EINVAL;
+
if (!test_bit(NVDIMM_INTEL_QUERY_OVERWRITE, _mem->dsm_mask))
return -ENOTTY;
 
@@ -355,7 +363,7 @@ static int __maybe_unused 
intel_security_query_overwrite(struct nvdimm *nvdimm)
}
 
/* flush all cache before we make the nvdimms available */
-   nvdimm_invalidate_cache();
+   flush_all_caches();
return 0;
 }
 
@@ -377,11 +385,14 @@ static int __maybe_unused intel_security_overwrite(struct 
nvdimm *nvdimm,
},
};
 
+   if (!flush_all_caches_capable())
+   return -EINVAL;
+
if (!test_bit(NVDIMM_INTEL_OVERWRITE, _mem->dsm_mask))
return -ENOTTY;
 
/* f

Re: [PATCH] arch/cacheflush: Introduce flush_all_caches()

2022-08-16 Thread Davidlohr Bueso

On Tue, 16 Aug 2022, Dan Williams wrote:


On Tue, Aug 16, 2022 at 10:30 AM Davidlohr Bueso  wrote:


On Tue, 16 Aug 2022, Dan Williams wrote:

>Peter Zijlstra wrote:
>> On Mon, Aug 15, 2022 at 09:07:06AM -0700, Davidlohr Bueso wrote:
>> > diff --git a/arch/x86/include/asm/cacheflush.h 
b/arch/x86/include/asm/cacheflush.h
>> > index b192d917a6d0..ce2ec9556093 100644
>> > --- a/arch/x86/include/asm/cacheflush.h
>> > +++ b/arch/x86/include/asm/cacheflush.h
>> > @@ -10,4 +10,7 @@
>> >
>> >  void clflush_cache_range(void *addr, unsigned int size);
>> >
>> > +#define flush_all_caches() \
>> > +  do { wbinvd_on_all_cpus(); } while(0)
>> > +
>>
>> This is horrific... we've done our utmost best to remove all WBINVD
>> usage and here you're adding it back in the most horrible form possible
>> ?!?
>>
>> Please don't do this, do *NOT* use WBINVD.
>
>Unfortunately there are a few good options here, and the changelog did
>not make clear that this is continuing legacy [1], not adding new wbinvd
>usage.

While I was hoping that it was obvious from the intel.c changes that this
was not a new wbinvd, I can certainly improve the changelog with the below.


I also think this cache_flush_region() API wants a prominent comment
clarifying the limited applicability of this API. I.e. that it is not
for general purpose usage, not for VMs, and only for select bare metal
scenarios that instantaneously invalidate wide swaths of memory.
Otherwise, I can now see how this looks like a potentially scary
expansion of the usage of wbinvd.


Sure.

Also, in the future we might be able to bypass this hammer in the presence
of persistent cpu caches.

Thanks,
Davidlohr



Re: [PATCH] arch/cacheflush: Introduce flush_all_caches()

2022-08-16 Thread Davidlohr Bueso

On Tue, 16 Aug 2022, Dan Williams wrote:


Peter Zijlstra wrote:

On Mon, Aug 15, 2022 at 09:07:06AM -0700, Davidlohr Bueso wrote:
> diff --git a/arch/x86/include/asm/cacheflush.h 
b/arch/x86/include/asm/cacheflush.h
> index b192d917a6d0..ce2ec9556093 100644
> --- a/arch/x86/include/asm/cacheflush.h
> +++ b/arch/x86/include/asm/cacheflush.h
> @@ -10,4 +10,7 @@
>
>  void clflush_cache_range(void *addr, unsigned int size);
>
> +#define flush_all_caches() \
> +  do { wbinvd_on_all_cpus(); } while(0)
> +

This is horrific... we've done our utmost best to remove all WBINVD
usage and here you're adding it back in the most horrible form possible
?!?

Please don't do this, do *NOT* use WBINVD.


Unfortunately there are a few good options here, and the changelog did
not make clear that this is continuing legacy [1], not adding new wbinvd
usage.


While I was hoping that it was obvious from the intel.c changes that this
was not a new wbinvd, I can certainly improve the changelog with the below.

Thanks,
Davidlohr



The functionality this is enabling is to be able to instantaneously
secure erase potentially terabytes of memory at once and the kernel
needs to be sure that none of the data from before the secure is still
present in the cache. It is also used when unlocking a memory device
where speculative reads and firmware accesses could have cached poison
from before the device was unlocked.

This capability is typically only used once per-boot (for unlock), or
once per bare metal provisioning event (secure erase), like when handing
off the system to another tenant. That small scope plus the fact that
none of this is available to a VM limits the potential damage. So,
similar to the mitigation we did in [2] that did not kill off wbinvd
completely, this is limited to specific scenarios and should be disabled
in any scenario where wbinvd is painful / forbidden.

[1]: 4c6926a23b76 ("acpi/nfit, libnvdimm: Add unlock of nvdimm support for Intel 
DIMMs")
[2]: e2efb6359e62 ("ACPICA: Avoid cache flush inside virtual machines")




[PATCH] arch/cacheflush: Introduce flush_all_caches()

2022-08-15 Thread Davidlohr Bueso

With CXL security features, global CPU cache flushing nvdimm
requirements are no longer specific to that subsystem, even
beyond the scope of security_ops. CXL will need such semantics
for features not necessarily limited to persistent memory.

While the scope of this is for physical address space, add a
new flush_all_caches() in cacheflush headers such that each
architecture can define it, when capable. For x86 just use the
wbinvd hammer and prevent any other arch from being capable.
While there can be performance penalties or delays response
times, these calls are both rare and explicitly security
related, and therefore become less important.

Signed-off-by: Davidlohr Bueso 
---

After a few iterations I circled back to an interface without granularity.
It just doesn't make sense right now to define a range if arm64 will not
support this (won't do VA-based physical address space flushes) and, until
it comes up with consistent caches, security operations will simply be
unsupported.

 arch/x86/include/asm/cacheflush.h |  3 +++
 drivers/acpi/nfit/intel.c | 41 ++-
 include/asm-generic/cacheflush.h  | 22 +
 3 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/cacheflush.h 
b/arch/x86/include/asm/cacheflush.h
index b192d917a6d0..ce2ec9556093 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -10,4 +10,7 @@

 void clflush_cache_range(void *addr, unsigned int size);

+#define flush_all_caches() \
+   do { wbinvd_on_all_cpus(); } while(0)
+
 #endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/drivers/acpi/nfit/intel.c b/drivers/acpi/nfit/intel.c
index 8dd792a55730..f2f6c31e6ab7 100644
--- a/drivers/acpi/nfit/intel.c
+++ b/drivers/acpi/nfit/intel.c
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "intel.h"
 #include "nfit.h"

@@ -190,8 +191,6 @@ static int intel_security_change_key(struct nvdimm *nvdimm,
}
 }

-static void nvdimm_invalidate_cache(void);
-
 static int __maybe_unused intel_security_unlock(struct nvdimm *nvdimm,
const struct nvdimm_key_data *key_data)
 {
@@ -210,6 +209,9 @@ static int __maybe_unused intel_security_unlock(struct 
nvdimm *nvdimm,
};
int rc;

+   if (!flush_all_caches_capable())
+   return -EINVAL;
+
if (!test_bit(NVDIMM_INTEL_UNLOCK_UNIT, _mem->dsm_mask))
return -ENOTTY;

@@ -228,7 +230,7 @@ static int __maybe_unused intel_security_unlock(struct 
nvdimm *nvdimm,
}

/* DIMM unlocked, invalidate all CPU caches before we read it */
-   nvdimm_invalidate_cache();
+   flush_all_caches();

return 0;
 }
@@ -294,11 +296,14 @@ static int __maybe_unused intel_security_erase(struct 
nvdimm *nvdimm,
},
};

+   if (!flush_all_caches_capable())
+   return -EINVAL;
+
if (!test_bit(cmd, _mem->dsm_mask))
return -ENOTTY;

/* flush all cache before we erase DIMM */
-   nvdimm_invalidate_cache();
+   flush_all_caches();
memcpy(nd_cmd.cmd.passphrase, key->data,
sizeof(nd_cmd.cmd.passphrase));
rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, _cmd, sizeof(nd_cmd), NULL);
@@ -318,7 +323,7 @@ static int __maybe_unused intel_security_erase(struct 
nvdimm *nvdimm,
}

/* DIMM erased, invalidate all CPU caches before we read it */
-   nvdimm_invalidate_cache();
+   flush_all_caches();
return 0;
 }

@@ -338,6 +343,9 @@ static int __maybe_unused 
intel_security_query_overwrite(struct nvdimm *nvdimm)
},
};

+   if (!flush_all_caches_capable())
+   return -EINVAL;
+
if (!test_bit(NVDIMM_INTEL_QUERY_OVERWRITE, _mem->dsm_mask))
return -ENOTTY;

@@ -355,7 +363,7 @@ static int __maybe_unused 
intel_security_query_overwrite(struct nvdimm *nvdimm)
}

/* flush all cache before we make the nvdimms available */
-   nvdimm_invalidate_cache();
+   flush_all_caches();
return 0;
 }

@@ -377,11 +385,14 @@ static int __maybe_unused intel_security_overwrite(struct 
nvdimm *nvdimm,
},
};

+   if (!flush_all_caches_capable())
+   return -EINVAL;
+
if (!test_bit(NVDIMM_INTEL_OVERWRITE, _mem->dsm_mask))
return -ENOTTY;

/* flush all cache before we erase DIMM */
-   nvdimm_invalidate_cache();
+   flush_all_caches();
memcpy(nd_cmd.cmd.passphrase, nkey->data,
sizeof(nd_cmd.cmd.passphrase));
rc = nvdimm_ctl(nvdimm, ND_CMD_CALL, _cmd, sizeof(nd_cmd), NULL);
@@ -401,22 +412,6 @@ static int __maybe_unused intel_security_overwrite(struct 
nvdimm *nvdimm,
}
 }

-/*
- * TODO: define a cross arch wbinvd equivalent when/if
- * NVDIMM_FAMILY_INTEL command support arrives on another arch

Re: [RFC] [PATCH] ipc/util.c: Use binary search for max_idx

2021-04-11 Thread Davidlohr Bueso

On Wed, 07 Apr 2021, Manfred Spraul wrote:


If semctl(), msgctl() and shmctl() are called with IPC_INFO, SEM_INFO,
MSG_INFO or SHM_INFO, then the return value is the index of the highest
used entry in the kernel's internal array recording information about
all SysV objects of the requested type for the current namespace.
(This information can be used with repeated ..._STAT or ..._STAT_ANY
operations to obtain information about all SysV objects on the system.)

If the current highest used entry is destroyed, then the new highest
used entry is determined by looping over all possible values.
With the introduction of IPCMNI_EXTEND_SHIFT, this could be a
loop over 16 million entries.


So the lineal search, starting from the end of the range, won't suck
as long as there aren't big holes in the range from the max_idx to
the next max_idx, minimizing the amount of idr_find calls - this was
accepted as the cost of doing caching the max_idx for O(1) access.
Of course, when not the case, the binary search can save a lot of cycles,
and I agree it's overall better than trying to predict rmid patterns.



As there is no get_last() function for idr structures:
Implement a "get_last()" using a binary search.


Right, nor do I think there are any users that try to avoid the lookup
caching the value.



As far as I see, ipc is the only user that needs get_last(), thus
implement it in ipc/util.c and not in a central location.


I find your implementation to be both obscure and elegant :)

Some nit comments below, but I agree with the idea, with the observation
that the search will always do the worst case amount of logN loops, even
when the retval is found (in which normal implementations would just
break out of the looping).

Acked-by: Davidlohr Bueso 



Signed-off-by: Manfred Spraul 
---
ipc/util.c | 44 +++-
1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/ipc/util.c b/ipc/util.c
index cfa0045e748d..0121bf6b2617 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -64,6 +64,7 @@
#include 
#include 
#include 
+#include 

#include 

@@ -450,6 +451,40 @@ static void ipc_kht_remove(struct ipc_ids *ids, struct 
kern_ipc_perm *ipcp)
   ipc_kht_params);
}

+/**
+ * ipc_get_maxusedidx - get highest in-use index
+ * @ids: ipc identifier set
+ * @limit: highest possible index.
+ *
+ * The function determines the highest in use index value.
+ * ipc_ids.rwsem needs to be owned by the caller.
+ * If no ipc object is allocated, then -1 is returned.
+ */
+static int ipc_get_maxusedidx(struct ipc_ids *ids, int limit)
+{
+   void *val;
+   int tmpidx;
+   int i;
+   int retval;
+
+   i = ilog2(limit+1);
+
+   retval = 0;
+   for (; i >= 0; i--) {
+   tmpidx = retval | (1<ipcs_idr, );
+   if (val)
+   retval |= (1<

Perhaps get rid of 'val', and just do this instead?

if (idr_get_next(...))
   retval = tmpidx;


+   }


Mostly thinking out-loud. Suppose the caller is doing RMID(10) == max_idx,
and no holes, so the next expected max_idx should be 9:

ids->max_idx == ipc_get_maxusedidx(10-1) == 9;

So this will loop from 1 << 3 to 1 << 0:

   idr_get_next(7)  == T   ==> retval = 8
   idr_get_next(11) == F
   idr_get_next(9)  == T   ==> retval = 10
   idr_get_next(10) == F

   return 10 - 1; // good

And with holes, ie: 1 2 7 10, so RMID(10) should update next to 7:

ipc_get_maxusedidx(10-1) == 7

   idr_get_next(7)  == T   ==> retval = 8
   idr_get_next(11) == F
   idr_get_next(9)  == F
   idr_get_next(8)  == F

   return 8 - 1; // good


+   retval--;
+   return retval;


Instead, just do?
 return retval - 1;


+}



+
/**
 * ipc_rmid - remove an ipc identifier
 * @ids: ipc identifier set
@@ -468,11 +503,10 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm 
*ipcp)
ipcp->deleted = true;

if (unlikely(idx == ids->max_idx)) {
-   do {
-   idx--;
-   if (idx == -1)
-   break;
-   } while (!idr_find(>ipcs_idr, idx));
+
+   idx = ids->max_idx-1;



+   if (idx >= 0)
+   idx = ipc_get_maxusedidx(ids, idx);





ids->max_idx = idx;


We already have ipc_get_maxidx(), so the naming here is a bit strange.
How about renaming ipc_get_maxusedidx() to ipc_get_nextidx() and let it
handle the whole logic, pass along the ids->max_id without decrementing?

  if (unlikely(idx == ids->max_idx))
 ids->max_idx = ipc_get_nextidx(ids->max_idx);

Thanks,
Davidlohr


Re: [PATCH 2/2] fs/epoll: restore waking from ep_done_scan()

2021-04-05 Thread Davidlohr Bueso

On Mon, 05 Apr 2021, Andrew Morton wrote:


Tricky.  339ddb53d373 was merged in December 2019.  So do we backport
this fix?  Could any userspace code be depending upon the
post-339ddb53d373 behavior?


As with previous trouble caused by this commit, I vote for restoring the 
behavior
backporting the fix, basically the equivalent of adding (which was my 
intention):

Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")



Or do we just leave the post-339ddb53d373 code as-is?  Presumably the
issue is very rarely encountered, and changeing it back has its own
risks.


While I also consider this scenario rare (normally new ready events can become
ready and trigger new wakeups), I'm seeing reports in real applications of task
hangs due to this change of semantics. Alternatively, users can update their 
code
to timeout in such scenarios, but it is ultimately the kernel's fault. 
Furthermore
it hasn't really been all _that_ long since the commit was merged, so I don't 
think
it merits a change in behavior.

As for the risks of restoring the behavior, afaict this only fixed a double 
wakeup
in an obscure nested epoll scenario, so I'm not too worried there sacrificing
performance for functionality. That said, there are fixes, for example 
65759097d80
(epoll: call final ep_events_available() check under the lock) that would 
perhaps
be rendered unnecessary.

Thanks,
Davidlohr


[PATCH 2/2] fs/epoll: restore waking from ep_done_scan()

2021-04-05 Thread Davidlohr Bueso
339ddb53d373 (fs/epoll: remove unnecessary wakeups of nested epoll) changed
the userspace visible behavior of exclusive waiters blocked on a common
epoll descriptor upon a single event becoming ready. Previously, all tasks
doing epoll_wait would awake, and now only one is awoken, potentially causing
missed wakeups on applications that rely on this behavior, such as Apache Qpid.

While the aforementioned commit aims at having only a wakeup single path in
ep_poll_callback (with the exceptions of epoll_ctl cases), we need to restore
the wakeup in what was the old ep_scan_ready_list() such that the next thread
can be awoken, in a cascading style, after the waker's corresponding 
ep_send_events().

Signed-off-by: Davidlohr Bueso 
---
 fs/eventpoll.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 73138ea68342..1e596e1d0bba 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -657,6 +657,12 @@ static void ep_done_scan(struct eventpoll *ep,
 */
list_splice(txlist, >rdllist);
__pm_relax(ep->ws);
+
+   if (!list_empty(>rdllist)) {
+   if (waitqueue_active(>wq))
+   wake_up(>wq);
+   }
+
write_unlock_irq(>lock);
 }
 
-- 
2.26.2



[PATCH 1/2] kselftest: introduce new epoll test case

2021-04-05 Thread Davidlohr Bueso
This incorporates the testcase originally reported in:

 https://bugzilla.kernel.org/show_bug.cgi?id=208943

Which ensures an event is reported to all threads blocked on the
same epoll descriptor, otherwise only a single thread will receive
the wakeup once the event become ready.

Signed-off-by: Davidlohr Bueso 
---
 .../filesystems/epoll/epoll_wakeup_test.c | 44 +++
 1 file changed, 44 insertions(+)

diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c 
b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
index ad7fabd575f9..65ede506305c 100644
--- a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
+++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
@@ -3449,4 +3449,48 @@ TEST(epoll63)
close(sfd[1]);
 }
 
+/*
+ *t0t1
+ * (ew) \  / (ew)
+ *   e0
+ *| (lt)
+ *   s0
+ */
+TEST(epoll64)
+{
+   pthread_t waiter[2];
+   struct epoll_event e;
+   struct epoll_mtcontext ctx = { 0 };
+
+   signal(SIGUSR1, signal_handler);
+
+   ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+   ctx.efd[0] = epoll_create(1);
+   ASSERT_GE(ctx.efd[0], 0);
+
+   e.events = EPOLLIN;
+   ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], ), 0);
+
+   /*
+* main will act as the emitter once both waiter threads are
+* blocked and expects to both be awoken upon the ready event.
+*/
+   ctx.main = pthread_self();
+   ASSERT_EQ(pthread_create([0], NULL, waiter_entry1a, ), 0);
+   ASSERT_EQ(pthread_create([1], NULL, waiter_entry1a, ), 0);
+
+   usleep(10);
+   ASSERT_EQ(write(ctx.sfd[1], "w", 1), 1);
+
+   ASSERT_EQ(pthread_join(waiter[0], NULL), 0);
+   ASSERT_EQ(pthread_join(waiter[1], NULL), 0);
+
+   EXPECT_EQ(ctx.count, 2);
+
+   close(ctx.efd[0]);
+   close(ctx.sfd[0]);
+   close(ctx.sfd[1]);
+}
+
 TEST_HARNESS_MAIN
-- 
2.26.2



[PATCH 0/2] fs/epoll: restore user-visible behavior upon event ready

2021-04-05 Thread Davidlohr Bueso
Hi,

This series tries to address a change in user visible behavior,
reported in:

 https://bugzilla.kernel.org/show_bug.cgi?id=208943


Epoll does not report an event to all the threads running epoll_wait()
on the same epoll descriptor. Unsurprisingly, this was bisected back to
339ddb53d373 (fs/epoll: remove unnecessary wakeups of nested epoll), which
has had various problems in the past, beyond only nested epoll usage.

Thanks!

Davidlohr Bueso (2):
  kselftest: introduce new epoll test case
  fs/epoll: restore waking from ep_done_scan()

 fs/eventpoll.c|  6 +++
 .../filesystems/epoll/epoll_wakeup_test.c | 44 +++
 2 files changed, 50 insertions(+)

--
2.26.2



[PATCH -tip] locking: Move CONFIG_LOCK_EVENT_COUNTS into Kernel hacking section

2021-03-29 Thread Davidlohr Bueso
It's a lot more intuitive to have it in the locking section of the kernel
hacking part rather than under "General architecture-dependent options".

Signed-off-by: Davidlohr Bueso 
---
 arch/Kconfig  | 9 -
 lib/Kconfig.debug | 9 +
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index ecfd3520b676..d6f9aeaaf9f2 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1113,15 +1113,6 @@ config HAVE_ARCH_PREL32_RELOCATIONS
 config ARCH_USE_MEMREMAP_PROT
bool
 
-config LOCK_EVENT_COUNTS
-   bool "Locking event counts collection"
-   depends on DEBUG_FS
-   help
- Enable light-weight counting of various locking related events
- in the system with minimal performance impact. This reduces
- the chance of application behavior change because of timing
- differences. The counts are reported via debugfs.
-
 # Select if the architecture has support for applying RELR relocations.
 config ARCH_HAS_RELR
bool
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2779c29d9981..76639ff5998c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1401,6 +1401,15 @@ config DEBUG_LOCKING_API_SELFTESTS
  The following locking APIs are covered: spinlocks, rwlocks,
  mutexes and rwsems.
 
+config LOCK_EVENT_COUNTS
+   bool "Locking event counts collection"
+   depends on DEBUG_FS
+   help
+ Enable light-weight counting of various locking related events
+ in the system with minimal performance impact. This reduces
+ the chance of application behavior change because of timing
+ differences. The counts are reported via debugfs.
+
 config LOCK_TORTURE_TEST
tristate "torture tests for locking"
depends on DEBUG_KERNEL
-- 
2.26.2



[tip: locking/core] MAINTAINERS: Add myself as futex reviewer

2021-03-28 Thread tip-bot2 for Davidlohr Bueso
The following commit has been merged into the locking/core branch of tip:

Commit-ID: bd9a5fc2edb0bdcb0756298daa31ddd6a02f0634
Gitweb:
https://git.kernel.org/tip/bd9a5fc2edb0bdcb0756298daa31ddd6a02f0634
Author:Davidlohr Bueso 
AuthorDate:Fri, 22 Jan 2021 09:11:01 -08:00
Committer: Thomas Gleixner 
CommitterDate: Sun, 28 Mar 2021 21:12:42 +02:00

MAINTAINERS: Add myself as futex reviewer

I'm volunteering to help review some of the pain.

Signed-off-by: Davidlohr Bueso 
Signed-off-by: Thomas Gleixner 
Link: https://lore.kernel.org/r/20210122171101.15991-1-d...@stgolabs.net

---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index aa84121..8c31c77 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7363,6 +7363,7 @@ M:Thomas Gleixner 
 M: Ingo Molnar 
 R: Peter Zijlstra 
 R: Darren Hart 
+R: Davidlohr Bueso 
 L: linux-kernel@vger.kernel.org
 S: Maintained
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
locking/core


Re: [PATCH] drivers/block: Goodbye BLK_DEV_UMEM

2021-03-23 Thread Davidlohr Bueso

I'm also Ccing Neil, who is one of the authors.

On Tue, 23 Mar 2021, Bueso wrote:


This removes the driver on the premise that it has been unused for a long
time. This is a better approach compared to changing untestable code nobody
cares about in the first place. Similarly, the umem.com website now shows a
mere Godaddy parking add.

Suggested-by: Christoph Hellwig 
Signed-off-by: Davidlohr Bueso 
---
arch/mips/configs/malta_defconfig   |1 -
arch/mips/configs/malta_kvm_defconfig   |1 -
arch/mips/configs/maltaup_xpa_defconfig |1 -
drivers/block/Kconfig   |   17 -
drivers/block/Makefile  |1 -
drivers/block/umem.c| 1130 ---
drivers/block/umem.h|  132 ---
7 files changed, 1283 deletions(-)
delete mode 100644 drivers/block/umem.c
delete mode 100644 drivers/block/umem.h

diff --git a/arch/mips/configs/malta_defconfig 
b/arch/mips/configs/malta_defconfig
index 211bd3d6e6cb..9cb2cf2595e0 100644
--- a/arch/mips/configs/malta_defconfig
+++ b/arch/mips/configs/malta_defconfig
@@ -227,7 +227,6 @@ CONFIG_MTD_PHYSMAP_OF=y
CONFIG_MTD_UBI=m
CONFIG_MTD_UBI_GLUEBI=m
CONFIG_BLK_DEV_FD=m
-CONFIG_BLK_DEV_UMEM=m
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_NBD=m
diff --git a/arch/mips/configs/malta_kvm_defconfig 
b/arch/mips/configs/malta_kvm_defconfig
index 62b1969b4f55..ab8d1df0f255 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -232,7 +232,6 @@ CONFIG_MTD_PHYSMAP_OF=y
CONFIG_MTD_UBI=m
CONFIG_MTD_UBI_GLUEBI=m
CONFIG_BLK_DEV_FD=m
-CONFIG_BLK_DEV_UMEM=m
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_NBD=m
diff --git a/arch/mips/configs/maltaup_xpa_defconfig 
b/arch/mips/configs/maltaup_xpa_defconfig
index 636311d67a53..c93e5a39c215 100644
--- a/arch/mips/configs/maltaup_xpa_defconfig
+++ b/arch/mips/configs/maltaup_xpa_defconfig
@@ -230,7 +230,6 @@ CONFIG_MTD_PHYSMAP_OF=y
CONFIG_MTD_UBI=m
CONFIG_MTD_UBI_GLUEBI=m
CONFIG_BLK_DEV_FD=m
-CONFIG_BLK_DEV_UMEM=m
CONFIG_BLK_DEV_LOOP=m
CONFIG_BLK_DEV_CRYPTOLOOP=m
CONFIG_BLK_DEV_NBD=m
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index fd236158f32d..56e7b8450120 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -121,23 +121,6 @@ source "drivers/block/mtip32xx/Kconfig"

source "drivers/block/zram/Kconfig"

-config BLK_DEV_UMEM
-   tristate "Micro Memory MM5415 Battery Backed RAM support"
-   depends on PCI
-   help
- Saying Y here will include support for the MM5415 family of
- battery backed (Non-volatile) RAM cards.
- <http://www.umem.com/>
-
- The cards appear as block devices that can be partitioned into
- as many as 15 partitions.
-
- To compile this driver as a module, choose M here: the
- module will be called umem.
-
- The umem driver has not yet been allocated a MAJOR number, so
- one is chosen dynamically.
-
config BLK_DEV_UBD
bool "Virtual block device"
depends on UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index e3e3f1c79a82..9878eb3dbfd5 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -24,7 +24,6 @@ obj-$(CONFIG_XILINX_SYSACE)   += xsysace.o
obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
obj-$(CONFIG_SUNVDC)+= sunvdc.o

-obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
obj-$(CONFIG_BLK_DEV_NBD)   += nbd.o
obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
obj-$(CONFIG_VIRTIO_BLK)+= virtio_blk.o
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
deleted file mode 100644
index 664280f23bee..
--- a/drivers/block/umem.c
+++ /dev/null
@@ -1,1130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * mm.c - Micro Memory(tm) PCI memory board block device driver - v2.3
- *
- * (C) 2001 San Mehat 
- * (C) 2001 Johannes Erdfelt 
- * (C) 2001 NeilBrown 
- *
- * This driver for the Micro Memory PCI Memory Module with Battery Backup
- * is Copyright Micro Memory Inc 2001-2002.  All rights reserved.
- *
- * This driver provides a standard block device interface for Micro Memory(tm)
- * PCI based RAM boards.
- * 10/05/01: Phap Nguyen - Rebuilt the driver
- * 10/22/01: Phap Nguyen - v2.1 Added disk partitioning
- * 29oct2001:NeilBrown   - Use make_request_fn instead of request_fn
- *   - use stand disk partitioning (so fdisk works).
- * 08nov2001:NeilBrown  - change driver name from "mm" to "umem"
- *  - incorporate into main kernel
- * 08apr2002:NeilBrown   - Move some of interrupt handle to tasklet
- *  - use spin_lock_bh instead of _irq
- *  - Never block on make_request.  queue
- *bh's instead.
- *  - unregister umem from devfs at mod unload
- *  - Change version to 2.3
- * 07

[PATCH] drivers/block: Goodbye BLK_DEV_UMEM

2021-03-23 Thread Davidlohr Bueso
This removes the driver on the premise that it has been unused for a long
time. This is a better approach compared to changing untestable code nobody
cares about in the first place. Similarly, the umem.com website now shows a
mere Godaddy parking add.

Suggested-by: Christoph Hellwig 
Signed-off-by: Davidlohr Bueso 
---
 arch/mips/configs/malta_defconfig   |1 -
 arch/mips/configs/malta_kvm_defconfig   |1 -
 arch/mips/configs/maltaup_xpa_defconfig |1 -
 drivers/block/Kconfig   |   17 -
 drivers/block/Makefile  |1 -
 drivers/block/umem.c| 1130 ---
 drivers/block/umem.h|  132 ---
 7 files changed, 1283 deletions(-)
 delete mode 100644 drivers/block/umem.c
 delete mode 100644 drivers/block/umem.h

diff --git a/arch/mips/configs/malta_defconfig 
b/arch/mips/configs/malta_defconfig
index 211bd3d6e6cb..9cb2cf2595e0 100644
--- a/arch/mips/configs/malta_defconfig
+++ b/arch/mips/configs/malta_defconfig
@@ -227,7 +227,6 @@ CONFIG_MTD_PHYSMAP_OF=y
 CONFIG_MTD_UBI=m
 CONFIG_MTD_UBI_GLUEBI=m
 CONFIG_BLK_DEV_FD=m
-CONFIG_BLK_DEV_UMEM=m
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
diff --git a/arch/mips/configs/malta_kvm_defconfig 
b/arch/mips/configs/malta_kvm_defconfig
index 62b1969b4f55..ab8d1df0f255 100644
--- a/arch/mips/configs/malta_kvm_defconfig
+++ b/arch/mips/configs/malta_kvm_defconfig
@@ -232,7 +232,6 @@ CONFIG_MTD_PHYSMAP_OF=y
 CONFIG_MTD_UBI=m
 CONFIG_MTD_UBI_GLUEBI=m
 CONFIG_BLK_DEV_FD=m
-CONFIG_BLK_DEV_UMEM=m
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
diff --git a/arch/mips/configs/maltaup_xpa_defconfig 
b/arch/mips/configs/maltaup_xpa_defconfig
index 636311d67a53..c93e5a39c215 100644
--- a/arch/mips/configs/maltaup_xpa_defconfig
+++ b/arch/mips/configs/maltaup_xpa_defconfig
@@ -230,7 +230,6 @@ CONFIG_MTD_PHYSMAP_OF=y
 CONFIG_MTD_UBI=m
 CONFIG_MTD_UBI_GLUEBI=m
 CONFIG_BLK_DEV_FD=m
-CONFIG_BLK_DEV_UMEM=m
 CONFIG_BLK_DEV_LOOP=m
 CONFIG_BLK_DEV_CRYPTOLOOP=m
 CONFIG_BLK_DEV_NBD=m
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index fd236158f32d..56e7b8450120 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -121,23 +121,6 @@ source "drivers/block/mtip32xx/Kconfig"
 
 source "drivers/block/zram/Kconfig"
 
-config BLK_DEV_UMEM
-   tristate "Micro Memory MM5415 Battery Backed RAM support"
-   depends on PCI
-   help
- Saying Y here will include support for the MM5415 family of
- battery backed (Non-volatile) RAM cards.
- <http://www.umem.com/>
-
- The cards appear as block devices that can be partitioned into
- as many as 15 partitions.
-
- To compile this driver as a module, choose M here: the
- module will be called umem.
-
- The umem driver has not yet been allocated a MAJOR number, so
- one is chosen dynamically.
-
 config BLK_DEV_UBD
bool "Virtual block device"
depends on UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index e3e3f1c79a82..9878eb3dbfd5 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -24,7 +24,6 @@ obj-$(CONFIG_XILINX_SYSACE)   += xsysace.o
 obj-$(CONFIG_CDROM_PKTCDVD)+= pktcdvd.o
 obj-$(CONFIG_SUNVDC)   += sunvdc.o
 
-obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
 obj-$(CONFIG_BLK_DEV_NBD)  += nbd.o
 obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
 obj-$(CONFIG_VIRTIO_BLK)   += virtio_blk.o
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
deleted file mode 100644
index 664280f23bee..
--- a/drivers/block/umem.c
+++ /dev/null
@@ -1,1130 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * mm.c - Micro Memory(tm) PCI memory board block device driver - v2.3
- *
- * (C) 2001 San Mehat 
- * (C) 2001 Johannes Erdfelt 
- * (C) 2001 NeilBrown 
- *
- * This driver for the Micro Memory PCI Memory Module with Battery Backup
- * is Copyright Micro Memory Inc 2001-2002.  All rights reserved.
- *
- * This driver provides a standard block device interface for Micro Memory(tm)
- * PCI based RAM boards.
- * 10/05/01: Phap Nguyen - Rebuilt the driver
- * 10/22/01: Phap Nguyen - v2.1 Added disk partitioning
- * 29oct2001:NeilBrown   - Use make_request_fn instead of request_fn
- *   - use stand disk partitioning (so fdisk works).
- * 08nov2001:NeilBrown  - change driver name from "mm" to "umem"
- *  - incorporate into main kernel
- * 08apr2002:NeilBrown   - Move some of interrupt handle to tasklet
- *  - use spin_lock_bh instead of _irq
- *  - Never block on make_request.  queue
- *bh's instead.
- *  - unregister umem from devfs at mod unload
- *  - Change version to 2.3
- * 07Nov2001:Phap Nguyen - Select pci read command: 06, 

Re: [PATCH] block/umem: convert tasklet to threaded irq

2021-03-23 Thread Davidlohr Bueso

On Tue, 23 Mar 2021, Jens Axboe wrote:


Me too, I'd be surprised if anyone has used it in... forever. We can
probably drop it - I really dislike making core changes to something
that can't even be tested. Davidlohr, assuming you had no way of
testing this change?


No, no way of testing these changes - I got here via git grep.


[PATCH] block/umem: convert tasklet to threaded irq

2021-03-22 Thread Davidlohr Bueso
Tasklets have long been deprecated as being too heavy on the system
by running in irq context - and this is not a performance critical
path. If a higher priority process wants to run, it must wait for
the tasklet to finish before doing so. A more suitable equivalent
is to converted to threaded irq instead and deal with the async
processing in task context.

Signed-off-by: Davidlohr Bueso 
---
 drivers/block/umem.c | 23 ++-
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 982732dbe82e..6b0a110f9233 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -120,7 +120,6 @@ struct cardinfo {
 
int  Active, Ready;
 
-   struct tasklet_struct   tasklet;
unsigned int dma_status;
 
struct {
@@ -243,7 +242,7 @@ static void dump_dmastat(struct cardinfo *card, unsigned 
int dmastat)
  * overloaded to record whether it was a read or a write.
  *
  * The interrupt handler only polls the device to clear the interrupt.
- * The processing of the result is done in a tasklet.
+ * The processing of the result is done in threaded irq.
  */
 
 static void mm_start_io(struct cardinfo *card)
@@ -405,7 +404,7 @@ static int add_bio(struct cardinfo *card)
return 1;
 }
 
-static void process_page(unsigned long data)
+static irqreturn_t process_page(int irq, void *__card)
 {
/* check if any of the requests in the page are DMA_COMPLETE,
 * and deal with them appropriately.
@@ -415,10 +414,10 @@ static void process_page(unsigned long data)
 */
struct mm_page *page;
struct bio *return_bio = NULL;
-   struct cardinfo *card = (struct cardinfo *)data;
+   struct cardinfo *card = (struct cardinfo *)__card;
unsigned int dma_status = card->dma_status;
 
-   spin_lock(>lock);
+   spin_lock_bh(>lock);
if (card->Active < 0)
goto out_unlock;
page = >mm_pages[card->Active];
@@ -493,7 +492,7 @@ static void process_page(unsigned long data)
mm_start_io(card);
}
  out_unlock:
-   spin_unlock(>lock);
+   spin_unlock_bh(>lock);
 
while (return_bio) {
struct bio *bio = return_bio;
@@ -502,6 +501,8 @@ static void process_page(unsigned long data)
bio->bi_next = NULL;
bio_endio(bio);
}
+
+   return IRQ_HANDLED;
 }
 
 static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule)
@@ -637,11 +638,10 @@ HW_TRACE(0x30);
 
/* and process the DMA descriptors */
card->dma_status = dma_status;
-   tasklet_schedule(>tasklet);
 
 HW_TRACE(0x36);
 
-   return IRQ_HANDLED;
+   return IRQ_WAKE_THREAD;
 }
 
 /*
@@ -891,8 +891,6 @@ static int mm_pci_probe(struct pci_dev *dev, const struct 
pci_device_id *id)
if (!card->queue)
goto failed_alloc;
 
-   tasklet_init(>tasklet, process_page, (unsigned long)card);
-
card->check_batteries = 0;
 
mem_present = readb(card->csr_remap + MEMCTRLSTATUS_MEMORY);
@@ -951,8 +949,8 @@ static int mm_pci_probe(struct pci_dev *dev, const struct 
pci_device_id *id)
data = ~data;
data += 1;
 
-   if (request_irq(dev->irq, mm_interrupt, IRQF_SHARED, DRIVER_NAME,
-   card)) {
+   if (request_threaded_irq(dev->irq, mm_interrupt, process_page,
+IRQF_SHARED, DRIVER_NAME, card)) {
dev_printk(KERN_ERR, >dev->dev,
"Unable to allocate IRQ\n");
ret = -ENODEV;
@@ -1015,7 +1013,6 @@ static void mm_pci_remove(struct pci_dev *dev)
 {
struct cardinfo *card = pci_get_drvdata(dev);
 
-   tasklet_kill(>tasklet);
free_irq(dev->irq, card);
iounmap(card->csr_remap);
 
-- 
2.26.2



Re: [PATCH] xsysace: Remove SYSACE driver

2021-03-22 Thread Davidlohr Bueso

Hi,

On Mon, 09 Nov 2020, Michal Simek wrote:


Sysace IP is no longer used on Xilinx PowerPC 405/440 and Microblaze
systems. The driver is not regularly tested and very likely not working for
quite a long time that's why remove it.


Is there a reason this patch was never merged? can the driver be removed? I ran
into this as a potential tasklet user that can be replaced/removed.

Thanks,
Davidlohr



Signed-off-by: Michal Simek 
---

Based on discussion
https://lore.kernel.org/linux-arm-kernel/5ab9a2a1-20e3-c7b2-f666-2034df436...@kernel.dk/

I have grepped the kernel and found any old ppc platform. I have included
it in this patch to have a discussion about it.

---
MAINTAINERS |1 -
arch/microblaze/boot/dts/system.dts |8 -
arch/powerpc/boot/dts/icon.dts  |7 -
arch/powerpc/configs/44x/icon_defconfig |1 -
drivers/block/Kconfig   |6 -
drivers/block/Makefile  |1 -
drivers/block/xsysace.c | 1273 ---
7 files changed, 1297 deletions(-)
delete mode 100644 drivers/block/xsysace.c

diff --git a/MAINTAINERS b/MAINTAINERS
index cba8ddf87a08..38556c009758 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2741,7 +2741,6 @@ T:git https://github.com/Xilinx/linux-xlnx.git
F:  Documentation/devicetree/bindings/i2c/cdns,i2c-r1p10.yaml
F:  Documentation/devicetree/bindings/i2c/xlnx,xps-iic-2.00.a.yaml
F:  arch/arm/mach-zynq/
-F: drivers/block/xsysace.c
F:  drivers/clocksource/timer-cadence-ttc.c
F:  drivers/cpuidle/cpuidle-zynq.c
F:  drivers/edac/synopsys_edac.c
diff --git a/arch/microblaze/boot/dts/system.dts 
b/arch/microblaze/boot/dts/system.dts
index 5b236527176e..b7ee1056779e 100644
--- a/arch/microblaze/boot/dts/system.dts
+++ b/arch/microblaze/boot/dts/system.dts
@@ -310,14 +310,6 @@ RS232_Uart_1: serial@8400 {
xlnx,odd-parity = <0x0>;
xlnx,use-parity = <0x0>;
} ;
-   SysACE_CompactFlash: sysace@8360 {
-   compatible = "xlnx,xps-sysace-1.00.a";
-   interrupt-parent = <_intc_0>;
-   interrupts = < 4 2 >;
-   reg = < 0x8360 0x1 >;
-   xlnx,family = "virtex5";
-   xlnx,mem-width = <0x10>;
-   } ;
debug_module: debug@8440 {
compatible = "xlnx,mdm-1.00.d";
reg = < 0x8440 0x1 >;
diff --git a/arch/powerpc/boot/dts/icon.dts b/arch/powerpc/boot/dts/icon.dts
index fbaa60b8f87a..4fd7a4fbb4fb 100644
--- a/arch/powerpc/boot/dts/icon.dts
+++ b/arch/powerpc/boot/dts/icon.dts
@@ -197,13 +197,6 @@ partition@fa {
reg = <0x00fa 0x0006>;
};
};
-
-   SysACE_CompactFlash: sysace@1,0 {
-   compatible = "xlnx,sysace";
-   interrupt-parent = <>;
-   interrupts = <24 0x4>;
-   reg = <0x0001 0x 0x1>;
-   };
};

UART0: serial@f200 {
diff --git a/arch/powerpc/configs/44x/icon_defconfig 
b/arch/powerpc/configs/44x/icon_defconfig
index 930948a1da76..fb9a15573546 100644
--- a/arch/powerpc/configs/44x/icon_defconfig
+++ b/arch/powerpc/configs/44x/icon_defconfig
@@ -28,7 +28,6 @@ CONFIG_MTD_CFI_AMDSTD=y
CONFIG_MTD_PHYSMAP_OF=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=35000
-CONFIG_XILINX_SYSACE=y
CONFIG_SCSI=y
CONFIG_BLK_DEV_SD=y
CONFIG_SCSI_CONSTANTS=y
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ecceaaa1a66f..9cb02861298d 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -388,12 +388,6 @@ config SUNVDC

source "drivers/s390/block/Kconfig"

-config XILINX_SYSACE
-   tristate "Xilinx SystemACE support"
-   depends on 4xx || MICROBLAZE
-   help
- Include support for the Xilinx SystemACE CompactFlash interface
-
config XEN_BLKDEV_FRONTEND
tristate "Xen virtual block device support"
depends on XEN
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index e1f63117ee94..5ddd9370972a 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_ATARI_FLOPPY)+= ataflop.o
obj-$(CONFIG_AMIGA_Z2RAM)   += z2ram.o
obj-$(CONFIG_BLK_DEV_RAM)   += brd.o
obj-$(CONFIG_BLK_DEV_LOOP)  += loop.o
-obj-$(CONFIG_XILINX_SYSACE)+= xsysace.o
obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
obj-$(CONFIG_SUNVDC)+= sunvdc.o
obj-$(CONFIG_BLK_DEV_SKD)   += skd.o
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
deleted file mode 100644
index eb8ef65778c3..

[PATCH v2] powerpc/qspinlock: Use generic smp_cond_load_relaxed

2021-03-18 Thread Davidlohr Bueso
49a7d46a06c3 (powerpc: Implement smp_cond_load_relaxed()) added
busy-waiting pausing with a preferred SMT priority pattern, lowering
the priority (reducing decode cycles) during the whole loop slowpath.

However, data shows that while this pattern works well with simple
spinlocks, queued spinlocks benefit more being kept in medium priority,
with a cpu_relax() instead, being a low+medium combo on powerpc.

Data is from three benchmarks on a Power9: 9008-22L 64 CPUs with
2 sockets and 8 threads per core.

1. locktorture.

This is data for the lowest and most artificial/pathological level,
with increasing thread counts pounding on the lock. Metrics are total
ops/minute. Despite some small hits in the 4-8 range, scenarios are
either neutral or favorable to this patch.

+=+==+==+===+
| # tasks | vanilla  | dirty| %diff |
+=+==+==+===+
| 2   | 46718565 | 48751350 | 4.35  |
+-+--+--+---+
| 4   | 51740198 | 50369082 | -2.65 |
+-+--+--+---+
| 8   | 63756510 | 62568821 | -1.86 |
+-+--+--+---+
| 16  | 67824531 | 70966546 | 4.63  |
+-+--+--+---+
| 32  | 53843519 | 61155508 | 13.58 |
+-+--+--+---+
| 64  | 53005778 | 53104412 | 0.18  |
+-+--+--+---+
| 128 | 53331980 | 54606910 | 2.39  |
+=+==+==+===+

2. sockperf (tcp throughput)

Here a client will do one-way throughput tests to a localhost server, with
increasing message sizes, dealing with the sk_lock. This patch shows to put
the performance of the qspinlock back to par with that of the simple lock:

 simple-spinlock   vanilla  dirty
Hmean 1473.50 (   0.00%)   54.44 * -25.93%*   73.45 * 
-0.07%*
Hmean 100  654.47 (   0.00%)  385.61 * -41.08%*  771.43 * 
17.87%*
Hmean 300 2719.39 (   0.00%) 2181.67 * -19.77%* 2666.50 * 
-1.94%*
Hmean 500 4400.59 (   0.00%) 3390.77 * -22.95%* 4322.14 * 
-1.78%*
Hmean 850 6726.21 (   0.00%) 5264.03 * -21.74%* 6863.12 * 2.04%*

3. dbench (tmpfs)

Configured to run with up to ncpusx8 clients, it shows both latency and
throughput metrics. For the latency, with the exception of the 64 case,
there is really nothing to go by:
 vanilladirty
Amean latency-1  1.67 (   0.00%)1.67 *   0.09%*
Amean latency-2  2.15 (   0.00%)2.08 *   3.36%*
Amean latency-4  2.50 (   0.00%)2.56 *  -2.27%*
Amean latency-8  2.49 (   0.00%)2.48 *   0.31%*
Amean latency-16 2.69 (   0.00%)2.72 *  -1.37%*
Amean latency-32 2.96 (   0.00%)3.04 *  -2.60%*
Amean latency-64 7.78 (   0.00%)8.17 *  -5.07%*
Amean latency-512  186.91 (   0.00%)  186.41 *   0.27%*

For the dbench4 Throughput (misleading but traditional) there's a small
but rather constant improvement:

 vanilladirty
Hmean 1849.13 (   0.00%)  851.51 *   0.28%*
Hmean 2   1664.03 (   0.00%) 1663.94 *  -0.01%*
Hmean 4   3073.70 (   0.00%) 3104.29 *   1.00%*
Hmean 8   5624.02 (   0.00%) 5694.16 *   1.25%*
Hmean 16  9169.49 (   0.00%) 9324.43 *   1.69%*
Hmean 32 11969.37 (   0.00%)12127.09 *   1.32%*
Hmean 64 15021.12 (   0.00%)15243.14 *   1.48%*
Hmean 51214891.27 (   0.00%)15162.11 *   1.82%*

Measuring the dbench4 Per-VFS Operation latency, shows some very minor
differences within the noise level, around the 0-1% ranges.

Fixes: 49a7d46a06c3 (powerpc: Implement smp_cond_load_relaxed())
Acked-by: Nicholas Piggin 
Signed-off-by: Davidlohr Bueso 
---
Changes from v1:
Added small description and labeling smp_cond_load_relaxed requested by Nick.
Added Nick's ack.

 arch/powerpc/include/asm/barrier.h   | 16 
 arch/powerpc/include/asm/qspinlock.h |  7 +++
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/barrier.h 
b/arch/powerpc/include/asm/barrier.h
index aecfde829d5d..7ae29cfb06c0 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -80,22 +80,6 @@ do { 
\
___p1;  \
 })
 
-#ifdef CONFIG_PPC64
-#define smp_cond_load_relaxed(ptr, cond_expr) ({   \
-   typeof(ptr) __PTR = (ptr);  \
-   __unqual_scalar_typeof(*ptr) VAL;   \
-   VAL = READ_ONCE(*__PTR);\
-   if (unlikely(!(cond_expr))) {   \
-   spin_begin

Re: [PATCH 3/3] powerpc/qspinlock: Use generic smp_cond_load_relaxed

2021-03-18 Thread Davidlohr Bueso

On Tue, 16 Mar 2021, Nicholas Piggin wrote:


One request, could you add a comment in place that references
smp_cond_load_relaxed() so this commit can be found again if
someone looks at it? Something like this

/*
* smp_cond_load_relaxed was found to have performance problems if
* implemented with spin_begin()/spin_end().
*/


Sure, let me see where I can fit that in and send out a v2.

Similarly, but unrelated to this patch, is there any chance we could
remove the whole spin_until_cond() machinery and make it specific to
powerpc? This was introduced in 2017 and doesn't really have any users
outside of powerpc, except for these:

drivers/firmware/arm_scmi/driver.c: 
spin_until_cond(scmi_xfer_done_no_timeout(cinfo, xfer, stop));
drivers/firmware/arm_scmi/shmem.c:  
spin_until_cond(ioread32(>channel_status) &
drivers/net/ethernet/xilinx/ll_temac_main.c:
spin_until_cond(hard_acs_rdy_or_timeout(lp, timeout));

... which afaict only the xilinx one can actually build on powerpc.
Regardless, these could be converted to smp_cond_load_relaxed(), being
the more standard way to do optimized busy-waiting, caring more about
the family of barriers than ad-hoc SMT priorities. Of course, I have
no way of testing any of these changes.


I wonder if it should have a Fixes: tag to the original commit as
well.


I'm not sure either. I've actually been informed recently of other
workloads that benefit from the revert on large Power9 boxes. So I'll
go ahead and add it.



Otherwise,

Acked-by: Nicholas Piggin 


Thanks,
Davidlohr


Re: [PATCH 4/4] locking/locktorture: Fix incorrect use of ww_acquire_ctx in ww_mutex test

2021-03-16 Thread Davidlohr Bueso

On Tue, 16 Mar 2021, Waiman Long wrote:


The ww_acquire_ctx structure for ww_mutex needs to persist for a complete
lock/unlock cycle. In the ww_mutex test in locktorture, however, both
ww_acquire_init() and ww_acquire_fini() are called within the lock
function only. This causes a lockdep splat of "WARNING: Nested lock
was not taken" when lockdep is enabled in the kernel.

To fix this problem, we need to move the ww_acquire_fini() after the
ww_mutex_unlock() in torture_ww_mutex_unlock(). In other word, we need
to pass state information from the lock function to the unlock function.


Right, and afaict this _is_ the way ww_acquire_fini() should be called:

 * Releases a w/w acquire context. This must be called _after_ all acquired w/w
 * mutexes have been released with ww_mutex_unlock.


Change the writelock and writeunlock function prototypes to allow that
and change the torture_ww_mutex_lock() and torture_ww_mutex_unlock()
accordingly.


But wouldn't just making ctx a global variable be enough instead? That way
we don't deal with memory allocation for every lock/unlock operation (yuck).
Plus the ENOMEM would need to be handled/propagated accordingly - the code
really doesn't expect any failure from ->writelock().

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 0ab94e1f1276..606c0f6c1657 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -362,6 +362,8 @@ static DEFINE_WW_MUTEX(torture_ww_mutex_0, 
_ww_class);
 static DEFINE_WW_MUTEX(torture_ww_mutex_1, _ww_class);
 static DEFINE_WW_MUTEX(torture_ww_mutex_2, _ww_class);

+static struct ww_acquire_ctx ctx;
+
 static int torture_ww_mutex_lock(void)
 __acquires(torture_ww_mutex_0)
 __acquires(torture_ww_mutex_1)
@@ -372,7 +374,6 @@ __acquires(torture_ww_mutex_2)
struct list_head link;
struct ww_mutex *lock;
} locks[3], *ll, *ln;
-   struct ww_acquire_ctx ctx;

locks[0].lock = _ww_mutex_0;
list_add([0].link, );
@@ -403,7 +404,6 @@ __acquires(torture_ww_mutex_2)
list_move(>link, );
}

-   ww_acquire_fini();
return 0;
 }

@@ -415,6 +415,8 @@ __releases(torture_ww_mutex_2)
ww_mutex_unlock(_ww_mutex_0);
ww_mutex_unlock(_ww_mutex_1);
ww_mutex_unlock(_ww_mutex_2);
+
+   ww_acquire_fini();
 }

 static struct lock_torture_ops ww_mutex_lock_ops = {


Re: [PATCH 3/4] locking/ww_mutex: Treat ww_mutex_lock() like a trylock

2021-03-16 Thread Davidlohr Bueso

On Tue, 16 Mar 2021, Waiman Long wrote:


It was found that running the ww_mutex_lock-torture test produced the
following lockdep splat almost immediately:

[  103.892638] ==
[  103.892639] WARNING: possible circular locking dependency detected
[  103.892641] 5.12.0-rc3-debug+ #2 Tainted: G S  W
[  103.892643] --
[  103.892643] lock_torture_wr/3234 is trying to acquire lock:
[  103.892646] c0b35b10 (torture_ww_mutex_2.base){+.+.}-{3:3}, at: 
torture_ww_mutex_lock+0x316/0x720 [locktorture]
[  103.892660]
[  103.892660] but task is already holding lock:
[  103.892661] c0b35cd0 (torture_ww_mutex_0.base){+.+.}-{3:3}, at: 
torture_ww_mutex_lock+0x3e2/0x720 [locktorture]
[  103.892669]
[  103.892669] which lock already depends on the new lock.
[  103.892669]
[  103.892670]
[  103.892670] the existing dependency chain (in reverse order) is:
[  103.892671]
[  103.892671] -> #2 (torture_ww_mutex_0.base){+.+.}-{3:3}:
[  103.892675]lock_acquire+0x1c5/0x830
[  103.892682]__ww_mutex_lock.constprop.15+0x1d1/0x2e50
[  103.892687]ww_mutex_lock+0x4b/0x180
[  103.892690]torture_ww_mutex_lock+0x316/0x720 [locktorture]
[  103.892694]lock_torture_writer+0x142/0x3a0 [locktorture]
[  103.892698]kthread+0x35f/0x430
[  103.892701]ret_from_fork+0x1f/0x30
[  103.892706]
[  103.892706] -> #1 (torture_ww_mutex_1.base){+.+.}-{3:3}:
[  103.892709]lock_acquire+0x1c5/0x830
[  103.892712]__ww_mutex_lock.constprop.15+0x1d1/0x2e50
[  103.892715]ww_mutex_lock+0x4b/0x180
[  103.892717]torture_ww_mutex_lock+0x316/0x720 [locktorture]
[  103.892721]lock_torture_writer+0x142/0x3a0 [locktorture]
[  103.892725]kthread+0x35f/0x430
[  103.892727]ret_from_fork+0x1f/0x30
[  103.892730]
[  103.892730] -> #0 (torture_ww_mutex_2.base){+.+.}-{3:3}:
[  103.892733]check_prevs_add+0x3fd/0x2470
[  103.892736]__lock_acquire+0x2602/0x3100
[  103.892738]lock_acquire+0x1c5/0x830
[  103.892740]__ww_mutex_lock.constprop.15+0x1d1/0x2e50
[  103.892743]ww_mutex_lock+0x4b/0x180
[  103.892746]torture_ww_mutex_lock+0x316/0x720 [locktorture]
[  103.892749]lock_torture_writer+0x142/0x3a0 [locktorture]
[  103.892753]kthread+0x35f/0x430
[  103.892755]ret_from_fork+0x1f/0x30
[  103.892757]
[  103.892757] other info that might help us debug this:
[  103.892757]
[  103.892758] Chain exists of:
[  103.892758]   torture_ww_mutex_2.base --> torture_ww_mutex_1.base --> 
torture_ww_mutex_0.base
[  103.892758]
[  103.892763]  Possible unsafe locking scenario:
[  103.892763]
[  103.892764]CPU0CPU1
[  103.892765]
[  103.892765]   lock(torture_ww_mutex_0.base);
[  103.892767]lock(torture_ww_mutex_1.base);
[  103.892770]lock(torture_ww_mutex_0.base);
[  103.892772]   lock(torture_ww_mutex_2.base);
[  103.892774]
[  103.892774]  *** DEADLOCK ***

Since ww_mutex is supposed to be deadlock-proof if used properly, such
deadlock scenario should not happen. To avoid this false positive splat,
treat ww_mutex_lock() like a trylock().

After applying this patch, the locktorture test can run for a long time
without triggering the circular locking dependency splat.

Signed-off-by: Waiman Long 


Acked-by Davidlohr Bueso 


Re: [PATCH 1/4] locking/ww_mutex: Simplify use_ww_ctx & ww_ctx handling

2021-03-16 Thread Davidlohr Bueso

On Tue, 16 Mar 2021, Waiman Long wrote:


The use_ww_ctx flag is passed to mutex_optimistic_spin(), but the
function doesn't use it. The frequent use of the (use_ww_ctx && ww_ctx)
combination is repetitive.


I always found that very fugly.



In fact, ww_ctx should not be used at all if !use_ww_ctx.  Simplify
ww_mutex code by dropping use_ww_ctx from mutex_optimistic_spin() an
clear ww_ctx if !use_ww_ctx. In this way, we can replace (use_ww_ctx &&
ww_ctx) by just (ww_ctx).

Signed-off-by: Waiman Long 


Acked-by: Davidlohr Bueso 


---
kernel/locking/mutex.c | 25 ++---
1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index adb935090768..622ebdfcd083 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -626,7 +626,7 @@ static inline int mutex_can_spin_on_owner(struct mutex 
*lock)
 */
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
if (!waiter) {
/*
@@ -702,7 +702,7 @@ mutex_optimistic_spin(struct mutex *lock, struct 
ww_acquire_ctx *ww_ctx,
#else
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
return false;
}
@@ -922,6 +922,9 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
struct ww_mutex *ww;
int ret;

+   if (!use_ww_ctx)
+   ww_ctx = NULL;
+
might_sleep();

#ifdef CONFIG_DEBUG_MUTEXES
@@ -929,7 +932,7 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
#endif

ww = container_of(lock, struct ww_mutex, base);
-   if (use_ww_ctx && ww_ctx) {
+   if (ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;

@@ -946,10 +949,10 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
mutex_acquire_nest(>dep_map, subclass, 0, nest_lock, ip);

if (__mutex_trylock(lock) ||
-   mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
+   mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(>dep_map, ip);
-   if (use_ww_ctx && ww_ctx)
+   if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
preempt_enable();
return 0;
@@ -960,7 +963,7 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
 * After waiting to acquire the wait_lock, try again.
 */
if (__mutex_trylock(lock)) {
-   if (use_ww_ctx && ww_ctx)
+   if (ww_ctx)
__ww_mutex_check_waiters(lock, ww_ctx);

goto skip_wait;
@@ -1013,7 +1016,7 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
goto err;
}

-   if (use_ww_ctx && ww_ctx) {
+   if (ww_ctx) {
ret = __ww_mutex_check_kill(lock, , ww_ctx);
if (ret)
goto err;
@@ -1026,7 +1029,7 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
 * ww_mutex needs to always recheck its position since its 
waiter
 * list is not FIFO ordered.
 */
-   if ((use_ww_ctx && ww_ctx) || !first) {
+   if (ww_ctx || !first) {
first = __mutex_waiter_is_first(lock, );
if (first)
__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
@@ -1039,7 +1042,7 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
 * or we must see its unlock and acquire.
 */
if (__mutex_trylock(lock) ||
-   (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, 
)))
+   (first && mutex_optimistic_spin(lock, ww_ctx, )))
break;

spin_lock(>wait_lock);
@@ -1048,7 +1051,7 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
acquired:
__set_current_state(TASK_RUNNING);

-   if (use_ww_ctx && ww_ctx) {
+   if (ww_ctx) {
/*
 * Wound-Wait; we stole the lock (!first_waiter), check the
 * waiters as anyone might want to wound us.
@@ -1068,7 +1071,7 @@ __mutex_lock_common(struct mutex *lock, long state, 
unsigned int subclass,
/* got the lock - cl

Re: [PATCH 2/2] futex: Leave the pi lock stealer in a consistent state upon successful fault

2021-03-16 Thread Davidlohr Bueso

On Tue, 16 Mar 2021, Peter Zijlstra wrote:


IIRC we made the explicit choice to never loop here. That saves having
to worry about getting stuck in in-kernel loops.

Userspace triggering the case where the futex goes corrupt is UB, after
that we have no obligation for anything to still work. It's on them,
they get to deal with the bits remaining.


I was kind of expecting this answer, honestly. After all, we are warned
about violations to the 10th:

 * [10] There is no transient state which leaves owner and user space
 *  TID out of sync. Except one error case where the kernel is denied
 *  write access to the user address, see fixup_pi_state_owner().

(btw, should we actually WARN_ON_ONCE this case such that the user is
well aware things are screwed up?)

However, as 34b1a1ce145 describes, it was cared enough about users to
protect them against spurious runaway tasks. And this is why I decided
to even send the patch; it fixes, without sacrificing performance or
additional complexity, a potentially user visible issue which could be
due to programming error. And unlike 34b1a1ce145, where a stealer that
cannot fault ends up dropping the lock, here the stealer can actually
amend things and not break semantics because of another task's stupidity.
But yeah, this could also be considered in the category of inept attempts
to fix a rotten situation.

Thanks,
Davidlohr


[tip: irq/core] tasklet: Remove tasklet_kill_immediate

2021-03-16 Thread tip-bot2 for Davidlohr Bueso
The following commit has been merged into the irq/core branch of tip:

Commit-ID: 3a0ade0c521a542f8a25e96ce8ea0dfaa532ac75
Gitweb:
https://git.kernel.org/tip/3a0ade0c521a542f8a25e96ce8ea0dfaa532ac75
Author:Davidlohr Bueso 
AuthorDate:Sat, 06 Mar 2021 13:36:58 -08:00
Committer: Thomas Gleixner 
CommitterDate: Tue, 16 Mar 2021 15:06:31 +01:00

tasklet: Remove tasklet_kill_immediate

Ever since RCU was converted to softirq, it has no users.

Signed-off-by: Davidlohr Bueso 
Signed-off-by: Thomas Gleixner 
Acked-by: Paul E. McKenney 
Link: https://lore.kernel.org/r/20210306213658.12862-1-d...@stgolabs.net

---
 include/linux/interrupt.h |  1 -
 kernel/softirq.c  | 32 
 2 files changed, 33 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 76f1161..2b98156 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -716,7 +716,6 @@ static inline void tasklet_enable(struct tasklet_struct *t)
 }
 
 extern void tasklet_kill(struct tasklet_struct *t);
-extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 void (*func)(unsigned long), unsigned long data);
 extern void tasklet_setup(struct tasklet_struct *t,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9908ec4..8b44ab9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -658,38 +658,6 @@ static void run_ksoftirqd(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * tasklet_kill_immediate is called to remove a tasklet which can already be
- * scheduled for execution on @cpu.
- *
- * Unlike tasklet_kill, this function removes the tasklet
- * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
- *
- * When this function is called, @cpu must be in the CPU_DEAD state.
- */
-void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
-{
-   struct tasklet_struct **i;
-
-   BUG_ON(cpu_online(cpu));
-   BUG_ON(test_bit(TASKLET_STATE_RUN, >state));
-
-   if (!test_bit(TASKLET_STATE_SCHED, >state))
-   return;
-
-   /* CPU is dead, so no lock needed. */
-   for (i = _cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
-   if (*i == t) {
-   *i = t->next;
-   /* If this was the tail element, move the tail ptr */
-   if (*i == NULL)
-   per_cpu(tasklet_vec, cpu).tail = i;
-   return;
-   }
-   }
-   BUG();
-}
-
 static int takeover_tasklets(unsigned int cpu)
 {
/* CPU is dead, so no lock needed. */


Re: [PATCH 1/2] futex: Fix irq mismatch in exit_pi_state_list()

2021-03-15 Thread Davidlohr Bueso

On Mon, 15 Mar 2021, Peter Zijlstra wrote:


Or am I reading this wrong?


No, I read it wrong. Please ignore this patch, there are rather a few
cases that do this trickery.

Thanks,
Davidlohr


[PATCH -tip 0/2] futex: Two pi fixes

2021-03-14 Thread Davidlohr Bueso
Hi,

Some unrelated fixlets found via code inspection. Please consider for v5.13.

Thanks!

Davidlohr Bueso (2):
  futex: Fix irq mismatch in exit_pi_state_list()
  futex: Leave the pi lock stealer in a consistent state upon successful
fault

 kernel/futex.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

-- 
2.26.2



[PATCH 2/2] futex: Leave the pi lock stealer in a consistent state upon successful fault

2021-03-14 Thread Davidlohr Bueso
Before 34b1a1ce145 (futex: Handle faults correctly for PI futexes) any
concurrent pi_state->owner fixup would assume that the task that fixed
things on our behalf also correctly updated the userspace value. This
is not always the case anymore, and can result in scenarios where a lock
stealer returns a successful FUTEX_PI_LOCK operation but raced during a fault
with an enqueued top waiter in an immutable state so the uval TID was
not updated for the stealer, breaking otherwise expected (and valid)
semantics and confusing the stealer task:

with pi_state->owner == victim.

victim  stealer
futex_lock_pi() {
  queue_me(, hb);
  rt_mutex_timed_futex_lock() {
futex_lock_pi() {
  // lock steal
  
rt_mutex_timed_futex_lock();
// timeout
  }

  spin_lock(q.lock_ptr);
  fixup_owner(!locked) {
fixup_pi_state_owner(NULL) {
  oldowner = pi_state->owner
  newowner = stealer;
  handle_err:
  //drop locks

  ret = fault_in_user_writeable() { spin_lock(q.lock_ptr);
fixup_owner(locked) {
  } // -EFAULT  
fixup_pi_state_owner(current) {
  oldowner = 
pi_state->owner
  newowner = 
current;
  handle_err:
  // drop locks
  ret = 
fault_in_user_writeable() {

  // take locks
  if (pi_state->owner != oldowner) // false

  pi_state_update_owner(rt_mutex_owner());

   } // SUCCESS
   }
   // all locks dropped// take locks
   if 
(pi_state->owner != oldowner) // success
}return 1;

This leaves: (pi_state == pi_mutex owner == stealer) AND (uval TID == victim).

This patch proposes for the lock stealer to do a retry upon seeing someone
changed pi_state->owner while all locks were dropped if the fault was
successful. This allows to self-fixup the user state of the lock, albeit
an incorrect order compared to traditionally updating userspace before
pi_state, but this is an extraordinary scenario.

For the cases of a normal fixups, this does add some unnecessary overhead
by having to deal with userspace value when things are already ok, but
this case is pretty rare and we've already given up any inch of performance
when releasing all locks, for faulting/blocking.

Signed-off-by: Davidlohr Bueso 
---
 kernel/futex.c | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index ded7af2ba87f..95ce10c4e33d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2460,7 +2460,6 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, 
struct futex_q *q,
 
case -EAGAIN:
cond_resched();
-   err = 0;
break;
 
default:
@@ -2474,11 +2473,22 @@ static int __fixup_pi_state_owner(u32 __user *uaddr, 
struct futex_q *q,
/*
 * Check if someone else fixed it for us:
 */
-   if (pi_state->owner != oldowner)
+   if (pi_state->owner != oldowner) {
+   /*
+* The change might have come from the rare immutable
+* state below, which leaves the userspace value out of
+* sync. But if we are the lock stealer and can update
+* the uval, do so, instead of reporting a successful
+* lock operation with an invalid user state.
+*/
+   if (!err && argowner == current)
+   goto retry;
+
return argowner == current;
+   }
 
/* Retry if err was -EAGAIN or the fault in succeeded */
-   if (!err)
+   if (err == -EAGAIN || !err)
goto retry;
 
/*
-- 
2.26.2



[PATCH 1/2] futex: Fix irq mismatch in exit_pi_state_list()

2021-03-14 Thread Davidlohr Bueso
The pi_mutex->wait_lock is irq safe and needs to enable local
interrupts upon unlocking, matching it's corresponding
raw_spin_lock_irq().

Fixes: c74aef2d06a9f (futex: Fix pi_state->owner serialization)
Signed-off-by: Davidlohr Bueso 
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 475055715371..ded7af2ba87f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -885,7 +885,7 @@ static void exit_pi_state_list(struct task_struct *curr)
 */
if (head->next != next) {
/* retain curr->pi_lock for the loop invariant */
-   raw_spin_unlock(_state->pi_mutex.wait_lock);
+   raw_spin_unlock_irq(_state->pi_mutex.wait_lock);
spin_unlock(>lock);
put_pi_state(pi_state);
continue;
-- 
2.26.2



[tip: locking/core] kernel/futex: Make futex_wait_requeue_pi() only call fixup_owner()

2021-03-11 Thread tip-bot2 for Davidlohr Bueso
The following commit has been merged into the locking/core branch of tip:

Commit-ID: a1565aa4699847febfdfd6af3bf06ca17a9e16af
Gitweb:
https://git.kernel.org/tip/a1565aa4699847febfdfd6af3bf06ca17a9e16af
Author:Davidlohr Bueso 
AuthorDate:Fri, 26 Feb 2021 09:50:27 -08:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 11 Mar 2021 19:19:17 +01:00

kernel/futex: Make futex_wait_requeue_pi() only call fixup_owner()

A small cleanup that allows for fixup_pi_state_owner() only to be called
from fixup_owner(), and make requeue_pi uniformly call fixup_owner()
regardless of the state in which the fixup is actually needed. Of course
this makes the caller's first pi_state->owner != current check redundant,
but that should't really matter.

Signed-off-by: Davidlohr Bueso 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: https://lore.kernel.org/r/20210226175029.50335-2-d...@stgolabs.net

---
 kernel/futex.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index db8002d..ee09995 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3241,15 +3241,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, 
unsigned int flags,
 * reference count.
 */
 
-   /* Check if the requeue code acquired the second futex for us. */
+   /*
+* Check if the requeue code acquired the second futex for us and do
+* any pertinent fixup.
+*/
if (!q.rt_waiter) {
-   /*
-* Got the lock. We might not be the anticipated owner if we
-* did a lock-steal - fix up the PI-state in that case.
-*/
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
-   ret = fixup_pi_state_owner(uaddr2, , current);
+   ret = fixup_owner(uaddr2, , true);
/*
 * Drop the reference to the pi state which
 * the requeue_pi() code acquired for us.


[tip: locking/core] kernel/futex: Move hb unlock out of unqueue_me_pi()

2021-03-11 Thread tip-bot2 for Davidlohr Bueso
The following commit has been merged into the locking/core branch of tip:

Commit-ID: a3f2428d2b9c9ca70f52818774a2f6e0e30a0f0b
Gitweb:
https://git.kernel.org/tip/a3f2428d2b9c9ca70f52818774a2f6e0e30a0f0b
Author:Davidlohr Bueso 
AuthorDate:Fri, 26 Feb 2021 09:50:28 -08:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 11 Mar 2021 19:19:17 +01:00

kernel/futex: Move hb unlock out of unqueue_me_pi()

This improves the code readability, and the locking more obvious
as it becomes symmetric for the caller.

Signed-off-by: Davidlohr Bueso 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: https://lore.kernel.org/r/20210226175029.50335-3-d...@stgolabs.net

---
 kernel/futex.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index ee09995..dcd6132 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2318,19 +2318,15 @@ retry:
 
 /*
  * PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
- * and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
  */
 static void unqueue_me_pi(struct futex_q *q)
-   __releases(q->lock_ptr)
 {
__unqueue_futex(q);
 
BUG_ON(!q->pi_state);
put_pi_state(q->pi_state);
q->pi_state = NULL;
-
-   spin_unlock(q->lock_ptr);
 }
 
 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
@@ -2913,8 +2909,8 @@ no_block:
if (res)
ret = (res < 0) ? res : 0;
 
-   /* Unqueue and drop the lock */
unqueue_me_pi();
+   spin_unlock(q.lock_ptr);
goto out;
 
 out_unlock_put_key:
@@ -3290,8 +3286,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, 
unsigned int flags,
if (res)
ret = (res < 0) ? res : 0;
 
-   /* Unqueue and drop the lock. */
unqueue_me_pi();
+   spin_unlock(q.lock_ptr);
}
 
if (ret == -EINTR) {


[tip: locking/core] kernel/futex: Kill rt_mutex_next_owner()

2021-03-11 Thread tip-bot2 for Davidlohr Bueso
The following commit has been merged into the locking/core branch of tip:

Commit-ID: 9a4b99fce659c03699f1cb5003ebe7c57c084d49
Gitweb:
https://git.kernel.org/tip/9a4b99fce659c03699f1cb5003ebe7c57c084d49
Author:Davidlohr Bueso 
AuthorDate:Fri, 26 Feb 2021 09:50:26 -08:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 11 Mar 2021 19:19:17 +01:00

kernel/futex: Kill rt_mutex_next_owner()

Update wake_futex_pi() and kill the call altogether. This is possible because:

(i) The case of fixup_owner() in which the pi_mutex was stolen from the
signaled enqueued top-waiter which fails to trylock and doesn't see a
current owner of the rtmutex but needs to acknowledge an non-enqueued
higher priority waiter, which is the other alternative. This used to be
handled by rt_mutex_next_owner(), which guaranteed 
fixup_pi_state_owner('newowner')
never to be nil. Nowadays the logic is handled by an EAGAIN loop, without
the need of rt_mutex_next_owner(). Specifically:

c1e2f0eaf015 (futex: Avoid violating the 10th rule of futex)
9f5d1c336a10 (futex: Handle transient "ownerless" rtmutex state correctly)

(ii) rt_mutex_next_owner() and rt_mutex_top_waiter() are semantically
equivalent, as of:

c28d62cf52d7 (locking/rtmutex: Handle non enqueued waiters gracefully in 
remove_waiter())

So instead of keeping the call around, just use the good ole 
rt_mutex_top_waiter().
No change in semantics.

Signed-off-by: Davidlohr Bueso 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: https://lore.kernel.org/r/20210226175029.50335-1-d...@stgolabs.net

---
 kernel/futex.c  |  7 +--
 kernel/locking/rtmutex.c| 20 
 kernel/locking/rtmutex_common.h |  1 -
 3 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index e68db77..db8002d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1494,13 +1494,14 @@ static void mark_wake_futex(struct wake_q_head *wake_q, 
struct futex_q *q)
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state 
*pi_state)
 {
u32 curval, newval;
+   struct rt_mutex_waiter *top_waiter;
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
int ret = 0;
 
-   new_owner = rt_mutex_next_owner(_state->pi_mutex);
-   if (WARN_ON_ONCE(!new_owner)) {
+   top_waiter = rt_mutex_top_waiter(_state->pi_mutex);
+   if (WARN_ON_ONCE(!top_waiter)) {
/*
 * As per the comment in futex_unlock_pi() this should not 
happen.
 *
@@ -1513,6 +1514,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, 
struct futex_pi_state *pi_
goto out_unlock;
}
 
+   new_owner = top_waiter->task;
+
/*
 * We pass it to the next owner. The WAITERS bit is always kept
 * enabled while there is PI state around. We cleanup the owner
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 48fff64..29f09d0 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1793,26 +1793,6 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
 }
 
 /**
- * rt_mutex_next_owner - return the next owner of the lock
- *
- * @lock: the rt lock query
- *
- * Returns the next owner of the lock or NULL
- *
- * Caller has to serialize against other accessors to the lock
- * itself.
- *
- * Special API call for PI-futex support
- */
-struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
-{
-   if (!rt_mutex_has_waiters(lock))
-   return NULL;
-
-   return rt_mutex_top_waiter(lock)->task;
-}
-
-/**
  * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
  * @lock:  the rt_mutex we were woken on
  * @to:the timeout, null if none. hrtimer should 
already have
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index ca6fb48..a5007f0 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -130,7 +130,6 @@ enum rtmutex_chainwalk {
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
-extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
   struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);


[tip: locking/core] kernel/futex: Explicitly document pi_lock for pi_state owner fixup

2021-03-11 Thread tip-bot2 for Davidlohr Bueso
The following commit has been merged into the locking/core branch of tip:

Commit-ID: c2e4bfe0eef313eeb1c4c9d921be7a9d91d5d71a
Gitweb:
https://git.kernel.org/tip/c2e4bfe0eef313eeb1c4c9d921be7a9d91d5d71a
Author:Davidlohr Bueso 
AuthorDate:Fri, 26 Feb 2021 09:50:29 -08:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 11 Mar 2021 19:19:17 +01:00

kernel/futex: Explicitly document pi_lock for pi_state owner fixup

This seems to belong in the serialization and lifetime rules section.
pi_state_update_owner() will take the pi_mutex's owner's pi_lock to
do whatever fixup, successful or not.

Signed-off-by: Davidlohr Bueso 
Signed-off-by: Thomas Gleixner 
Acked-by: Peter Zijlstra (Intel) 
Link: https://lore.kernel.org/r/20210226175029.50335-4-d...@stgolabs.net

---
 kernel/futex.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/futex.c b/kernel/futex.c
index dcd6132..4750557 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -981,6 +981,7 @@ static inline void exit_pi_state_list(struct task_struct 
*curr) { }
  * p->pi_lock:
  *
  * p->pi_state_list -> pi_state->list, relation
+ * pi_mutex->owner -> pi_state->owner, relation
  *
  * pi_state->refcount:
  *


Re: [PATCH 3/3] powerpc/qspinlock: Use generic smp_cond_load_relaxed

2021-03-09 Thread Davidlohr Bueso

On Tue, 09 Mar 2021, Michal Such�nek wrote:


On Mon, Mar 08, 2021 at 05:59:50PM -0800, Davidlohr Bueso wrote:

49a7d46a06c3 (powerpc: Implement smp_cond_load_relaxed()) added
busy-waiting pausing with a preferred SMT priority pattern, lowering
the priority (reducing decode cycles) during the whole loop slowpath.

However, data shows that while this pattern works well with simple

 ^^

spinlocks, queued spinlocks benefit more being kept in medium priority,
with a cpu_relax() instead, being a low+medium combo on powerpc.

...


diff --git a/arch/powerpc/include/asm/barrier.h 
b/arch/powerpc/include/asm/barrier.h
index aecfde829d5d..7ae29cfb06c0 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -80,22 +80,6 @@ do { 
\
___p1;  \
 })

-#ifdef CONFIG_PPC64

Maybe it should be kept for the simple spinlock case then?


It is kept, note that simple spinlocks don't use smp_cond_load_relaxed,
but instead deal with the priorities in arch_spin_lock(), so it will
spin in low priority until it sees a chance to take the lock, where
it switches back to medium.

Thanks,
Davidlohr


[PATCH 2/3] powerpc/spinlock: Unserialize spin_is_locked

2021-03-08 Thread Davidlohr Bueso
c6f5d02b6a0f (locking/spinlocks/arm64: Remove smp_mb() from
arch_spin_is_locked()) made it pretty official that the call
semantics do not imply any sort of barriers, and any user that
gets creative must explicitly do any serialization.

This creativity, however, is nowadays pretty limited:

1. spin_unlock_wait() has been removed from the kernel in favor
of a lock/unlock combo. Furthermore, queued spinlocks have now
for a number of years no longer relied on _Q_LOCKED_VAL for the
call, but any non-zero value to indicate a locked state. There
were cases where the delayed locked store could lead to breaking
mutual exclusion with crossed locking; such as with sysv ipc and
netfilter being the most extreme.

2. The auditing Andrea did in verified that remaining spin_is_locked()
no longer rely on such semantics. Most callers just use it to assert
a lock is taken, in a debug nature. The only user that gets cute is
NOLOCK qdisc, as of:

   96009c7d500e (sched: replace __QDISC_STATE_RUNNING bit with a spin lock)

... which ironically went in the next day after c6f5d02b6a0f. This
change replaces test_bit() with spin_is_locked() to know whether
to take the busylock heuristic to reduce contention on the main
qdisc lock. So any races against spin_is_locked() for archs that
use LL/SC for spin_lock() will be benign and not break any mutual
exclusion; furthermore, both the seqlock and busylock have the same
scope.

Cc: parri.and...@gmail.com
Cc: pab...@redhat.com
Signed-off-by: Davidlohr Bueso 
---
 arch/powerpc/include/asm/qspinlock.h   | 12 
 arch/powerpc/include/asm/simple_spinlock.h |  3 +--
 2 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index 3ce1a0bee4fe..b052b0624816 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -44,18 +44,6 @@ static __always_inline void queued_spin_lock(struct 
qspinlock *lock)
 }
 #define queued_spin_lock queued_spin_lock
 
-static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
-{
-   /*
-* This barrier was added to simple spinlocks by commit 51d7d5205d338,
-* but it should now be possible to remove it, asm arm64 has done with
-* commit c6f5d02b6a0f.
-*/
-   smp_mb();
-   return atomic_read(>val);
-}
-#define queued_spin_is_locked queued_spin_is_locked
-
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define SPIN_THRESHOLD (1<<15) /* not tuned */
 
diff --git a/arch/powerpc/include/asm/simple_spinlock.h 
b/arch/powerpc/include/asm/simple_spinlock.h
index 3e87258f73b1..1b935396522a 100644
--- a/arch/powerpc/include/asm/simple_spinlock.h
+++ b/arch/powerpc/include/asm/simple_spinlock.h
@@ -38,8 +38,7 @@ static __always_inline int 
arch_spin_value_unlocked(arch_spinlock_t lock)
 
 static inline int arch_spin_is_locked(arch_spinlock_t *lock)
 {
-   smp_mb();
-   return !arch_spin_value_unlocked(*lock);
+   return !arch_spin_value_unlocked(READ_ONCE(*lock));
 }
 
 /*
-- 
2.26.2



[PATCH 0/3] powerpc/qspinlock: Some tuning updates

2021-03-08 Thread Davidlohr Bueso
Hi,

A few updates while going through the powerpc port of the qspinlock.

Patches 1 and 2 are straightforward, while patch 3 can be considered
more of an rfc as I've only tested on a single machine, and there
could be an alternative way if it doesn't end up being nearly a
universal performance win.

Thanks!

Davidlohr Bueso (3):
  powerpc/spinlock: Define smp_mb__after_spinlock only once
  powerpc/spinlock: Unserialize spin_is_locked
  powerpc/qspinlock: Use generic smp_cond_load_relaxed

 arch/powerpc/include/asm/barrier.h | 16 
 arch/powerpc/include/asm/qspinlock.h   | 14 --
 arch/powerpc/include/asm/simple_spinlock.h |  6 +-
 arch/powerpc/include/asm/spinlock.h|  3 +++
 4 files changed, 4 insertions(+), 35 deletions(-)

--
2.26.2



[PATCH 1/3] powerpc/spinlock: Define smp_mb__after_spinlock only once

2021-03-08 Thread Davidlohr Bueso
Instead of both queued and simple spinlocks doing it. Move
it into the arch's spinlock.h.

Signed-off-by: Davidlohr Bueso 
---
 arch/powerpc/include/asm/qspinlock.h   | 2 --
 arch/powerpc/include/asm/simple_spinlock.h | 3 ---
 arch/powerpc/include/asm/spinlock.h| 3 +++
 3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index b752d34517b3..3ce1a0bee4fe 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -44,8 +44,6 @@ static __always_inline void queued_spin_lock(struct qspinlock 
*lock)
 }
 #define queued_spin_lock queued_spin_lock
 
-#define smp_mb__after_spinlock()   smp_mb()
-
 static __always_inline int queued_spin_is_locked(struct qspinlock *lock)
 {
/*
diff --git a/arch/powerpc/include/asm/simple_spinlock.h 
b/arch/powerpc/include/asm/simple_spinlock.h
index 9c3c30534333..3e87258f73b1 100644
--- a/arch/powerpc/include/asm/simple_spinlock.h
+++ b/arch/powerpc/include/asm/simple_spinlock.h
@@ -282,7 +282,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)  rw_yield(lock)
 #define arch_write_relax(lock) rw_yield(lock)
 
-/* See include/linux/spinlock.h */
-#define smp_mb__after_spinlock()   smp_mb()
-
 #endif /* _ASM_POWERPC_SIMPLE_SPINLOCK_H */
diff --git a/arch/powerpc/include/asm/spinlock.h 
b/arch/powerpc/include/asm/spinlock.h
index 6ec72282888d..bd75872a6334 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -10,6 +10,9 @@
 #include 
 #endif
 
+/* See include/linux/spinlock.h */
+#define smp_mb__after_spinlock()   smp_mb()
+
 #ifndef CONFIG_PARAVIRT_SPINLOCKS
 static inline void pv_spinlocks_init(void) { }
 #endif
-- 
2.26.2



[PATCH 3/3] powerpc/qspinlock: Use generic smp_cond_load_relaxed

2021-03-08 Thread Davidlohr Bueso
49a7d46a06c3 (powerpc: Implement smp_cond_load_relaxed()) added
busy-waiting pausing with a preferred SMT priority pattern, lowering
the priority (reducing decode cycles) during the whole loop slowpath.

However, data shows that while this pattern works well with simple
spinlocks, queued spinlocks benefit more being kept in medium priority,
with a cpu_relax() instead, being a low+medium combo on powerpc.

Data is from three benchmarks on a Power9: 9008-22L 64 CPUs with
2 sockets and 8 threads per core.

1. locktorture.

This is data for the lowest and most artificial/pathological level,
with increasing thread counts pounding on the lock. Metrics are total
ops/minute. Despite some small hits in the 4-8 range, scenarios are
either neutral or favorable to this patch.

+=+==+==+===+
| # tasks | vanilla  | dirty| %diff |
+=+==+==+===+
| 2   | 46718565 | 48751350 | 4.35  |
+-+--+--+---+
| 4   | 51740198 | 50369082 | -2.65 |
+-+--+--+---+
| 8   | 63756510 | 62568821 | -1.86 |
+-+--+--+---+
| 16  | 67824531 | 70966546 | 4.63  |
+-+--+--+---+
| 32  | 53843519 | 61155508 | 13.58 |
+-+--+--+---+
| 64  | 53005778 | 53104412 | 0.18  |
+-+--+--+---+
| 128 | 53331980 | 54606910 | 2.39  |
+=+==+==+===+

2. sockperf (tcp throughput)

Here a client will do one-way throughput tests to a localhost server, with
increasing message sizes, dealing with the sk_lock. This patch shows to put
the performance of the qspinlock back to par with that of the simple lock:

 simple-spinlock   vanilla  dirty
Hmean 1473.50 (   0.00%)   54.44 * -25.93%*   73.45 * 
-0.07%*
Hmean 100  654.47 (   0.00%)  385.61 * -41.08%*  771.43 * 
17.87%*
Hmean 300 2719.39 (   0.00%) 2181.67 * -19.77%* 2666.50 * 
-1.94%*
Hmean 500 4400.59 (   0.00%) 3390.77 * -22.95%* 4322.14 * 
-1.78%*
Hmean 850 6726.21 (   0.00%) 5264.03 * -21.74%* 6863.12 * 2.04%*

3. dbench (tmpfs)

Configured to run with up to ncpusx8 clients, it shows both latency and
throughput metrics. For the latency, with the exception of the 64 case,
there is really nothing to go by:
 vanilladirty
Amean latency-1  1.67 (   0.00%)1.67 *   0.09%*
Amean latency-2  2.15 (   0.00%)2.08 *   3.36%*
Amean latency-4  2.50 (   0.00%)2.56 *  -2.27%*
Amean latency-8  2.49 (   0.00%)2.48 *   0.31%*
Amean latency-16 2.69 (   0.00%)2.72 *  -1.37%*
Amean latency-32 2.96 (   0.00%)3.04 *  -2.60%*
Amean latency-64 7.78 (   0.00%)8.17 *  -5.07%*
Amean latency-512  186.91 (   0.00%)  186.41 *   0.27%*

For the dbench4 Throughput (misleading but traditional) there's a small
but rather constant improvement:

 vanilladirty
Hmean 1849.13 (   0.00%)  851.51 *   0.28%*
Hmean 2   1664.03 (   0.00%) 1663.94 *  -0.01%*
Hmean 4   3073.70 (   0.00%) 3104.29 *   1.00%*
Hmean 8   5624.02 (   0.00%) 5694.16 *   1.25%*
Hmean 16  9169.49 (   0.00%) 9324.43 *   1.69%*
Hmean 32 11969.37 (   0.00%)12127.09 *   1.32%*
Hmean 64 15021.12 (   0.00%)15243.14 *   1.48%*
Hmean 51214891.27 (   0.00%)15162.11 *   1.82%*

Measuring the dbench4 Per-VFS Operation latency, shows some very minor
differences within the noise level, around the 0-1% ranges.

Signed-off-by: Davidlohr Bueso 
---
 arch/powerpc/include/asm/barrier.h | 16 
 1 file changed, 16 deletions(-)

diff --git a/arch/powerpc/include/asm/barrier.h 
b/arch/powerpc/include/asm/barrier.h
index aecfde829d5d..7ae29cfb06c0 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -80,22 +80,6 @@ do { 
\
___p1;  \
 })
 
-#ifdef CONFIG_PPC64
-#define smp_cond_load_relaxed(ptr, cond_expr) ({   \
-   typeof(ptr) __PTR = (ptr);  \
-   __unqual_scalar_typeof(*ptr) VAL;   \
-   VAL = READ_ONCE(*__PTR);\
-   if (unlikely(!(cond_expr))) {   \
-   spin_begin();   \
-   do {\
-   VAL = READ_ONCE(*__PTR);\
-   } while (!(cond_expr)); \
-   spin_end

[PATCH] tasklet: Remove tasklet_kill_immediate

2021-03-06 Thread Davidlohr Bueso
Ever since RCU was converted to softirq, it has no users.

Signed-off-by: Davidlohr Bueso 
---
 include/linux/interrupt.h |  1 -
 kernel/softirq.c  | 32 
 2 files changed, 33 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 967e25767153..36a2ac6baf9a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -712,7 +712,6 @@ static inline void tasklet_enable(struct tasklet_struct *t)
 }
 
 extern void tasklet_kill(struct tasklet_struct *t);
-extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 void (*func)(unsigned long), unsigned long data);
 extern void tasklet_setup(struct tasklet_struct *t,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9908ec4a9bfe..8b44ab9a2f69 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -658,38 +658,6 @@ static void run_ksoftirqd(unsigned int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-/*
- * tasklet_kill_immediate is called to remove a tasklet which can already be
- * scheduled for execution on @cpu.
- *
- * Unlike tasklet_kill, this function removes the tasklet
- * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
- *
- * When this function is called, @cpu must be in the CPU_DEAD state.
- */
-void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
-{
-   struct tasklet_struct **i;
-
-   BUG_ON(cpu_online(cpu));
-   BUG_ON(test_bit(TASKLET_STATE_RUN, >state));
-
-   if (!test_bit(TASKLET_STATE_SCHED, >state))
-   return;
-
-   /* CPU is dead, so no lock needed. */
-   for (i = _cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
-   if (*i == t) {
-   *i = t->next;
-   /* If this was the tail element, move the tail ptr */
-   if (*i == NULL)
-   per_cpu(tasklet_vec, cpu).tail = i;
-   return;
-   }
-   }
-   BUG();
-}
-
 static int takeover_tasklets(unsigned int cpu)
 {
/* CPU is dead, so no lock needed. */
-- 
2.26.2



[PATCH 2/4] kernel/futex: Make futex_wait_requeue_pi() only call fixup_owner()

2021-02-26 Thread Davidlohr Bueso
A small cleanup that allows for fixup_pi_state_owner() only to be
called from fixup_owner(), and make the requeue_pi uniformly call
fixup_owner() regardless of the state in which the fixup is actually
needed. Of course this makes the caller's first pi_state->owner != current
check redundant, but that should't really matter.

Signed-off-by: Davidlohr Bueso 
---
 kernel/futex.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index db8002dbca7a..ee09995d707b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -3241,15 +3241,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, 
unsigned int flags,
 * reference count.
 */
 
-   /* Check if the requeue code acquired the second futex for us. */
+   /*
+* Check if the requeue code acquired the second futex for us and do
+* any pertinent fixup.
+*/
if (!q.rt_waiter) {
-   /*
-* Got the lock. We might not be the anticipated owner if we
-* did a lock-steal - fix up the PI-state in that case.
-*/
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
-   ret = fixup_pi_state_owner(uaddr2, , current);
+   ret = fixup_owner(uaddr2, , true);
/*
 * Drop the reference to the pi state which
 * the requeue_pi() code acquired for us.
-- 
2.26.2



[PATCH 1/4] kernel/futex: Kill rt_mutex_next_owner()

2021-02-26 Thread Davidlohr Bueso
Update wake_futex_pi() and kill the call altogether. This is possible because:

(i) The case of fixup_owner() in which the pi_mutex was stolen from the
signaled enqueued top-waiter which fails to trylock and doesn't see a
current owner of the rtmutex but needs to acknowledge an non-enqueued
higher priority waiter, which is the other alternative. This used to be
handled by rt_mutex_next_owner(), which guaranteed 
fixup_pi_state_owner('newowner')
never to be nil. Nowadays the logic is handled by an EAGAIN loop, without
the need of rt_mutex_next_owner(). Specifically:

c1e2f0eaf015 (futex: Avoid violating the 10th rule of futex)
9f5d1c336a10 (futex: Handle transient "ownerless" rtmutex state correctly)

(ii) rt_mutex_next_owner() and rt_mutex_top_waiter() are semantically
equivalent, as of:

c28d62cf52d7 (locking/rtmutex: Handle non enqueued waiters gracefully in 
remove_waiter())

So instead of keeping the call around, just use the good ole 
rt_mutex_top_waiter().
This patch implies no change in semantics.

Signed-off-by: Davidlohr Bueso 
---
 kernel/futex.c  |  7 +--
 kernel/locking/rtmutex.c| 20 
 kernel/locking/rtmutex_common.h |  1 -
 3 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index e68db7745039..db8002dbca7a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1494,13 +1494,14 @@ static void mark_wake_futex(struct wake_q_head *wake_q, 
struct futex_q *q)
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state 
*pi_state)
 {
u32 curval, newval;
+   struct rt_mutex_waiter *top_waiter;
struct task_struct *new_owner;
bool postunlock = false;
DEFINE_WAKE_Q(wake_q);
int ret = 0;
 
-   new_owner = rt_mutex_next_owner(_state->pi_mutex);
-   if (WARN_ON_ONCE(!new_owner)) {
+   top_waiter = rt_mutex_top_waiter(_state->pi_mutex);
+   if (WARN_ON_ONCE(!top_waiter)) {
/*
 * As per the comment in futex_unlock_pi() this should not 
happen.
 *
@@ -1513,6 +1514,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, 
struct futex_pi_state *pi_
goto out_unlock;
}
 
+   new_owner = top_waiter->task;
+
/*
 * We pass it to the next owner. The WAITERS bit is always kept
 * enabled while there is PI state around. We cleanup the owner
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 03b21135313c..919391c604f1 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1792,26 +1792,6 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
return ret;
 }
 
-/**
- * rt_mutex_next_owner - return the next owner of the lock
- *
- * @lock: the rt lock query
- *
- * Returns the next owner of the lock or NULL
- *
- * Caller has to serialize against other accessors to the lock
- * itself.
- *
- * Special API call for PI-futex support
- */
-struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
-{
-   if (!rt_mutex_has_waiters(lock))
-   return NULL;
-
-   return rt_mutex_top_waiter(lock)->task;
-}
-
 /**
  * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
  * @lock:  the rt_mutex we were woken on
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index ca6fb489007b..a5007f00c1b7 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -130,7 +130,6 @@ enum rtmutex_chainwalk {
 /*
  * PI-futex support (proxy locking functions, etc.):
  */
-extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
   struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
-- 
2.26.2



[PATCH 4/4] kernel/futex: Explicitly document pi_lock for pi_state owner fixup

2021-02-26 Thread Davidlohr Bueso
This seems to belong in the serialization and lifetime rules section.
pi_state_update_owner() will take the pi_mutex's owner's pi_lock to
do whatever fixup, successful or not.

Signed-off-by: Davidlohr Bueso 
---
 kernel/futex.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/futex.c b/kernel/futex.c
index dcd6132485e1..475055715371 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -981,6 +981,7 @@ static inline void exit_pi_state_list(struct task_struct 
*curr) { }
  * p->pi_lock:
  *
  * p->pi_state_list -> pi_state->list, relation
+ * pi_mutex->owner -> pi_state->owner, relation
  *
  * pi_state->refcount:
  *
-- 
2.26.2



[PATCH 3/4] kernel/futex: Move hb unlock out of unqueue_me_pi()

2021-02-26 Thread Davidlohr Bueso
This improves the code readability, and the locking more obvious
as it becomes symmetric for the caller.

Signed-off-by: Davidlohr Bueso 
---
 kernel/futex.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index ee09995d707b..dcd6132485e1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2318,19 +2318,15 @@ static int unqueue_me(struct futex_q *q)
 
 /*
  * PI futexes can not be requeued and must remove themself from the
- * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
- * and dropped here.
+ * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
  */
 static void unqueue_me_pi(struct futex_q *q)
-   __releases(q->lock_ptr)
 {
__unqueue_futex(q);
 
BUG_ON(!q->pi_state);
put_pi_state(q->pi_state);
q->pi_state = NULL;
-
-   spin_unlock(q->lock_ptr);
 }
 
 static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
@@ -2913,8 +2909,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int 
flags,
if (res)
ret = (res < 0) ? res : 0;
 
-   /* Unqueue and drop the lock */
unqueue_me_pi();
+   spin_unlock(q.lock_ptr);
goto out;
 
 out_unlock_put_key:
@@ -3290,8 +3286,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, 
unsigned int flags,
if (res)
ret = (res < 0) ? res : 0;
 
-   /* Unqueue and drop the lock. */
unqueue_me_pi();
+   spin_unlock(q.lock_ptr);
}
 
if (ret == -EINTR) {
-- 
2.26.2



Re: [PATCH] kgdb: Remove kgdb_schedule_breakpoint()

2021-02-10 Thread Davidlohr Bueso

On Wed, 10 Feb 2021, Daniel Thompson wrote:


To the very best of my knowledge there has never been any in-tree
code that calls this function. It exists largely to support an
out-of-tree driver that provides kgdb-over-ethernet using the
netpoll API.

kgdboe has been out-of-tree for more than 10 years and I don't
recall any serious attempt to upstream it at any point in the last
five. At this stage it looks better to stop carrying this code in
the kernel and integrate the code into the out-of-tree driver
instead.

The long term trajectory for the kernel looks likely to include
effort to remove or reduce the use of tasklets (something that has
also been true for the last 10 years). Thus the main real reason
for this patch is to make explicit that the in-tree kgdb features
do not require tasklets.


I'm happy to see another user gone, I missed that it was even an option
to remove this altogether. Yeah so in general I started sending random
patches to get rid of some tasklets after seeing the recent extentions
in 12cc923f1cc (tasklet: Introduce new initialization API), which is
really the wrong way to go imo. Some driver maintainers/authors push
back in the name of performance (albeit tasklets provide no guarantees
because of ksoftirqd, for example), some don't care as much. There are
also constantly new users being added (despite the explicit deprecation
of the api) defering through tasklets, which makes me wonder if the tasklet
removal is anything but a pipe dream.

Acked-by: Davidlohr Bueso 


Signed-off-by: Daniel Thompson 
---

Notes:
   During this cycle two developers have proposed tidying up the
   DECLARE_TASKLET_OLD() in the debug core. Both threads ended with a
   suggestion to remove kgdb_schedule_breakpoint() but I don't recall
   seeing a follow up patch for either thread... so I wrote it myself.


Thanks,
Davidlohr


Re: [PATCH v2 5/7] rbtree, uprobes: Use rbtree helpers

2021-01-26 Thread Davidlohr Bueso

On Mon, 25 Jan 2021, Peter Zijlstra wrote:


Reduce rbtree boilerplate by using the new helpers.


This reminds me of:

https://lore.kernel.org/lkml/20200207180305.11092-6-d...@stgolabs.net/

Would a walk optimization (list+rbtree) serve here? Not sure how big
the uprobes_trees gets.

Thanks,
Davidlohr


Re: [PATCH v2 0/7] Generic RB-tree helpers

2021-01-26 Thread Davidlohr Bueso

On Mon, 25 Jan 2021, Peter Zijlstra wrote:


Hai all,

I found myself needing to write yet another rbtree and remembered I had these
patches gathering dust. I've had them in a git tree pretty much ever since I
posted them last and the robot is telling me they build/work/dance/sing fine.

I'm proposing to stick them in tip and get on with life. What say you?


I like consolidating this code with helpers, reduces LoC and improves
readability imo. Feel free to add my:

Acked-by: Davidlohr Bueso 


Re: [PATCH v8 09/18] virt: acrn: Introduce I/O request management

2021-01-24 Thread Davidlohr Bueso

On Fri, 22 Jan 2021, shuo.a@intel.com wrote:


From: Shuo Liu 

An I/O request of a User VM, which is constructed by the hypervisor, is
distributed by the ACRN Hypervisor Service Module to an I/O client
corresponding to the address range of the I/O request.

For each User VM, there is a shared 4-KByte memory region used for I/O
requests communication between the hypervisor and Service VM. An I/O
request is a 256-byte structure buffer, which is 'struct
acrn_io_request', that is filled by an I/O handler of the hypervisor
when a trapped I/O access happens in a User VM. ACRN userspace in the
Service VM first allocates a 4-KByte page and passes the GPA (Guest
Physical Address) of the buffer to the hypervisor. The buffer is used as
an array of 16 I/O request slots with each I/O request slot being 256
bytes. This array is indexed by vCPU ID.

An I/O client, which is 'struct acrn_ioreq_client', is responsible for
handling User VM I/O requests whose accessed GPA falls in a certain
range. Multiple I/O clients can be associated with each User VM. There
is a special client associated with each User VM, called the default
client, that handles all I/O requests that do not fit into the range of
any other I/O clients. The ACRN userspace acts as the default client for
each User VM.

The state transitions of a ACRN I/O request are as follows.

  FREE -> PENDING -> PROCESSING -> COMPLETE -> FREE -> ...

FREE: this I/O request slot is empty
PENDING: a valid I/O request is pending in this slot
PROCESSING: the I/O request is being processed
COMPLETE: the I/O request has been processed

An I/O request in COMPLETE or FREE state is owned by the hypervisor. HSM
and ACRN userspace are in charge of processing the others.

The processing flow of I/O requests are listed as following:

a) The I/O handler of the hypervisor will fill an I/O request with
  PENDING state when a trapped I/O access happens in a User VM.
b) The hypervisor makes an upcall, which is a notification interrupt, to
  the Service VM.
c) The upcall handler schedules a worker to dispatch I/O requests.
d) The worker looks for the PENDING I/O requests, assigns them to
  different registered clients based on the address of the I/O accesses,
  updates their state to PROCESSING, and notifies the corresponding
  client to handle.
e) The notified client handles the assigned I/O requests.
f) The HSM updates I/O requests states to COMPLETE and notifies the
  hypervisor of the completion via hypercalls.

Signed-off-by: Shuo Liu 
Reviewed-by: Zhi Wang 
Reviewed-by: Reinette Chatre 
Cc: Zhi Wang 
Cc: Zhenyu Wang 
Cc: Yu Wang 
Cc: Reinette Chatre 
Cc: Greg Kroah-Hartman 
Cc: Davidlohr Bueso 


Thanks for respinning this with a workqueue.

Acked-by: Davidlohr Bueso 


[PATCH] MAINTAINERS: Add myself as futex reviewer

2021-01-22 Thread Davidlohr Bueso
I'm volunteering to help review some of the pain.

Signed-off-by: Davidlohr Bueso 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3978e8b21fc4..6f204358908b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7333,6 +7333,7 @@ M:Thomas Gleixner 
 M: Ingo Molnar 
 R: Peter Zijlstra 
 R: Darren Hart 
+R: Davidlohr Bueso 
 L: linux-kernel@vger.kernel.org
 S: Maintained
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
locking/core
-- 
2.26.2



[PATCH] USB: gadget: udc: Process disconnect synchronously

2021-01-18 Thread Davidlohr Bueso
As the comment in usb_disconnect() hints, do not defer the
disconnect processing, and instead just do it directly in
the irq handler. This allows the driver to avoid using a
nowadays deprecated tasklet.

Signed-off-by: Davidlohr Bueso 
---
 drivers/usb/gadget/udc/snps_udc_core.c | 30 +++---
 1 file changed, 3 insertions(+), 27 deletions(-)

diff --git a/drivers/usb/gadget/udc/snps_udc_core.c 
b/drivers/usb/gadget/udc/snps_udc_core.c
index 6c726d2e1788..d046c09fa566 100644
--- a/drivers/usb/gadget/udc/snps_udc_core.c
+++ b/drivers/usb/gadget/udc/snps_udc_core.c
@@ -36,7 +36,6 @@
 #include 
 #include "amd5536udc.h"
 
-static void udc_tasklet_disconnect(unsigned long);
 static void udc_setup_endpoints(struct udc *dev);
 static void udc_soft_reset(struct udc *dev);
 static struct udc_request *udc_alloc_bna_dummy(struct udc_ep *ep);
@@ -95,9 +94,6 @@ static struct timer_list udc_pollstall_timer;
 static int stop_pollstall_timer;
 static DECLARE_COMPLETION(on_pollstall_exit);
 
-/* tasklet for usb disconnect */
-static DECLARE_TASKLET_OLD(disconnect_tasklet, udc_tasklet_disconnect);
-
 /* endpoint names used for print */
 static const char ep0_string[] = "ep0in";
 static const struct {
@@ -1637,6 +1633,8 @@ static void usb_connect(struct udc *dev)
  */
 static void usb_disconnect(struct udc *dev)
 {
+   u32 tmp;
+
/* Return if already disconnected */
if (!dev->connected)
return;
@@ -1648,23 +1646,6 @@ static void usb_disconnect(struct udc *dev)
/* mask interrupts */
udc_mask_unused_interrupts(dev);
 
-   /* REVISIT there doesn't seem to be a point to having this
-* talk to a tasklet ... do it directly, we already hold
-* the spinlock needed to process the disconnect.
-*/
-
-   tasklet_schedule(_tasklet);
-}
-
-/* Tasklet for disconnect to be outside of interrupt context */
-static void udc_tasklet_disconnect(unsigned long par)
-{
-   struct udc *dev = udc;
-   u32 tmp;
-
-   DBG(dev, "Tasklet disconnect\n");
-   spin_lock_irq(>lock);
-
if (dev->driver) {
spin_unlock(>lock);
dev->driver->disconnect(>gadget);
@@ -1673,13 +1654,10 @@ static void udc_tasklet_disconnect(unsigned long par)
/* empty queues */
for (tmp = 0; tmp < UDC_EP_NUM; tmp++)
empty_req_queue(>ep[tmp]);
-
}
 
/* disable ep0 */
-   ep_init(dev->regs,
-   >ep[UDC_EP0IN_IX]);
-
+   ep_init(dev->regs, >ep[UDC_EP0IN_IX]);
 
if (!soft_reset_occured) {
/* init controller by soft reset */
@@ -1695,8 +1673,6 @@ static void udc_tasklet_disconnect(unsigned long par)
tmp = AMD_ADDBITS(tmp, UDC_DEVCFG_SPD_FS, UDC_DEVCFG_SPD);
writel(tmp, >regs->cfg);
}
-
-   spin_unlock_irq(>lock);
 }
 
 /* Reset the UDC core */
-- 
2.26.2



[PATCH] usb: gadget: u_serial: Remove old tasklet comments

2021-01-18 Thread Davidlohr Bueso
Update old comments as of 8b4c62aef6f (usb: gadget: u_serial: process RX
in workqueue instead of tasklet).

Signed-off-by: Davidlohr Bueso 
---
 drivers/usb/gadget/function/u_serial.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/usb/gadget/function/u_serial.c 
b/drivers/usb/gadget/function/u_serial.c
index 2caccbb6e014..6d59f44ef2cd 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -346,7 +346,7 @@ __acquires(>port_lock)
 }
 
 /*
- * RX tasklet takes data out of the RX queue and hands it up to the TTY
+ * RX work takes data out of the RX queue and hands it up to the TTY
  * layer until it refuses to take any more data (or is throttled back).
  * Then it issues reads for any further data.
  *
@@ -709,7 +709,7 @@ static void gs_close(struct tty_struct *tty, struct file 
*file)
 
/* Iff we're disconnected, there can be no I/O in flight so it's
 * ok to free the circular buffer; else just scrub it.  And don't
-* let the push tasklet fire again until we're re-opened.
+* let the push async work fire again until we're re-opened.
 */
if (gser == NULL)
kfifo_free(>port_write_buf);
-- 
2.26.2



[PATCH] usb: xhci: Replace tasklet with work

2021-01-18 Thread Davidlohr Bueso
Tasklets have long been deprecated as being too heavy on the system
by running in irq context - and this is not a performance critical
path. If a higher priority process wants to run, it must wait for
the tasklet to finish before doing so.

dbc_rx_push() will now run in process context and have further
concurrency - tasklets being serialized among themselves, but this
is done holding the port_lock, so it should be fine.

Signed-off-by: Davidlohr Bueso 
---
 drivers/usb/host/xhci-dbgcap.h |  2 +-
 drivers/usb/host/xhci-dbgtty.c | 19 +++
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/drivers/usb/host/xhci-dbgcap.h b/drivers/usb/host/xhci-dbgcap.h
index c70b78d504eb..aec66009af51 100644
--- a/drivers/usb/host/xhci-dbgcap.h
+++ b/drivers/usb/host/xhci-dbgcap.h
@@ -104,7 +104,7 @@ struct dbc_port {
struct list_headread_pool;
struct list_headread_queue;
unsigned intn_read;
-   struct tasklet_struct   push;
+   struct work_struct  push;
 
struct list_headwrite_pool;
struct kfifowrite_fifo;
diff --git a/drivers/usb/host/xhci-dbgtty.c b/drivers/usb/host/xhci-dbgtty.c
index ae4e4ab638b5..50da77d92cf6 100644
--- a/drivers/usb/host/xhci-dbgtty.c
+++ b/drivers/usb/host/xhci-dbgtty.c
@@ -108,7 +108,7 @@ dbc_read_complete(struct xhci_dbc *dbc, struct dbc_request 
*req)
 
spin_lock_irqsave(>port_lock, flags);
list_add_tail(>list_pool, >read_queue);
-   tasklet_schedule(>push);
+   schedule_work(>push);
spin_unlock_irqrestore(>port_lock, flags);
 }
 
@@ -272,7 +272,7 @@ static void dbc_tty_unthrottle(struct tty_struct *tty)
unsigned long   flags;
 
spin_lock_irqsave(>port_lock, flags);
-   tasklet_schedule(>push);
+   schedule_work(>push);
spin_unlock_irqrestore(>port_lock, flags);
 }
 
@@ -288,15 +288,18 @@ static const struct tty_operations dbc_tty_ops = {
.unthrottle = dbc_tty_unthrottle,
 };
 
-static void dbc_rx_push(struct tasklet_struct *t)
+static void dbc_rx_push(struct work_struct *work)
 {
struct dbc_request  *req;
struct tty_struct   *tty;
unsigned long   flags;
booldo_push = false;
booldisconnect = false;
-   struct dbc_port *port = from_tasklet(port, t, push);
-   struct list_head*queue = >read_queue;
+   struct dbc_port *port;
+   struct list_head*queue;
+
+   port = container_of(work, struct dbc_port, push);
+   queue = >read_queue;
 
spin_lock_irqsave(>port_lock, flags);
tty = port->port.tty;
@@ -349,7 +352,7 @@ static void dbc_rx_push(struct tasklet_struct *t)
if (!list_empty(queue) && tty) {
if (!tty_throttled(tty)) {
if (do_push)
-   tasklet_schedule(>push);
+   schedule_work(>push);
else
pr_warn("ttyDBC0: RX not scheduled?\n");
}
@@ -382,7 +385,7 @@ xhci_dbc_tty_init_port(struct xhci_dbc *dbc, struct 
dbc_port *port)
 {
tty_port_init(>port);
spin_lock_init(>port_lock);
-   tasklet_setup(>push, dbc_rx_push);
+   INIT_WORK(>push, dbc_rx_push);
INIT_LIST_HEAD(>read_pool);
INIT_LIST_HEAD(>read_queue);
INIT_LIST_HEAD(>write_pool);
@@ -394,7 +397,7 @@ xhci_dbc_tty_init_port(struct xhci_dbc *dbc, struct 
dbc_port *port)
 static void
 xhci_dbc_tty_exit_port(struct dbc_port *port)
 {
-   tasklet_kill(>push);
+   cancel_work_sync(>push);
tty_port_destroy(>port);
 }
 
-- 
2.26.2



Re: [PATCH] mailbox: bcm: Replace tasklet with threaded irq

2021-01-14 Thread Davidlohr Bueso

On Thu, 14 Jan 2021, Jassi Brar wrote:


On Thu, Jan 14, 2021 at 6:21 PM Davidlohr Bueso  wrote:


Tasklets have long been deprecated as being too heavy on the system
by running in irq context - and this is not a performance critical
path. If a higher priority process wants to run, it must wait for
the tasklet to finish before doing so.

Use a more suitable alternative such as threaded irqs and do the
async work in process context.

Signed-off-by: Davidlohr Bueso 
---

Please cc the author and other contributors to this file, esp when
this is vendor specific code.


So looking at who to Cc I noticed:

commit 8aef00f090bcbe5237c5a6628e7c000890267efe
Author: Rob Rice 
Date:   Mon Nov 14 13:26:01 2016 -0500

mailbox: bcm-pdc: Convert from threaded IRQ to tasklet

Previously used threaded IRQs in the PDC driver to defer
processing the rx DMA ring after getting an rx done interrupt.
Instead, use a tasklet at normal priority for deferred processing.

... which is exactly the opposite of what modern Linux should be
doing. Rob, could this not be done in process context?

Thanks,
Davidlohr


[PATCH] platform/goldfish: Convert pipe tasklet to threaded irq

2021-01-14 Thread Davidlohr Bueso
Tasklets have long been deprecated as being too heavy on the system
by running in irq context - and this is not a performance critical
path. If a higher priority process wants to run, it must wait for
the tasklet to finish before doing so. A more suitable equivalent
is to converted to threaded irq instead and deal with the signaled
pipes in task context.

Signed-off-by: Davidlohr Bueso 
---
 drivers/platform/goldfish/goldfish_pipe.c | 28 +--
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/drivers/platform/goldfish/goldfish_pipe.c 
b/drivers/platform/goldfish/goldfish_pipe.c
index 1ab207ec9c94..b67539f9848c 100644
--- a/drivers/platform/goldfish/goldfish_pipe.c
+++ b/drivers/platform/goldfish/goldfish_pipe.c
@@ -212,9 +212,6 @@ struct goldfish_pipe_dev {
int version;
unsigned char __iomem *base;
 
-   /* an irq tasklet to run goldfish_interrupt_task */
-   struct tasklet_struct irq_tasklet;
-
struct miscdevice miscdev;
 };
 
@@ -577,10 +574,10 @@ static struct goldfish_pipe *signalled_pipes_pop_front(
return pipe;
 }
 
-static void goldfish_interrupt_task(unsigned long dev_addr)
+static irqreturn_t goldfish_interrupt_task(int irq, void *dev_addr)
 {
/* Iterate over the signalled pipes and wake them one by one */
-   struct goldfish_pipe_dev *dev = (struct goldfish_pipe_dev *)dev_addr;
+   struct goldfish_pipe_dev *dev = dev_addr;
struct goldfish_pipe *pipe;
int wakes;
 
@@ -599,13 +596,14 @@ static void goldfish_interrupt_task(unsigned long 
dev_addr)
 */
wake_up_interruptible(>wake_queue);
}
+   return IRQ_HANDLED;
 }
 
 static void goldfish_pipe_device_deinit(struct platform_device *pdev,
struct goldfish_pipe_dev *dev);
 
 /*
- * The general idea of the interrupt handling:
+ * The general idea of the (threaded) interrupt handling:
  *
  *  1. device raises an interrupt if there's at least one signalled pipe
  *  2. IRQ handler reads the signalled pipes and their count from the device
@@ -614,8 +612,8 @@ static void goldfish_pipe_device_deinit(struct 
platform_device *pdev,
  *  otherwise it leaves it raised, so IRQ handler will be called
  *  again for the next chunk
  *  4. IRQ handler adds all returned pipes to the device's signalled pipes list
- *  5. IRQ handler launches a tasklet to process the signalled pipes from the
- *  list in a separate context
+ *  5. IRQ handler defers processing the signalled pipes from the list in a
+ *  separate context
  */
 static irqreturn_t goldfish_pipe_interrupt(int irq, void *dev_id)
 {
@@ -645,8 +643,7 @@ static irqreturn_t goldfish_pipe_interrupt(int irq, void 
*dev_id)
 
spin_unlock_irqrestore(>lock, flags);
 
-   tasklet_schedule(>irq_tasklet);
-   return IRQ_HANDLED;
+   return IRQ_WAKE_THREAD;
 }
 
 static int get_free_pipe_id_locked(struct goldfish_pipe_dev *dev)
@@ -811,12 +808,10 @@ static int goldfish_pipe_device_init(struct 
platform_device *pdev,
 {
int err;
 
-   tasklet_init(>irq_tasklet, _interrupt_task,
-(unsigned long)dev);
-
-   err = devm_request_irq(>dev, dev->irq,
-  goldfish_pipe_interrupt,
-  IRQF_SHARED, "goldfish_pipe", dev);
+   err = devm_request_threaded_irq(>dev, dev->irq,
+   goldfish_pipe_interrupt,
+   goldfish_interrupt_task,
+   IRQF_SHARED, "goldfish_pipe", dev);
if (err) {
dev_err(>dev, "unable to allocate IRQ for v2\n");
return err;
@@ -874,7 +869,6 @@ static void goldfish_pipe_device_deinit(struct 
platform_device *pdev,
struct goldfish_pipe_dev *dev)
 {
misc_deregister(>miscdev);
-   tasklet_kill(>irq_tasklet);
kfree(dev->pipes);
free_page((unsigned long)dev->buffers);
 }
-- 
2.26.2



[PATCH] mailbox: bcm: Replace tasklet with threaded irq

2021-01-14 Thread Davidlohr Bueso
Tasklets have long been deprecated as being too heavy on the system
by running in irq context - and this is not a performance critical
path. If a higher priority process wants to run, it must wait for
the tasklet to finish before doing so.

Use a more suitable alternative such as threaded irqs and do the
async work in process context.

Signed-off-by: Davidlohr Bueso 
---
 drivers/mailbox/bcm-pdc-mailbox.c | 23 +++
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/drivers/mailbox/bcm-pdc-mailbox.c 
b/drivers/mailbox/bcm-pdc-mailbox.c
index 5b375985f7b8..4718722c3c67 100644
--- a/drivers/mailbox/bcm-pdc-mailbox.c
+++ b/drivers/mailbox/bcm-pdc-mailbox.c
@@ -294,9 +294,6 @@ struct pdc_state {
 
unsigned int pdc_irq;
 
-   /* tasklet for deferred processing after DMA rx interrupt */
-   struct tasklet_struct rx_tasklet;
-
/* Number of bytes of receive status prior to each rx frame */
u32 rx_status_len;
/* Whether a BCM header is prepended to each frame */
@@ -953,23 +950,23 @@ static irqreturn_t pdc_irq_handler(int irq, void *data)
iowrite32(intstatus, pdcs->pdc_reg_vbase + PDC_INTSTATUS_OFFSET);
 
/* Wakeup IRQ thread */
-   tasklet_schedule(>rx_tasklet);
-   return IRQ_HANDLED;
+   return IRQ_WAKE_THREAD;
 }
 
 /**
- * pdc_tasklet_cb() - Tasklet callback that runs the deferred processing after
+ * pdc_task_cb() - Threaded IRQ that runs the deferred processing after
  * a DMA receive interrupt. Reenables the receive interrupt.
  * @data: PDC state structure
  */
-static void pdc_tasklet_cb(struct tasklet_struct *t)
+static irqreturn_t pdc_task_cb(int irq, void *data)
 {
-   struct pdc_state *pdcs = from_tasklet(pdcs, t, rx_tasklet);
+   struct pdc_state *pdcs = data;
 
pdc_receive(pdcs);
 
/* reenable interrupts */
iowrite32(PDC_INTMASK, pdcs->pdc_reg_vbase + PDC_INTMASK_OFFSET);
+   return IRQ_HANDLED;
 }
 
 /**
@@ -1405,8 +1402,8 @@ static int pdc_interrupts_init(struct pdc_state *pdcs)
dev_dbg(dev, "pdc device %s irq %u for pdcs %p",
dev_name(dev), pdcs->pdc_irq, pdcs);
 
-   err = devm_request_irq(dev, pdcs->pdc_irq, pdc_irq_handler, 0,
-  dev_name(dev), dev);
+   err = devm_request_threaded_irq(dev, pdcs->pdc_irq, pdc_irq_handler,
+   pdc_task_cb, 0, dev_name(dev), dev);
if (err) {
dev_err(dev, "IRQ %u request failed with err %d\n",
pdcs->pdc_irq, err);
@@ -1588,9 +1585,6 @@ static int pdc_probe(struct platform_device *pdev)
 
pdc_hw_init(pdcs);
 
-   /* Init tasklet for deferred DMA rx processing */
-   tasklet_setup(>rx_tasklet, pdc_tasklet_cb);
-
err = pdc_interrupts_init(pdcs);
if (err)
goto cleanup_buf_pool;
@@ -1606,7 +1600,6 @@ static int pdc_probe(struct platform_device *pdev)
return PDC_SUCCESS;
 
 cleanup_buf_pool:
-   tasklet_kill(>rx_tasklet);
dma_pool_destroy(pdcs->rx_buf_pool);
 
 cleanup_ring_pool:
@@ -1622,8 +1615,6 @@ static int pdc_remove(struct platform_device *pdev)
 
pdc_free_debugfs();
 
-   tasklet_kill(>rx_tasklet);
-
pdc_hw_disable(pdcs);
 
dma_pool_destroy(pdcs->rx_buf_pool);
-- 
2.26.2



[PATCH] parisc: Remove leftover reference to the power_tasklet

2021-01-14 Thread Davidlohr Bueso
This was removed long ago, back in:

 6e16d9409e1 ([PARISC] Convert soft power switch driver to kthread)

Signed-off-by: Davidlohr Bueso 
---
 arch/parisc/include/asm/irq.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/parisc/include/asm/irq.h b/arch/parisc/include/asm/irq.h
index 959e79cd2c14..378f63c4015b 100644
--- a/arch/parisc/include/asm/irq.h
+++ b/arch/parisc/include/asm/irq.h
@@ -47,7 +47,4 @@ extern unsigned long txn_affinity_addr(unsigned int irq, int 
cpu);
 extern int cpu_claim_irq(unsigned int irq, struct irq_chip *, void *);
 extern int cpu_check_affinity(struct irq_data *d, const struct cpumask *dest);
 
-/* soft power switch support (power.c) */
-extern struct tasklet_struct power_tasklet;
-
 #endif /* _ASM_PARISC_IRQ_H */
-- 
2.26.2



[PATCH] kgdb: Schedule breakpoints via workqueue

2021-01-14 Thread Davidlohr Bueso
The original functionality was added back in:

1cee5e35f15 (kgdb: Add the ability to schedule a breakpoint via a tasklet)

However tasklets have long been deprecated as being too heavy on
the system by running in irq context - and this is not a performance
critical path. If a higher priority process wants to run, it must
wait for the tasklet to finish before doing so. Instead, generate
the breakpoint exception in process context.

Signed-off-by: Davidlohr Bueso 
---
Compile-tested only.

 kernel/debug/debug_core.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index af6e8b4fb359..e1ff974c6b6f 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -119,7 +119,7 @@ static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
  */
 static atomic_tmasters_in_kgdb;
 static atomic_tslaves_in_kgdb;
-static atomic_tkgdb_break_tasklet_var;
+static atomic_tkgdb_break_work_var;
 atomic_t   kgdb_setting_breakpoint;
 
 struct task_struct *kgdb_usethread;
@@ -1085,27 +1085,27 @@ static void kgdb_unregister_callbacks(void)
 }
 
 /*
- * There are times a tasklet needs to be used vs a compiled in
+ * There are times a workqueue needs to be used vs a compiled in
  * break point so as to cause an exception outside a kgdb I/O module,
  * such as is the case with kgdboe, where calling a breakpoint in the
  * I/O driver itself would be fatal.
  */
-static void kgdb_tasklet_bpt(unsigned long ing)
+static void kgdb_work_bpt(struct work_struct *unused)
 {
kgdb_breakpoint();
-   atomic_set(_break_tasklet_var, 0);
+   atomic_set(_break_work_var, 0);
 }
 
-static DECLARE_TASKLET_OLD(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt);
+static DECLARE_WORK(kgdb_async_breakpoint, kgdb_work_bpt);
 
 void kgdb_schedule_breakpoint(void)
 {
-   if (atomic_read(_break_tasklet_var) ||
+   if (atomic_read(_break_work_var) ||
atomic_read(_active) != -1 ||
atomic_read(_setting_breakpoint))
return;
-   atomic_inc(_break_tasklet_var);
-   tasklet_schedule(_tasklet_breakpoint);
+   atomic_inc(_break_work_var);
+   schedule_work(_async_breakpoint);
 }
 EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
 
-- 
2.26.2



[PATCH] audit: Remove leftover reference to the audit_tasklet

2021-01-14 Thread Davidlohr Bueso
This was replaced with a kauditd_wait kthread long ago,
back in:

 b7d1125817c (AUDIT: Send netlink messages from a separate kernel thread)

Update the stale comment.

Signed-off-by: Davidlohr Bueso 
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/audit.c b/kernel/audit.c
index 1ffc2e059027..8fd735190c12 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2365,7 +2365,7 @@ int audit_signal_info(int sig, struct task_struct *t)
  *
  * We can not do a netlink send inside an irq context because it blocks (last
  * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
- * queue and a tasklet is scheduled to remove them from the queue outside the
+ * queue and a kthread is scheduled to remove them from the queue outside the
  * irq context.  May be called in any context.
  */
 void audit_log_end(struct audit_buffer *ab)
-- 
2.26.2



[PATCH v2] usb/c67x00: Replace tasklet with work

2021-01-12 Thread Davidlohr Bueso
Tasklets have long been deprecated as being too heavy on the system
by running in irq context - and this is not a performance critical
path. If a higher priority process wants to run, it must wait for
the tasklet to finish before doing so.

c67x00_do_work() will now run in process context and have further
concurrency (tasklets being serialized among themselves), but this
is done holding the c67x00->lock, so it should be fine. Furthermore,
this patch fixes the usage of the lock in the callback as otherwise
it would need to be irq-safe.

Signed-off-by: Davidlohr Bueso 
---
Changes from v1: cleaned up busted white spaces.

 drivers/usb/c67x00/c67x00-hcd.h   |  2 +-
 drivers/usb/c67x00/c67x00-sched.c | 12 +++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/usb/c67x00/c67x00-hcd.h b/drivers/usb/c67x00/c67x00-hcd.h
index 6b6b04a3fe0f..6332a6b5dce6 100644
--- a/drivers/usb/c67x00/c67x00-hcd.h
+++ b/drivers/usb/c67x00/c67x00-hcd.h
@@ -76,7 +76,7 @@ struct c67x00_hcd {
u16 next_td_addr;
u16 next_buf_addr;
 
-   struct tasklet_struct tasklet;
+   struct work_struct work;
 
struct completion endpoint_disable;
 
diff --git a/drivers/usb/c67x00/c67x00-sched.c 
b/drivers/usb/c67x00/c67x00-sched.c
index e65f1a0ae80b..c7d3e907be81 100644
--- a/drivers/usb/c67x00/c67x00-sched.c
+++ b/drivers/usb/c67x00/c67x00-sched.c
@@ -1123,24 +1123,26 @@ static void c67x00_do_work(struct c67x00_hcd *c67x00)
 
 /* -- 
*/
 
-static void c67x00_sched_tasklet(struct tasklet_struct *t)
+static void c67x00_sched_work(struct work_struct *work)
 {
-   struct c67x00_hcd *c67x00 = from_tasklet(c67x00, t, tasklet);
+   struct c67x00_hcd *c67x00;
+
+   c67x00 = container_of(work, struct c67x00_hcd, work);
c67x00_do_work(c67x00);
 }
 
 void c67x00_sched_kick(struct c67x00_hcd *c67x00)
 {
-   tasklet_hi_schedule(>tasklet);
+   queue_work(system_highpri_wq, >work);
 }
 
 int c67x00_sched_start_scheduler(struct c67x00_hcd *c67x00)
 {
-   tasklet_setup(>tasklet, c67x00_sched_tasklet);
+   INIT_WORK(>work, c67x00_sched_work);
return 0;
 }
 
 void c67x00_sched_stop_scheduler(struct c67x00_hcd *c67x00)
 {
-   tasklet_kill(>tasklet);
+   cancel_work_sync(>work);
 }
-- 
2.26.2



Re: [PATCH v7 09/18] virt: acrn: Introduce I/O request management

2021-01-12 Thread Davidlohr Bueso

On Tue, 12 Jan 2021, Shuo A Liu wrote:


On Mon 11.Jan'21 at 13:52:19 -0800, Davidlohr Bueso wrote:

Could this not be done in process context instead?


It could be. The original consideration with tasklet was more about
performance as the I/O requests dispatching is a hot code path. I think
irq thread has little performance impact? I can have a try to convert
the tasklet to irq thread.


Yes, there is some added latency between when the work is scheduled and
actually executed - however this should not be a problem for this scenario,
and furthermore consider that tasklets do not guarantee performance as
ksoftirqd comes in the picture under heavy load.

Thanks,
Davidlohr


Re: [PATCH v7 09/18] virt: acrn: Introduce I/O request management

2021-01-11 Thread Davidlohr Bueso

On Wed, 06 Jan 2021, shuo.a@intel.com wrote:

The processing flow of I/O requests are listed as following:

a) The I/O handler of the hypervisor will fill an I/O request with
  PENDING state when a trapped I/O access happens in a User VM.
b) The hypervisor makes an upcall, which is a notification interrupt, to
  the Service VM.
c) The upcall handler schedules a tasklet to dispatch I/O requests.
d) The tasklet looks for the PENDING I/O requests, assigns them to
  different registered clients based on the address of the I/O accesses,
  updates their state to PROCESSING, and notifies the corresponding
  client to handle.


Hmm so tasklets are deprecated (and have been for a while) and it's sad
to see incoming new users in modern Linux. This wouldn't be the first one,
however. We should be _removing_ users, not adding... In addition, this
expands the whole tasklet_disable/enable() hacks.

Could this not be done in process context instead?

Thanks,
Davidlohr


Re: [PATCH] usb/c67x00: Replace tasklet with work

2021-01-11 Thread Davidlohr Bueso

On Mon, 11 Jan 2021, Hillf Danton wrote:


On Sun, 10 Jan 2021 20:40:50 -0800 Davidlohr Bueso wrote:

Tasklets have long been deprecated as being too heavy on the system
by running in irq context - and this is not a performance critical
path. If a higher priority process wants to run, it must wait for
the tasklet to finish before doing so.

c67x00_do_work() will now run in process context and have further
concurrency (tasklets being serialized among themselves), but this
is done holding the c67x00->lock, so it should be fine. Furthermore,
this patch fixes the usage of the lock in the callback as otherwise
it would need to be irq-safe.


Can you add a couple of words about the need to be irq-safe because
no lock is taken for scheduling either tasklet or work?


I was refering to the locking in the c67x00_do_work() tasklet callback.
Because it is currently under irq it should be disabling irq (or at least
BH) but after this patch that is no longer the case.



Signed-off-by: Davidlohr Bueso 
---
 drivers/usb/c67x00/c67x00-hcd.h   |  2 +-
 drivers/usb/c67x00/c67x00-sched.c | 12 +++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/usb/c67x00/c67x00-hcd.h b/drivers/usb/c67x00/c67x00-hcd.h
index 6b6b04a3fe0f..6332a6b5dce6 100644
--- a/drivers/usb/c67x00/c67x00-hcd.h
+++ b/drivers/usb/c67x00/c67x00-hcd.h
@@ -76,7 +76,7 @@ struct c67x00_hcd {
u16 next_td_addr;
u16 next_buf_addr;

-   struct tasklet_struct tasklet;
+   struct work_struct work;

struct completion endpoint_disable;

diff --git a/drivers/usb/c67x00/c67x00-sched.c 
b/drivers/usb/c67x00/c67x00-sched.c
index e65f1a0ae80b..af60f4fdd340 100644
--- a/drivers/usb/c67x00/c67x00-sched.c
+++ b/drivers/usb/c67x00/c67x00-sched.c
@@ -1123,24 +1123,26 @@ static void c67x00_do_work(struct c67x00_hcd *c67x00)

 /* -- 
*/

-static void c67x00_sched_tasklet(struct tasklet_struct *t)
+static void c67x00_sched_work(struct work_struct *work)
 {
-   struct c67x00_hcd *c67x00 = from_tasklet(c67x00, t, tasklet);
+   struct c67x00_hcd *c67x00;
+
+   c67x00 = container_of(work, struct c67x00_hcd, work);
c67x00_do_work(c67x00);
 }

 void c67x00_sched_kick(struct c67x00_hcd *c67x00)
 {
-   tasklet_hi_schedule(>tasklet);
+queue_work(system_highpri_wq, >work);


Better if one line comment is added for highpri, given this is not a
performance critical path.


I'm not sure the value here, considering the highprio is not being
changed here. There are a few drivers who use highpri workqueue and
care about latencies but they are still not performance critical (to
the overall system that is, which is what I meant by that).

Thanks,
Davidlohr


[PATCH] usb/c67x00: Replace tasklet with work

2021-01-10 Thread Davidlohr Bueso
Tasklets have long been deprecated as being too heavy on the system
by running in irq context - and this is not a performance critical
path. If a higher priority process wants to run, it must wait for
the tasklet to finish before doing so.

c67x00_do_work() will now run in process context and have further
concurrency (tasklets being serialized among themselves), but this
is done holding the c67x00->lock, so it should be fine. Furthermore,
this patch fixes the usage of the lock in the callback as otherwise
it would need to be irq-safe.

Signed-off-by: Davidlohr Bueso 
---
 drivers/usb/c67x00/c67x00-hcd.h   |  2 +-
 drivers/usb/c67x00/c67x00-sched.c | 12 +++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/usb/c67x00/c67x00-hcd.h b/drivers/usb/c67x00/c67x00-hcd.h
index 6b6b04a3fe0f..6332a6b5dce6 100644
--- a/drivers/usb/c67x00/c67x00-hcd.h
+++ b/drivers/usb/c67x00/c67x00-hcd.h
@@ -76,7 +76,7 @@ struct c67x00_hcd {
u16 next_td_addr;
u16 next_buf_addr;
 
-   struct tasklet_struct tasklet;
+   struct work_struct work;
 
struct completion endpoint_disable;
 
diff --git a/drivers/usb/c67x00/c67x00-sched.c 
b/drivers/usb/c67x00/c67x00-sched.c
index e65f1a0ae80b..af60f4fdd340 100644
--- a/drivers/usb/c67x00/c67x00-sched.c
+++ b/drivers/usb/c67x00/c67x00-sched.c
@@ -1123,24 +1123,26 @@ static void c67x00_do_work(struct c67x00_hcd *c67x00)
 
 /* -- 
*/
 
-static void c67x00_sched_tasklet(struct tasklet_struct *t)
+static void c67x00_sched_work(struct work_struct *work)
 {
-   struct c67x00_hcd *c67x00 = from_tasklet(c67x00, t, tasklet);
+   struct c67x00_hcd *c67x00;
+
+   c67x00 = container_of(work, struct c67x00_hcd, work);
c67x00_do_work(c67x00);
 }
 
 void c67x00_sched_kick(struct c67x00_hcd *c67x00)
 {
-   tasklet_hi_schedule(>tasklet);
+queue_work(system_highpri_wq, >work);
 }
 
 int c67x00_sched_start_scheduler(struct c67x00_hcd *c67x00)
 {
-   tasklet_setup(>tasklet, c67x00_sched_tasklet);
+INIT_WORK(>work, c67x00_sched_work);
return 0;
 }
 
 void c67x00_sched_stop_scheduler(struct c67x00_hcd *c67x00)
 {
-   tasklet_kill(>tasklet);
+cancel_work_sync(>work);
 }
-- 
2.26.2



[PATCH] usb/gadget: f_midi: Replace tasklet with work

2021-01-10 Thread Davidlohr Bueso
Currently a tasklet is used to transmit input substream buffer
data. However, tasklets have long been deprecated as being too
heavy on the system by running in irq context - and this is not
a performance critical path. If a higher priority process wants
to run, it must wait for the tasklet to finish before doing so.

Deferring work to a workqueue and executing in process context
should be fine considering the callback already does
f_midi_do_transmit() under the transmit_lock and thus changes in
semantics are ok regarding concurrency - tasklets being serialized
against itself.

Cc: Takashi Iwai 
Signed-off-by: Davidlohr Bueso 
---
 drivers/usb/gadget/function/f_midi.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/usb/gadget/function/f_midi.c 
b/drivers/usb/gadget/function/f_midi.c
index 8fff995b8dd5..71a1a26e85c7 100644
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -87,7 +87,7 @@ struct f_midi {
struct snd_rawmidi_substream *out_substream[MAX_PORTS];
 
unsigned long   out_triggered;
-   struct tasklet_struct   tasklet;
+   struct work_struct  work;
unsigned int in_ports;
unsigned int out_ports;
int index;
@@ -698,9 +698,11 @@ static void f_midi_transmit(struct f_midi *midi)
f_midi_drop_out_substreams(midi);
 }
 
-static void f_midi_in_tasklet(struct tasklet_struct *t)
+static void f_midi_in_work(struct work_struct *work)
 {
-   struct f_midi *midi = from_tasklet(midi, t, tasklet);
+   struct f_midi *midi;
+
+   midi = container_of(work, struct f_midi, work);
f_midi_transmit(midi);
 }
 
@@ -737,7 +739,7 @@ static void f_midi_in_trigger(struct snd_rawmidi_substream 
*substream, int up)
VDBG(midi, "%s() %d\n", __func__, up);
midi->in_ports_array[substream->number].active = up;
if (up)
-   tasklet_hi_schedule(>tasklet);
+   queue_work(system_highpri_wq, >work);
 }
 
 static int f_midi_out_open(struct snd_rawmidi_substream *substream)
@@ -875,7 +877,7 @@ static int f_midi_bind(struct usb_configuration *c, struct 
usb_function *f)
int status, n, jack = 1, i = 0, endpoint_descriptor_index = 0;
 
midi->gadget = cdev->gadget;
-   tasklet_setup(>tasklet, f_midi_in_tasklet);
+   INIT_WORK(>work, f_midi_in_work);
status = f_midi_register_card(midi);
if (status < 0)
goto fail_register;
-- 
2.26.2



Re: [PATCH] perf: Break deadlock involving exec_update_mutex

2020-12-10 Thread Davidlohr Bueso

On Tue, 08 Dec 2020, Peter Zijlstra wrote:


I suppose I'll queue the below into tip/perf/core for next merge window,
unless you want it in a hurry?


I'm thinking we'd still want Eric's series on top of this for the reader
benefits of the lock.

Thanks,
Davidlohr


Re: [PATCH v2 5/5] locking/rwsem: Remove reader optimistic spinning

2020-12-07 Thread Davidlohr Bueso

On Fri, 20 Nov 2020, Waiman Long wrote:


Reader optimistic spinning is helpful when the reader critical section
is short and there aren't that many readers around. It also improves
the chance that a reader can get the lock as writer optimistic spinning
disproportionally favors writers much more than readers.

Since commit d3681e269fff ("locking/rwsem: Wake up almost all readers
in wait queue"), all the waiting readers are woken up so that they can
all get the read lock and run in parallel. When the number of contending
readers is large, allowing reader optimistic spinning will likely cause
reader fragmentation where multiple smaller groups of readers can get
the read lock in a sequential manner separated by writers. That reduces
reader parallelism.

One possible way to address that drawback is to limit the number of
readers (preferably one) that can do optimistic spinning. These readers
act as representatives of all the waiting readers in the wait queue as
they will wake up all those waiting readers once they get the lock.

Alternatively, as reader optimistic lock stealing has already enhanced
fairness to readers, it may be easier to just remove reader optimistic
spinning and simplifying the optimistic spinning code as a result.

Performance measurements (locking throughput kops/s) using a locking
microbenchmark with 50/50 reader/writer distribution and turbo-boost
disabled was done on a 2-socket Cascade Lake system (48-core 96-thread)
to see the impacts of these changes:

 1) Vanilla - 5.10-rc3 kernel
 2) Before  - 5.10-rc3 kernel with previous patches in this series
 2) limit-rspin - 5.10-rc3 kernel with limited reader spinning patch
 3) no-rspin- 5.10-rc3 kernel with reader spinning disabled

 # of threads  CS Load   Vanilla  Before   limit-rspin   no-rspin
   ---   ---  --   ---   
  21  5,1855,662  5,214   5,077
  41  5,1074,983  5,188   4,760
  81  4,7824,564  4,720   4,628
 161  4,6804,053  4,567   3,402
 321  4,2991,115  1,118   1,098
 641  3,218  983  1,001 957
 961  1,938  944957 930

  2   20  2,0082,128  2,264   1,665
  4   20  1,3901,033  1,046   1,101
  8   20  1,4721,155  1,098   1,213
 16   20  1,3321,077  1,089   1,122
 32   20967  914917 980
 64   20787  874891 858
 96   20730  836847 844

  2  100372  356360 355
  4  100492  425434 392
  8  100533  537529 538
 16  100548  572568 598
 32  100499  520527 537
 64  100466  517526 512
 96  100406  497506 509

The column "CS Load" represents the number of pause instructions issued
in the locking critical section. A CS load of 1 is extremely short and
is not likey in real situations. A load of 20 (moderate) and 100 (long)
are more realistic.

It can be seen that the previous patches in this series have reduced
performance in general except in highly contended cases with moderate
or long critical sections that performance improves a bit. This change
is mostly caused by the "Prevent potential lock starvation" patch that
reduce reader optimistic spinning and hence reduce reader fragmentation.

The patch that further limit reader optimistic spinning doesn't seem to
have too much impact on overall performance as shown in the benchmark
data.

The patch that disables reader optimistic spinning shows reduced
performance at lightly loaded cases, but comparable or slightly better
performance on with heavier contention.

This patch just removes reader optimistic spinning for now. As readers
are not going to do optimistic spinning anymore, we don't need to
consider if the OSQ is empty or not when doing lock stealing.

Signed-off-by: Waiman Long 


Reviewed-by: Davidlohr Bueso 


Re: [PATCH 3/5] locking/rwsem: Enable reader optimistic lock stealing

2020-12-07 Thread Davidlohr Bueso

On Tue, 17 Nov 2020, Waiman Long wrote:


If the optimistic spinning queue is empty and the rwsem does not have
the handoff or write-lock bits set, it is actually not necessary to
call rwsem_optimistic_spin() to spin on it. Instead, it can steal the
lock directly as its reader bias is in the count already.  If it is
the first reader in this state, it will try to wake up other readers
in the wait queue.

Signed-off-by: Waiman Long 


Reviewed-by: Davidlohr Bueso 


Re: [RFC PATCH] usb: hcd: complete URBs in threaded-IRQ context instead of tasklet

2020-12-04 Thread Davidlohr Bueso

On Thu, 03 Dec 2020, Bueso wrote:


On Mon, 16 Apr 2018, Sebastian Andrzej Siewior wrote:


On 2018-03-08 10:57:39 [+0100], To Mauro Carvalho Chehab wrote:

On 2018-02-27 14:39:34 [-0300], Mauro Carvalho Chehab wrote:

Hi Sebastian,

Hi Mauro,


Sorry for taking some time to test it, has been busy those days...

:)


Anyway, I tested it today. Didn't work. It keep losing data.


Okay, this was unexpected. What I learned from the thread is that you
use the dwc2 controller and once upgrade to a kernel which completes the
URBs in BH context then you starting losing data from your DVB-s USB
device. And it was assumed that this is because BH/ksoftirq is getting
"paused" if it is running for too long. If that is the case then a
revert of "let us complete the URB in BH context" should get it working
again. Is that so?


ping


I ran into this while looking at getting rid of tasklets in drivers/usb.

Mauro, were you ever able to try reverting 8add17cf8e4 like Sebastian suggested?
If not would you mind trying the below, please? Considering this thread is from
over two years ago, it's a rebase of Sebastian's patch to complete urbs in 
process
context + the dwc2 changes not to use defer urb into bh.


Hmm Mauro's email bounced, updating with a valid address.



Thanks,
Davidlohr

8<---
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 60886a7464c3..4952a8fc1719 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1665,33 +1665,76 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
usb_put_urb(urb);
}

-static void usb_giveback_urb_bh(struct tasklet_struct *t)
+static void usb_hcd_rh_gb_urb(struct work_struct *work)
{
-   struct giveback_urb_bh *bh = from_tasklet(bh, t, bh);
-   struct list_head local_list;
+   struct giveback_urb *bh;
+   struct list_head urb_list;
+
+   bh = container_of(work, struct giveback_urb, rh_compl);

spin_lock_irq(>lock);
-   bh->running = true;
- restart:
-   list_replace_init(>head, _list);
+   list_replace_init(>rh_head, _list);
spin_unlock_irq(>lock);

-   while (!list_empty(_list)) {
+   while (!list_empty(_list)) {
struct urb *urb;

-   urb = list_entry(local_list.next, struct urb, urb_list);
+   urb = list_first_entry(_list, struct urb, urb_list);
list_del_init(>urb_list);
-   bh->completing_ep = urb->ep;
__usb_hcd_giveback_urb(urb);
-   bh->completing_ep = NULL;
+   }
+}
+
+#define URB_PRIO_HIGH  0
+#define URB_PRIO_LOW   1
+
+static irqreturn_t usb_hcd_gb_urb(int irq, void *__hcd)
+{
+   struct usb_hcd *hcd = __hcd;
+   struct giveback_urb *bh = >gb_urb;
+   struct list_head urb_list[2];
+   int i;
+
+   INIT_LIST_HEAD(_list[URB_PRIO_HIGH]);
+   INIT_LIST_HEAD(_list[URB_PRIO_LOW]);
+
+   spin_lock_irq(>lock);
+ restart:
+   list_splice_tail_init(>prio_hi_head, _list[URB_PRIO_HIGH]);
+   list_splice_tail_init(>prio_lo_head, _list[URB_PRIO_LOW]);
+   spin_unlock_irq(>lock);
+
+   for (i = 0; i < ARRAY_SIZE(urb_list); i++) {
+   while (!list_empty(_list[i])) {
+   struct urb *urb;
+
+   urb = list_first_entry(_list[i],
+  struct urb, urb_list);
+   list_del_init(>urb_list);
+   if (i == URB_PRIO_HIGH)
+   bh->completing_ep = urb->ep;
+
+   __usb_hcd_giveback_urb(urb);
+
+   if (i == URB_PRIO_HIGH)
+   bh->completing_ep = NULL;
+
+   if (i == URB_PRIO_LOW &&
+   !list_empty_careful(_list[URB_PRIO_HIGH])) {
+   spin_lock_irq(>lock);
+   goto restart;
+   }
+   }
}

/* check if there are new URBs to giveback */
spin_lock_irq(>lock);
-   if (!list_empty(>head))
+   if (!list_empty(>prio_hi_head) ||
+   !list_empty(>prio_lo_head))
goto restart;
-   bh->running = false;
spin_unlock_irq(>lock);
+
+   return IRQ_HANDLED;
}

/**
@@ -1717,37 +1760,34 @@ static void usb_giveback_urb_bh(struct tasklet_struct 
*t)
*/
void usb_hcd_giveback_urb(struct usb_hcd *hcd, struct urb *urb, int status)
{
-   struct giveback_urb_bh *bh;
-   bool running, high_prio_bh;
+   struct giveback_urb *bh = >gb_urb;
+   struct list_head*lh;

/* pass status to tasklet via unlinked */
if (likely(!urb->unlinked))
urb->unlinked = status;

-   if (!hcd_giveback_urb_in_bh(hcd) && !is_root_hub(urb->dev)) {
-   __usb_hcd_giveback_urb(urb);
+   if (is_root_hub(urb->dev)) {
+   spin_lock(>lock);

Re: [PATCH 3/3] exec: Transform exec_update_mutex into a rw_semaphore

2020-12-04 Thread Davidlohr Bueso

On Fri, 04 Dec 2020, Linus Torvalds wrote:


On Fri, Dec 4, 2020 at 12:30 PM Bernd Edlinger
 wrote:



>perf_event_open  (exec_update_mutex -> ovl_i_mutex)


Side note: this one looks like it should be easy to fix.

Is there any real reason why exec_update_mutex is actually gotten that
early, and held for that long in the perf event code?


afaict just to validate the whole operation early. Per 79c9ce57eb2 the
mutex will guard the check and the perf_install_in_context vs exec.



I _think_ we could move the ptrace check to be much later, to _just_ before that

* This is the point on no return; we cannot fail hereafter.

point in the perf event install chain..


Peter had the idea of doing the ptrace_may_access() check twice: first
lockless and early, then under exec_update_mutex when it mattered right
before perf_install_in_context():

https://lore.kernel.org/linux-fsdevel/20200828123720.gz1362...@hirez.programming.kicks-ass.net/



I don't think it needs to be moved down even that much, I think it
would be sufficient to move it down below the "perf_event_alloc()",
but I didn't check very much.


Yeah we could just keep a single ptrace_may_access() check just further
down until it won't deadlock.

Thanks,
Davidlohr


Re: [RFC PATCH] usb: hcd: complete URBs in threaded-IRQ context instead of tasklet

2020-12-03 Thread Davidlohr Bueso

On Mon, 16 Apr 2018, Sebastian Andrzej Siewior wrote:


On 2018-03-08 10:57:39 [+0100], To Mauro Carvalho Chehab wrote:

On 2018-02-27 14:39:34 [-0300], Mauro Carvalho Chehab wrote:
> Hi Sebastian,
Hi Mauro,

> Sorry for taking some time to test it, has been busy those days...
:)

> Anyway, I tested it today. Didn't work. It keep losing data.

Okay, this was unexpected. What I learned from the thread is that you
use the dwc2 controller and once upgrade to a kernel which completes the
URBs in BH context then you starting losing data from your DVB-s USB
device. And it was assumed that this is because BH/ksoftirq is getting
"paused" if it is running for too long. If that is the case then a
revert of "let us complete the URB in BH context" should get it working
again. Is that so?


ping


I ran into this while looking at getting rid of tasklets in drivers/usb.

Mauro, were you ever able to try reverting 8add17cf8e4 like Sebastian suggested?
If not would you mind trying the below, please? Considering this thread is from
over two years ago, it's a rebase of Sebastian's patch to complete urbs in 
process
context + the dwc2 changes not to use defer urb into bh.

Thanks,
Davidlohr

8<---
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 60886a7464c3..4952a8fc1719 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1665,33 +1665,76 @@ static void __usb_hcd_giveback_urb(struct urb *urb)
usb_put_urb(urb);
}

-static void usb_giveback_urb_bh(struct tasklet_struct *t)
+static void usb_hcd_rh_gb_urb(struct work_struct *work)
{
-   struct giveback_urb_bh *bh = from_tasklet(bh, t, bh);
-   struct list_head local_list;
+   struct giveback_urb *bh;
+   struct list_head urb_list;
+
+   bh = container_of(work, struct giveback_urb, rh_compl);

spin_lock_irq(>lock);
-   bh->running = true;
- restart:
-   list_replace_init(>head, _list);
+   list_replace_init(>rh_head, _list);
spin_unlock_irq(>lock);

-   while (!list_empty(_list)) {
+   while (!list_empty(_list)) {
struct urb *urb;

-   urb = list_entry(local_list.next, struct urb, urb_list);
+   urb = list_first_entry(_list, struct urb, urb_list);
list_del_init(>urb_list);
-   bh->completing_ep = urb->ep;
__usb_hcd_giveback_urb(urb);
-   bh->completing_ep = NULL;
+   }
+}
+
+#define URB_PRIO_HIGH  0
+#define URB_PRIO_LOW   1
+
+static irqreturn_t usb_hcd_gb_urb(int irq, void *__hcd)
+{
+   struct usb_hcd *hcd = __hcd;
+   struct giveback_urb *bh = >gb_urb;
+   struct list_head urb_list[2];
+   int i;
+
+   INIT_LIST_HEAD(_list[URB_PRIO_HIGH]);
+   INIT_LIST_HEAD(_list[URB_PRIO_LOW]);
+
+   spin_lock_irq(>lock);
+ restart:
+   list_splice_tail_init(>prio_hi_head, _list[URB_PRIO_HIGH]);
+   list_splice_tail_init(>prio_lo_head, _list[URB_PRIO_LOW]);
+   spin_unlock_irq(>lock);
+
+   for (i = 0; i < ARRAY_SIZE(urb_list); i++) {
+   while (!list_empty(_list[i])) {
+   struct urb *urb;
+
+   urb = list_first_entry(_list[i],
+  struct urb, urb_list);
+   list_del_init(>urb_list);
+   if (i == URB_PRIO_HIGH)
+   bh->completing_ep = urb->ep;
+
+   __usb_hcd_giveback_urb(urb);
+
+   if (i == URB_PRIO_HIGH)
+   bh->completing_ep = NULL;
+
+   if (i == URB_PRIO_LOW &&
+   !list_empty_careful(_list[URB_PRIO_HIGH])) {
+   spin_lock_irq(>lock);
+   goto restart;
+   }
+   }
}

/* check if there are new URBs to giveback */
spin_lock_irq(>lock);
-   if (!list_empty(>head))
+   if (!list_empty(>prio_hi_head) ||
+   !list_empty(>prio_lo_head))
goto restart;
-   bh->running = false;
spin_unlock_irq(>lock);
+
+   return IRQ_HANDLED;
}

/**
@@ -1717,37 +1760,34 @@ static void usb_giveback_urb_bh(struct tasklet_struct 
*t)
 */
void usb_hcd_giveback_urb(struct usb_hcd *hcd, struct urb *urb, int status)
{
-   struct giveback_urb_bh *bh;
-   bool running, high_prio_bh;
+   struct giveback_urb *bh = >gb_urb;
+   struct list_head*lh;

/* pass status to tasklet via unlinked */
if (likely(!urb->unlinked))
urb->unlinked = status;

-   if (!hcd_giveback_urb_in_bh(hcd) && !is_root_hub(urb->dev)) {
-   __usb_hcd_giveback_urb(urb);
+   if (is_root_hub(urb->dev)) {
+   spin_lock(>lock);
+   list_add_tail(>urb_list, >rh_head);
+   spin_unlock(>lock);
+

Re: [PATCH 0/3] exec: Transform exec_update_mutex into a rw_semaphore

2020-12-03 Thread Davidlohr Bueso

On Thu, 03 Dec 2020, Linus Torvalds wrote:


On Thu, Dec 3, 2020 at 12:10 PM Eric W. Biederman  wrote:


The simplest and most robust solution appears to be making
exec_update_mutex a read/write lock and having everything execept for
exec take the lock for read.


Looks sane to me.

I'd like the locking people to look at the down_read_*() functions,
even if they look simple. Adding Waiman and Davidlohr to the cc just
in case they would look at that too, since they've worked on that
code.


rwsem changes look good to me - and 3/3 looks much nicer than the previous
alternatives to the deadlock.

Acked-by: Davidlohr Bueso 


Re: [PATCH] media/siano: kill pointless kmutex definitions

2020-11-25 Thread Davidlohr Bueso

ping

On Sun, 01 Nov 2020, Davidlohr Bueso wrote:


Use the mutex api instead of renaming the calls for this
driver.

Signed-off-by: Davidlohr Bueso 
---
This was found while auditing mutex semantics in drivers.

drivers/media/common/siano/smscoreapi.c  | 42 
drivers/media/common/siano/smscoreapi.h  |  5 ---
drivers/media/common/siano/smsdvb-main.c | 14 
3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/drivers/media/common/siano/smscoreapi.c 
b/drivers/media/common/siano/smscoreapi.c
index c1511094fdc7..410cc3ac6f94 100644
--- a/drivers/media/common/siano/smscoreapi.c
+++ b/drivers/media/common/siano/smscoreapi.c
@@ -429,13 +429,13 @@ static struct smscore_registry_entry_t 
*smscore_find_registry(char *devpath)
struct smscore_registry_entry_t *entry;
struct list_head *next;

-   kmutex_lock(_smscore_registrylock);
+   mutex_lock(_smscore_registrylock);
for (next = g_smscore_registry.next;
 next != _smscore_registry;
 next = next->next) {
entry = (struct smscore_registry_entry_t *) next;
if (!strncmp(entry->devpath, devpath, sizeof(entry->devpath))) {
-   kmutex_unlock(_smscore_registrylock);
+   mutex_unlock(_smscore_registrylock);
return entry;
}
}
@@ -446,7 +446,7 @@ static struct smscore_registry_entry_t 
*smscore_find_registry(char *devpath)
list_add(>entry, _smscore_registry);
} else
pr_err("failed to create smscore_registry.\n");
-   kmutex_unlock(_smscore_registrylock);
+   mutex_unlock(_smscore_registrylock);
return entry;
}

@@ -527,7 +527,7 @@ int smscore_register_hotplug(hotplug_t hotplug)
struct list_head *next, *first;
int rc = 0;

-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);
notifyee = kmalloc(sizeof(*notifyee), GFP_KERNEL);
if (notifyee) {
/* now notify callback about existing devices */
@@ -548,7 +548,7 @@ int smscore_register_hotplug(hotplug_t hotplug)
} else
rc = -ENOMEM;

-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);

return rc;
}
@@ -564,7 +564,7 @@ void smscore_unregister_hotplug(hotplug_t hotplug)
{
struct list_head *next, *first;

-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);

first = _smscore_notifyees;

@@ -579,7 +579,7 @@ void smscore_unregister_hotplug(hotplug_t hotplug)
}
}

-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);
}
EXPORT_SYMBOL_GPL(smscore_unregister_hotplug);

@@ -732,9 +732,9 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
smscore_registry_settype(dev->devpath, params->device_type);

/* add device to devices list */
-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);
list_add(>entry, _smscore_devices);
-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);

*coredev = dev;

@@ -890,14 +890,14 @@ int smscore_start_device(struct smscore_device_t *coredev)
return rc;
}

-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);

rc = smscore_notify_callbacks(coredev, coredev->device, 1);
smscore_init_ir(coredev);

pr_debug("device %p started, rc %d\n", coredev, rc);

-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);

return rc;
}
@@ -1197,7 +1197,7 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
int num_buffers = 0;
int retry = 0;

-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);

/* Release input device (IR) resources */
sms_ir_exit(coredev);
@@ -1224,9 +1224,9 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)

pr_debug("waiting for %d buffer(s)\n",
 coredev->num_buffers - num_buffers);
-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);
msleep(100);
-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);
}

pr_debug("freed %d buffers\n", num_buffers);
@@ -1245,7 +1245,7 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
list_del(>entry);
kfree(coredev);

-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);

pr_debug("device %p destroyed\n", coredev);
}
@@ -2123,17 +2123,17 @@ static int __init smscore_module_init(void)
{
INIT_LIST_HEAD(_s

Re: [RFC PATCH 5/5] locking/rwsem: Remove reader optimistic spinning

2020-11-20 Thread Davidlohr Bueso

On Fri, 20 Nov 2020, David Laight wrote:

I got massive performance improvements from changing a driver
we have to use mutex instead of the old semaphores (the driver
was written a long time ago).

While these weren't 'rw' the same issue will apply.

The problem was that the semaphore/mutex was typically only held over
a few instructions (eg to add an item to a list).
But with semaphore if you got contention the process always slept.
OTOH mutex spin 'for a while' before sleeping so the code rarely slept.


The caveat here is if you are using trylock/unlock from irq, which
is the only reason why regular semaphores are still around today. If
not, indeed a mutex is better.

Thanks,
Davidlohr


[PATCH v3] USB: serial: mos7720: defer state restore to a workqueue

2020-11-19 Thread Davidlohr Bueso
The parallel port restore operation currently defers writes
to a tasklet, if it sees a locked disconnect mutex. The
driver goes to a lot of trouble to ensure writes happen
in a non-blocking context, but things can be greatly
simplified if it's done in regular process context and
this is not a system performance critical path. As such,
instead of doing the state restore writes in irq context,
use a workqueue and just do regular synchronous writes.

In addition to the cleanup, this also imposes less on the
overall system as tasklets have been deprecated because
of it's softirq implications, potentially blocking a higher
priority task from running.

Signed-off-by: Davidlohr Bueso 
---

 drivers/usb/serial/mos7720.c | 234 ++-
 1 file changed, 35 insertions(+), 199 deletions(-)

diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 5a5d2a95070e..41ee2984a0df 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -79,14 +79,6 @@ MODULE_DEVICE_TABLE(usb, id_table);
 #define DCR_INIT_VAL   0x0c/* SLCTIN, nINIT */
 #define ECR_INIT_VAL   0x00/* SPP mode */
 
-struct urbtracker {
-   struct mos7715_parport  *mos_parport;
-   struct list_headurblist_entry;
-   struct kref ref_count;
-   struct urb  *urb;
-   struct usb_ctrlrequest  *setup;
-};
-
 enum mos7715_pp_modes {
SPP = 0<<5,
PS2 = 1<<5,  /* moschip calls this 'NIBBLE' mode */
@@ -96,12 +88,9 @@ enum mos7715_pp_modes {
 struct mos7715_parport {
struct parport  *pp;   /* back to containing struct */
struct kref ref_count; /* to instance of this struct */
-   struct list_headdeferred_urbs; /* list deferred async urbs */
-   struct list_headactive_urbs;   /* list async urbs in flight */
-   spinlock_t  listlock;  /* protects list access */
boolmsg_pending;   /* usb sync call pending */
struct completion   syncmsg_compl; /* usb sync call completed */
-   struct tasklet_struct   urb_tasklet;   /* for sending deferred urbs */
+   struct work_struct  work;  /* restore deferred writes */
struct usb_serial   *serial;   /* back to containing struct */
__u8shadowECR; /* parallel port regs... */
__u8shadowDCR;
@@ -265,174 +254,8 @@ static void destroy_mos_parport(struct kref *kref)
kfree(mos_parport);
 }
 
-static void destroy_urbtracker(struct kref *kref)
-{
-   struct urbtracker *urbtrack =
-   container_of(kref, struct urbtracker, ref_count);
-   struct mos7715_parport *mos_parport = urbtrack->mos_parport;
-
-   usb_free_urb(urbtrack->urb);
-   kfree(urbtrack->setup);
-   kfree(urbtrack);
-   kref_put(_parport->ref_count, destroy_mos_parport);
-}
-
 /*
- * This runs as a tasklet when sending an urb in a non-blocking parallel
- * port callback had to be deferred because the disconnect mutex could not be
- * obtained at the time.
- */
-static void send_deferred_urbs(struct tasklet_struct *t)
-{
-   int ret_val;
-   unsigned long flags;
-   struct mos7715_parport *mos_parport = from_tasklet(mos_parport, t,
-  urb_tasklet);
-   struct urbtracker *urbtrack, *tmp;
-   struct list_head *cursor, *next;
-   struct device *dev;
-
-   /* if release function ran, game over */
-   if (unlikely(mos_parport->serial == NULL))
-   return;
-
-   dev = _parport->serial->dev->dev;
-
-   /* try again to get the mutex */
-   if (!mutex_trylock(_parport->serial->disc_mutex)) {
-   dev_dbg(dev, "%s: rescheduling tasklet\n", __func__);
-   tasklet_schedule(_parport->urb_tasklet);
-   return;
-   }
-
-   /* if device disconnected, game over */
-   if (unlikely(mos_parport->serial->disconnected)) {
-   mutex_unlock(_parport->serial->disc_mutex);
-   return;
-   }
-
-   spin_lock_irqsave(_parport->listlock, flags);
-   if (list_empty(_parport->deferred_urbs)) {
-   spin_unlock_irqrestore(_parport->listlock, flags);
-   mutex_unlock(_parport->serial->disc_mutex);
-   dev_dbg(dev, "%s: deferred_urbs list empty\n", __func__);
-   return;
-   }
-
-   /* move contents of deferred_urbs list to active_urbs list and submit */
-   list_for_each_safe(cursor, next, _parport->deferred_urbs)
-   list_move_tail(cursor, _parport->active_urbs);
-   list_for_each_entry_safe(urbtrack, tmp, _parport->active_urbs,
-   urblist_entry) {
-   ret_val = us

Re: [RFC PATCH 5/5] locking/rwsem: Remove reader optimistic spinning

2020-11-17 Thread Davidlohr Bueso

On Tue, 17 Nov 2020, Waiman Long wrote:


The column "CS Load" represents the number of pause instructions issued
in the locking critical section. A CS load of 1 is extremely short and
is not likey in real situations. A load of 20 (moderate) and 100 (long)
are more realistic.

It can be seen that the previous patches in this series have reduced
performance in general except in highly contended cases with moderate
or long critical sections that performance improves a bit. This change
is mostly caused by the "Prevent potential lock starvation" patch that
reduce reader optimistic spinning and hence reduce reader fragmentation.

The patch that further limit reader optimistic spinning doesn't seem to
have too much impact on overall performance as shown in the benchmark
data.

The patch that disables reader optimistic spinning shows reduced
performance at lightly loaded cases, but comparable or slightly better
performance on with heavier contention.


I'm not overly worried about the lightly loaded cases here as the users
(mostly thinking mmap_sem) most likely won't care for real workloads,
not, ie: will-it-scale type things.

So at SUSE we also ran into this very same problem with reader optimistic
spinning and considering the fragmentation went with disabling it, much
like this patch - but without the reader optimistic lock stealing bits
you have. So far nothing has really shown to fall out in our performance
automation. And per your data a single reader spinner does not seem to be
worth the added complexity of keeping reader spinning vs ripping it out.

Thanks,
Davidlohr


Re: [PATCH 4/5] locking/rwsem: Wake up all waiting readers if RWSEM_WAKE_READ_OWNED

2020-11-17 Thread Davidlohr Bueso

On Tue, 17 Nov 2020, Waiman Long wrote:


The rwsem wakeup logic has been modified by commit d3681e269fff
("locking/rwsem: Wake up almost all readers in wait queue") to wake up
all readers in the wait queue if the first waiter is a reader. In the
case of RWSEM_WAKE_READ_OWNED, not all readers can be woken up if the
first waiter happens to be a writer. Complete the logic by waking up
all readers even for this case.


While rwsems are certainly not fifo, I'm concerned this would give too
much priority to the readers by having the reader owned lock just skip
over the first waiter. And I'd say most users are more concerned about
the writer side. Basically this would affect the phase-fair properties.

Thanks,
Davidlohr


[PATCH v2] USB: serial: mos7720: defer state restore to a workqueue

2020-11-17 Thread Davidlohr Bueso

The parallel port restore operation currently defers writes
to a tasklet, if it sees a locked disconnect mutex. The
driver goes to a lot of trouble to ensure writes happen
in a non-blocking context, but things can be greatly
simplified if it's done in regular process context and
this is not a system performance critical path. As such,
instead of doing the state restore writes in irq context,
use a workqueue and just do regular synchronous writes.

In addition to the cleanup, this also imposes less on the
overall system as tasklets have been deprecated because
of it's BH implications, potentially blocking a higher
priority task from running. We also get rid of hacks
such as trylocking a mutex in irq, something which does
not play nice with priority boosting in PREEMPT_RT.

Signed-off-by: Davidlohr Bueso 
---
Changes from v1: remove bogus irq comment.

drivers/usb/serial/mos7720.c | 234 ++-
1 file changed, 35 insertions(+), 199 deletions(-)

diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 5a5d2a95070e..41ee2984a0df 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -79,14 +79,6 @@ MODULE_DEVICE_TABLE(usb, id_table);
#define DCR_INIT_VAL   0x0c /* SLCTIN, nINIT */
#define ECR_INIT_VAL   0x00 /* SPP mode */

-struct urbtracker {
-   struct mos7715_parport  *mos_parport;
-   struct list_headurblist_entry;
-   struct kref ref_count;
-   struct urb  *urb;
-   struct usb_ctrlrequest  *setup;
-};
-
enum mos7715_pp_modes {
SPP = 0<<5,
PS2 = 1<<5,  /* moschip calls this 'NIBBLE' mode */
@@ -96,12 +88,9 @@ enum mos7715_pp_modes {
struct mos7715_parport {
struct parport  *pp;   /* back to containing struct */
struct kref ref_count; /* to instance of this struct */
-   struct list_headdeferred_urbs; /* list deferred async urbs */
-   struct list_headactive_urbs;   /* list async urbs in flight */
-   spinlock_t  listlock;  /* protects list access */
boolmsg_pending;   /* usb sync call pending */
struct completion   syncmsg_compl; /* usb sync call completed */
-   struct tasklet_struct   urb_tasklet;   /* for sending deferred urbs */
+   struct work_struct  work;  /* restore deferred writes */
struct usb_serial   *serial;   /* back to containing struct */
__u8shadowECR; /* parallel port regs... */
__u8shadowDCR;
@@ -265,174 +254,8 @@ static void destroy_mos_parport(struct kref *kref)
kfree(mos_parport);
}

-static void destroy_urbtracker(struct kref *kref)
-{
-   struct urbtracker *urbtrack =
-   container_of(kref, struct urbtracker, ref_count);
-   struct mos7715_parport *mos_parport = urbtrack->mos_parport;
-
-   usb_free_urb(urbtrack->urb);
-   kfree(urbtrack->setup);
-   kfree(urbtrack);
-   kref_put(_parport->ref_count, destroy_mos_parport);
-}
-
/*
- * This runs as a tasklet when sending an urb in a non-blocking parallel
- * port callback had to be deferred because the disconnect mutex could not be
- * obtained at the time.
- */
-static void send_deferred_urbs(struct tasklet_struct *t)
-{
-   int ret_val;
-   unsigned long flags;
-   struct mos7715_parport *mos_parport = from_tasklet(mos_parport, t,
-  urb_tasklet);
-   struct urbtracker *urbtrack, *tmp;
-   struct list_head *cursor, *next;
-   struct device *dev;
-
-   /* if release function ran, game over */
-   if (unlikely(mos_parport->serial == NULL))
-   return;
-
-   dev = _parport->serial->dev->dev;
-
-   /* try again to get the mutex */
-   if (!mutex_trylock(_parport->serial->disc_mutex)) {
-   dev_dbg(dev, "%s: rescheduling tasklet\n", __func__);
-   tasklet_schedule(_parport->urb_tasklet);
-   return;
-   }
-
-   /* if device disconnected, game over */
-   if (unlikely(mos_parport->serial->disconnected)) {
-   mutex_unlock(_parport->serial->disc_mutex);
-   return;
-   }
-
-   spin_lock_irqsave(_parport->listlock, flags);
-   if (list_empty(_parport->deferred_urbs)) {
-   spin_unlock_irqrestore(_parport->listlock, flags);
-   mutex_unlock(_parport->serial->disc_mutex);
-   dev_dbg(dev, "%s: deferred_urbs list empty\n", __func__);
-   return;
-   }
-
-   /* move contents of deferred_urbs list to active_urbs list and submit */
-   list_for_each_safe(cursor, next, _parport->deferred_urbs)
-   list_move_tail(cursor, _parport->active_urbs);
-   list_f

Re: [PATCH] USB: serial: mos7720: defer state restore to a workqueue

2020-11-16 Thread Davidlohr Bueso

On Mon, 16 Nov 2020, Johan Hovold wrote:


On Fri, Nov 13, 2020 at 08:27:25PM -0800, Davidlohr Bueso wrote:

@@ -1883,21 +1724,17 @@ static void mos7720_release(struct usb_serial *serial)
if (mos_parport->msg_pending)
wait_for_completion_timeout(_parport->syncmsg_compl,
msecs_to_jiffies(MOS_WDR_TIMEOUT));
+   /*
+* If delayed work is currently scheduled, wait for it to
+* complete. This also implies barriers that ensure the
+* below serial clearing is not hoisted above the ->work.
+*/
+   cancel_work_sync(_parport->work);


As I mentioned, this needs to be done *after* deregistering the port or
you could theoretically end up with the work item being requeued.


Hmm sorry yes I forgot to mention this. I was counting on the private_data
already being null to prevent any new work being actually scheduled, so an
incoming restore state, for example, would be a nop.



Yes, the same applies for the "synchronous" requests, but that's a
preexisting issue.


Per the above I also assumed sync requests were also safe as is.

But I can certainly re-order things, how about:

mos7720_release():
   mos_parport->pp->private_data = NULL;
   parport_remove_port(mos_parport->pp);

   wait_for_completion_timeout();
   cancel_work_sync();

   usb_set_serial_data(serial, NULL);
   mos_parport->serial = NULL;

   parport_del_port(mos_parport->pp);
   kref_put();

Thanks,
Davidlohr


[PATCH] USB: serial: mos7720: defer state restore to a workqueue

2020-11-13 Thread Davidlohr Bueso

The parallel port restore operation currently defers writes
to a tasklet, if it sees a locked disconnect mutex. The
driver goes to a lot of trouble to ensure writes happen
in a non-blocking context, but things can be greatly
simplified if it's done in regular process context and
this is not a system performance critical path. As such,
instead of doing the async state restore writes in irq
context, use a workqueue and just do regular synchronous
writes.

In addition to the cleanup, this also imposes less on the
overall system as tasklets have been deprecated because
of it's BH implications, potentially blocking a higher
priority task from running. We also get rid of hacks
such as trylocking a mutex in irq, something which does
not play nice with priority boosting in PREEMPT_RT.

Signed-off-by: Davidlohr Bueso 
---
drivers/usb/serial/mos7720.c | 235 ++-
1 file changed, 36 insertions(+), 199 deletions(-)

diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 5a5d2a95070e..d36aaa4a13de 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -79,14 +79,6 @@ MODULE_DEVICE_TABLE(usb, id_table);
#define DCR_INIT_VAL   0x0c /* SLCTIN, nINIT */
#define ECR_INIT_VAL   0x00 /* SPP mode */

-struct urbtracker {
-   struct mos7715_parport  *mos_parport;
-   struct list_headurblist_entry;
-   struct kref ref_count;
-   struct urb  *urb;
-   struct usb_ctrlrequest  *setup;
-};
-
enum mos7715_pp_modes {
SPP = 0<<5,
PS2 = 1<<5,  /* moschip calls this 'NIBBLE' mode */
@@ -96,12 +88,9 @@ enum mos7715_pp_modes {
struct mos7715_parport {
struct parport  *pp;   /* back to containing struct */
struct kref ref_count; /* to instance of this struct */
-   struct list_headdeferred_urbs; /* list deferred async urbs */
-   struct list_headactive_urbs;   /* list async urbs in flight */
-   spinlock_t  listlock;  /* protects list access */
boolmsg_pending;   /* usb sync call pending */
struct completion   syncmsg_compl; /* usb sync call completed */
-   struct tasklet_struct   urb_tasklet;   /* for sending deferred urbs */
+   struct work_struct  work;  /* restore deferred writes */
struct usb_serial   *serial;   /* back to containing struct */
__u8shadowECR; /* parallel port regs... */
__u8shadowDCR;
@@ -265,174 +254,8 @@ static void destroy_mos_parport(struct kref *kref)
kfree(mos_parport);
}

-static void destroy_urbtracker(struct kref *kref)
-{
-   struct urbtracker *urbtrack =
-   container_of(kref, struct urbtracker, ref_count);
-   struct mos7715_parport *mos_parport = urbtrack->mos_parport;
-
-   usb_free_urb(urbtrack->urb);
-   kfree(urbtrack->setup);
-   kfree(urbtrack);
-   kref_put(_parport->ref_count, destroy_mos_parport);
-}
-
/*
- * This runs as a tasklet when sending an urb in a non-blocking parallel
- * port callback had to be deferred because the disconnect mutex could not be
- * obtained at the time.
- */
-static void send_deferred_urbs(struct tasklet_struct *t)
-{
-   int ret_val;
-   unsigned long flags;
-   struct mos7715_parport *mos_parport = from_tasklet(mos_parport, t,
-  urb_tasklet);
-   struct urbtracker *urbtrack, *tmp;
-   struct list_head *cursor, *next;
-   struct device *dev;
-
-   /* if release function ran, game over */
-   if (unlikely(mos_parport->serial == NULL))
-   return;
-
-   dev = _parport->serial->dev->dev;
-
-   /* try again to get the mutex */
-   if (!mutex_trylock(_parport->serial->disc_mutex)) {
-   dev_dbg(dev, "%s: rescheduling tasklet\n", __func__);
-   tasklet_schedule(_parport->urb_tasklet);
-   return;
-   }
-
-   /* if device disconnected, game over */
-   if (unlikely(mos_parport->serial->disconnected)) {
-   mutex_unlock(_parport->serial->disc_mutex);
-   return;
-   }
-
-   spin_lock_irqsave(_parport->listlock, flags);
-   if (list_empty(_parport->deferred_urbs)) {
-   spin_unlock_irqrestore(_parport->listlock, flags);
-   mutex_unlock(_parport->serial->disc_mutex);
-   dev_dbg(dev, "%s: deferred_urbs list empty\n", __func__);
-   return;
-   }
-
-   /* move contents of deferred_urbs list to active_urbs list and submit */
-   list_for_each_safe(cursor, next, _parport->deferred_urbs)
-   list_move_tail(cursor, _parport->active_urbs);
-   list_for_each_entry_safe(urbtrack, tmp, _parport-&

Re: [PATCH 1/8] epoll: check for events when removing a timed out thread from the wait queue

2020-11-09 Thread Davidlohr Bueso

On Fri, 06 Nov 2020, Soheil Hassas Yeganeh wrote:


From: Soheil Hassas Yeganeh 

After abc610e01c66 ("fs/epoll: avoid barrier after an epoll_wait(2)
timeout"), we break out of the ep_poll loop upon timeout, without checking
whether there is any new events available.  Prior to that patch-series we
always called ep_events_available() after exiting the loop.

This can cause races and missed wakeups.  For example, consider the
following scenario reported by Guantao Liu:

Suppose we have an eventfd added using EPOLLET to an epollfd.

Thread 1: Sleeps for just below 5ms and then writes to an eventfd.
Thread 2: Calls epoll_wait with a timeout of 5 ms. If it sees an
 event of the eventfd, it will write back on that fd.
Thread 3: Calls epoll_wait with a negative timeout.

Prior to abc610e01c66, it is guaranteed that Thread 3 will wake up either
by Thread 1 or Thread 2.  After abc610e01c66, Thread 3 can be blocked
indefinitely if Thread 2 sees a timeout right before the write to the
eventfd by Thread 1.  Thread 2 will be woken up from
schedule_hrtimeout_range and, with evail 0, it will not call
ep_send_events().

To fix this issue:
1) Simplify the timed_out case as suggested by Linus.
2) while holding the lock, recheck whether the thread was woken up
  after its time out has reached.

Note that (2) is different from Linus' original suggestion: It do not
set "eavail = ep_events_available(ep)" to avoid unnecessary contention
(when there are too many timed-out threads and a small number of events),
as well as races mentioned in the discussion thread.

This is the first patch in the series so that the backport to stable
releases is straightforward.

Link: 
https://lkml.kernel.org/r/CAHk-=wizk=oxuyqpbo8ms41w2pag1kniuv5wdd5qwl-gq1k...@mail.gmail.com
Fixes: abc610e01c66 ("fs/epoll: avoid barrier after an epoll_wait(2) timeout")
Tested-by: Guantao Liu 
Suggested-by: Linus Torvalds 
Signed-off-by: Soheil Hassas Yeganeh 
Reported-by: Guantao Liu 
Reviewed-by: Eric Dumazet 
Reviewed-by: Willem de Bruijn 
Reviewed-by: Khazhismel Kumykov 


Thanks for providing the fix and a testcase.

Reviewed-by: Davidlohr Bueso 


Re: [PATCH] usb/mos7720: process deferred urbs in a workqueue

2020-11-09 Thread Davidlohr Bueso

On Mon, 09 Nov 2020, Oliver Neukum wrote:


Am Donnerstag, den 05.11.2020, 22:17 -0800 schrieb Davidlohr Bueso:

@@ -1888,16 +1732,8 @@ static void mos7720_release(struct usb_serial *serial)
usb_set_serial_data(serial, NULL);
mos_parport->serial = NULL;

-   /* if tasklet currently scheduled, wait for it to complete */
-   tasklet_kill(_parport->urb_tasklet);
-
-   /* unlink any urbs sent by the tasklet  */
-   spin_lock_irqsave(_parport->listlock, flags);
-   list_for_each_entry(urbtrack,
-   _parport->active_urbs,
-   urblist_entry)
-   usb_unlink_urb(urbtrack->urb);
-   spin_unlock_irqrestore(_parport->listlock, flags);
+   /* if work is currently scheduled, wait for it to complete */
+   cancel_work_sync(_parport->work);
parport_del_port(mos_parport->pp);

kref_put(_parport->ref_count, destroy_mos_parport);


Hi,

do you really want to cancel as opposed to wait for work in release()?


Well I tried to maintain the current semantics here. tasklet_kill() is
equivalent to cancel_work_sync() in that they both wait for the delayed
execution to finish running and guarantee that it is no longer queued.

Thanks,
Davidlohr


Re: [PATCH 0/8] simplify ep_poll

2020-11-09 Thread Davidlohr Bueso

On Sat, 07 Nov 2020, Soheil Hassas Yeganeh wrote:

FWIW, I also stress-tested the patch series applied on top of
linux-next/master for a couple of hours.


Out of curiosity, what exactly did you use for testing?

Thanks,
Davidlohr


Re: [PATCH tip/core/rcu 03/28] locktorture: Track time of last ->writeunlock()

2020-11-05 Thread Davidlohr Bueso

On Thu, 05 Nov 2020, paul...@kernel.org wrote:


From: "Paul E. McKenney" 

This commit adds a last_lock_release variable that tracks the time of
the last ->writeunlock() call, which allows easier diagnosing of lock
hangs when using a kernel debugger.


This makes sense to have.

Acked-by: Davidlohr Bueso 



Cc: Davidlohr Bueso 
Signed-off-by: Paul E. McKenney 
---
kernel/locking/locktorture.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 62d215b..316531d 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -60,6 +60,7 @@ static struct task_struct **reader_tasks;

static bool lock_is_write_held;
static bool lock_is_read_held;
+static unsigned long last_lock_release;

struct lock_stress_stats {
long n_lock_fail;
@@ -632,6 +633,7 @@ static int lock_torture_writer(void *arg)
lwsp->n_lock_acquired++;
cxt.cur_ops->write_delay();
lock_is_write_held = false;
+   WRITE_ONCE(last_lock_release, jiffies);
cxt.cur_ops->writeunlock();

stutter_wait("lock_torture_writer");
--
2.9.5



Re: [PATCH] usb/mos7720: process deferred urbs in a workqueue

2020-11-05 Thread Davidlohr Bueso

On Thu, 05 Nov 2020, Johan Hovold wrote:

On Wed, Nov 04, 2020 at 04:13:07PM -0800, Davidlohr Bueso wrote:

Also, but not strictly related to this. What do you think of deferring all
work in write_parport_reg_nonblock() unconditionally? I'd like to avoid
that mutex_trylock() because eventually I'll be re-adding a warn in the
locking code, but that would also simplify the code done here in the
nonblocking irq write. I'm not at all familiar with parport, but I would
think that restore_state context would not care.


Sounds good to me. As long as the state is restored before submitting
further requests we should be fine. That would even allow getting rid of
write_parport_reg_nonblock() as you can restore the state using
synchronous calls from the worker thread. Should simplify things quite a
bit.


What about something like the below (probably buggy)? I avoided messing with
the completion in the work callback, like what prologue/epilogue does for all
other synchronous calls, because when releasing we sync the work anyway and we
need to account for scenarios where the work is scheduled but yet not running,
so it would not be the best fit. And this also makes the flush_work() always
wait for both writes, otherwise I was having the thread locklessly busy-wait
on a flag that was set until both write_parport_reg_nonblock() calls returned
before the flush such that all potential scheduled work was observed. Unless
I missed something, the cleanup is considerable.

diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 5a5d2a95070e..8a9408b94cb0 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -79,14 +79,6 @@ MODULE_DEVICE_TABLE(usb, id_table);
#define DCR_INIT_VAL   0x0c /* SLCTIN, nINIT */
#define ECR_INIT_VAL   0x00 /* SPP mode */

-struct urbtracker {
-   struct mos7715_parport  *mos_parport;
-   struct list_headurblist_entry;
-   struct kref ref_count;
-   struct urb  *urb;
-   struct usb_ctrlrequest  *setup;
-};
-
enum mos7715_pp_modes {
SPP = 0<<5,
PS2 = 1<<5,  /* moschip calls this 'NIBBLE' mode */
@@ -96,12 +88,9 @@ enum mos7715_pp_modes {
struct mos7715_parport {
struct parport  *pp;   /* back to containing struct */
struct kref ref_count; /* to instance of this struct */
-   struct list_headdeferred_urbs; /* list deferred async urbs */
-   struct list_headactive_urbs;   /* list async urbs in flight */
-   spinlock_t  listlock;  /* protects list access */
boolmsg_pending;   /* usb sync call pending */
struct completion   syncmsg_compl; /* usb sync call completed */
-   struct tasklet_struct   urb_tasklet;   /* for sending deferred urbs */
+   struct work_struct  work;  /* restore deferred writes */
struct usb_serial   *serial;   /* back to containing struct */
__u8shadowECR; /* parallel port regs... */
__u8shadowDCR;
@@ -265,172 +254,6 @@ static void destroy_mos_parport(struct kref *kref)
kfree(mos_parport);
}

-static void destroy_urbtracker(struct kref *kref)
-{
-   struct urbtracker *urbtrack =
-   container_of(kref, struct urbtracker, ref_count);
-   struct mos7715_parport *mos_parport = urbtrack->mos_parport;
-
-   usb_free_urb(urbtrack->urb);
-   kfree(urbtrack->setup);
-   kfree(urbtrack);
-   kref_put(_parport->ref_count, destroy_mos_parport);
-}
-
-/*
- * This runs as a tasklet when sending an urb in a non-blocking parallel
- * port callback had to be deferred because the disconnect mutex could not be
- * obtained at the time.
- */
-static void send_deferred_urbs(struct tasklet_struct *t)
-{
-   int ret_val;
-   unsigned long flags;
-   struct mos7715_parport *mos_parport = from_tasklet(mos_parport, t,
-  urb_tasklet);
-   struct urbtracker *urbtrack, *tmp;
-   struct list_head *cursor, *next;
-   struct device *dev;
-
-   /* if release function ran, game over */
-   if (unlikely(mos_parport->serial == NULL))
-   return;
-
-   dev = _parport->serial->dev->dev;
-
-   /* try again to get the mutex */
-   if (!mutex_trylock(_parport->serial->disc_mutex)) {
-   dev_dbg(dev, "%s: rescheduling tasklet\n", __func__);
-   tasklet_schedule(_parport->urb_tasklet);
-   return;
-   }
-
-   /* if device disconnected, game over */
-   if (unlikely(mos_parport->serial->disconnected)) {
-   mutex_unlock(_parport->serial->disc_mutex);
-   return;
-   }
-
-   spin_lock_irqsave(_parport->listlock, flags);
-   if (list_empty(_parport->deferred_

Re: [PATCH] usb/mos7720: process deferred urbs in a workqueue

2020-11-04 Thread Davidlohr Bueso

On Wed, 04 Nov 2020, Johan Hovold wrote:


Hmm. I took at closer look at the parport code and it seems the current
implementation is already racy but that removing the tasklet is going to
widen that that window.

Those register writes in restore() should be submitted before any
later requests. Perhaps setting a flag and flushing the work in
parport_prologue() could work?


Ah, I see and agree. Considering work is only deferred from restore_state()
I don't even think we need a flag, no? We can let parport_prologue()
just flush_work() unconditionally (right before taking the disc_mutex)
which for the most part will be idle anyway. The flush_work() also becomes
saner now that we'll stop rescheduling work in send_deferred_urbs().

Also, but not strictly related to this. What do you think of deferring all
work in write_parport_reg_nonblock() unconditionally? I'd like to avoid
that mutex_trylock() because eventually I'll be re-adding a warn in the
locking code, but that would also simplify the code done here in the
nonblocking irq write. I'm not at all familiar with parport, but I would
think that restore_state context would not care.


On the other hand, the restore() implementation looks broken in that it
doesn't actually restore the provided state. I'll go fix that up.


How did this thing ever work?

Thanks,
Davidlohr


Re: [PATCH] usb/mos7720: process deferred urbs in a workqueue

2020-11-03 Thread Davidlohr Bueso

On Mon, 02 Nov 2020, Bueso wrote:


There is
also no need anymore for atomic allocations.


Bleh this is a brain fart - obviously not true as usb_submit_urb() is
called under mos_parport->listlock. I'll send a v2 unless you have
any objections.

Thanks,
Davidlohr


[PATCH] usb/mos7720: process deferred urbs in a workqueue

2020-11-02 Thread Davidlohr Bueso
Tasklets have long been deprecated as being too heavy on the
system by running in irq context - and this is not a performance
critical path. If a higher priority process wants to run, it
must wait for the tasklet to finish before doing so. In addition,
mutex_trylock() is not supposed to be used in irq context because
it can confuse priority boosting in PREEMPT_RT, although in this
case the lock is held and released in the same context.

This conversion from tasklet to workqueue allows to avoid
playing games with the disconnect mutex, having to re-reschedule
in the callback, now just take the mutex normally. There is
also no need anymore for atomic allocations.

Signed-off-by: Davidlohr Bueso 
---
Compile tested only.

 drivers/usb/serial/mos7720.c | 38 
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 5eed1078fac8..6982800e61d4 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -101,7 +101,7 @@ struct mos7715_parport {
spinlock_t  listlock;  /* protects list access */
boolmsg_pending;   /* usb sync call pending */
struct completion   syncmsg_compl; /* usb sync call completed */
-   struct tasklet_struct   urb_tasklet;   /* for sending deferred urbs */
+   struct work_struct  urb_wq;/* for sending deferred urbs */
struct usb_serial   *serial;   /* back to containing struct */
__u8shadowECR; /* parallel port regs... */
__u8shadowDCR;
@@ -278,32 +278,28 @@ static void destroy_urbtracker(struct kref *kref)
 }
 
 /*
- * This runs as a tasklet when sending an urb in a non-blocking parallel
- * port callback had to be deferred because the disconnect mutex could not be
- * obtained at the time.
+ * This runs as a workqueue (process context) when sending a urb from a
+ * non-blocking parallel port callback which had to be deferred because
+ * the disconnect mutex could not be obtained at the time.
  */
-static void send_deferred_urbs(struct tasklet_struct *t)
+static void send_deferred_urbs(struct work_struct *work)
 {
int ret_val;
unsigned long flags;
-   struct mos7715_parport *mos_parport = from_tasklet(mos_parport, t,
-  urb_tasklet);
+   struct mos7715_parport *mos_parport;
struct urbtracker *urbtrack, *tmp;
struct list_head *cursor, *next;
struct device *dev;
 
+   mos_parport = container_of(work, struct mos7715_parport, urb_wq);
+
/* if release function ran, game over */
if (unlikely(mos_parport->serial == NULL))
return;
 
dev = _parport->serial->dev->dev;
 
-   /* try again to get the mutex */
-   if (!mutex_trylock(_parport->serial->disc_mutex)) {
-   dev_dbg(dev, "%s: rescheduling tasklet\n", __func__);
-   tasklet_schedule(_parport->urb_tasklet);
-   return;
-   }
+   mutex_lock(_parport->serial->disc_mutex);
 
/* if device disconnected, game over */
if (unlikely(mos_parport->serial->disconnected)) {
@@ -324,7 +320,7 @@ static void send_deferred_urbs(struct tasklet_struct *t)
list_move_tail(cursor, _parport->active_urbs);
list_for_each_entry_safe(urbtrack, tmp, _parport->active_urbs,
urblist_entry) {
-   ret_val = usb_submit_urb(urbtrack->urb, GFP_ATOMIC);
+   ret_val = usb_submit_urb(urbtrack->urb, GFP_KERNEL);
dev_dbg(dev, "%s: urb submitted\n", __func__);
if (ret_val) {
dev_err(dev, "usb_submit_urb() failed: %d\n", ret_val);
@@ -394,15 +390,15 @@ static int write_parport_reg_nonblock(struct 
mos7715_parport *mos_parport,
 
/*
 * get the disconnect mutex, or add tracker to the deferred_urbs list
-* and schedule a tasklet to try again later
+* and schedule a workqueue to process it later
 */
if (!mutex_trylock(>disc_mutex)) {
spin_lock_irqsave(_parport->listlock, flags);
list_add_tail(>urblist_entry,
  _parport->deferred_urbs);
spin_unlock_irqrestore(_parport->listlock, flags);
-   tasklet_schedule(_parport->urb_tasklet);
-   dev_dbg(>dev, "tasklet scheduled\n");
+   schedule_work(_parport->urb_wq);
+   dev_dbg(>dev, "workqueue scheduled\n");
return 0;
}
 
@@ -717,7 +713,7 @@ static int mos7715_parport_init(struct usb_serial *serial)
INIT_LIST_HEAD(_parport->deferred_urbs);
usb_set_serial_data(serial, mos_parport); /* hi

[PATCH] media/siano: kill pointless kmutex definitions

2020-11-01 Thread Davidlohr Bueso
Use the mutex api instead of renaming the calls for this
driver.

Signed-off-by: Davidlohr Bueso 
---
This was found while auditing mutex semantics in drivers.

 drivers/media/common/siano/smscoreapi.c  | 42 
 drivers/media/common/siano/smscoreapi.h  |  5 ---
 drivers/media/common/siano/smsdvb-main.c | 14 
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/drivers/media/common/siano/smscoreapi.c 
b/drivers/media/common/siano/smscoreapi.c
index c1511094fdc7..410cc3ac6f94 100644
--- a/drivers/media/common/siano/smscoreapi.c
+++ b/drivers/media/common/siano/smscoreapi.c
@@ -429,13 +429,13 @@ static struct smscore_registry_entry_t 
*smscore_find_registry(char *devpath)
struct smscore_registry_entry_t *entry;
struct list_head *next;
 
-   kmutex_lock(_smscore_registrylock);
+   mutex_lock(_smscore_registrylock);
for (next = g_smscore_registry.next;
 next != _smscore_registry;
 next = next->next) {
entry = (struct smscore_registry_entry_t *) next;
if (!strncmp(entry->devpath, devpath, sizeof(entry->devpath))) {
-   kmutex_unlock(_smscore_registrylock);
+   mutex_unlock(_smscore_registrylock);
return entry;
}
}
@@ -446,7 +446,7 @@ static struct smscore_registry_entry_t 
*smscore_find_registry(char *devpath)
list_add(>entry, _smscore_registry);
} else
pr_err("failed to create smscore_registry.\n");
-   kmutex_unlock(_smscore_registrylock);
+   mutex_unlock(_smscore_registrylock);
return entry;
 }
 
@@ -527,7 +527,7 @@ int smscore_register_hotplug(hotplug_t hotplug)
struct list_head *next, *first;
int rc = 0;
 
-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);
notifyee = kmalloc(sizeof(*notifyee), GFP_KERNEL);
if (notifyee) {
/* now notify callback about existing devices */
@@ -548,7 +548,7 @@ int smscore_register_hotplug(hotplug_t hotplug)
} else
rc = -ENOMEM;
 
-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);
 
return rc;
 }
@@ -564,7 +564,7 @@ void smscore_unregister_hotplug(hotplug_t hotplug)
 {
struct list_head *next, *first;
 
-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);
 
first = _smscore_notifyees;
 
@@ -579,7 +579,7 @@ void smscore_unregister_hotplug(hotplug_t hotplug)
}
}
 
-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);
 }
 EXPORT_SYMBOL_GPL(smscore_unregister_hotplug);
 
@@ -732,9 +732,9 @@ int smscore_register_device(struct smsdevice_params_t 
*params,
smscore_registry_settype(dev->devpath, params->device_type);
 
/* add device to devices list */
-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);
list_add(>entry, _smscore_devices);
-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);
 
*coredev = dev;
 
@@ -890,14 +890,14 @@ int smscore_start_device(struct smscore_device_t *coredev)
return rc;
}
 
-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);
 
rc = smscore_notify_callbacks(coredev, coredev->device, 1);
smscore_init_ir(coredev);
 
pr_debug("device %p started, rc %d\n", coredev, rc);
 
-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);
 
return rc;
 }
@@ -1197,7 +1197,7 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
int num_buffers = 0;
int retry = 0;
 
-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);
 
/* Release input device (IR) resources */
sms_ir_exit(coredev);
@@ -1224,9 +1224,9 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
 
pr_debug("waiting for %d buffer(s)\n",
 coredev->num_buffers - num_buffers);
-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);
msleep(100);
-   kmutex_lock(_smscore_deviceslock);
+   mutex_lock(_smscore_deviceslock);
}
 
pr_debug("freed %d buffers\n", num_buffers);
@@ -1245,7 +1245,7 @@ void smscore_unregister_device(struct smscore_device_t 
*coredev)
list_del(>entry);
kfree(coredev);
 
-   kmutex_unlock(_smscore_deviceslock);
+   mutex_unlock(_smscore_deviceslock);
 
pr_debug("device %p destroyed\n", coredev);
 }
@@ -2123,17 +2123,17 @@ static int __init smscore_module_init(void)
 {
INIT_LIST_HEAD(_smscore_notifyees);
  

[PATCH v2] fs/dcache: optimize start_dir_add()

2020-10-29 Thread Davidlohr Bueso

Considering both end_dir_add() and d_alloc_parallel(), the
dir->i_dir_seq wants acquire/release semantics, therefore
micro-optimize for ll/sc archs. Also add READ_ONCE around
the variable mostly for documentation purposes - either
the successful cmpxchg or the pause will avoid the tearing).

Signed-off-by: Davidlohr Bueso 
---
fs/dcache.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index ea0485861d93..9177f0d08a5a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2504,8 +2504,8 @@ static inline unsigned start_dir_add(struct inode *dir)
{

for (;;) {
-   unsigned n = dir->i_dir_seq;
-   if (!(n & 1) && cmpxchg(>i_dir_seq, n, n + 1) == n)
+   unsigned n = READ_ONCE(dir->i_dir_seq);
+   if (!(n & 1) && cmpxchg_acquire(>i_dir_seq, n, n + 1) == n)
return n;
cpu_relax();
}
--
2.26.2


Re: [PATCH 3/3] sched: Add cond_resched_rwlock

2020-10-27 Thread Davidlohr Bueso

On Tue, 27 Oct 2020, Sean Christopherson wrote:


On Tue, Oct 27, 2020 at 09:49:50AM -0700, Ben Gardon wrote:

Rescheduling while holding a spin lock is essential for keeping long
running kernel operations running smoothly. Add the facility to
cond_resched rwlocks.


Nit: I would start the paragraph with 'Safely rescheduling ...'
While obvious when reading the code, 'Rescheduling while holding
a spin lock' can throw the reader off.



This adds two new exports and two new macros without any in-tree users, which
is generally frowned upon.  You and I know these will be used by KVM's new
TDP MMU, but the non-KVM folks, and more importantly the maintainers of this
code, are undoubtedly going to ask "why".  I.e. these patches probably belong
in the KVM series to switch to a rwlock for the TDP MMU.

Regarding the code, it's all copy-pasted from the spinlock code and darn near
identical.  It might be worth adding builder macros for these.


Agreed, all three could be nicely consolidated. Otherwise this series looks
sane, feel free to add my:

Acked-by: Davidlohr Bueso 


[tip: timers/core] timekeeping: Convert jiffies_seq to seqcount_raw_spinlock_t

2020-10-26 Thread tip-bot2 for Davidlohr Bueso
The following commit has been merged into the timers/core branch of tip:

Commit-ID: 1a2b85f1e2a93a3f84243e654d225e4088735336
Gitweb:
https://git.kernel.org/tip/1a2b85f1e2a93a3f84243e654d225e4088735336
Author:Davidlohr Bueso 
AuthorDate:Wed, 21 Oct 2020 12:07:49 -07:00
Committer: Thomas Gleixner 
CommitterDate: Mon, 26 Oct 2020 11:04:14 +01:00

timekeeping: Convert jiffies_seq to seqcount_raw_spinlock_t

Use the new api and associate the seqcounter to the jiffies_lock enabling
lockdep support - although for this particular case the write-side locking
and non-preemptibility are quite obvious.

Signed-off-by: Davidlohr Bueso 
Signed-off-by: Thomas Gleixner 
Link: https://lore.kernel.org/r/20201021190749.19363-1-d...@stgolabs.net

---
 kernel/time/jiffies.c | 3 ++-
 kernel/time/timekeeping.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index eddcf49..a5cffe2 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -59,7 +59,8 @@ static struct clocksource clocksource_jiffies = {
 };
 
 __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
-__cacheline_aligned_in_smp seqcount_t jiffies_seq;
+__cacheline_aligned_in_smp seqcount_raw_spinlock_t jiffies_seq =
+   SEQCNT_RAW_SPINLOCK_ZERO(jiffies_seq, _lock);
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 099737f..6c2cbd9 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -26,7 +26,7 @@ extern void do_timer(unsigned long ticks);
 extern void update_wall_time(void);
 
 extern raw_spinlock_t jiffies_lock;
-extern seqcount_t jiffies_seq;
+extern seqcount_raw_spinlock_t jiffies_seq;
 
 #define CS_NAME_LEN32
 


[PATCH] fs/dcache: optimize start_dir_add()

2020-10-22 Thread Davidlohr Bueso
Considering both end_dir_add() and d_alloc_parallel(), the
dir->i_dir_seq wants acquire/release semantics, therefore
micro-optimize for ll/sc archs and use finer grained barriers
to provide (load)-ACQUIRE ordering (L->S + L->L). This comes
at no additional cost for most of x86, as sane tso models will
have a nop for smp_rmb/smp_acquire__after_ctrl_dep.

Signed-off-by: Davidlohr Bueso 
---
Alternatively I guess we could just use cmpxchg_acquire().

 fs/dcache.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index ea0485861d93..22738daccb9c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2502,13 +2502,18 @@ EXPORT_SYMBOL(d_rehash);
 
 static inline unsigned start_dir_add(struct inode *dir)
 {
+   unsigned n;
 
for (;;) {
-   unsigned n = dir->i_dir_seq;
-   if (!(n & 1) && cmpxchg(>i_dir_seq, n, n + 1) == n)
-   return n;
+   n = READ_ONCE(dir->i_dir_seq);
+   if (!(n & 1) && cmpxchg_relaxed(>i_dir_seq, n, n + 1) == n)
+   break;
cpu_relax();
}
+
+   /* create (load)-ACQUIRE ordering */
+   smp_acquire__after_ctrl_dep();
+   return n;
 }
 
 static inline void end_dir_add(struct inode *dir, unsigned n)
-- 
2.26.2



[PATCH] btrfs: convert data_seqcount to seqcount_mutex_t

2020-10-21 Thread Davidlohr Bueso
By doing so we can associate the sequence counter to the chunk_mutex
for lockdep purposes (compiled-out otherwise) and also avoid
explicitly disabling preemption around the write region as it will now
be done automatically by the seqcount machinery based on the lock type.

Signed-off-by: Davidlohr Bueso 
---
 fs/btrfs/volumes.c |  2 +-
 fs/btrfs/volumes.h | 10 --
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 58b9c419a2b6..b6ee92ddc624 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -431,7 +431,7 @@ static struct btrfs_device *__alloc_device(struct 
btrfs_fs_info *fs_info)
 
atomic_set(>reada_in_flight, 0);
atomic_set(>dev_stats_ccnt, 0);
-   btrfs_device_data_ordered_init(dev);
+   btrfs_device_data_ordered_init(dev, fs_info);
INIT_RADIX_TREE(>reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(>reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
extent_io_tree_init(fs_info, >alloc_state,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index bf27ac07d315..ff3ec11bb9d2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -39,10 +39,10 @@ struct btrfs_io_geometry {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 #include 
 #define __BTRFS_NEED_DEVICE_DATA_ORDERED
-#define btrfs_device_data_ordered_init(device) \
-   seqcount_init(>data_seqcount)
+#define btrfs_device_data_ordered_init(device, info)   
\
+   seqcount_mutex_init(>data_seqcount, >chunk_mutex)
 #else
-#define btrfs_device_data_ordered_init(device) do { } while (0)
+#define btrfs_device_data_ordered_init(device, info) do { } while (0)
 #endif
 
 #define BTRFS_DEV_STATE_WRITEABLE  (0)
@@ -71,7 +71,7 @@ struct btrfs_device {
blk_status_t last_flush_error;
 
 #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
-   seqcount_t data_seqcount;
+   seqcount_mutex_t data_seqcount;
 #endif
 
/* the internal btrfs device id */
@@ -162,11 +162,9 @@ btrfs_device_get_##name(const struct btrfs_device *dev)
\
 static inline void \
 btrfs_device_set_##name(struct btrfs_device *dev, u64 size)\
 {  \
-   preempt_disable();  \
write_seqcount_begin(>data_seqcount);  \
dev->name = size;   \
write_seqcount_end(>data_seqcount);\
-   preempt_enable();   \
 }
 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
 #define BTRFS_DEVICE_GETSET_FUNCS(name)
\
--
2.26.2



  1   2   3   4   5   6   7   8   9   10   >