BTT lanes serialize access to per-lane metadata and workspace state
during BTT I/O. The btt-check unit test reports data mismatches during
BTT writes due to a race in lane acquisition that can lead to silent
data corruption.
The existing lane model uses a spinlock together with a per-CPU
recursion count. That recursion model stopped being valid after BTT
lanes became preemptible: another task can run on the same CPU,
observe a non-zero recursion count, bypass locking, and use the same
lane concurrently.
BTT lanes are also held across arena_write_bytes() calls. That path
reaches nsio_rw_bytes(), which flushes writes with nvdimm_flush().
Some provider flush callbacks can sleep, making a spinlock the wrong
primitive for the lane lifetime.
Replace the spinlock-based recursion model with a dynamically
allocated per-lane mutex array and take the lane lock
unconditionally.
Add might_sleep() to catch any future atomic-context caller.
Found with the ndctl unit test btt-check.sh.
Fixes: 36c75ce3bd29 ("nd_btt: Make BTT lanes preemptible")
Assisted-by: Claude Sonnet 4.5
Signed-off-by: Alison Schofield <[email protected]>
---
Changes in v5:
- Align lane mutex entries to cachelines in SMP builds (Sashiko AI)
- Add sparse lock annotations for lane mutexes (DaveJ)
- s/spinlock/mutexes in the driver-api doc btt.rst
Changes in v4:
- Replace per-CPU lane storage w dynamically allocated mutex array (Sashiko AI)
- Remove the recursion fast path and take the lane lock unconditionally
- Update commit log
Changes in v3:
Replace spinlock with a per-lane mutex (Arboorva)
Changes in v2:
Use spin_(un)lock_bh() (Sashiko AI)
Update commit log per softirq re-enty and spinlock change
A new unit test to stress this is under review here:
https://lore.kernel.org/nvdimm/[email protected]/
Documentation/driver-api/nvdimm/btt.rst | 4 +-
drivers/nvdimm/nd.h | 7 ++-
drivers/nvdimm/region_devs.c | 64 ++++++++-----------------
3 files changed, 25 insertions(+), 50 deletions(-)
diff --git a/Documentation/driver-api/nvdimm/btt.rst
b/Documentation/driver-api/nvdimm/btt.rst
index 2d8269f834bd..e3218863ec96 100644
--- a/Documentation/driver-api/nvdimm/btt.rst
+++ b/Documentation/driver-api/nvdimm/btt.rst
@@ -162,8 +162,8 @@ process::
A lane number is obtained at the start of any IO, and is used for indexing into
all the on-disk and in-memory data structures for the duration of the IO. If
-there are more CPUs than the max number of available lanes, than lanes are
-protected by spinlocks.
+there are more CPUs than the max number of available lanes, then lanes are
+protected by mutexes.
d. In-memory data structure: Read Tracking Table (RTT)
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index b199eea3260e..263b7dde0f87 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -366,9 +366,8 @@ unsigned sizeof_namespace_label(struct nvdimm_drvdata *ndd);
res; res = next, next = next ? next->sibling : NULL)
struct nd_percpu_lane {
- int count;
- spinlock_t lock;
-};
+ struct mutex lock; /* serialize lane access */
+} ____cacheline_aligned_in_smp;
enum nd_label_flags {
ND_LABEL_REAP,
@@ -420,7 +419,7 @@ struct nd_region {
struct kernfs_node *bb_state;
struct badblocks bb;
struct nd_interleave_set *nd_set;
- struct nd_percpu_lane __percpu *lane;
+ struct nd_percpu_lane *lane;
int (*flush)(struct nd_region *nd_region, struct bio *bio);
struct nd_mapping mapping[] __counted_by(ndr_mappings);
};
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index e35c2e18518f..9f5a34181cf5 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -192,7 +192,7 @@ static void nd_region_release(struct device *dev)
put_device(&nvdimm->dev);
}
- free_percpu(nd_region->lane);
+ kfree(nd_region->lane);
if (!test_bit(ND_REGION_CXL, &nd_region->flags))
memregion_free(nd_region->id);
kfree(nd_region);
@@ -904,52 +904,30 @@ void nd_region_advance_seeds(struct nd_region *nd_region,
struct device *dev)
* nd_region_acquire_lane - allocate and lock a lane
* @nd_region: region id and number of lanes possible
*
- * A lane correlates to a BLK-data-window and/or a log slot in the BTT.
- * We optimize for the common case where there are 256 lanes, one
- * per-cpu. For larger systems we need to lock to share lanes. For now
- * this implementation assumes the cost of maintaining an allocator for
- * free lanes is on the order of the lock hold time, so it implements a
- * static lane = cpu % num_lanes mapping.
+ * A lane correlates to a log slot in the BTT. Lanes are shared across
+ * CPUs using a static lane = cpu % num_lanes mapping, with a per-lane
+ * mutex to serialize access.
*
- * In the case of a BTT instance on top of a BLK namespace a lane may be
- * acquired recursively. We lock on the first instance.
- *
- * In the case of a BTT instance on top of PMEM, we only acquire a lane
- * for the BTT metadata updates.
+ * Callers must be in sleepable context. The only in-tree caller is
+ * BTT's ->submit_bio handler (btt_read_pg / btt_write_pg).
*/
unsigned int nd_region_acquire_lane(struct nd_region *nd_region)
+ __acquires(&nd_region->lane[lane].lock)
{
- unsigned int cpu, lane;
+ unsigned int lane;
- migrate_disable();
- cpu = smp_processor_id();
- if (nd_region->num_lanes < nr_cpu_ids) {
- struct nd_percpu_lane *ndl_lock, *ndl_count;
-
- lane = cpu % nd_region->num_lanes;
- ndl_count = per_cpu_ptr(nd_region->lane, cpu);
- ndl_lock = per_cpu_ptr(nd_region->lane, lane);
- if (ndl_count->count++ == 0)
- spin_lock(&ndl_lock->lock);
- } else
- lane = cpu;
+ might_sleep();
+ lane = raw_smp_processor_id() % nd_region->num_lanes;
+ mutex_lock(&nd_region->lane[lane].lock);
return lane;
}
EXPORT_SYMBOL(nd_region_acquire_lane);
void nd_region_release_lane(struct nd_region *nd_region, unsigned int lane)
+ __releases(&nd_region->lane[lane].lock)
{
- if (nd_region->num_lanes < nr_cpu_ids) {
- unsigned int cpu = smp_processor_id();
- struct nd_percpu_lane *ndl_lock, *ndl_count;
-
- ndl_count = per_cpu_ptr(nd_region->lane, cpu);
- ndl_lock = per_cpu_ptr(nd_region->lane, lane);
- if (--ndl_count->count == 0)
- spin_unlock(&ndl_lock->lock);
- }
- migrate_enable();
+ mutex_unlock(&nd_region->lane[lane].lock);
}
EXPORT_SYMBOL(nd_region_release_lane);
@@ -1019,17 +997,16 @@ static struct nd_region *nd_region_create(struct
nvdimm_bus *nvdimm_bus,
goto err_id;
}
- nd_region->lane = alloc_percpu(struct nd_percpu_lane);
+ nd_region->num_lanes = ndr_desc->num_lanes;
+ if (!nd_region->num_lanes)
+ goto err_percpu;
+ nd_region->lane = kcalloc(nd_region->num_lanes,
+ sizeof(*nd_region->lane), GFP_KERNEL);
if (!nd_region->lane)
goto err_percpu;
- for (i = 0; i < nr_cpu_ids; i++) {
- struct nd_percpu_lane *ndl;
-
- ndl = per_cpu_ptr(nd_region->lane, i);
- spin_lock_init(&ndl->lock);
- ndl->count = 0;
- }
+ for (i = 0; i < nd_region->num_lanes; i++)
+ mutex_init(&nd_region->lane[i].lock);
for (i = 0; i < ndr_desc->num_mappings; i++) {
struct nd_mapping_desc *mapping = &ndr_desc->mapping[i];
@@ -1046,7 +1023,6 @@ static struct nd_region *nd_region_create(struct
nvdimm_bus *nvdimm_bus,
}
nd_region->provider_data = ndr_desc->provider_data;
nd_region->nd_set = ndr_desc->nd_set;
- nd_region->num_lanes = ndr_desc->num_lanes;
nd_region->flags = ndr_desc->flags;
nd_region->ro = ro;
nd_region->numa_node = ndr_desc->numa_node;
base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
--
2.37.3