Re: [PATCH v7] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-09-20 Thread Waiman Long
On 09/20/2017 01:35 PM, Christoph Hellwig wrote:
>> +/*
>> + * When reading or writing the blktrace sysfs files, the references to the
>> + * opened sysfs or device files should prevent the underlying block device
>> + * from being removed. So no further delete protection is really needed.
>> + *
>> + * Protection from multiple readers and writers accessing blktrace data
>> + * concurrently is still required. The bd_mutex was used for this purpose.
>> + * That could lead to deadlock with concurrent block device deletion and
>> + * sysfs access. As a result, a new blk_trace_mutex is now added to be
>> + * used solely by the blktrace code.
>> + */
> Comments about previous locking schemes really don't have a business
> in the code - those are remarks for the commit logs.  And in general
> please explain the locking scheme near the data that they proctect
> it, as locks should always protected data, not code and the comments
> should follow that.

It seems to be a general practice that we don't put detailed comments in
the header files. The comment was put above the function with the first
instance of the blk_trace_mutex. Yes, I agree that talking about the
past history may not be applicable here. I will keep that in mind in the
future.

Thanks,
Longman




[PATCH v7] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-09-20 Thread Waiman Long
The lockdep code had reported the following unsafe locking scenario:

   CPU0CPU1
   
  lock(s_active#228);
   lock(>bd_mutex/1);
   lock(s_active#228);
  lock(>bd_mutex);

 *** DEADLOCK ***

The deadlock may happen when one task (CPU1) is trying to delete a
partition in a block device and another task (CPU0) is accessing
tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that
partition.

The s_active isn't an actual lock. It is a reference count (kn->count)
on the sysfs (kernfs) file. Removal of a sysfs file, however, require
a wait until all the references are gone. The reference count is
treated like a rwsem using lockdep instrumentation code.

The fact that a thread is in the sysfs callback method or in the
ioctl call means there is a reference to the opended sysfs or device
file. That should prevent the underlying block structure from being
removed.

Instead of using bd_mutex in the block_device structure, a new
blk_trace_mutex is now added to the request_queue structure to protect
access to the blk_trace structure.

Suggested-by: Christoph Hellwig <h...@infradead.org>
Signed-off-by: Waiman Long <long...@redhat.com>
---
 v7:
  - Add a new blk_trace_mutex in request_queue structure for blk_trace
protection.

 v6:
  - Add a second patch to rename the bd_fsfreeze_mutex to
bd_fsfreeze_blktrace_mutex.

 v5:
  - Overload the bd_fsfreeze_mutex in block_device structure for
blktrace protection.

 v4:
  - Use blktrace_mutex in blk_trace_ioctl() as well.

 v3:
  - Use a global blktrace_mutex to serialize sysfs attribute accesses
instead of the bd_mutex.

 v2:
  - Use READ_ONCE() and smp_store_mb() to read and write bd_deleting.
  - Check for signal in the mutex_trylock loops.
  - Use usleep() instead of schedule() for RT tasks.

 block/blk-core.c|  3 +++
 include/linux/blkdev.h  |  1 +
 kernel/trace/blktrace.c | 24 ++--
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index aebe676..048be4a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -854,6 +854,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
 
kobject_init(>kobj, _queue_ktype);
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+   mutex_init(>blk_trace_mutex);
+#endif
mutex_init(>sysfs_lock);
spin_lock_init(>__queue_lock);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 460294b..02fa42d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -551,6 +551,7 @@ struct request_queue {
int node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
struct blk_trace*blk_trace;
+   struct mutexblk_trace_mutex;
 #endif
/*
 * for flush operations
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2a685b4..d5cef05 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -648,6 +648,18 @@ int blk_trace_startstop(struct request_queue *q, int start)
 }
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
 
+/*
+ * When reading or writing the blktrace sysfs files, the references to the
+ * opened sysfs or device files should prevent the underlying block device
+ * from being removed. So no further delete protection is really needed.
+ *
+ * Protection from multiple readers and writers accessing blktrace data
+ * concurrently is still required. The bd_mutex was used for this purpose.
+ * That could lead to deadlock with concurrent block device deletion and
+ * sysfs access. As a result, a new blk_trace_mutex is now added to be
+ * used solely by the blktrace code.
+ */
+
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:  the block device
@@ -665,7 +677,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned 
cmd, char __user *arg)
if (!q)
return -ENXIO;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(>blk_trace_mutex);
 
switch (cmd) {
case BLKTRACESETUP:
@@ -691,7 +703,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned 
cmd, char __user *arg)
break;
}
 
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(>blk_trace_mutex);
return ret;
 }
 
@@ -1727,7 +1739,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(>blk_trace_mutex);
 
if (attr == _attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1746,7 +1758,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
 
 out_unlock_bdev:
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(>blk_tra

Re: [PATCH v6 1/2] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-09-19 Thread Waiman Long
On 09/19/2017 10:38 AM, Christoph Hellwig wrote:
> On Tue, Sep 19, 2017 at 08:49:12AM -0400, Waiman Long wrote:
>> On 09/18/2017 08:01 PM, Christoph Hellwig wrote:
>>> Taking a look at this it seems like using a lock in struct block_device
>>> isn't the right thing to do anyway - all the action is on fields in
>>> struct blk_trace, so having a lock inside that would make a lot more
>>> sense.
>>>
>>> It would also help to document what exactly we're actually protecting.
>> I think I documented in the patch that the lock has to protect changes
>> in the blktrace structure as well as the allocation and destruction of
>> it. Because of that, it can't be put inside the blktrace structure. The
>> original code use the bd_mutex of the block_device structure. I just
>> change the code to use another bd_fsfreeze_mutex in the same structure.
> Either way it has absolutely nothing to do with struct block_device,
> given that struct blk_trace hangs off the request_queue.
>
> Reusing a mutex just because it happens to live in a structure also
> generally is a bad idea if the protected data is in no way related.

I was trying not to add a new mutex to a structure just for blktrace as
it is an optional feature that is enabled only if the
CONFIG_BLK_DEV_IO_TRACE config option is defined and it will only need
to be taken occasionally.

As filesystem freeze looks orthogonal to blktrace and the mutex also
looks to be used sparingly, I think it is a good match to overload it to
control blktrace as well.

I could modify the patch to use a mutex in the request_queue structure.
The current sysfs_lock mutex has about 74 references. So I am not
totally sure if it is safe to reuse. So the only option is to add
something like

#ifdef CONFIG_BLK_DEV_IO_TRACE
struct mutex blktrace_mutex;
#endif

to the request_queue structure. That structure is large enough that
adding a mutex won't increase the size much percentage-wise.

I would like to keep the current patch as is as I don't see any problem
with it. However, I can revise the patch as discussed above if you guys
prefer that alternative.

Cheers,
Longman





Re: [PATCH v6 1/2] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-09-19 Thread Waiman Long
On 09/18/2017 08:01 PM, Christoph Hellwig wrote:
> Taking a look at this it seems like using a lock in struct block_device
> isn't the right thing to do anyway - all the action is on fields in
> struct blk_trace, so having a lock inside that would make a lot more
> sense.
>
> It would also help to document what exactly we're actually protecting.

I think I documented in the patch that the lock has to protect changes
in the blktrace structure as well as the allocation and destruction of
it. Because of that, it can't be put inside the blktrace structure. The
original code use the bd_mutex of the block_device structure. I just
change the code to use another bd_fsfreeze_mutex in the same structure.

In an earlier patch version, I used a global blktrace mutex. This was
deemed to be not scalable enough and so I now use the bd_fsfreeze_mutex
instead.

Cheers,
Longman



Re: [PATCH v6 2/2] block_dev: Rename bd_fsfreeze_mutex

2017-09-19 Thread Waiman Long
On 09/18/2017 07:47 PM, Christoph Hellwig wrote:
> Don't rename it to a way to long name.  Either add a separate mutex
> for your purpose (unless there is interaction between freezing and
> blktrace, which I doubt), or properly comment the usage.

I would agree with you if the long name causes the expressions hard to
read. In this particular case, it is just the single parameter to the
mutex_lock() and mutex_unlock() functions. There is no confusion and
overly long lines. So I think it is OK. In fact, I got the opposite
advices in the past that some people prefer long descriptive names than
short and cryptic names.

Cheers,
Longman



[PATCH v6 1/2] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-09-18 Thread Waiman Long
The lockdep code had reported the following unsafe locking scenario:

   CPU0CPU1
   
  lock(s_active#228);
   lock(>bd_mutex/1);
   lock(s_active#228);
  lock(>bd_mutex);

 *** DEADLOCK ***

The deadlock may happen when one task (CPU1) is trying to delete a
partition in a block device and another task (CPU0) is accessing
tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that
partition.

The s_active isn't an actual lock. It is a reference count (kn->count)
on the sysfs (kernfs) file. Removal of a sysfs file, however, require
a wait until all the references are gone. The reference count is
treated like a rwsem using lockdep instrumentation code.

The fact that a thread is in the sysfs callback method or in the
ioctl call means there is a reference to the opended sysfs or device
file. That should prevent the underlying block structure from being
removed.

Instead of using bd_mutex in the block_device structure, the other
bd_fsfreeze_mutex mutex in the block_device structure is now overloaded
to protect against concurrent blktrace data access in the blktrace.c
file. There is no point in adding one more mutex to the block_device
structure just for blktrace.

Signed-off-by: Waiman Long <long...@redhat.com>
---
 include/linux/fs.h  |  2 +-
 kernel/trace/blktrace.c | 26 --
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 339e737..330b572 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -448,7 +448,7 @@ struct block_device {
 
/* The counter of freeze processes */
int bd_fsfreeze_count;
-   /* Mutex for freeze */
+   /* Mutex for freeze and blktrace */
struct mutexbd_fsfreeze_mutex;
 } __randomize_layout;
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2a685b4..7cd5d1d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -648,6 +648,20 @@ int blk_trace_startstop(struct request_queue *q, int start)
 }
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
 
+/*
+ * When reading or writing the blktrace sysfs files, the references to the
+ * opened sysfs or device files should prevent the underlying block device
+ * from being removed. So no further delete protection is really needed.
+ *
+ * Protection from multiple readers and writers accessing blktrace data
+ * concurrently is still required. The bd_mutex was used for this purpose.
+ * That could lead to deadlock with concurrent block device deletion and
+ * sysfs access. Instead, the block device bd_fsfreeze_mutex is now
+ * overloaded for blktrace data protection. Like freeze/thaw, blktrace
+ * functionality is not frequently used. There is no point in adding
+ * one more mutex to the block_device structure just for blktrace.
+ */
+
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:  the block device
@@ -665,7 +679,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned 
cmd, char __user *arg)
if (!q)
return -ENXIO;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(>bd_fsfreeze_mutex);
 
switch (cmd) {
case BLKTRACESETUP:
@@ -691,7 +705,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned 
cmd, char __user *arg)
break;
}
 
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(>bd_fsfreeze_mutex);
return ret;
 }
 
@@ -1727,7 +1741,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(>bd_fsfreeze_mutex);
 
if (attr == _attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1746,7 +1760,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
 
 out_unlock_bdev:
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(>bd_fsfreeze_mutex);
 out_bdput:
bdput(bdev);
 out:
@@ -1788,7 +1802,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(>bd_fsfreeze_mutex);
 
if (attr == _attr_enable) {
if (value)
@@ -1814,7 +1828,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device 
*dev,
}
 
 out_unlock_bdev:
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(>bd_fsfreeze_mutex);
 out_bdput:
bdput(bdev);
 out:
-- 
1.8.3.1



[PATCH v6 2/2] block_dev: Rename bd_fsfreeze_mutex

2017-09-18 Thread Waiman Long
As the bd_fsfreeze_mutex is used by the blktrace subsystem as well,
it is now renamed to bd_fsfreeze_blktrace_mutex to better reflect
its purpose.

Signed-off-by: Waiman Long <long...@redhat.com>
---
 fs/block_dev.c  | 14 +++---
 fs/gfs2/ops_fstype.c|  6 +++---
 fs/nilfs2/super.c   |  6 +++---
 fs/super.c  |  6 +++---
 include/linux/fs.h  |  5 +++--
 kernel/trace/blktrace.c | 14 +++---
 6 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 93d088f..3dea006 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -504,7 +504,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
struct super_block *sb;
int error = 0;
 
-   mutex_lock(>bd_fsfreeze_mutex);
+   mutex_lock(>bd_fsfreeze_blktrace_mutex);
if (++bdev->bd_fsfreeze_count > 1) {
/*
 * We don't even need to grab a reference - the first call
@@ -514,7 +514,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
sb = get_super(bdev);
if (sb)
drop_super(sb);
-   mutex_unlock(>bd_fsfreeze_mutex);
+   mutex_unlock(>bd_fsfreeze_blktrace_mutex);
return sb;
}
 
@@ -528,13 +528,13 @@ struct super_block *freeze_bdev(struct block_device *bdev)
if (error) {
deactivate_super(sb);
bdev->bd_fsfreeze_count--;
-   mutex_unlock(>bd_fsfreeze_mutex);
+   mutex_unlock(>bd_fsfreeze_blktrace_mutex);
return ERR_PTR(error);
}
deactivate_super(sb);
  out:
sync_blockdev(bdev);
-   mutex_unlock(>bd_fsfreeze_mutex);
+   mutex_unlock(>bd_fsfreeze_blktrace_mutex);
return sb;  /* thaw_bdev releases s->s_umount */
 }
 EXPORT_SYMBOL(freeze_bdev);
@@ -550,7 +550,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block 
*sb)
 {
int error = -EINVAL;
 
-   mutex_lock(>bd_fsfreeze_mutex);
+   mutex_lock(>bd_fsfreeze_blktrace_mutex);
if (!bdev->bd_fsfreeze_count)
goto out;
 
@@ -568,7 +568,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block 
*sb)
if (error)
bdev->bd_fsfreeze_count++;
 out:
-   mutex_unlock(>bd_fsfreeze_mutex);
+   mutex_unlock(>bd_fsfreeze_blktrace_mutex);
return error;
 }
 EXPORT_SYMBOL(thaw_bdev);
@@ -767,7 +767,7 @@ static void init_once(void *foo)
bdev->bd_bdi = _backing_dev_info;
inode_init_once(>vfs_inode);
/* Initialize mutex for freeze. */
-   mutex_init(>bd_fsfreeze_mutex);
+   mutex_init(>bd_fsfreeze_blktrace_mutex);
 }
 
 static void bdev_evict_inode(struct inode *inode)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a3711f5..5664905 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1269,14 +1269,14 @@ static struct dentry *gfs2_mount(struct 
file_system_type *fs_type, int flags,
 * will protect the lockfs code from trying to start a snapshot
 * while we are mounting
 */
-   mutex_lock(>bd_fsfreeze_mutex);
+   mutex_lock(>bd_fsfreeze_blktrace_mutex);
if (bdev->bd_fsfreeze_count > 0) {
-   mutex_unlock(>bd_fsfreeze_mutex);
+   mutex_unlock(>bd_fsfreeze_blktrace_mutex);
error = -EBUSY;
goto error_bdev;
}
s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev);
-   mutex_unlock(>bd_fsfreeze_mutex);
+   mutex_unlock(>bd_fsfreeze_blktrace_mutex);
error = PTR_ERR(s);
if (IS_ERR(s))
goto error_bdev;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 4fc018d..931b455 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1306,15 +1306,15 @@ static int nilfs_test_bdev_super(struct super_block *s, 
void *data)
 * will protect the lockfs code from trying to start a snapshot
 * while we are mounting
 */
-   mutex_lock(>bd_fsfreeze_mutex);
+   mutex_lock(>bd_fsfreeze_blktrace_mutex);
if (sd.bdev->bd_fsfreeze_count > 0) {
-   mutex_unlock(>bd_fsfreeze_mutex);
+   mutex_unlock(>bd_fsfreeze_blktrace_mutex);
err = -EBUSY;
goto failed;
}
s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags,
 sd.bdev);
-   mutex_unlock(>bd_fsfreeze_mutex);
+   mutex_unlock(>bd_fsfreeze_blktrace_mutex);
if (IS_ERR(s)) {
err = PTR_ERR(s);
goto failed;
diff --git a/fs/super.c b/fs/super.c
index 166c4ee..079890f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1083,15 +1083,15 @@ struct dentry *mount_bdev(struct file_system_type 
*fs_type,
 * will protect the lockfs

[PATCH v6 0/2] blktrace: Fix deadlock problem

2017-09-18 Thread Waiman Long
 v6:
  - Add a second patch to rename the bd_fsfreeze_mutex to
bd_fsfreeze_blktrace_mutex.

 v5:
  - Overload the bd_fsfreeze_mutex in block_device structure for
blktrace protection.

 v4:
  - Use blktrace_mutex in blk_trace_ioctl() as well.

 v3:
  - Use a global blktrace_mutex to serialize sysfs attribute accesses
instead of the bd_mutex.

 v2:
  - Use READ_ONCE() and smp_store_mb() to read and write bd_deleting.
  - Check for signal in the mutex_trylock loops.
  - Use usleep() instead of schedule() for RT tasks.

This patchset fixes a potential blktrace deadlock problem between
block device deletion and sysfs operations.

Waiman Long (2):
  blktrace: Fix potentail deadlock between delete & sysfs ops
  block_dev: Rename bd_fsfreeze_mutex

 fs/block_dev.c  | 14 +++---
 fs/gfs2/ops_fstype.c|  6 +++---
 fs/nilfs2/super.c   |  6 +++---
 fs/super.c  |  6 +++---
 include/linux/fs.h  |  5 +++--
 kernel/trace/blktrace.c | 26 --
 6 files changed, 39 insertions(+), 24 deletions(-)

-- 
1.8.3.1



[PATCH v5] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-09-16 Thread Waiman Long
The lockdep code had reported the following unsafe locking scenario:

   CPU0CPU1
   
  lock(s_active#228);
   lock(>bd_mutex/1);
   lock(s_active#228);
  lock(>bd_mutex);

 *** DEADLOCK ***

The deadlock may happen when one task (CPU1) is trying to delete a
partition in a block device and another task (CPU0) is accessing
tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that
partition.

The s_active isn't an actual lock. It is a reference count (kn->count)
on the sysfs (kernfs) file. Removal of a sysfs file, however, require
a wait until all the references are gone. The reference count is
treated like a rwsem using lockdep instrumentation code.

The fact that a thread is in the sysfs callback method or in the
ioctl call means there is a reference to the opended sysfs or device
file. That should prevent the underlying block structure from being
removed.

Instead of using bd_mutex in the block_device structure, the other
bd_fsfreeze_mutex mutex in the block_device structure is now overloaded
to protect against concurrent blktrace data access in the blktrace.c
file. There is no point in adding one more mutex to the block_device
structure just for blktrace.

Signed-off-by: Waiman Long <long...@redhat.com>
---
 v5:
  - Overload the bd_fsfreeze_mutex in block_device structure for
blktrace protection.

 v4:
  - Use blktrace_mutex in blk_trace_ioctl() as well.

 v3:
  - Use a global blktrace_mutex to serialize sysfs attribute accesses
instead of the bd_mutex.

 v2:
  - Use READ_ONCE() and smp_store_mb() to read and write bd_deleting.
  - Check for signal in the mutex_trylock loops.
  - Use usleep() instead of schedule() for RT tasks.

 include/linux/fs.h  |  2 +-
 kernel/trace/blktrace.c | 26 --
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 339e737..330b572 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -448,7 +448,7 @@ struct block_device {
 
/* The counter of freeze processes */
int bd_fsfreeze_count;
-   /* Mutex for freeze */
+   /* Mutex for freeze and blktrace */
struct mutexbd_fsfreeze_mutex;
 } __randomize_layout;
 
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2a685b4..7cd5d1d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -648,6 +648,20 @@ int blk_trace_startstop(struct request_queue *q, int start)
 }
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
 
+/*
+ * When reading or writing the blktrace sysfs files, the references to the
+ * opened sysfs or device files should prevent the underlying block device
+ * from being removed. So no further delete protection is really needed.
+ *
+ * Protection from multiple readers and writers accessing blktrace data
+ * concurrently is still required. The bd_mutex was used for this purpose.
+ * That could lead to deadlock with concurrent block device deletion and
+ * sysfs access. Instead, the block device bd_fsfreeze_mutex is now
+ * overloaded for blktrace data protection. Like freeze/thaw, blktrace
+ * functionality is not frequently used. There is no point in adding
+ * one more mutex to the block_device structure just for blktrace.
+ */
+
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:  the block device
@@ -665,7 +679,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned 
cmd, char __user *arg)
if (!q)
return -ENXIO;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(>bd_fsfreeze_mutex);
 
switch (cmd) {
case BLKTRACESETUP:
@@ -691,7 +705,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned 
cmd, char __user *arg)
break;
}
 
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(>bd_fsfreeze_mutex);
return ret;
 }
 
@@ -1727,7 +1741,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(>bd_fsfreeze_mutex);
 
if (attr == _attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1746,7 +1760,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
 
 out_unlock_bdev:
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(>bd_fsfreeze_mutex);
 out_bdput:
bdput(bdev);
 out:
@@ -1788,7 +1802,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(>bd_fsfreeze_mutex);
 
if (attr == _attr_enable) {
if (value)
@@ -1814,7 +1828,7 @@ static ssize_t sysfs_blk_trace_attr_sto

Re: [PATCH v4] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-08-18 Thread Waiman Long
On 08/18/2017 04:18 PM, Bart Van Assche wrote:
> On Fri, 2017-08-18 at 16:01 -0400, Waiman Long wrote:
>> [ ... ]
>> Instead of using bd_mutex, a new global blktrace mutex is now used
>> to protect against concurrent access, creation and destruction of the
>> blk_trace structure that is used only in the blktrace.c file. As
>> blktrace files will not be frequently accessed, using a global mutex
>> should not cause any performance problem.
>> [ ... ]
>> +/*
>> + * The bd_mutex was used previously for protecting blk_trace structure.
>> + * That could lead to deadlock with concurrent block device deletion and
>> + * sysfs access. So a global blktrace_mutex is now used instead for
>> + * protecting the blk_trace structure.
>> + *
>> + * The references to the opened sysfs or device files should prevent the
>> + * underlying block device from being removed.
>> + */
>> +static DEFINE_MUTEX(blktrace_mutex);
> Hello Waiman,
>
> Thanks for having addressed my previous comment. Regarding this patch: sorry
> but I don't think it's a good idea to use a global mutex for serializing
> accesses to tracing data of a single block device. Global mutexes create
> unwanted lock dependencies between different block devices. Additionally, on
> multiprocessor systems global mutexes can cause cache line ping-pong between
> processors and hence can cause a severe slowdown. Please make blktrace_mutex
> per block device instead of global.

I fully understand the problem of a global lock. The main reason of
using a global lock here is that the blktrace APIs are not in a
performance critical path. In fact, I think it is used primarily for
debugging purpose. Activating it will certainly slow thing down no
matter what kind of lock is used.

I also don't believe that the blktrace APIs will be used in a high
enough frequency that it will cause a performance issue. Please let me
know if I am wrong in my assumptions.

Cheers,
Longman



[PATCH v4] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-08-18 Thread Waiman Long
The lockdep code had reported the following unsafe locking scenario:

   CPU0CPU1
   
  lock(s_active#228);
   lock(>bd_mutex/1);
   lock(s_active#228);
  lock(>bd_mutex);

 *** DEADLOCK ***

The deadlock may happen when one task (CPU1) is trying to delete a
partition in a block device and another task (CPU0) is accessing
tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that
partition.

The s_active isn't an actual lock. It is a reference count (kn->count)
on the sysfs (kernfs) file. Removal of a sysfs file, however, require
a wait until all the references are gone. The reference count is
treated like a rwsem using lockdep instrumentation code.

The fact that a thread is in the sysfs callback method or in the
ioctl call means there is a reference to the opended sysfs or device
file. That should prevent the underlying block structure from being
removed.

Instead of using bd_mutex, a new global blktrace mutex is now used
to protect against concurrent access, creation and destruction of the
blk_trace structure that is used only in the blktrace.c file. As
blktrace files will not be frequently accessed, using a global mutex
should not cause any performance problem.

Signed-off-by: Waiman Long <long...@redhat.com>
---

 v4:
  - Use blktrace_mutex in blk_trace_ioctl() as well.

 v3:
  - Use a global blktrace_mutex to serialize sysfs attribute accesses
instead of the bd_mutex. 

 v2:
  - Use READ_ONCE() and smp_store_mb() to read and write bd_deleting.
  - Check for signal in the mutex_trylock loops.
  - Use usleep() instead of schedule() for RT tasks.

 kernel/trace/blktrace.c | 23 +--
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f8..ec5a919 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -624,6 +624,17 @@ int blk_trace_startstop(struct request_queue *q, int start)
 }
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
 
+/*
+ * The bd_mutex was used previously for protecting blk_trace structure.
+ * That could lead to deadlock with concurrent block device deletion and
+ * sysfs access. So a global blktrace_mutex is now used instead for
+ * protecting the blk_trace structure.
+ *
+ * The references to the opened sysfs or device files should prevent the
+ * underlying block device from being removed.
+ */
+static DEFINE_MUTEX(blktrace_mutex);
+
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:  the block device
@@ -641,7 +652,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned 
cmd, char __user *arg)
if (!q)
return -ENXIO;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(_mutex);
 
switch (cmd) {
case BLKTRACESETUP:
@@ -667,7 +678,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned 
cmd, char __user *arg)
break;
}
 
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(_mutex);
return ret;
 }
 
@@ -1622,7 +1633,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(_mutex);
 
if (attr == _attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1641,7 +1652,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
 
 out_unlock_bdev:
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(_mutex);
 out_bdput:
bdput(bdev);
 out:
@@ -1683,7 +1694,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(_mutex);
 
if (attr == _attr_enable) {
if (value)
@@ -1709,7 +1720,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device 
*dev,
}
 
 out_unlock_bdev:
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(_mutex);
 out_bdput:
bdput(bdev);
 out:
-- 
1.8.3.1



Re: [PATCH v3] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-08-18 Thread Waiman Long
On 08/18/2017 02:07 PM, Bart Van Assche wrote:
> On Fri, 2017-08-18 at 13:54 -0400, Waiman Long wrote:
>> Instead, a global blktrace
>> mutex will be used to serialize the read/write of the blktrace sysfs
>> attributes.
> Hello Waiman,
>
> Using a mutex to serialize code is wrong. What is needed is exact
> documentation of what data structures and/or member variables are
> protected by bdev->bd_mutex and by blktrace_mutex.

Yes, I missed the blk_trace_ioctl(). I should have modified it to use
blktrace_mutex as well.

> Also, please spell "potential" correctly in the subject of this patch.

Will do.

Thanks,
Longman



[PATCH v3] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-08-18 Thread Waiman Long
The lockdep code had reported the following unsafe locking scenario:

   CPU0CPU1
   
  lock(s_active#228);
   lock(>bd_mutex/1);
   lock(s_active#228);
  lock(>bd_mutex);

 *** DEADLOCK ***

The deadlock may happen when one task (CPU1) is trying to delete a
partition in a block device and another task (CPU0) is accessing
tracing sysfs file (e.g. /sys/block/dm-1/trace/act_mask) in that
partition.

The s_active isn't an actual lock. It is a reference count (kn->count)
on the sysfs (kernfs) file. Removal of a sysfs file, however, require
a wait until all the references are gone. The reference count is
treated like a rwsem using lockdep instrumentation code.

The fact that a thread is in the sysfs callback method will guarantee
that the underlying block device structure won't go away as device
deletion requires all the sysfs references to be gone. Therefore,
there is no need to take the bd_mutex. Instead, a global blktrace
mutex will be used to serialize the read/write of the blktrace sysfs
attributes.

Signed-off-by: Waiman Long <long...@redhat.com>
---
 v3:
  - Use a global blktrace_mutex to serialize sysfs attribute accesses
instead of the bd_mutex. 

 v2:
  - Use READ_ONCE() and smp_store_mb() to read and write bd_deleting.
  - Check for signal in the mutex_trylock loops.
  - Use usleep() instead of schedule() for RT tasks.

 kernel/trace/blktrace.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f8..e5901c6 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1605,6 +1605,15 @@ static struct request_queue *blk_trace_get_queue(struct 
block_device *bdev)
return bdev_get_queue(bdev);
 }
 
+/*
+ * Read/write to the blk_trace sysfs files requires taking references to
+ * the underlying kernfs_node structure which will guarantee that the block
+ * device won't go away as the device deletion code will wait until all the
+ * sysfs references are gone. For serialization of read/write accesses to
+ * the sysfs attributes, a global blk_trace mutex is used.
+ */
+static DEFINE_MUTEX(blktrace_mutex);
+
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 struct device_attribute *attr,
 char *buf)
@@ -1622,7 +1631,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(_mutex);
 
if (attr == _attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1641,7 +1650,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
 
 out_unlock_bdev:
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(_mutex);
 out_bdput:
bdput(bdev);
 out:
@@ -1683,7 +1692,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   mutex_lock(_mutex);
 
if (attr == _attr_enable) {
if (value)
@@ -1709,7 +1718,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device 
*dev,
}
 
 out_unlock_bdev:
-   mutex_unlock(>bd_mutex);
+   mutex_unlock(_mutex);
 out_bdput:
bdput(bdev);
 out:
-- 
1.8.3.1



Re: [PATCH v2] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-08-18 Thread Waiman Long
On 08/18/2017 12:21 PM, Bart Van Assche wrote:
> On Fri, 2017-08-18 at 09:55 -0400, Waiman Long wrote:
>> On 08/17/2017 05:30 PM, Steven Rostedt wrote:
>>> On Thu, 17 Aug 2017 17:10:07 -0400
>>> Steven Rostedt <rost...@goodmis.org> wrote:
>>>> Instead of playing games with taking the lock, the only way this race
>>>> is hit, is if the partition is being deleted and the sysfs attribute is
>>>> being read at the same time, correct? In that case, just return
>>>> -ENODEV, and be done with it.
>>> Nevermind that wont work. Too bad there's not a mutex_lock_timeout()
>>> that we could use in a loop. It would solve the issue of forward
>>> progress with RT tasks, and will break after a timeout in case of
>>> deadlock.
>> I think it will be useful to have mutex_timed_lock(). RT-mutex does have
>> a timed version, so I guess it shouldn't be hard to implement one for
>> mutex. I can take a shot at trying to do that.
> (just caught up with the entire e-mail thread)
>
> Sorry Waiman but personally I thoroughly detest loops around mutex_trylock() 
> or
> mutex_timed_lock() because such loops are usually used to paper over a problem
> instead of fixing the root cause. What I understood from the comment in v1 of 
> your
> patch is that bd_mutex is not only held during block device creation and 
> removal
> but additionally that bd_mutex is obtained inside sysfs attribute callback 
> methods?
> That pattern is guaranteed to lead to deadlocks. Since the block device 
> removal
> code waits until all sysfs callback methods have finished there is no need to
> protect against block device removal inside the sysfs callback methods. My 
> proposal

You are right. We don't really need to take the bd_mutex as the fact
that inside the sysfs callback method will guarantee the block device
won't go away.

> is to split bd_mutex: one global mutex that serializes block device creation 
> and
> removal and one mutex per block device that serializes changes to a single 
> block
> device. Obtaining the global mutex from inside a block device sysfs callback
> function is not safe but obtaining the per-block-device mutex from inside a 
> sysfs
> callback function is safe.
>
> Bart.

The bd_mutex we are talking here is already per block device. I am
thinking about having a global blktrace mutex that is used to serialize
the read and write of blktrace attributes. Since blktrace sysfs files
are not supposed to be frequently accessed, having a global lock
shouldn't cause any problem.

Thanks,
Longman




Re: [PATCH v2] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-08-18 Thread Waiman Long
On 08/17/2017 05:30 PM, Steven Rostedt wrote:
> On Thu, 17 Aug 2017 17:10:07 -0400
> Steven Rostedt  wrote:
>
>
>> Instead of playing games with taking the lock, the only way this race
>> is hit, is if the partition is being deleted and the sysfs attribute is
>> being read at the same time, correct? In that case, just return
>> -ENODEV, and be done with it.
> Nevermind that wont work. Too bad there's not a mutex_lock_timeout()
> that we could use in a loop. It would solve the issue of forward
> progress with RT tasks, and will break after a timeout in case of
> deadlock.
>
> -- Steve

I think it will be useful to have mutex_timed_lock(). RT-mutex does have
a timed version, so I guess it shouldn't be hard to implement one for
mutex. I can take a shot at trying to do that.

Thanks,
Longman




Re: [PATCH v2] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-08-17 Thread Waiman Long
On 08/17/2017 05:10 PM, Steven Rostedt wrote:
> On Thu, 17 Aug 2017 12:24:39 -0400
> Waiman Long <long...@redhat.com> wrote:
>
>>>> + * sysfs file and then acquiring the bd_mutex. Deleting a block device
>>>> + * requires acquiring the bd_mutex and then waiting for all the sysfs
>>>> + * references to be gone. This can lead to deadlock if both operations
>>>> + * happen simultaneously. To avoid this problem, read/write to the
>>>> + * the tracing sysfs files can now fail if the bd_mutex cannot be
>>>> + * acquired while a deletion operation is in progress.
>>>> + *
>>>> + * A mutex trylock loop is used assuming that tracing sysfs operations  
>>> A mutex trylock loop is not enough to stop a deadlock. But I'm guessing
>>> the undocumented bd_deleting may prevent that.  
>> Yes, that is what the bd_deleting flag is for.
>>
>> I was thinking about failing the sysfs operation after a certain number
>> of trylock attempts, but then it will require changes to user space code
>> to handle the occasional failures. Finally I decided to fail it only
>> when a delete operation is in progress as all hopes are lost in this case.
> Actually, why not just fail the attr read on deletion? If it is being
> deleted, and one is reading the attribute, perhaps -ENODEV is the
> proper return value?
>
>> The root cause is the lock inversion under this circumstance. I think
>> modifying the blk_trace code has the least impact overall. I agree that
>> the code is ugly. If you have a better suggestion, I will certainly like
>> to hear it.
> Instead of playing games with taking the lock, the only way this race
> is hit, is if the partition is being deleted and the sysfs attribute is
> being read at the same time, correct? In that case, just return
> -ENODEV, and be done with it.
>
> -- Steve

It is actually what the patch is trying to do by checking for the
deletion flag in the mutex_trylock loop. Please note that mutex does not
guarantee FIFO ordering of lock acquisition. As a result, cpu1 may call
mutex_lock() and wait for it while cpu2 can set the deletion flag later
and get the mutex first before cpu1. So checking for the deletion flag
before taking the mutex isn't enough.

Cheers,
Longman




[PATCH] blktrace: Fix potentail deadlock between delete & sysfs ops

2017-08-10 Thread Waiman Long
The lockdep code had reported the following unsafe locking scenario:

   CPU0CPU1
   
  lock(s_active#228);
   lock(>bd_mutex/1);
   lock(s_active#228);
  lock(>bd_mutex);

 *** DEADLOCK ***

The deadlock may happen when one task (CPU1) is trying to delete
a partition in a block device and another task (CPU0) is accessing
tracing sysfs file in that partition.

To avoid that, accessing tracing sysfs file will now use a mutex
trylock loop and the operation will fail if a delete operation is
in progress.

Signed-off-by: Waiman Long <long...@redhat.com>
---
 block/ioctl.c   |  3 +++
 include/linux/fs.h  |  1 +
 kernel/trace/blktrace.c | 24 ++--
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 0de02ee..7211608 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -86,12 +86,15 @@ static int blkpg_ioctl(struct block_device *bdev, struct 
blkpg_ioctl_arg __user
return -EBUSY;
}
/* all seems OK */
+   bdev->bd_deleting = 1;
fsync_bdev(bdevp);
invalidate_bdev(bdevp);
 
mutex_lock_nested(>bd_mutex, 1);
delete_partition(disk, partno);
mutex_unlock(>bd_mutex);
+   bdev->bd_deleting = 0;
+
mutex_unlock(>bd_mutex);
bdput(bdevp);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7b5d681..5d4ae9d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -427,6 +427,7 @@ struct block_device {
 #endif
struct block_device *   bd_contains;
unsignedbd_block_size;
+   int bd_deleting;
struct hd_struct *  bd_part;
/* number of times partitions within this device have been opened. */
unsignedbd_part_count;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f8..715e77e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1605,6 +1605,18 @@ static struct request_queue *blk_trace_get_queue(struct 
block_device *bdev)
return bdev_get_queue(bdev);
 }
 
+/*
+ * Read/write to the tracing sysfs file requires taking references to the
+ * sysfs file and then acquiring the bd_mutex. Deleting a block device
+ * requires acquiring the bd_mutex and then waiting for all the sysfs
+ * references to be gone. This can lead to deadlock if both operations
+ * happen simultaneously. To avoid this problem, read/write to the
+ * the tracing sysfs files can now fail if the bd_mutex cannot be
+ * acquired while a deletion operation is in progress.
+ *
+ * A mutex trylock loop is used assuming that tracing sysfs operations
+ * aren't frequently enough to cause any contention problem.
+ */
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 struct device_attribute *attr,
 char *buf)
@@ -1622,7 +1634,11 @@ static ssize_t sysfs_blk_trace_attr_show(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   while (!mutex_trylock(>bd_mutex)) {
+   if (bdev->bd_deleting)
+   goto out_bdput;
+   schedule();
+   }
 
if (attr == _attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1683,7 +1699,11 @@ static ssize_t sysfs_blk_trace_attr_store(struct device 
*dev,
if (q == NULL)
goto out_bdput;
 
-   mutex_lock(>bd_mutex);
+   while (!mutex_trylock(>bd_mutex)) {
+   if (bdev->bd_deleting)
+   goto out_bdput;
+   schedule();
+   }
 
if (attr == _attr_enable) {
if (value)
-- 
1.8.3.1