Re: [PATCH 4/9] virtio_pci: simplify MSI-X setup

2017-02-02 Thread Jason Wang



On 2017年01月27日 16:16, Christoph Hellwig wrote:

Try to grab the MSI-X vectors early and fall back to the shared one
before doing lots of allocations.

Signed-off-by: Christoph Hellwig 
---


Reviewed-by: Jason Wang 


  drivers/virtio/virtio_pci_common.c | 58 +++---
  1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 9c4ad7d3f..74ff74c0 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -142,32 +142,39 @@ void vp_del_vqs(struct virtio_device *vdev)
  }
  
  static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,

- struct virtqueue *vqs[],
- vq_callback_t *callbacks[],
- const char * const names[],
- bool per_vq_vectors)
+   struct virtqueue *vqs[], vq_callback_t *callbacks[],
+   const char * const names[])
  {
struct virtio_pci_device *vp_dev = to_vp_device(vdev);
const char *name = dev_name(_dev->vdev.dev);
int i, err = -ENOMEM, allocated_vectors, nvectors;
+   bool shared = false;
u16 msix_vec;
  
-	if (per_vq_vectors) {

-   /* Best option: one for change interrupt, one per vq. */
-   nvectors = 1;
-   for (i = 0; i < nvqs; ++i)
-   if (callbacks[i])
-   ++nvectors;
-   } else {
-   /* Second best: one for change, shared for all vqs. */
+   nvectors = 1;
+   for (i = 0; i < nvqs; i++) {
+   if (callbacks[i])
+   nvectors++;
+   }
+
+   /* Try one vector per queue first. */
+   err = pci_alloc_irq_vectors(vp_dev->pci_dev, nvectors, nvectors,
+   PCI_IRQ_MSIX);
+   if (err < 0) {
+   /* Fallback to one vector for config, one shared for queues. */
+   shared = true;
nvectors = 2;
+   err = pci_alloc_irq_vectors(vp_dev->pci_dev, nvectors, nvectors,
+   PCI_IRQ_MSIX);
+   if (err < 0)
+   return err;
}
  
  	vp_dev->msix_vectors = nvectors;

vp_dev->msix_names = kmalloc_array(nvectors,
sizeof(*vp_dev->msix_names), GFP_KERNEL);
if (!vp_dev->msix_names)
-   return err;
+   goto out_free_irq_vectors;
  
  	vp_dev->msix_affinity_masks = kcalloc(nvectors,

sizeof(*vp_dev->msix_affinity_masks), GFP_KERNEL);
@@ -180,18 +187,13 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, 
unsigned nvqs,
goto out_free_msix_affinity_masks;
}
  
-	err = pci_alloc_irq_vectors(vp_dev->pci_dev, nvectors, nvectors,

-   PCI_IRQ_MSIX);
-   if (err < 0)
-   goto out_free_msix_affinity_masks;
-
/* Set the vector used for configuration */
snprintf(vp_dev->msix_names[0], sizeof(*vp_dev->msix_names),
 "%s-config", name);
err = request_irq(pci_irq_vector(vp_dev->pci_dev, 0), vp_config_changed,
0, vp_dev->msix_names[0], vp_dev);
if (err)
-   goto out_free_irq_vectors;
+   goto out_free_msix_affinity_masks;
  
  	/* Verify we had enough resources to assign the vector */

if (vp_dev->config_vector(vp_dev, 0) == VIRTIO_MSI_NO_VECTOR) {
@@ -241,7 +243,11 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, 
unsigned nvqs,
}
vp_dev->msix_vector_map[i] = msix_vec;
  
-		if (per_vq_vectors)

+   /*
+* Use a different vector for each queue if they are available,
+* else share the same vector for all VQs.
+*/
+   if (!shared)
allocated_vectors++;
}
  
@@ -254,8 +260,6 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,

vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR);
  out_free_config_irq:
free_irq(pci_irq_vector(vp_dev->pci_dev, 0), vp_dev);
-out_free_irq_vectors:
-   pci_free_irq_vectors(vp_dev->pci_dev);
  out_free_msix_affinity_masks:
for (i = 0; i < nvectors; i++) {
if (vp_dev->msix_affinity_masks[i])
@@ -264,6 +268,8 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, 
unsigned nvqs,
kfree(vp_dev->msix_affinity_masks);
  out_free_msix_names:
kfree(vp_dev->msix_names);
+out_free_irq_vectors:
+   pci_free_irq_vectors(vp_dev->pci_dev);
return err;
  }
  
@@ -308,15 +314,9 @@ int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,

  {
int err;
  
-	/* Try MSI-X with one vector per queue. */

-   err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, 

[REGRESSION v4.10-rc1] blkdev_issue_zeroout() returns -EREMOTEIO on the first call for SCSI device that doesn't support WRITE SAME

2017-02-02 Thread Junichi Nomura
I found following ext4 error occurs on a certain storage since v4.10-rc1:
  EXT4-fs (sdc1): Delayed block allocation failed for inode 12 at logical 
offset 100 with max blocks 2 with error 121
  EXT4-fs (sdc1): This should not happen!! Data will be lost

Error 121 (EREMOTEIO) was returned from blkdev_issue_zeroout().
That came from sd driver because WRITE SAME was sent to the device
which didn't support it.

The problem was introduced by commit e73c23ff736e ("block: add async
variant of blkdev_issue_zeroout"). Before the commit, blkdev_issue_zeroout
fell back to normal zero writing when WRITE SAME failed and it seems
sd driver's heuristics depends on that behaviour.

Below is a band-aid fix to restore the fallback behaviour for sd. Although
there should be better fix as retrying blindly is not a good idea...

v4.10-rc6:
  # cat /sys/block/sdc/queue/write_same_max_bytes
  33553920
  # fallocate -v -z -l 512 /dev/sdc1
  fallocate: fallocate failed: Remote I/O error
  # cat /sys/block/sdc/queue/write_same_max_bytes
  0
  # fallocate -v -z -l 512 /dev/sdc1
  # echo $?
  0

v4.9 or v4.10-rc6 + this patch:
  # grep . /sys/block/sdc/queue/write_same_max_bytes
  33553920
  # fallocate -v -z -l 512 /dev/sdc1
  # echo $?
  0
  # grep . /sys/block/sdc/queue/write_same_max_bytes
  0

diff --git a/block/blk-lib.c b/block/blk-lib.c
index f8c82a9..8e53474 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -360,6 +360,7 @@ int blkdev_issue_zeroout(struct block_device *bdev, 
sector_t sector,
 sector_t nr_sects, gfp_t gfp_mask, bool discard)
 {
int ret;
+   int pass = 0;
struct bio *bio = NULL;
struct blk_plug plug;
 
@@ -369,6 +370,7 @@ int blkdev_issue_zeroout(struct block_device *bdev, 
sector_t sector,
return 0;
}
 
+  retry_other_method:
blk_start_plug();
ret = __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
, discard);
@@ -378,6 +380,11 @@ int blkdev_issue_zeroout(struct block_device *bdev, 
sector_t sector,
}
blk_finish_plug();
 
+   if (ret && pass++ == 0) {
+   bio = NULL;
+   goto retry_other_method;
+   }
+
return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
-- 
Jun'ichi Nomura, NEC Corporation / NEC Solution Innovators, Ltd.--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/9] virtio_pci: don't duplicate the msix_enable flag in struct pci_dev

2017-02-02 Thread Jason Wang



On 2017年01月27日 16:16, Christoph Hellwig wrote:

Signed-off-by: Christoph Hellwig 
---


Reviewed-by: Jason Wang 


  drivers/virtio/virtio_pci_common.c | 5 ++---
  drivers/virtio/virtio_pci_common.h | 2 --
  drivers/virtio/virtio_pci_legacy.c | 2 +-
  drivers/virtio/virtio_pci_modern.c | 2 +-
  include/uapi/linux/virtio_pci.h| 2 +-
  5 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 5880e86..9c4ad7d3f 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -125,7 +125,7 @@ void vp_del_vqs(struct virtio_device *vdev)
  
  	vp_remove_vqs(vdev);
  
-	if (vp_dev->msix_enabled) {

+   if (vp_dev->pci_dev->msix_enabled) {
for (i = 0; i < vp_dev->msix_vectors; i++)
free_cpumask_var(vp_dev->msix_affinity_masks[i]);
  
@@ -245,7 +245,6 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs,

allocated_vectors++;
}
  
-	vp_dev->msix_enabled = 1;

return 0;
  
  out_remove_vqs:

@@ -341,7 +340,7 @@ int vp_set_vq_affinity(struct virtqueue *vq, int cpu)
if (!vq->callback)
return -EINVAL;
  
-	if (vp_dev->msix_enabled) {

+   if (vp_dev->pci_dev->msix_enabled) {
int vec = vp_dev->msix_vector_map[vq->index];
struct cpumask *mask = vp_dev->msix_affinity_masks[vec];
unsigned int irq = pci_irq_vector(vp_dev->pci_dev, vec);
diff --git a/drivers/virtio/virtio_pci_common.h 
b/drivers/virtio/virtio_pci_common.h
index 8559386..217ca87 100644
--- a/drivers/virtio/virtio_pci_common.h
+++ b/drivers/virtio/virtio_pci_common.h
@@ -64,8 +64,6 @@ struct virtio_pci_device {
/* the IO mapping for the PCI config space */
void __iomem *ioaddr;
  
-	/* MSI-X support */

-   int msix_enabled;
cpumask_var_t *msix_affinity_masks;
/* Name strings for interrupts. This size should be enough,
 * and I'm too lazy to allocate each name separately. */
diff --git a/drivers/virtio/virtio_pci_legacy.c 
b/drivers/virtio/virtio_pci_legacy.c
index 47292da..2ab6aee 100644
--- a/drivers/virtio/virtio_pci_legacy.c
+++ b/drivers/virtio/virtio_pci_legacy.c
@@ -165,7 +165,7 @@ static void del_vq(struct virtqueue *vq)
  
  	iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
  
-	if (vp_dev->msix_enabled) {

+   if (vp_dev->pci_dev->msix_enabled) {
iowrite16(VIRTIO_MSI_NO_VECTOR,
  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
/* Flush the write out to device */
diff --git a/drivers/virtio/virtio_pci_modern.c 
b/drivers/virtio/virtio_pci_modern.c
index 00e6fc1..e5ce310 100644
--- a/drivers/virtio/virtio_pci_modern.c
+++ b/drivers/virtio/virtio_pci_modern.c
@@ -412,7 +412,7 @@ static void del_vq(struct virtqueue *vq)
  
  	vp_iowrite16(vq->index, _dev->common->queue_select);
  
-	if (vp_dev->msix_enabled) {

+   if (vp_dev->pci_dev->msix_enabled) {
vp_iowrite16(VIRTIO_MSI_NO_VECTOR,
 _dev->common->queue_msix_vector);
/* Flush the write out to device */
diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h
index 90007a1..15b4385 100644
--- a/include/uapi/linux/virtio_pci.h
+++ b/include/uapi/linux/virtio_pci.h
@@ -79,7 +79,7 @@
   * configuration space */
  #define VIRTIO_PCI_CONFIG_OFF(msix_enabled)   ((msix_enabled) ? 24 : 20)
  /* Deprecated: please use VIRTIO_PCI_CONFIG_OFF instead */
-#define VIRTIO_PCI_CONFIG(dev) VIRTIO_PCI_CONFIG_OFF((dev)->msix_enabled)
+#define VIRTIO_PCI_CONFIG(dev) 
VIRTIO_PCI_CONFIG_OFF((dev)->pci_dev->msix_enabled)
  
  /* Virtio ABI version, this must match exactly */

  #define VIRTIO_PCI_ABI_VERSION0


--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/9] virtio_pci: remove struct virtio_pci_vq_info

2017-02-02 Thread Jason Wang



On 2017年01月27日 16:16, Christoph Hellwig wrote:

We don't really need struct virtio_pci_vq_info, as most field in there
are redundant:

  - the vq backpointer is not strictly neede to start with
  - the entry in the vqs list is not needed - the generic virtqueue already
has list, we only need to check if it has a callback to get the same
semantics
  - we can use a simple array to look up the MSI-X vec if needed.
  - That simple array now also duoble serves to replace the per_vq_vectors
flag

Signed-off-by: Christoph Hellwig 
---
  drivers/virtio/virtio_pci_common.c | 117 +++--
  drivers/virtio/virtio_pci_common.h |  25 +---
  drivers/virtio/virtio_pci_legacy.c |   6 +-
  drivers/virtio/virtio_pci_modern.c |   6 +-
  4 files changed, 39 insertions(+), 115 deletions(-)

diff --git a/drivers/virtio/virtio_pci_common.c 
b/drivers/virtio/virtio_pci_common.c
index 186cbab..a3376731 100644
--- a/drivers/virtio/virtio_pci_common.c
+++ b/drivers/virtio/virtio_pci_common.c
@@ -62,16 +62,13 @@ static irqreturn_t vp_config_changed(int irq, void *opaque)
  static irqreturn_t vp_vring_interrupt(int irq, void *opaque)
  {
struct virtio_pci_device *vp_dev = opaque;
-   struct virtio_pci_vq_info *info;
irqreturn_t ret = IRQ_NONE;
-   unsigned long flags;
+   struct virtqueue *vq;
  
-	spin_lock_irqsave(_dev->lock, flags);

-   list_for_each_entry(info, _dev->virtqueues, node) {
-   if (vring_interrupt(irq, info->vq) == IRQ_HANDLED)
+   list_for_each_entry(vq, _dev->vdev.vqs, list) {
+   if (vq->callback && vring_interrupt(irq, vq) == IRQ_HANDLED)


The check of vq->callback seems redundant, we will check it soon in 
vring_interrupt().


Thanks
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/9] virtio_pci: use shared interrupts for virtqueues

2017-02-02 Thread Jason Wang



On 2017年01月27日 16:16, Christoph Hellwig wrote:

+   snprintf(vp_dev->msix_names[i + 1],
+sizeof(*vp_dev->msix_names), "%s-%s",
 dev_name(_dev->vdev.dev), names[i]);
err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec),
- vring_interrupt, 0,
- vp_dev->msix_names[msix_vec],
- vqs[i]);
+ vring_interrupt, IRQF_SHARED,
+ vp_dev->msix_names[i + 1], vqs[i]);


Do we need to check per_vq_vectors before dereferencing msix_names[i + 1] ?

Thanks
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: block for-next: zram build error

2017-02-02 Thread Jens Axboe
On 02/02/2017 04:40 PM, Bart Van Assche wrote:
> Hello Jens,
> 
> I noticed accidentally that with the for-next branch of the block
> git repository that the zram driver doesn't build anymore:
> 
> 
> $ make M=drivers/block/zram
>   LD  drivers/block/zram/built-in.o
>   CC [M]  drivers/block/zram/zcomp.o
>   CC [M]  drivers/block/zram/zram_drv.o
> drivers/block/zram/zram_drv.c: In function ‘zram_revalidate_disk’:
> drivers/block/zram/zram_drv.c:120:37: error: 
> ‘zram->disk->queue->backing_dev_info’ is a pointer; did you mean to use ‘->’?
>   zram->disk->queue->backing_dev_info.capabilities |=
>  ^
>  ->
> make[1]: *** [scripts/Makefile.build:294: drivers/block/zram/zram_drv.o] 
> Error 1
> make: *** [Makefile:1490: _module_drivers/block/zram] Error 2
> 
> Compilation exited abnormally with code 2 at Thu Feb  2 15:37:19
> 
> 
> I have not yet tried to figure out which commit broke the build.

Thanks for catching that, Bart. I fixed it here after your report:

http://git.kernel.dk/cgit/linux-block/commit/?h=for-4.11/next=e17354961bb50931ec7b33f59c0713dcf98ac7d2

It's caused by commit:

commit dc3b17cc8bf21307c7e076e7c778d5db756f7871
Author: Jan Kara 
Date:   Thu Feb 2 15:56:50 2017 +0100

block: Use pointer to backing_dev_info from request_queue

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] blk-mq-sched: bypass the scheduler for flushes entirely

2017-02-02 Thread Omar Sandoval
From: Omar Sandoval 

There's a weird inconsistency that flushes are mostly hidden from the
scheduler, but it needs to be aware of them in ->insert_requests().
Instead of having every scheduler call blk_mq_sched_bypass_insert(),
let's do it in the common framework.

Signed-off-by: Omar Sandoval 
---
Patch is based on block/for-next.

 block/blk-mq-sched.c | 25 +++--
 block/blk-mq-sched.h |  1 -
 block/mq-deadline.c  |  3 ---
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 114814ec3d49..3ec52f494094 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -289,7 +289,8 @@ void blk_mq_sched_request_inserted(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
 
-bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
+  struct request *rq)
 {
if (rq->tag == -1) {
rq->rq_flags |= RQF_SORTED;
@@ -305,7 +306,6 @@ bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, 
struct request *rq)
spin_unlock(>lock);
return true;
 }
-EXPORT_SYMBOL_GPL(blk_mq_sched_bypass_insert);
 
 static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
@@ -363,6 +363,9 @@ void blk_mq_sched_insert_request(struct request *rq, bool 
at_head,
return;
}
 
+   if (e && blk_mq_sched_bypass_insert(hctx, rq))
+   goto run;
+
if (e && e->type->ops.mq.insert_requests) {
LIST_HEAD(list);
 
@@ -374,6 +377,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool 
at_head,
spin_unlock(>lock);
}
 
+run:
if (run_queue)
blk_mq_run_hw_queue(hctx, async);
 }
@@ -385,6 +389,23 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
struct elevator_queue *e = hctx->queue->elevator;
 
+   if (e) {
+   struct request *rq, *next;
+
+   /*
+* We bypass requests that already have a driver tag assigned,
+* which should only be flushes. Flushes are only ever inserted
+* as single requests, so we shouldn't ever hit the
+* WARN_ON_ONCE() below (but let's handle it just in case).
+*/
+   list_for_each_entry_safe(rq, next, list, queuelist) {
+   if (WARN_ON_ONCE(rq->tag != -1)) {
+   list_del_init(>queuelist);
+   blk_mq_sched_bypass_insert(hctx, rq);
+   }
+   }
+   }
+
if (e && e->type->ops.mq.insert_requests)
e->type->ops.mq.insert_requests(hctx, list, false);
else
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 9478aaeb48c5..add5f090a8cd 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -15,7 +15,6 @@ struct request *blk_mq_sched_get_request(struct request_queue 
*q, struct bio *bi
 void blk_mq_sched_put_request(struct request *rq);
 
 void blk_mq_sched_request_inserted(struct request *rq);
-bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, struct request 
*rq);
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request 
*rq);
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 49583536698c..8f91f21e8663 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -395,9 +395,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, 
struct request *rq,
 
blk_mq_sched_request_inserted(rq);
 
-   if (blk_mq_sched_bypass_insert(hctx, rq))
-   return;
-
if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(>queuelist, >dispatch);
-- 
2.11.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


block for-next: zram build error

2017-02-02 Thread Bart Van Assche
Hello Jens,

I noticed accidentally that with the for-next branch of the block
git repository that the zram driver doesn't build anymore:


$ make M=drivers/block/zram
  LD  drivers/block/zram/built-in.o
  CC [M]  drivers/block/zram/zcomp.o
  CC [M]  drivers/block/zram/zram_drv.o
drivers/block/zram/zram_drv.c: In function ‘zram_revalidate_disk’:
drivers/block/zram/zram_drv.c:120:37: error: 
‘zram->disk->queue->backing_dev_info’ is a pointer; did you mean to use ‘->’?
  zram->disk->queue->backing_dev_info.capabilities |=
 ^
 ->
make[1]: *** [scripts/Makefile.build:294: drivers/block/zram/zram_drv.o] Error 1
make: *** [Makefile:1490: _module_drivers/block/zram] Error 2

Compilation exited abnormally with code 2 at Thu Feb  2 15:37:19


I have not yet tried to figure out which commit broke the build.

Bart.

Re: split scsi passthrough fields out of struct request V2

2017-02-02 Thread Bart Van Assche
On Thu, 2017-02-02 at 16:04 -0500, Mike Snitzer wrote:
> Any progress on getting this to work without requiring infiniband HW?

Hello Mike,

Intructions for running these tests over SoftRoCE have been added to
the README.md file in https://github.com/bvanassche/srp-test. However,
I'm not sure the SoftRoCE driver is already stable enough to run these
tests on top of that driver.

Bart.--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/8] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

2017-02-02 Thread Jens Axboe
On 02/02/2017 02:15 PM, Paolo Valente wrote:
> 
>> Il giorno 02 feb 2017, alle ore 16:30, Jens Axboe  ha scritto:
>>
>> On 02/02/2017 02:19 AM, Paolo Valente wrote:
>>> The scheme is clear.  One comment, in case it could make sense and
>>> avoid more complexity: since put_rq_priv is invoked in two different
>>> contexts, process or interrupt, I didn't feel so confusing that, when
>>> put_rq_priv is invoked in the context where the lock cannot be held
>>> (unless one is willing to pay with irq disabling all the times), the
>>> lock is not held, while, when invoked in the context where the lock
>>> can be held, the lock is actually held, or must be taken.
>>
>> If you grab the same lock from put_rq_priv, yes, you must make it IRQ
>> disabling in all contexts, and use _irqsave() from put_rq_priv. If it's
>> just freeing resources, you could potentially wait and do that when
>> someone else needs them, since that part will come from proces context.
>> That would need two locks, though.
>>
>> As I said above, I would not worry about the IRQ disabling lock.
>>
> 
> I'm sorry, I focused only on the IRQ-disabling consequence of grabbing
> a scheduler lock also in IRQ context.  I thought it was a serious
> enough issue to avoid this option.  Yet there is also a deadlock
> problem related to this option.  In fact, the IRQ handler may preempt
> some process-context code that already holds some other locks, and, if
> some of these locks are already held by another process, which is
> executing on another CPU and which then tries to take the scheduler
> lock, or which happens to be preempted by an IRQ handler trying to
> grab the scheduler lock, then a deadlock occurs.  This is not just a
> speculation, but a problem that did occur before I moved to a
> deferred-work solution, and that can be readily reproduced.  Before
> moving to a deferred work solution, I tried various code manipulations
> to avoid these deadlocks without resorting to deferred work, but at no
> avail.

There are two important rules here:

1) If a lock is ever used in interrupt context, anyone acquiring it must
   ensure that interrupts gets disabled.

2) If multiple locks are needed, they need to be acquired in the right
   order.

Instead of talking in hypotheticals, be more specific. With the latest
code, the scheduler lock should now be fine, there should be no cases
where you are being invoked with it held. I'm assuming you are running
with lockdep enabled on your kernel? Post the stack traces from your
problem (and your code...), then we can take a look.

Don't punt to deferring work from your put_rq_private() function, that's
a suboptimal work around. It needs to be fixed for real.

> At any rate, bfq seems now to work, so I can finally move from just
> asking questions endlessly, to proposing actual code to discuss on.
> I'm about to: port this version of bfq to your improved/fixed
> blk-mq-sched version in for-4.11 (port postponed, to avoid introducing
> further changes in code that did not yet wok), run more extensive
> tests, polish commits a little bit, and finally share a branch.

Post the code sooner rather than later. There are bound to be things
that need to be improved or fixed up, let's start this process now. The
framework is pretty much buttoned up at this point, so there's time to
shift the attention a bit to a consumer of it.

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/8] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

2017-02-02 Thread Paolo Valente

> Il giorno 02 feb 2017, alle ore 16:30, Jens Axboe  ha scritto:
> 
> On 02/02/2017 02:19 AM, Paolo Valente wrote:
>> The scheme is clear.  One comment, in case it could make sense and
>> avoid more complexity: since put_rq_priv is invoked in two different
>> contexts, process or interrupt, I didn't feel so confusing that, when
>> put_rq_priv is invoked in the context where the lock cannot be held
>> (unless one is willing to pay with irq disabling all the times), the
>> lock is not held, while, when invoked in the context where the lock
>> can be held, the lock is actually held, or must be taken.
> 
> If you grab the same lock from put_rq_priv, yes, you must make it IRQ
> disabling in all contexts, and use _irqsave() from put_rq_priv. If it's
> just freeing resources, you could potentially wait and do that when
> someone else needs them, since that part will come from proces context.
> That would need two locks, though.
> 
> As I said above, I would not worry about the IRQ disabling lock.
> 

I'm sorry, I focused only on the IRQ-disabling consequence of grabbing
a scheduler lock also in IRQ context.  I thought it was a serious
enough issue to avoid this option.  Yet there is also a deadlock
problem related to this option.  In fact, the IRQ handler may preempt
some process-context code that already holds some other locks, and, if
some of these locks are already held by another process, which is
executing on another CPU and which then tries to take the scheduler
lock, or which happens to be preempted by an IRQ handler trying to
grab the scheduler lock, then a deadlock occurs.  This is not just a
speculation, but a problem that did occur before I moved to a
deferred-work solution, and that can be readily reproduced.  Before
moving to a deferred work solution, I tried various code manipulations
to avoid these deadlocks without resorting to deferred work, but at no
avail.

At any rate, bfq seems now to work, so I can finally move from just
asking questions endlessly, to proposing actual code to discuss on.
I'm about to: port this version of bfq to your improved/fixed
blk-mq-sched version in for-4.11 (port postponed, to avoid introducing
further changes in code that did not yet wok), run more extensive
tests, polish commits a little bit, and finally share a branch.

Thanks,
Paolo

> -- 
> Jens Axboe
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-block" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: split scsi passthrough fields out of struct request V2

2017-02-02 Thread Mike Snitzer
On Thu, Feb 02 2017 at  4:04pm -0500,
Mike Snitzer  wrote:

> On Thu, Feb 02 2017 at  2:46pm -0500,
> Bart Van Assche  wrote:
> 
> > On Thu, 2017-02-02 at 14:13 -0500, Mike Snitzer wrote:
> > > On Thu, Feb 02 2017 at  1:43pm -0500, Bart Van Assche 
> > >  wrote:
> > > > On Thu, 2017-02-02 at 13:33 -0500, Mike Snitzer wrote:
> > > > > I'll go back over hch's changes to see if I can spot anything.  But is
> > > > > this testing using dm_mod.use_bk_mq=Y or are you testing old 
> > > > > .request_fn
> > > > > dm-multipath?
> > > > 
> > > > The srp-test software tests multiple configurations: dm-mq on scsi-mq, 
> > > > dm-sq
> > > > on scsi-mq and dm-sq on scsi-sq. I have not yet checked which of these
> > > > three configurations triggers the kernel crash.
> > > 
> > > OK, such info is important to provide for crashes like this.  Please let
> > > me know once you do.
> > 
> > Hello Mike,
> > 
> > Apparently it's the large I/O test (using dm-mq on scsi-mq) that triggers 
> > the
> > crash:
> 
> I've gone over Christoph's "dm: always defer request allocation to the
> owner of the request_queue" commit yet again.  Most of that commit's
> changes are just mechanical.  I didn't see any problems.
> 
> In general, dm_start_request() calls dm_get(md) to take a reference on
> the mapped_device.  And rq_completed() calls dm_put(md) to drop the
> reference.  The DM device's request_queue (md->queue) should _not_ ever
> be torn down before all references on the md have been dropped. But I'll
> have to look closer on how/if that is enforced anywhere by coordinating
> with block core.
> 
> In any case, the crash you reported was that the mapped_device was being
> dereferenced after it was freed (at line 187's md->queue).  Which seems
> to imply a dm_get/dm_put reference count regression.  But I'm not seeing
> where at this point.

Maybe it isn't a regression but something about Christoph's changes
causes a race to present itself?

Care to try moving the dm_get(md) at the end of dm_start_request() to
the beginning of dm_start_request() and report back on whether it helps
at all?

Thanks,
Mike
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: split scsi passthrough fields out of struct request V2

2017-02-02 Thread Mike Snitzer
On Thu, Feb 02 2017 at  2:46pm -0500,
Bart Van Assche  wrote:

> On Thu, 2017-02-02 at 14:13 -0500, Mike Snitzer wrote:
> > On Thu, Feb 02 2017 at  1:43pm -0500, Bart Van Assche 
> >  wrote:
> > > On Thu, 2017-02-02 at 13:33 -0500, Mike Snitzer wrote:
> > > > I'll go back over hch's changes to see if I can spot anything.  But is
> > > > this testing using dm_mod.use_bk_mq=Y or are you testing old .request_fn
> > > > dm-multipath?
> > > 
> > > The srp-test software tests multiple configurations: dm-mq on scsi-mq, 
> > > dm-sq
> > > on scsi-mq and dm-sq on scsi-sq. I have not yet checked which of these
> > > three configurations triggers the kernel crash.
> > 
> > OK, such info is important to provide for crashes like this.  Please let
> > me know once you do.
> 
> Hello Mike,
> 
> Apparently it's the large I/O test (using dm-mq on scsi-mq) that triggers the
> crash:

I've gone over Christoph's "dm: always defer request allocation to the
owner of the request_queue" commit yet again.  Most of that commit's
changes are just mechanical.  I didn't see any problems.

In general, dm_start_request() calls dm_get(md) to take a reference on
the mapped_device.  And rq_completed() calls dm_put(md) to drop the
reference.  The DM device's request_queue (md->queue) should _not_ ever
be torn down before all references on the md have been dropped. But I'll
have to look closer on how/if that is enforced anywhere by coordinating
with block core.

In any case, the crash you reported was that the mapped_device was being
dereferenced after it was freed (at line 187's md->queue).  Which seems
to imply a dm_get/dm_put reference count regression.  But I'm not seeing
where at this point.

> # ~bart/software/infiniband/srp-test/run_tests -r 10
> [ ... ]
> Test /home/bart/software/infiniband/srp-test/tests/02-sq-on-mq succeeded
> Running test /home/bart/software/infiniband/srp-test/tests/03 ...
> Test large transfer sizes with cmd_sg_entries=255
> removing /dev/mapper/mpatht: [ CRASH ]
> 
> The source code of the test I ran is available at
> https://github.com/bvanassche/srp-test.

Any progress on getting this to work without requiring infiniband HW?
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 22/24] ubifs: Convert to separately allocated bdi

2017-02-02 Thread Richard Weinberger
Jan,

Am 02.02.2017 um 18:34 schrieb Jan Kara:
> Allocate struct backing_dev_info separately instead of embedding it
> inside the superblock. This unifies handling of bdi among users.
> 
> CC: Richard Weinberger 
> CC: Artem Bityutskiy 
> CC: Adrian Hunter 
> CC: linux-...@lists.infradead.org
> Signed-off-by: Jan Kara 

Is this series available at some git tree, please?

Thanks,
//richard
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: split scsi passthrough fields out of struct request V2

2017-02-02 Thread Bart Van Assche
On Thu, 2017-02-02 at 14:13 -0500, Mike Snitzer wrote:
> On Thu, Feb 02 2017 at  1:43pm -0500, Bart Van Assche 
>  wrote:
> > On Thu, 2017-02-02 at 13:33 -0500, Mike Snitzer wrote:
> > > I'll go back over hch's changes to see if I can spot anything.  But is
> > > this testing using dm_mod.use_bk_mq=Y or are you testing old .request_fn
> > > dm-multipath?
> > 
> > The srp-test software tests multiple configurations: dm-mq on scsi-mq, dm-sq
> > on scsi-mq and dm-sq on scsi-sq. I have not yet checked which of these
> > three configurations triggers the kernel crash.
> 
> OK, such info is important to provide for crashes like this.  Please let
> me know once you do.

Hello Mike,

Apparently it's the large I/O test (using dm-mq on scsi-mq) that triggers the
crash:

# ~bart/software/infiniband/srp-test/run_tests -r 10
[ ... ]
Test /home/bart/software/infiniband/srp-test/tests/02-sq-on-mq succeeded
Running test /home/bart/software/infiniband/srp-test/tests/03 ...
Test large transfer sizes with cmd_sg_entries=255
removing /dev/mapper/mpatht: [ CRASH ]

The source code of the test I ran is available at
https://github.com/bvanassche/srp-test.

Bart.--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 04/24] fs: Provide infrastructure for dynamic BDIs in filesystems

2017-02-02 Thread Liu Bo
Hi,

On Thu, Feb 02, 2017 at 06:34:02PM +0100, Jan Kara wrote:
> Provide helper functions for setting up dynamically allocated
> backing_dev_info structures for filesystems and cleaning them up on
> superblock destruction.

Just one concern, will this cause problems for multiple superblock cases
like nfs with nosharecache?

Thanks,

-liubo

> 
> CC: linux-...@lists.infradead.org
> CC: linux-...@vger.kernel.org
> CC: Petr Vandrovec 
> CC: linux-ni...@vger.kernel.org
> CC: cluster-de...@redhat.com
> CC: osd-...@open-osd.org
> CC: codal...@coda.cs.cmu.edu
> CC: linux-...@lists.infradead.org
> CC: ecryp...@vger.kernel.org
> CC: linux-c...@vger.kernel.org
> CC: ceph-de...@vger.kernel.org
> CC: linux-bt...@vger.kernel.org
> CC: v9fs-develo...@lists.sourceforge.net
> CC: lustre-de...@lists.lustre.org
> Signed-off-by: Jan Kara 
> ---
>  fs/super.c   | 49 
> 
>  include/linux/backing-dev-defs.h |  2 +-
>  include/linux/fs.h   |  6 +
>  3 files changed, 56 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index ea662b0e5e78..31dc4c6450ef 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -446,6 +446,11 @@ void generic_shutdown_super(struct super_block *sb)
>   hlist_del_init(>s_instances);
>   spin_unlock(_lock);
>   up_write(>s_umount);
> + if (sb->s_iflags & SB_I_DYNBDI) {
> + bdi_put(sb->s_bdi);
> + sb->s_bdi = _backing_dev_info;
> + sb->s_iflags &= ~SB_I_DYNBDI;
> + }
>  }
>  
>  EXPORT_SYMBOL(generic_shutdown_super);
> @@ -1249,6 +1254,50 @@ mount_fs(struct file_system_type *type, int flags, 
> const char *name, void *data)
>  }
>  
>  /*
> + * Setup private BDI for given superblock. I gets automatically cleaned up
> + * in generic_shutdown_super().
> + */
> +int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
> +{
> + struct backing_dev_info *bdi;
> + int err;
> + va_list args;
> +
> + bdi = bdi_alloc(GFP_KERNEL);
> + if (!bdi)
> + return -ENOMEM;
> +
> + bdi->name = sb->s_type->name;
> +
> + va_start(args, fmt);
> + err = bdi_register_va(bdi, NULL, fmt, args);
> + va_end(args);
> + if (err) {
> + bdi_put(bdi);
> + return err;
> + }
> + WARN_ON(sb->s_bdi != _backing_dev_info);
> + sb->s_bdi = bdi;
> + sb->s_iflags |= SB_I_DYNBDI;
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(super_setup_bdi_name);
> +
> +/*
> + * Setup private BDI for given superblock. I gets automatically cleaned up
> + * in generic_shutdown_super().
> + */
> +int super_setup_bdi(struct super_block *sb)
> +{
> + static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
> +
> + return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
> + atomic_long_inc_return(_seq));
> +}
> +EXPORT_SYMBOL(super_setup_bdi);
> +
> +/*
>   * This is an internal function, please use sb_end_{write,pagefault,intwrite}
>   * instead.
>   */
> diff --git a/include/linux/backing-dev-defs.h 
> b/include/linux/backing-dev-defs.h
> index 2ecafc8a2d06..70080b4217f4 100644
> --- a/include/linux/backing-dev-defs.h
> +++ b/include/linux/backing-dev-defs.h
> @@ -143,7 +143,7 @@ struct backing_dev_info {
>   congested_fn *congested_fn; /* Function pointer if device is md/dm */
>   void *congested_data;   /* Pointer to aux data for congested func */
>  
> - char *name;
> + const char *name;
>  
>   struct kref refcnt; /* Reference counter for the structure */
>   unsigned int registered:1;  /* Is bdi registered? */
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c930cbc19342..8ed8b6d1bc54 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1267,6 +1267,9 @@ struct mm_struct;
>  /* sb->s_iflags to limit user namespace mounts */
>  #define SB_I_USERNS_VISIBLE  0x0010 /* fstype already mounted */
>  
> +/* Temporary flag until all filesystems are converted to dynamic bdis */
> +#define SB_I_DYNBDI  0x0100
> +
>  /* Possible states of 'frozen' field */
>  enum {
>   SB_UNFROZEN = 0,/* FS is unfrozen */
> @@ -2103,6 +2106,9 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
>  extern int freeze_super(struct super_block *super);
>  extern int thaw_super(struct super_block *super);
>  extern bool our_mnt(struct vfsmount *mnt);
> +extern __printf(2, 3)
> +int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
> +extern int super_setup_bdi(struct super_block *sb);
>  
>  extern int current_umask(void);
>  
> -- 
> 2.10.2
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: split scsi passthrough fields out of struct request V2

2017-02-02 Thread Mike Snitzer
On Thu, Feb 02 2017 at  1:43pm -0500,
Bart Van Assche  wrote:

> On Thu, 2017-02-02 at 13:33 -0500, Mike Snitzer wrote:
> > I'll go back over hch's changes to see if I can spot anything.  But is
> > this testing using dm_mod.use_bk_mq=Y or are you testing old .request_fn
> > dm-multipath?
> 
> Hello Mike,
> 
> The srp-test software tests multiple configurations: dm-mq on scsi-mq, dm-sq
> on scsi-mq and dm-sq on scsi-sq. I have not yet checked which of these
> three configurations triggers the kernel crash.

OK, such info is important to provide for crashes like this.  Please let
me know once you do.

Thanks,
Mike
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: split scsi passthrough fields out of struct request V2

2017-02-02 Thread Bart Van Assche
On Thu, 2017-02-02 at 13:33 -0500, Mike Snitzer wrote:
> I'll go back over hch's changes to see if I can spot anything.  But is
> this testing using dm_mod.use_bk_mq=Y or are you testing old .request_fn
> dm-multipath?

Hello Mike,

The srp-test software tests multiple configurations: dm-mq on scsi-mq, dm-sq
on scsi-mq and dm-sq on scsi-sq. I have not yet checked which of these
three configurations triggers the kernel crash.

Bart.--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: split scsi passthrough fields out of struct request V2

2017-02-02 Thread Mike Snitzer
On Thu, Feb 02 2017 at 12:27pm -0500,
Bart Van Assche  wrote:

> On Wed, 2017-02-01 at 22:01 +, Bart Van Assche wrote:
> > However, a new issue shows up sporadically, an issue that I had not yet seen
> > during any test with a kernel tree from Linus:
> >
> > [  227.613440] general protection fault:  [#1] SMP
> > [  227.613495] Modules linked in: dm_service_time ib_srp scsi_transport_srp 
> > target_core_user uio target_core_pscsi target_core_file ib_srpt 
> > target_core_iblock target_core_mod brd netconsole xt_CHECKSUM 
> > iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat 
> > nf_nat_ipv4 nf_nat libcrc32c nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack 
> > nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc 
> > ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter ip_tables 
> > x_tables af_packet ib_ipoib msr rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm 
> > configfs ib_cm iw_cm mlx4_ib ib_core sb_edac edac_core x86_pkg_temp_thermal 
> > intel_powerclamp coretemp kvm_intel ipmi_ssif kvm irqbypass 
> > crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel pcbc tg3 
> > aesni_intel iTCO_wdt mlx4_core ptp iTCO_vendor_support dcdbas aes_x86_64 
> > crypto_simd glue_helper pps_core cryptd pcspkr devlink ipmi_si libphy 
> > ipmi_devintf fjes ipmi_msghandler tpm_tis tpm_tis_core lpc_ich mei_me 
> > mfd_core mei shpchp wmi tpm button hid_generic usbhid mgag200 i2c_algo_bit 
> > drm_kms_helper syscopyarea sysfillrect sr_mod sysimgblt fb_sys_fops cdrom 
> > ttm drm ehci_pci ehci_hcd usbcore usb_common sg dm_multipath dm_mod 
> > scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4
> > [  227.613774] CPU: 3 PID: 28 Comm: ksoftirqd/3 Not tainted 4.10.0-rc5-dbg+ 
> > #1
> > [  227.613840] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 
> > 11/17/2014
> > [  227.613893] task: 880172a249c0 task.stack: c90001aa8000
> > [  227.613932] RIP: 0010:rq_completed+0x12/0x90 [dm_mod]
> > [  227.613965] RSP: 0018:c90001aabda8 EFLAGS: 00010246
> > [  227.614006] RAX:  RBX: 6b6b6b6b6b6b6b6b RCX: 
> > 
> > [  227.614043] RDX:  RSI:  RDI: 
> > 6b6b6b6b6b6b6b6b
> > [  227.614074] RBP: c90001aabdc0 R08: 8803825f4c38 R09: 
> > 
> > [  227.614105] R10:  R11:  R12: 
> > 
> > [  227.614137] R13:  R14: 81c05120 R15: 
> > 0004
> > [  227.614170] FS:  () GS:88046f2c() 
> > knlGS:
> > [  227.614209] CS:  0010 DS:  ES:  CR0: 80050033
> > [  227.614239] CR2: 557e28bc20d0 CR3: 00038594e000 CR4: 
> > 001406e0
> > [  227.614268] Call Trace:
> > [  227.614301]  dm_softirq_done+0xe6/0x1e0 [dm_mod]
> > [  227.614337]  blk_done_softirq+0x88/0xa0
> > [  227.614369]  __do_softirq+0xba/0x4c0
> > [  227.614470]  run_ksoftirqd+0x1a/0x50
> > [  227.614499]  smpboot_thread_fn+0x123/0x1e0
> > [  227.614529]  kthread+0x107/0x140
> > [  227.614624]  ret_from_fork+0x2e/0x40
> > [  227.614648] Code: ff ff 31 f6 48 89 c7 e8 cd 0e 2f e1 5d c3 90 66 2e 0f 
> > 1f 84 00 00 00 00 00 55 48 63 f6 48 89 e5 41 55 41 89 d5 41 54 53 48 89 fb 
> > <4c> 8b a7 88 02 00 00 f0 ff 8c b7 50 03 00 00 e8 ba 43 ff ff 85 
> > [  227.614738] RIP: rq_completed+0x12/0x90 [dm_mod] RSP: c90001aabda8
> > 
> > (gdb) list *(rq_completed+0x12)
> > 0xdd12 is in rq_completed (drivers/md/dm-rq.c:187).
> > 182  * the md may be freed in dm_put() at the end of this function.
> > 183  * Or do dm_get() before calling this function and dm_put() later.
> > 184  */
> > 185 static void rq_completed(struct mapped_device *md, int rw, bool 
> > run_queue)
> > 186 {
> > 187 struct request_queue *q = md->queue;
> > 188 unsigned long flags;
> > 189
> > 190 atomic_dec(>pending[rw]);
> > 191
> > 
> > (gdb) disas rq_completed
> > Dump of assembler code for function rq_completed:
> >0xdd00 <+0>: push   %rbp
> >0xdd01 <+1>: movslq %esi,%rsi
> >0xdd04 <+4>: mov%rsp,%rbp
> >0xdd07 <+7>: push   %r13
> >0xdd09 <+9>: mov%edx,%r13d
> >0xdd0c <+12>:push   %r12
> >0xdd0e <+14>:push   %rbx
> >0xdd0f <+15>:mov%rdi,%rbx
> >0xdd12 <+18>:mov0x288(%rdi),%r12
> >0xdd19 <+25>:lock decl 0x350(%rdi,%rsi,4)
> > 
> > So this was caused by an attempt to dereference %rdi = 0x6b6b6b6b6b6b6b6b.
> > Hence this is probably a use-after-free of struct mapped_device.
> 
> Hello Christoph and Mike,
> 
> The above crash occurs with Jens' for-next branch but not with Jens'
> for-4.11/block branch. Sorry but I think this means that the SCSI
> passthrough refactoring code is not yet ready for prime time.

I somehow missed your 

[PATCH 01/24] block: Provide bdi_alloc()

2017-02-02 Thread Jan Kara
Provide bdi_alloc() forsimple allocation of a BDI that can be used by
filesystems that don't need anything fancy. We use this function when
converting filesystems from embedded struct backing_dev_info into a
dynamically allocated one.

Signed-off-by: Jan Kara 
---
 include/linux/backing-dev.h |  1 +
 mm/backing-dev.c| 15 +++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c52a48cb9a66..81c07ade4305 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -37,6 +37,7 @@ void bdi_unregister(struct backing_dev_info *bdi);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_destroy(struct backing_dev_info *bdi);
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
+struct backing_dev_info *bdi_alloc(gfp_t gfp_mask);
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
bool range_cyclic, enum wb_reason reason);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 28ce6cf7b2ff..7a5ba4163656 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -809,6 +809,21 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, 
int node_id)
return bdi;
 }
 
+struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
+{
+   struct backing_dev_info *bdi;
+
+   bdi = kmalloc(sizeof(struct backing_dev_info), gfp_mask | __GFP_ZERO);
+   if (!bdi)
+   return NULL;
+   if (bdi_init(bdi)) {
+   kfree(bdi);
+   return NULL;
+   }
+   return bdi;
+}
+EXPORT_SYMBOL(bdi_alloc);
+
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...)
 {
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/24 RFC] fs: Convert all embedded bdis into separate ones

2017-02-02 Thread Jan Kara
Hello,

this patch series converts all embedded occurences of struct backing_dev_info
to use standalone dynamically allocated structures. This makes bdi handling
unified across all bdi users and generally removes some boilerplate code from
filesystems setting up their own bdi. It also allows us to remove some code
from generic bdi implementation.

The patches were only compile-tested for most filesystems (I've tested
mounting only for NFS & btrfs) so fs maintainers please have a look whether
the changes look sound to you.

This series is based on top of bdi fixes that were merged into linux-block
git tree.

Honza
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/24] ceph: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside client structure. This unifies handling of bdi among users.

CC: Ilya Dryomov 
CC: "Yan, Zheng" 
CC: Sage Weil 
CC: ceph-de...@vger.kernel.org
Signed-off-by: Jan Kara 
---
 fs/ceph/addr.c|  6 +++---
 fs/ceph/debugfs.c |  2 +-
 fs/ceph/super.c   | 32 +++-
 fs/ceph/super.h   |  2 --
 4 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9cd0c0ea7cdb..f83d00cf3e66 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -576,7 +576,7 @@ static int writepage_nounlock(struct page *page, struct 
writeback_control *wbc)
writeback_stat = atomic_long_inc_return(>writeback_count);
if (writeback_stat >
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
-   set_bdi_congested(>backing_dev_info, BLK_RW_ASYNC);
+   set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -698,7 +698,7 @@ static void writepages_finish(struct ceph_osd_request *req)
if (atomic_long_dec_return(>writeback_count) <
 CONGESTION_OFF_THRESH(
fsc->mount_options->congestion_kb))
-   clear_bdi_congested(>backing_dev_info,
+   clear_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC);
 
if (rc < 0)
@@ -977,7 +977,7 @@ static int ceph_writepages_start(struct address_space 
*mapping,
if (atomic_long_inc_return(>writeback_count) >
CONGESTION_ON_THRESH(
fsc->mount_options->congestion_kb)) {
-   set_bdi_congested(>backing_dev_info,
+   set_bdi_congested(inode_to_bdi(inode),
  BLK_RW_ASYNC);
}
 
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 39ff678e567f..5da595c0edf1 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -251,7 +251,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
goto out;
 
snprintf(name, sizeof(name), "../../bdi/%s",
-dev_name(fsc->backing_dev_info.dev));
+dev_name(fsc->sb->s_bdi->dev));
fsc->debugfs_bdi =
debugfs_create_symlink("bdi",
   fsc->client->debugfs_dir,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6bd20d707bfd..ecc411fa7c06 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -579,10 +579,6 @@ static struct ceph_fs_client *create_fs_client(struct 
ceph_mount_options *fsopt,
 
atomic_long_set(>writeback_count, 0);
 
-   err = bdi_init(>backing_dev_info);
-   if (err < 0)
-   goto fail_client;
-
err = -ENOMEM;
/*
 * The number of concurrent works can be high but they don't need
@@ -590,7 +586,7 @@ static struct ceph_fs_client *create_fs_client(struct 
ceph_mount_options *fsopt,
 */
fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
if (fsc->wb_wq == NULL)
-   goto fail_bdi;
+   goto fail_client;
fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
if (fsc->pg_inv_wq == NULL)
goto fail_wb_wq;
@@ -624,8 +620,6 @@ static struct ceph_fs_client *create_fs_client(struct 
ceph_mount_options *fsopt,
destroy_workqueue(fsc->pg_inv_wq);
 fail_wb_wq:
destroy_workqueue(fsc->wb_wq);
-fail_bdi:
-   bdi_destroy(>backing_dev_info);
 fail_client:
ceph_destroy_client(fsc->client);
 fail:
@@ -643,8 +637,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq);
 
-   bdi_destroy(>backing_dev_info);
-
mempool_destroy(fsc->wb_pagevec_pool);
 
destroy_mount_options(fsc->mount_options);
@@ -938,25 +930,23 @@ static int ceph_compare_super(struct super_block *sb, 
void *data)
  */
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 
-static int ceph_register_bdi(struct super_block *sb,
-struct ceph_fs_client *fsc)
+static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc)
 {
int err;
 
+   err = super_setup_bdi_name(sb, "ceph-%ld",
+  atomic_long_inc_return(_seq));
+   if (err)
+   return err;
+
/* set ra_pages based on rasize mount option? */
if (fsc->mount_options->rasize >= PAGE_SIZE)
-   fsc->backing_dev_info.ra_pages =
+   sb->s_bdi->ra_pages =

[PATCH 11/24] ecryptfs: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Tyler Hicks 
CC: ecryp...@vger.kernel.org
Signed-off-by: Jan Kara 
---
 fs/ecryptfs/ecryptfs_kernel.h | 1 -
 fs/ecryptfs/main.c| 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 599a29237cfe..e93444a4c4b1 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -349,7 +349,6 @@ struct ecryptfs_mount_crypt_stat {
 struct ecryptfs_sb_info {
struct super_block *wsi_sb;
struct ecryptfs_mount_crypt_stat mount_crypt_stat;
-   struct backing_dev_info bdi;
 };
 
 /* file private data. */
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 151872dcc1f4..9014479d0160 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -519,12 +519,11 @@ static struct dentry *ecryptfs_mount(struct 
file_system_type *fs_type, int flags
goto out;
}
 
-   rc = bdi_setup_and_register(>bdi, "ecryptfs");
+   rc = super_setup_bdi(s);
if (rc)
goto out1;
 
ecryptfs_set_superblock_private(s, sbi);
-   s->s_bdi = >bdi;
 
/* ->kill_sb() will take care of sbi after that point */
sbi = NULL;
@@ -633,7 +632,6 @@ static void ecryptfs_kill_block_super(struct super_block 
*sb)
if (!sb_info)
return;
ecryptfs_destroy_mount_crypt_stat(_info->mount_crypt_stat);
-   bdi_destroy(_info->bdi);
kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
 }
 
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/24] orangefs: Remove orangefs_backing_dev_info

2017-02-02 Thread Jan Kara
It is not used anywhere.

CC: Mike Marshall 
Signed-off-by: Jan Kara 
---
 fs/orangefs/inode.c   |  6 --
 fs/orangefs/orangefs-kernel.h |  1 -
 fs/orangefs/orangefs-mod.c| 12 +---
 3 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 551bc74ed2b8..5cd617980fbf 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -136,12 +136,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
return -EINVAL;
 }
 
-struct backing_dev_info orangefs_backing_dev_info = {
-   .name = "orangefs",
-   .ra_pages = 0,
-   .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
-};
-
 /** ORANGEFS2 implementation of address space operations */
 const struct address_space_operations orangefs_address_operations = {
.readpage = orangefs_readpage,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 3bf803d732c5..70355a9a2596 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -529,7 +529,6 @@ extern spinlock_t orangefs_htable_ops_in_progress_lock;
 extern int hash_table_size;
 
 extern const struct address_space_operations orangefs_address_operations;
-extern struct backing_dev_info orangefs_backing_dev_info;
 extern const struct inode_operations orangefs_file_inode_operations;
 extern const struct file_operations orangefs_file_operations;
 extern const struct inode_operations orangefs_symlink_inode_operations;
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 4113eb0495bf..c1b5174cb5a9 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -80,11 +80,6 @@ static int __init orangefs_init(void)
int ret = -1;
__u32 i = 0;
 
-   ret = bdi_init(_backing_dev_info);
-
-   if (ret)
-   return ret;
-
if (op_timeout_secs < 0)
op_timeout_secs = 0;
 
@@ -94,7 +89,7 @@ static int __init orangefs_init(void)
/* initialize global book keeping data structures */
ret = op_cache_initialize();
if (ret < 0)
-   goto err;
+   goto out;
 
ret = orangefs_inode_cache_initialize();
if (ret < 0)
@@ -181,9 +176,6 @@ static int __init orangefs_init(void)
 cleanup_op:
op_cache_finalize();
 
-err:
-   bdi_destroy(_backing_dev_info);
-
 out:
return ret;
 }
@@ -207,8 +199,6 @@ static void __exit orangefs_exit(void)
 
kfree(orangefs_htable_ops_in_progress);
 
-   bdi_destroy(_backing_dev_info);
-
pr_info("orangefs: module version %s unloaded\n", ORANGEFS_VERSION);
 }
 
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/24] mtd: Convert to dynamically allocated bdi infrastructure

2017-02-02 Thread Jan Kara
MTD already allocates backing_dev_info dynamically. Convert it to use
generic infrastructure for this including proper refcounting. We drop
mtd->backing_dev_info as its only use was to pass mtd_bdi pointer from
one file into another and if we wanted to keep that in a clean way, we'd
have to make mtd hold and drop bdi reference as needed which seems
pointless for passing one global pointer...

CC: David Woodhouse 
CC: Brian Norris 
CC: linux-...@lists.infradead.org
Signed-off-by: Jan Kara 
---
 drivers/mtd/mtdcore.c   | 23 ---
 drivers/mtd/mtdsuper.c  |  7 ++-
 include/linux/mtd/mtd.h |  5 -
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 052772f7caef..a05063de362f 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -46,7 +46,7 @@
 
 #include "mtdcore.h"
 
-static struct backing_dev_info *mtd_bdi;
+struct backing_dev_info *mtd_bdi;
 
 #ifdef CONFIG_PM_SLEEP
 
@@ -496,11 +496,9 @@ int add_mtd_device(struct mtd_info *mtd)
 * mtd_device_parse_register() multiple times on the same master MTD,
 * especially with CONFIG_MTD_PARTITIONED_MASTER=y.
 */
-   if (WARN_ONCE(mtd->backing_dev_info, "MTD already registered\n"))
+   if (WARN_ONCE(mtd->dev.type, "MTD already registered\n"))
return -EEXIST;
 
-   mtd->backing_dev_info = mtd_bdi;
-
BUG_ON(mtd->writesize == 0);
mutex_lock(_table_mutex);
 
@@ -1775,13 +1773,18 @@ static struct backing_dev_info * __init 
mtd_bdi_init(char *name)
struct backing_dev_info *bdi;
int ret;
 
-   bdi = kzalloc(sizeof(*bdi), GFP_KERNEL);
+   bdi = bdi_alloc(GFP_KERNEL);
if (!bdi)
return ERR_PTR(-ENOMEM);
 
-   ret = bdi_setup_and_register(bdi, name);
+   bdi->name = name;
+   /*
+* We put '-0' suffix to the name to get the same name format as we
+* used to get. Since this is called only once, we get a unique name. 
+*/
+   ret = bdi_register(bdi, NULL, "%.28s-0", name);
if (ret)
-   kfree(bdi);
+   bdi_put(bdi);
 
return ret ? ERR_PTR(ret) : bdi;
 }
@@ -1813,8 +1816,7 @@ static int __init init_mtd(void)
 out_procfs:
if (proc_mtd)
remove_proc_entry("mtd", NULL);
-   bdi_destroy(mtd_bdi);
-   kfree(mtd_bdi);
+   bdi_put(mtd_bdi);
 err_bdi:
class_unregister(_class);
 err_reg:
@@ -1828,8 +1830,7 @@ static void __exit cleanup_mtd(void)
if (proc_mtd)
remove_proc_entry("mtd", NULL);
class_unregister(_class);
-   bdi_destroy(mtd_bdi);
-   kfree(mtd_bdi);
+   bdi_put(mtd_bdi);
idr_destroy(_idr);
 }
 
diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 20c02a3b7417..e69e7855e31f 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * compare superblocks to see if they're equivalent
@@ -38,6 +39,8 @@ static int get_sb_mtd_compare(struct super_block *sb, void 
*_mtd)
return 0;
 }
 
+extern struct backing_dev_info *mtd_bdi;
+
 /*
  * mark the superblock by the MTD device it is using
  * - set the device number to be the correct MTD block device for pesuperstence
@@ -49,7 +52,9 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
 
sb->s_mtd = mtd;
sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
-   sb->s_bdi = mtd->backing_dev_info;
+   sb->s_bdi = bdi_get(mtd_bdi);
+   sb->s_iflags |= SB_I_DYNBDI;
+
return 0;
 }
 
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 13f8052b9ff9..de64f87abbe0 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -332,11 +332,6 @@ struct mtd_info {
int (*_get_device) (struct mtd_info *mtd);
void (*_put_device) (struct mtd_info *mtd);
 
-   /* Backing device capabilities for this device
-* - provides mmap capabilities
-*/
-   struct backing_dev_info *backing_dev_info;
-
struct notifier_block reboot_notifier;  /* default mode before reboot */
 
/* ECC status information */
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/24] afs: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: David Howells 
CC: linux-...@lists.infradead.org
Signed-off-by: Jan Kara 
---
 fs/afs/internal.h | 1 -
 fs/afs/super.c| 4 +++-
 fs/afs/volume.c   | 7 ---
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 535a38d2c1d0..f8d52c36e6ec 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -310,7 +310,6 @@ struct afs_volume {
unsigned short  rjservers;  /* number of servers discarded 
due to -ENOMEDIUM */
struct afs_server   *servers[8];/* servers on which volume 
resides (ordered) */
struct rw_semaphore server_sem; /* lock for accessing current 
server */
-   struct backing_dev_info bdi;
 };
 
 /*
diff --git a/fs/afs/super.c b/fs/afs/super.c
index fbdb022b75a2..3bae29cd277f 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -319,7 +319,9 @@ static int afs_fill_super(struct super_block *sb,
sb->s_blocksize_bits= PAGE_SHIFT;
sb->s_magic = AFS_FS_MAGIC;
sb->s_op= _super_ops;
-   sb->s_bdi   = >volume->bdi;
+   ret = super_setup_bdi(sb);
+   if (ret)
+   return ret;
strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id));
 
/* allocate the root inode and dentry */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index d142a2449e65..db73d6dad02b 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,10 +106,6 @@ struct afs_volume *afs_volume_lookup(struct 
afs_mount_params *params)
volume->cell= params->cell;
volume->vid = vlocation->vldb.vid[params->type];
 
-   ret = bdi_setup_and_register(>bdi, "afs");
-   if (ret)
-   goto error_bdi;
-
init_rwsem(>server_sem);
 
/* look up all the applicable server records */
@@ -155,8 +151,6 @@ struct afs_volume *afs_volume_lookup(struct 
afs_mount_params *params)
return ERR_PTR(ret);
 
 error_discard:
-   bdi_destroy(>bdi);
-error_bdi:
up_write(>cell->vl_sem);
 
for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -206,7 +200,6 @@ void afs_put_volume(struct afs_volume *volume)
for (loop = volume->nservers - 1; loop >= 0; loop--)
afs_put_server(volume->servers[loop]);
 
-   bdi_destroy(>bdi);
kfree(volume);
 
_leave(" [destroyed]");
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/24] lustre: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside superblock. This unifies handling of bdi among users.

CC: Oleg Drokin 
CC: Andreas Dilger 
CC: James Simmons 
CC: lustre-de...@lists.lustre.org
Signed-off-by: Jan Kara 
---
 .../staging/lustre/lustre/include/lustre_disk.h|  4 
 drivers/staging/lustre/lustre/llite/llite_lib.c| 24 +++---
 2 files changed, 3 insertions(+), 25 deletions(-)

diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h 
b/drivers/staging/lustre/lustre/include/lustre_disk.h
index 8886458748c1..a676bccabd43 100644
--- a/drivers/staging/lustre/lustre/include/lustre_disk.h
+++ b/drivers/staging/lustre/lustre/include/lustre_disk.h
@@ -133,13 +133,9 @@ struct lustre_sb_info {
struct obd_export*lsi_osd_exp;
char  lsi_osd_type[16];
char  lsi_fstype[16];
-   struct backing_dev_info   lsi_bdi; /* each client mountpoint needs
-   * own backing_dev_info
-   */
 };
 
 #define LSI_UMOUNT_FAILOVER  0x0020
-#define LSI_BDI_INITIALIZED  0x0040
 
 #define s2lsi(sb)  ((struct lustre_sb_info *)((sb)->s_fs_info))
 #define s2lsi_nocast(sb) ((sb)->s_fs_info)
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c 
b/drivers/staging/lustre/lustre/llite/llite_lib.c
index 25f5aed97f63..4f07d2e60d40 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -861,15 +861,6 @@ void ll_lli_init(struct ll_inode_info *lli)
mutex_init(>lli_layout_mutex);
 }
 
-static inline int ll_bdi_register(struct backing_dev_info *bdi)
-{
-   static atomic_t ll_bdi_num = ATOMIC_INIT(0);
-
-   bdi->name = "lustre";
-   return bdi_register(bdi, NULL, "lustre-%d",
-   atomic_inc_return(_bdi_num));
-}
-
 int ll_fill_super(struct super_block *sb, struct vfsmount *mnt)
 {
struct lustre_profile *lprof = NULL;
@@ -879,6 +870,7 @@ int ll_fill_super(struct super_block *sb, struct vfsmount 
*mnt)
char  *profilenm = get_profile_name(sb);
struct config_llog_instance *cfg;
interr;
+   static atomic_t ll_bdi_num = ATOMIC_INIT(0);
 
CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb);
 
@@ -901,16 +893,11 @@ int ll_fill_super(struct super_block *sb, struct vfsmount 
*mnt)
if (err)
goto out_free;
 
-   err = bdi_init(>lsi_bdi);
-   if (err)
-   goto out_free;
-   lsi->lsi_flags |= LSI_BDI_INITIALIZED;
-   lsi->lsi_bdi.capabilities = 0;
-   err = ll_bdi_register(>lsi_bdi);
+   err = super_setup_bdi_name(sb, "lustre-%d",
+  atomic_inc_return(_bdi_num));
if (err)
goto out_free;
 
-   sb->s_bdi = >lsi_bdi;
/* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */
sb->s_d_op = _d_ops;
 
@@ -1031,11 +1018,6 @@ void ll_put_super(struct super_block *sb)
if (profilenm)
class_del_profile(profilenm);
 
-   if (lsi->lsi_flags & LSI_BDI_INITIALIZED) {
-   bdi_destroy(>lsi_bdi);
-   lsi->lsi_flags &= ~LSI_BDI_INITIALIZED;
-   }
-
ll_free_sbi(sb);
lsi->lsi_llsbi = NULL;
 
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 15/24] coda: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Jan Harkes 
CC: c...@cs.cmu.edu
CC: codal...@coda.cs.cmu.edu
Signed-off-by: Jan Kara 
---
 fs/coda/inode.c| 11 ---
 include/linux/coda_psdev.h |  1 -
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 71dbe7e287ce..b72eb96f3f96 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void 
*data, int silent)
goto unlock_out;
}
 
-   error = bdi_setup_and_register(>bdi, "coda");
-   if (error)
-   goto unlock_out;
-
vc->vc_sb = sb;
mutex_unlock(>vc_mutex);
 
@@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void 
*data, int silent)
sb->s_magic = CODA_SUPER_MAGIC;
sb->s_op = _super_operations;
sb->s_d_op = _dentry_operations;
-   sb->s_bdi = >bdi;
+
+   error = super_setup_bdi(sb);
+   if (error)
+   goto error;
 
/* get root fid from Venus: this needs the root inode */
error = venus_rootfid(sb, );
@@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void 
*data, int silent)
 
 error:
mutex_lock(>vc_mutex);
-   bdi_destroy(>bdi);
vc->vc_sb = NULL;
sb->s_fs_info = NULL;
 unlock_out:
@@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb)
 {
struct venus_comm *vcp = coda_vcp(sb);
mutex_lock(>vc_mutex);
-   bdi_destroy(>bdi);
vcp->vc_sb = NULL;
sb->s_fs_info = NULL;
mutex_unlock(>vc_mutex);
diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 5b8721efa948..31e4e1f1547c 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -15,7 +15,6 @@ struct venus_comm {
struct list_headvc_processing;
int vc_inuse;
struct super_block *vc_sb;
-   struct backing_dev_info bdi;
struct mutexvc_mutex;
 };
 
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 24/24] block: Remove unused functions

2017-02-02 Thread Jan Kara
Now that all backing_dev_info structure are allocated separately, we can
drop some unused functions.

Signed-off-by: Jan Kara 
---
 include/linux/backing-dev.h |  5 -
 mm/backing-dev.c| 54 +
 2 files changed, 5 insertions(+), 54 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 6865b1c8b122..f39822a06305 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -17,8 +17,6 @@
 #include 
 #include 
 
-int __must_check bdi_init(struct backing_dev_info *bdi);
-
 static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
 {
kref_get(>refcnt);
@@ -32,12 +30,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device 
*parent,
const char *fmt, ...);
 int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, va_list args);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
 
-int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_destroy(struct backing_dev_info *bdi);
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
 struct backing_dev_info *bdi_alloc(gfp_t gfp_mask);
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 82fee0f52d06..38b1197f7479 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,8 +12,6 @@
 #include 
 #include 
 
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
 struct backing_dev_info noop_backing_dev_info = {
.name   = "noop",
.capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void)
 }
 postcore_initcall(bdi_class_init);
 
+static int bdi_init(struct backing_dev_info *bdi);
+
 static int __init default_bdi_init(void)
 {
int err;
@@ -771,7 +771,7 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) 
{ }
 
 #endif /* CONFIG_CGROUP_WRITEBACK */
 
-int bdi_init(struct backing_dev_info *bdi)
+static int bdi_init(struct backing_dev_info *bdi)
 {
int ret;
 
@@ -791,7 +791,6 @@ int bdi_init(struct backing_dev_info *bdi)
 
return ret;
 }
-EXPORT_SYMBOL(bdi_init);
 
 struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
 {
@@ -864,12 +863,6 @@ int bdi_register(struct backing_dev_info *bdi, struct 
device *parent,
 }
 EXPORT_SYMBOL(bdi_register);
 
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-{
-   return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
-}
-EXPORT_SYMBOL(bdi_register_dev);
-
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
 {
int rc;
@@ -923,19 +916,14 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
 }
 
-static void bdi_exit(struct backing_dev_info *bdi)
-{
-   WARN_ON_ONCE(bdi->dev);
-   wb_exit(>wb);
-}
-
 static void release_bdi(struct kref *ref)
 {
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
 
bdi_unregister(bdi);
-   bdi_exit(bdi);
+   WARN_ON_ONCE(bdi->dev);
+   wb_exit(>wb);
kfree(bdi);
 }
 
@@ -944,38 +932,6 @@ void bdi_put(struct backing_dev_info *bdi)
kref_put(>refcnt, release_bdi);
 }
 
-void bdi_destroy(struct backing_dev_info *bdi)
-{
-   bdi_unregister(bdi);
-   bdi_exit(bdi);
-}
-EXPORT_SYMBOL(bdi_destroy);
-
-/*
- * For use from filesystems to quickly init and register a bdi associated
- * with dirty writeback
- */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-{
-   int err;
-
-   bdi->name = name;
-   bdi->capabilities = 0;
-   err = bdi_init(bdi);
-   if (err)
-   return err;
-
-   err = bdi_register(bdi, NULL, "%.28s-%ld", name,
-  atomic_long_inc_return(_seq));
-   if (err) {
-   bdi_destroy(bdi);
-   return err;
-   }
-
-   return 0;
-}
-EXPORT_SYMBOL(bdi_setup_and_register);
-
 static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/24] cifs: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside superblock. This unifies handling of bdi among users.

CC: Steve French 
CC: linux-c...@vger.kernel.org
Signed-off-by: Jan Kara 
---
 fs/cifs/cifs_fs_sb.h |  1 -
 fs/cifs/cifsfs.c |  7 ++-
 fs/cifs/connect.c| 10 --
 3 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 07ed81cf1552..cbd216b57239 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -68,7 +68,6 @@ struct cifs_sb_info {
umode_t mnt_dir_mode;
unsigned int mnt_cifs_flags;
char   *mountdata; /* options received at mount time or via DFS refs */
-   struct backing_dev_info bdi;
struct delayed_work prune_tlinks;
struct rcu_head rcu;
char *prepath;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 70f4e65fced2..8dcf1c2555bf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -138,7 +138,12 @@ cifs_read_super(struct super_block *sb)
sb->s_magic = CIFS_MAGIC_NUMBER;
sb->s_op = _super_ops;
sb->s_xattr = cifs_xattr_handlers;
-   sb->s_bdi = _sb->bdi;
+   rc = super_setup_bdi(sb);
+   if (rc)
+   goto out_no_root;
+   /* tune readahead according to rsize */
+   sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE;
+
sb->s_blocksize = CIFS_MAX_MSGSIZE;
sb->s_blocksize_bits = 14;  /* default 2**14 = CIFS_MAX_MSGSIZE */
inode = cifs_root_iget(sb);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 35ae49ed1f76..6c1dfe56589d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3650,10 +3650,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol 
*volume_info)
int referral_walks_count = 0;
 #endif
 
-   rc = bdi_setup_and_register(_sb->bdi, "cifs");
-   if (rc)
-   return rc;
-
 #ifdef CONFIG_CIFS_DFS_UPCALL
 try_mount_again:
/* cleanup activities if we're chasing a referral */
@@ -3681,7 +3677,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol 
*volume_info)
server = cifs_get_tcp_session(volume_info);
if (IS_ERR(server)) {
rc = PTR_ERR(server);
-   bdi_destroy(_sb->bdi);
goto out;
}
if ((volume_info->max_credits < 20) ||
@@ -3735,9 +3730,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol 
*volume_info)
cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info);
cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info);
 
-   /* tune readahead according to rsize */
-   cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE;
-
 remote_path_check:
 #ifdef CONFIG_CIFS_DFS_UPCALL
/*
@@ -3854,7 +3846,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol 
*volume_info)
cifs_put_smb_ses(ses);
else
cifs_put_tcp_session(server, 0);
-   bdi_destroy(_sb->bdi);
}
 
 out:
@@ -4057,7 +4048,6 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
}
spin_unlock(_sb->tlink_tree_lock);
 
-   bdi_destroy(_sb->bdi);
kfree(cifs_sb->mountdata);
kfree(cifs_sb->prepath);
call_rcu(_sb->rcu, delayed_free);
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 17/24] fuse: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Miklos Szeredi 
CC: linux-fsde...@vger.kernel.org
Signed-off-by: Jan Kara 
---
 fs/fuse/dev.c|  8 
 fs/fuse/fuse_i.h |  3 ---
 fs/fuse/inode.c  | 42 +-
 3 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 70ea57c7b6bb..1912164d57e9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -382,8 +382,8 @@ static void request_end(struct fuse_conn *fc, struct 
fuse_req *req)
 
if (fc->num_background == fc->congestion_threshold &&
fc->connected && fc->bdi_initialized) {
-   clear_bdi_congested(>bdi, BLK_RW_SYNC);
-   clear_bdi_congested(>bdi, BLK_RW_ASYNC);
+   clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+   clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
fc->num_background--;
fc->active_background--;
@@ -570,8 +570,8 @@ void fuse_request_send_background_locked(struct fuse_conn 
*fc,
fc->blocked = 1;
if (fc->num_background == fc->congestion_threshold &&
fc->bdi_initialized) {
-   set_bdi_congested(>bdi, BLK_RW_SYNC);
-   set_bdi_congested(>bdi, BLK_RW_ASYNC);
+   set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC);
+   set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC);
}
list_add_tail(>list, >bg_queue);
flush_bg_queue(fc);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 91307940c8ac..effab9e9607f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -631,9 +631,6 @@ struct fuse_conn {
/** Negotiated minor version */
unsigned minor;
 
-   /** Backing dev info */
-   struct backing_dev_info bdi;
-
/** Entry on the fuse_conn_list */
struct list_head entry;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6fe6a88ecb4a..90bacbc87fb3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -386,12 +386,6 @@ static void fuse_send_destroy(struct fuse_conn *fc)
}
 }
 
-static void fuse_bdi_destroy(struct fuse_conn *fc)
-{
-   if (fc->bdi_initialized)
-   bdi_destroy(>bdi);
-}
-
 static void fuse_put_super(struct super_block *sb)
 {
struct fuse_conn *fc = get_fuse_conn_super(sb);
@@ -403,7 +397,6 @@ static void fuse_put_super(struct super_block *sb)
list_del(>entry);
fuse_ctl_remove_conn(fc);
mutex_unlock(_mutex);
-   fuse_bdi_destroy(fc);
 
fuse_conn_put(fc);
 }
@@ -928,7 +921,8 @@ static void process_init_reply(struct fuse_conn *fc, struct 
fuse_req *req)
fc->no_flock = 1;
}
 
-   fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
+   fc->sb->s_bdi->ra_pages =
+   min(fc->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
fc->max_write = max_t(unsigned, 4096, fc->max_write);
@@ -944,7 +938,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct 
fuse_req *req)
 
arg->major = FUSE_KERNEL_VERSION;
arg->minor = FUSE_KERNEL_MINOR_VERSION;
-   arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE;
+   arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
@@ -976,27 +970,20 @@ static void fuse_free_conn(struct fuse_conn *fc)
 static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 {
int err;
+   char *suffix = "";
 
-   fc->bdi.name = "fuse";
-   fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
-   /* fuse does it's own writeback accounting */
-   fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
-
-   err = bdi_init(>bdi);
+   if (sb->s_bdev)
+   suffix = "-fuseblk";
+   err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev),
+  MINOR(fc->dev), suffix);
if (err)
return err;
 
-   fc->bdi_initialized = 1;
-
-   if (sb->s_bdev) {
-   err =  bdi_register(>bdi, NULL, "%u:%u-fuseblk",
-   MAJOR(fc->dev), MINOR(fc->dev));
-   } else {
-   err = bdi_register_dev(>bdi, fc->dev);
-   }
+   sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
+   /* fuse does it's own writeback accounting */
+   sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
 
-   if (err)
-   

[PATCH 22/24] ubifs: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Richard Weinberger 
CC: Artem Bityutskiy 
CC: Adrian Hunter 
CC: linux-...@lists.infradead.org
Signed-off-by: Jan Kara 
---
 fs/ubifs/super.c | 23 +++
 fs/ubifs/ubifs.h |  3 ---
 2 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e08aa04fc835..34810eb52b22 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1827,7 +1827,6 @@ static void ubifs_put_super(struct super_block *sb)
}
 
ubifs_umount(c);
-   bdi_destroy(>bdi);
ubi_close_volume(c->ubi);
mutex_unlock(>umount_mutex);
 }
@@ -2019,29 +2018,23 @@ static int ubifs_fill_super(struct super_block *sb, 
void *data, int silent)
goto out;
}
 
+   err = ubifs_parse_options(c, data, 0);
+   if (err)
+   goto out_close;
+
/*
 * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
 * UBIFS, I/O is not deferred, it is done immediately in readpage,
 * which means the user would have to wait not just for their own I/O
 * but the read-ahead I/O as well i.e. completely pointless.
 *
-* Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+* Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0.
 */
-   c->bdi.name = "ubifs",
-   c->bdi.capabilities = 0;
-   err  = bdi_init(>bdi);
+   err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num,
+  c->vi.vol_id);
if (err)
goto out_close;
-   err = bdi_register(>bdi, NULL, "ubifs_%d_%d",
-  c->vi.ubi_num, c->vi.vol_id);
-   if (err)
-   goto out_bdi;
-
-   err = ubifs_parse_options(c, data, 0);
-   if (err)
-   goto out_bdi;
 
-   sb->s_bdi = >bdi;
sb->s_fs_info = c;
sb->s_magic = UBIFS_SUPER_MAGIC;
sb->s_blocksize = UBIFS_BLOCK_SIZE;
@@ -2080,8 +2073,6 @@ static int ubifs_fill_super(struct super_block *sb, void 
*data, int silent)
ubifs_umount(c);
 out_unlock:
mutex_unlock(>umount_mutex);
-out_bdi:
-   bdi_destroy(>bdi);
 out_close:
ubi_close_volume(c->ubi);
 out:
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ca72382ce6cc..41b42a425b42 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -968,7 +968,6 @@ struct ubifs_debug_info;
  * struct ubifs_info - UBIFS file-system description data structure
  * (per-superblock).
  * @vfs_sb: VFS @struct super_block object
- * @bdi: backing device info object to make VFS happy and disable read-ahead
  *
  * @highest_inum: highest used inode number
  * @max_sqnum: current global sequence number
@@ -1216,7 +1215,6 @@ struct ubifs_debug_info;
  */
 struct ubifs_info {
struct super_block *vfs_sb;
-   struct backing_dev_info bdi;
 
ino_t highest_inum;
unsigned long long max_sqnum;
@@ -1457,7 +1455,6 @@ extern const struct inode_operations 
ubifs_file_inode_operations;
 extern const struct file_operations ubifs_dir_operations;
 extern const struct inode_operations ubifs_dir_inode_operations;
 extern const struct inode_operations ubifs_symlink_inode_operations;
-extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 
 /* io.c */
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/24] block: Unregister bdi on last reference drop

2017-02-02 Thread Jan Kara
Most users will want to unregister bdi when dropping last reference to a
bdi. Only a few users (like block devices) want to play more complex
tricks with bdi registration and unregistration. So unregister bdi when
the last reference to bdi is dropped and just make sure we don't
unregister the bdi the second time if it is already unregistered.

Signed-off-by: Jan Kara 
---
 include/linux/backing-dev-defs.h |  3 ++-
 mm/backing-dev.c | 10 ++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index ad955817916d..2ecafc8a2d06 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -146,7 +146,8 @@ struct backing_dev_info {
char *name;
 
struct kref refcnt; /* Reference counter for the structure */
-   unsigned int capabilities; /* Device capabilities */
+   unsigned int registered:1;  /* Is bdi registered? */
+   unsigned int capabilities:31;   /* Device capabilities */
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d59571023df7..82fee0f52d06 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -843,6 +843,7 @@ int bdi_register_va(struct backing_dev_info *bdi, struct 
device *parent,
 
spin_lock_bh(_lock);
list_add_tail_rcu(>bdi_list, _list);
+   bdi->registered = 1;
spin_unlock_bh(_lock);
 
trace_writeback_bdi_register(bdi);
@@ -897,6 +898,14 @@ static void bdi_remove_from_list(struct backing_dev_info 
*bdi)
 
 void bdi_unregister(struct backing_dev_info *bdi)
 {
+   spin_lock_bh(_lock);
+   if (!bdi->registered) {
+   spin_unlock_bh(_lock);
+   return;
+   }
+   bdi->registered = 0;
+   spin_unlock_bh(_lock);
+
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(>wb);
@@ -925,6 +934,7 @@ static void release_bdi(struct kref *ref)
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
 
+   bdi_unregister(bdi);
bdi_exit(bdi);
kfree(bdi);
 }
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 21/24] nfs: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Trond Myklebust 
CC: Anna Schumaker 
CC: linux-...@vger.kernel.org
Signed-off-by: Jan Kara 
---
 fs/nfs/client.c   | 10 --
 fs/nfs/internal.h |  6 +++---
 fs/nfs/super.c| 34 +++---
 fs/nfs/write.c| 13 ++---
 include/linux/nfs_fs_sb.h |  1 -
 5 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 91a8d610ba0f..479afae529d8 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -738,9 +738,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->rsize = NFS_MAX_FILE_IO_SIZE;
server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-   server->backing_dev_info.name = "nfs";
-   server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
-
if (server->wsize > max_rpc_payload)
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
@@ -894,12 +891,6 @@ struct nfs_server *nfs_alloc_server(void)
return NULL;
}
 
-   if (bdi_init(>backing_dev_info)) {
-   nfs_free_iostats(server->io_stats);
-   kfree(server);
-   return NULL;
-   }
-
ida_init(>openowner_id);
ida_init(>lockowner_id);
pnfs_init_server(server);
@@ -930,7 +921,6 @@ void nfs_free_server(struct nfs_server *server)
ida_destroy(>lockowner_id);
ida_destroy(>openowner_id);
nfs_free_iostats(server->io_stats);
-   bdi_destroy(>backing_dev_info);
kfree(server);
nfs_release_automount_timer();
dprintk("<-- nfs_free_server()\n");
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 09ca5095c04e..55591c06b5d0 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -139,7 +139,7 @@ struct nfs_mount_request {
 };
 
 struct nfs_mount_info {
-   void (*fill_super)(struct super_block *, struct nfs_mount_info *);
+   int (*fill_super)(struct super_block *, struct nfs_mount_info *);
int (*set_security)(struct super_block *, struct dentry *, struct 
nfs_mount_info *);
struct nfs_parsed_mount_data *parsed;
struct nfs_clone_mount *cloned;
@@ -405,7 +405,7 @@ struct dentry *nfs_fs_mount(struct file_system_type *, int, 
const char *, void *
 struct dentry * nfs_xdev_mount_common(struct file_system_type *, int,
const char *, struct nfs_mount_info *);
 void nfs_kill_super(struct super_block *);
-void nfs_fill_super(struct super_block *, struct nfs_mount_info *);
+int nfs_fill_super(struct super_block *, struct nfs_mount_info *);
 
 extern struct rpc_stat nfs_rpcstat;
 
@@ -456,7 +456,7 @@ extern void nfs_read_prepare(struct rpc_task *task, void 
*calldata);
 extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
 
 /* super.c */
-void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
+int nfs_clone_super(struct super_block *, struct nfs_mount_info *);
 void nfs_umount_begin(struct super_block *);
 int  nfs_statfs(struct dentry *, struct kstatfs *);
 int  nfs_show_options(struct seq_file *, struct dentry *);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 6bca17883b93..16f4d92a96ec 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2322,18 +2322,17 @@ inline void nfs_initialise_sb(struct super_block *sb)
sb->s_blocksize = nfs_block_bits(server->wsize,
 >s_blocksize_bits);
 
-   sb->s_bdi = >backing_dev_info;
-
nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
 
 /*
  * Finish setting up an NFS2/3 superblock
  */
-void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 {
struct nfs_parsed_mount_data *data = mount_info->parsed;
struct nfs_server *server = NFS_SB(sb);
+   int ret;
 
sb->s_blocksize_bits = 0;
sb->s_blocksize = 0;
@@ -2351,13 +2350,21 @@ void nfs_fill_super(struct super_block *sb, struct 
nfs_mount_info *mount_info)
}
 
nfs_initialise_sb(sb);
+
+   ret = super_setup_bdi_name(sb, "%u:%u", MAJOR(server->s_dev),
+  MINOR(server->s_dev));
+   if (ret)
+   return ret;
+   sb->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD;
+   return 0;
+
 }
 EXPORT_SYMBOL_GPL(nfs_fill_super);
 
 /*
  * Finish setting up a cloned NFS2/3/4 superblock
  */
-void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
+int nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
 {
const struct super_block *old_sb = mount_info->cloned->sb;
struct 

[PATCH 04/24] fs: Provide infrastructure for dynamic BDIs in filesystems

2017-02-02 Thread Jan Kara
Provide helper functions for setting up dynamically allocated
backing_dev_info structures for filesystems and cleaning them up on
superblock destruction.

CC: linux-...@lists.infradead.org
CC: linux-...@vger.kernel.org
CC: Petr Vandrovec 
CC: linux-ni...@vger.kernel.org
CC: cluster-de...@redhat.com
CC: osd-...@open-osd.org
CC: codal...@coda.cs.cmu.edu
CC: linux-...@lists.infradead.org
CC: ecryp...@vger.kernel.org
CC: linux-c...@vger.kernel.org
CC: ceph-de...@vger.kernel.org
CC: linux-bt...@vger.kernel.org
CC: v9fs-develo...@lists.sourceforge.net
CC: lustre-de...@lists.lustre.org
Signed-off-by: Jan Kara 
---
 fs/super.c   | 49 
 include/linux/backing-dev-defs.h |  2 +-
 include/linux/fs.h   |  6 +
 3 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/fs/super.c b/fs/super.c
index ea662b0e5e78..31dc4c6450ef 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -446,6 +446,11 @@ void generic_shutdown_super(struct super_block *sb)
hlist_del_init(>s_instances);
spin_unlock(_lock);
up_write(>s_umount);
+   if (sb->s_iflags & SB_I_DYNBDI) {
+   bdi_put(sb->s_bdi);
+   sb->s_bdi = _backing_dev_info;
+   sb->s_iflags &= ~SB_I_DYNBDI;
+   }
 }
 
 EXPORT_SYMBOL(generic_shutdown_super);
@@ -1249,6 +1254,50 @@ mount_fs(struct file_system_type *type, int flags, const 
char *name, void *data)
 }
 
 /*
+ * Setup private BDI for given superblock. I gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
+{
+   struct backing_dev_info *bdi;
+   int err;
+   va_list args;
+
+   bdi = bdi_alloc(GFP_KERNEL);
+   if (!bdi)
+   return -ENOMEM;
+
+   bdi->name = sb->s_type->name;
+
+   va_start(args, fmt);
+   err = bdi_register_va(bdi, NULL, fmt, args);
+   va_end(args);
+   if (err) {
+   bdi_put(bdi);
+   return err;
+   }
+   WARN_ON(sb->s_bdi != _backing_dev_info);
+   sb->s_bdi = bdi;
+   sb->s_iflags |= SB_I_DYNBDI;
+
+   return 0;
+}
+EXPORT_SYMBOL(super_setup_bdi_name);
+
+/*
+ * Setup private BDI for given superblock. I gets automatically cleaned up
+ * in generic_shutdown_super().
+ */
+int super_setup_bdi(struct super_block *sb)
+{
+   static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
+
+   return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
+   atomic_long_inc_return(_seq));
+}
+EXPORT_SYMBOL(super_setup_bdi);
+
+/*
  * This is an internal function, please use sb_end_{write,pagefault,intwrite}
  * instead.
  */
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 2ecafc8a2d06..70080b4217f4 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -143,7 +143,7 @@ struct backing_dev_info {
congested_fn *congested_fn; /* Function pointer if device is md/dm */
void *congested_data;   /* Pointer to aux data for congested func */
 
-   char *name;
+   const char *name;
 
struct kref refcnt; /* Reference counter for the structure */
unsigned int registered:1;  /* Is bdi registered? */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c930cbc19342..8ed8b6d1bc54 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1267,6 +1267,9 @@ struct mm_struct;
 /* sb->s_iflags to limit user namespace mounts */
 #define SB_I_USERNS_VISIBLE0x0010 /* fstype already mounted */
 
+/* Temporary flag until all filesystems are converted to dynamic bdis */
+#define SB_I_DYNBDI0x0100
+
 /* Possible states of 'frozen' field */
 enum {
SB_UNFROZEN = 0,/* FS is unfrozen */
@@ -2103,6 +2106,9 @@ extern int vfs_ustat(dev_t, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
 extern bool our_mnt(struct vfsmount *mnt);
+extern __printf(2, 3)
+int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
+extern int super_setup_bdi(struct super_block *sb);
 
 extern int current_umask(void);
 
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 16/24] exofs: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside the superblock. This unifies handling of bdi among users.

CC: Boaz Harrosh 
CC: Benny Halevy 
CC: osd-...@open-osd.org
Signed-off-by: Jan Kara 
---
 fs/exofs/exofs.h |  1 -
 fs/exofs/super.c | 17 ++---
 2 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2e86086bc940..5dc392404559 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -64,7 +64,6 @@ struct exofs_dev {
  * our extension to the in-memory superblock
  */
 struct exofs_sb_info {
-   struct backing_dev_info bdi;/* register our bdi with VFS  */
struct exofs_sb_stats s_ess;/* Written often, pre-allocate*/
int s_timeout;  /* timeout for OSD operations */
uint64_ts_nextid;   /* highest object ID used */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 1076a4233b39..819624cfc8da 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -464,7 +464,6 @@ static void exofs_put_super(struct super_block *sb)
sbi->one_comp.obj.partition);
 
exofs_sysfs_sb_del(sbi);
-   bdi_destroy(>bdi);
exofs_free_sbi(sbi);
sb->s_fs_info = NULL;
 }
@@ -809,8 +808,12 @@ static int exofs_fill_super(struct super_block *sb, void 
*data, int silent)
__sbi_read_stats(sbi);
 
/* set up operation vectors */
-   sbi->bdi.ra_pages = __ra_pages(>layout);
-   sb->s_bdi = >bdi;
+   ret = super_setup_bdi(sb);
+   if (ret) {
+   EXOFS_DBGMSG("Failed to super_setup_bdi\n");
+   goto free_sbi;
+   }
+   sb->s_bdi->ra_pages = __ra_pages(>layout);
sb->s_fs_info = sbi;
sb->s_op = _sops;
sb->s_export_op = _export_ops;
@@ -836,14 +839,6 @@ static int exofs_fill_super(struct super_block *sb, void 
*data, int silent)
goto free_sbi;
}
 
-   ret = bdi_setup_and_register(>bdi, "exofs");
-   if (ret) {
-   EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
-   dput(sb->s_root);
-   sb->s_root = NULL;
-   goto free_sbi;
-   }
-
exofs_sysfs_dbg_print();
_exofs_print_device("Mounting", opts->dev_name,
ore_comp_dev(>oc, 0),
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/24] bdi: Provide bdi_register_va()

2017-02-02 Thread Jan Kara
Add function that registers bdi and takes va_list instead of variable
number of arguments.

Signed-off-by: Jan Kara 
---
 include/linux/backing-dev.h |  2 ++
 mm/backing-dev.c| 20 +++-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 81c07ade4305..6865b1c8b122 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -30,6 +30,8 @@ void bdi_put(struct backing_dev_info *bdi);
 __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
const char *fmt, ...);
+int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
+   const char *fmt, va_list args);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);
 void bdi_unregister(struct backing_dev_info *bdi);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7a5ba4163656..d59571023df7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -824,18 +824,15 @@ struct backing_dev_info *bdi_alloc(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(bdi_alloc);
 
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-   const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, struct device *parent,
+   const char *fmt, va_list args)
 {
-   va_list args;
struct device *dev;
 
if (bdi->dev)   /* The driver needs to use separate queues per device */
return 0;
 
-   va_start(args, fmt);
dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, 
args);
-   va_end(args);
if (IS_ERR(dev))
return PTR_ERR(dev);
 
@@ -851,6 +848,19 @@ int bdi_register(struct backing_dev_info *bdi, struct 
device *parent,
trace_writeback_bdi_register(bdi);
return 0;
 }
+EXPORT_SYMBOL(bdi_register_va);
+
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+   const char *fmt, ...)
+{
+   va_list args;
+   int ret;
+
+   va_start(args, fmt);
+   ret = bdi_register_va(bdi, parent, fmt, args);
+   va_end(args);
+   return ret;
+}
 EXPORT_SYMBOL(bdi_register);
 
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/24] fs: Get proper reference for s_bdi

2017-02-02 Thread Jan Kara
So far we just relied on block device to hold a bdi reference for us
while the filesystem is mounted. While that works perfectly fine, it is
a bit awkward that we have a pointer to a refcounted structure in the
superblock without proper reference. So make s_bdi hold a proper
reference to block device's BDI. No filesystem using mount_bdev()
actually changes s_bdi so this is safe and will make bdev filesystems
work the same way as filesystems needing to set up their private bdi.

Signed-off-by: Jan Kara 
---
 fs/super.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 31dc4c6450ef..dfb95ccd4351 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1047,12 +1047,9 @@ static int set_bdev_super(struct super_block *s, void 
*data)
 {
s->s_bdev = data;
s->s_dev = s->s_bdev->bd_dev;
+   s->s_bdi = bdi_get(s->s_bdev->bd_bdi);
+   s->s_iflags |= SB_I_DYNBDI;
 
-   /*
-* We set the bdi here to the queue backing, file systems can
-* overwrite this in ->fill_super()
-*/
-   s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
return 0;
 }
 
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/24] btrfs: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside superblock. This unifies handling of bdi among users.

CC: Chris Mason 
CC: Josef Bacik 
CC: David Sterba 
CC: linux-bt...@vger.kernel.org
Signed-off-by: Jan Kara 
---
 fs/btrfs/ctree.h   |  1 -
 fs/btrfs/disk-io.c | 36 +++-
 fs/btrfs/super.c   |  7 +++
 3 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6a823719b6c5..1dc06f66dfcf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -801,7 +801,6 @@ struct btrfs_fs_info {
struct btrfs_super_block *super_for_commit;
struct super_block *sb;
struct inode *btree_inode;
-   struct backing_dev_info bdi;
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 37a31b12bb0c..b25723e729c0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1810,21 +1810,6 @@ static int btrfs_congested_fn(void *congested_data, int 
bdi_bits)
return ret;
 }
 
-static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
-{
-   int err;
-
-   err = bdi_setup_and_register(bdi, "btrfs");
-   if (err)
-   return err;
-
-   bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
-   bdi->congested_fn   = btrfs_congested_fn;
-   bdi->congested_data = info;
-   bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
-   return 0;
-}
-
 /*
  * called by the kthread helper functions to finally call the bio end_io
  * functions.  This is where read checksum verification actually happens
@@ -2598,16 +2583,10 @@ int open_ctree(struct super_block *sb,
goto fail;
}
 
-   ret = setup_bdi(fs_info, _info->bdi);
-   if (ret) {
-   err = ret;
-   goto fail_srcu;
-   }
-
ret = percpu_counter_init(_info->dirty_metadata_bytes, 0, 
GFP_KERNEL);
if (ret) {
err = ret;
-   goto fail_bdi;
+   goto fail_srcu;
}
fs_info->dirty_metadata_batch = PAGE_SIZE *
(1 + ilog2(nr_cpu_ids));
@@ -2715,7 +2694,6 @@ int open_ctree(struct super_block *sb,
 
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
-   sb->s_bdi = _info->bdi;
 
btrfs_init_btree_inode(fs_info);
 
@@ -2912,9 +2890,12 @@ int open_ctree(struct super_block *sb,
goto fail_sb_buffer;
}
 
-   fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
-   fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
-   SZ_4M / PAGE_SIZE);
+   sb->s_bdi->congested_fn = btrfs_congested_fn;
+   sb->s_bdi->congested_data = fs_info;
+   sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
+   sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
+   sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
+   sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
 
sb->s_blocksize = sectorsize;
sb->s_blocksize_bits = blksize_bits(sectorsize);
@@ -3282,8 +3263,6 @@ int open_ctree(struct super_block *sb,
percpu_counter_destroy(_info->delalloc_bytes);
 fail_dirty_metadata_bytes:
percpu_counter_destroy(_info->dirty_metadata_bytes);
-fail_bdi:
-   bdi_destroy(_info->bdi);
 fail_srcu:
cleanup_srcu_struct(_info->subvol_srcu);
 fail:
@@ -4010,7 +3989,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(_info->dirty_metadata_bytes);
percpu_counter_destroy(_info->delalloc_bytes);
percpu_counter_destroy(_info->bio_counter);
-   bdi_destroy(_info->bdi);
cleanup_srcu_struct(_info->subvol_srcu);
 
btrfs_free_stripe_hash_table(fs_info);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b5ae7d3d1896..08ef08b63132 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1133,6 +1133,13 @@ static int btrfs_fill_super(struct super_block *sb,
 #endif
sb->s_flags |= MS_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
+
+   err = super_setup_bdi(sb);
+   if (err) {
+   btrfs_err(fs_info, "super_setup_bdi failed");
+   return err;
+   }
+
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/24] 9p: Convert to separately allocated bdi

2017-02-02 Thread Jan Kara
Allocate struct backing_dev_info separately instead of embedding it
inside session. This unifies handling of bdi among users.

CC: Eric Van Hensbergen 
CC: Ron Minnich 
CC: Latchesar Ionkov 
CC: v9fs-develo...@lists.sourceforge.net
Signed-off-by: Jan Kara 
---
 fs/9p/v9fs.c  | 10 +-
 fs/9p/v9fs.h  |  1 -
 fs/9p/vfs_super.c | 15 ---
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 072e7599583a..0898a1a774fa 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -332,10 +332,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info 
*v9ses,
goto err_names;
init_rwsem(>rename_sem);
 
-   rc = bdi_setup_and_register(>bdi, "9p");
-   if (rc)
-   goto err_names;
-
v9ses->uid = INVALID_UID;
v9ses->dfltuid = V9FS_DEFUID;
v9ses->dfltgid = V9FS_DEFGID;
@@ -344,7 +340,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info 
*v9ses,
if (IS_ERR(v9ses->clnt)) {
rc = PTR_ERR(v9ses->clnt);
p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
-   goto err_bdi;
+   goto err_names;
}
 
v9ses->flags = V9FS_ACCESS_USER;
@@ -414,8 +410,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info 
*v9ses,
 
 err_clnt:
p9_client_destroy(v9ses->clnt);
-err_bdi:
-   bdi_destroy(>bdi);
 err_names:
kfree(v9ses->uname);
kfree(v9ses->aname);
@@ -444,8 +438,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
kfree(v9ses->uname);
kfree(v9ses->aname);
 
-   bdi_destroy(>bdi);
-
spin_lock(_sessionlist_lock);
list_del(>slist);
spin_unlock(_sessionlist_lock);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 443d12e02043..76eaf49abd3a 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -114,7 +114,6 @@ struct v9fs_session_info {
kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */
struct p9_client *clnt; /* 9p client */
struct list_head slist; /* list of sessions registered with v9fs */
-   struct backing_dev_info bdi;
struct rw_semaphore rename_sem;
 };
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index de3ed8629196..a0965fb587a5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data)
  *
  */
 
-static void
+static int
 v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
int flags, void *data)
 {
+   int ret;
+
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
sb->s_blocksize = 1 << sb->s_blocksize_bits;
@@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct 
v9fs_session_info *v9ses,
sb->s_xattr = v9fs_xattr_handlers;
} else
sb->s_op = _super_ops;
-   sb->s_bdi = >bdi;
+
+   ret = super_setup_bdi(sb);
+   if (ret)
+   return ret;
+
if (v9ses->cache)
sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
 
@@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct 
v9fs_session_info *v9ses,
 #endif
 
save_mount_options(sb, data);
+   return 0;
 }
 
 /**
@@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type 
*fs_type, int flags,
retval = PTR_ERR(sb);
goto clunk_fid;
}
-   v9fs_fill_super(sb, v9ses, flags, data);
+   retval = v9fs_fill_super(sb, v9ses, flags, data);
+   if (retval)
+   goto release_sb;
 
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
sb->s_d_op = _cached_dentry_operations;
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [dm-devel] split scsi passthrough fields out of struct request V2

2017-02-02 Thread Bart Van Assche
On Wed, 2017-02-01 at 22:01 +, Bart Van Assche wrote:
> However, a new issue shows up sporadically, an issue that I had not yet seen
> during any test with a kernel tree from Linus:
>
> [  227.613440] general protection fault:  [#1] SMP
> [  227.613495] Modules linked in: dm_service_time ib_srp scsi_transport_srp 
> target_core_user uio target_core_pscsi target_core_file ib_srpt 
> target_core_iblock target_core_mod brd netconsole xt_CHECKSUM iptable_mangle 
> ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat 
> libcrc32c nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack 
> ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc ebtable_filter 
> ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables 
> af_packet ib_ipoib msr rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm configfs 
> ib_cm iw_cm mlx4_ib ib_core sb_edac edac_core x86_pkg_temp_thermal 
> intel_powerclamp coretemp kvm_intel ipmi_ssif kvm irqbypass crct10dif_pclmul 
> crc32_pclmul crc32c_intel ghash_clmulni_intel pcbc tg3 aesni_intel iTCO_wdt 
> mlx4_core ptp iTCO_vendor_support dcdbas aes_x86_64 crypto_simd glue_helper 
> pps_core cryptd pcspkr devlink ipmi_si libphy ipmi_devintf fjes 
> ipmi_msghandler tpm_tis tpm_tis_core lpc_ich mei_me mfd_core mei shpchp wmi 
> tpm button hid_generic usbhid mgag200 i2c_algo_bit drm_kms_helper syscopyarea 
> sysfillrect sr_mod sysimgblt fb_sys_fops cdrom ttm drm ehci_pci ehci_hcd 
> usbcore usb_common sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc 
> scsi_dh_alua autofs4
> [  227.613774] CPU: 3 PID: 28 Comm: ksoftirqd/3 Not tainted 4.10.0-rc5-dbg+ #1
> [  227.613840] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.0.2 
> 11/17/2014
> [  227.613893] task: 880172a249c0 task.stack: c90001aa8000
> [  227.613932] RIP: 0010:rq_completed+0x12/0x90 [dm_mod]
> [  227.613965] RSP: 0018:c90001aabda8 EFLAGS: 00010246
> [  227.614006] RAX:  RBX: 6b6b6b6b6b6b6b6b RCX: 
> 
> [  227.614043] RDX:  RSI:  RDI: 
> 6b6b6b6b6b6b6b6b
> [  227.614074] RBP: c90001aabdc0 R08: 8803825f4c38 R09: 
> 
> [  227.614105] R10:  R11:  R12: 
> 
> [  227.614137] R13:  R14: 81c05120 R15: 
> 0004
> [  227.614170] FS:  () GS:88046f2c() 
> knlGS:
> [  227.614209] CS:  0010 DS:  ES:  CR0: 80050033
> [  227.614239] CR2: 557e28bc20d0 CR3: 00038594e000 CR4: 
> 001406e0
> [  227.614268] Call Trace:
> [  227.614301]  dm_softirq_done+0xe6/0x1e0 [dm_mod]
> [  227.614337]  blk_done_softirq+0x88/0xa0
> [  227.614369]  __do_softirq+0xba/0x4c0
> [  227.614470]  run_ksoftirqd+0x1a/0x50
> [  227.614499]  smpboot_thread_fn+0x123/0x1e0
> [  227.614529]  kthread+0x107/0x140
> [  227.614624]  ret_from_fork+0x2e/0x40
> [  227.614648] Code: ff ff 31 f6 48 89 c7 e8 cd 0e 2f e1 5d c3 90 66 2e 0f 1f 
> 84 00 00 00 00 00 55 48 63 f6 48 89 e5 41 55 41 89 d5 41 54 53 48 89 fb <4c> 
> 8b a7 88 02 00 00 f0 ff 8c b7 50 03 00 00 e8 ba 43 ff ff 85 
> [  227.614738] RIP: rq_completed+0x12/0x90 [dm_mod] RSP: c90001aabda8
> 
> (gdb) list *(rq_completed+0x12)
> 0xdd12 is in rq_completed (drivers/md/dm-rq.c:187).
> 182  * the md may be freed in dm_put() at the end of this function.
> 183  * Or do dm_get() before calling this function and dm_put() later.
> 184  */
> 185 static void rq_completed(struct mapped_device *md, int rw, bool 
> run_queue)
> 186 {
> 187 struct request_queue *q = md->queue;
> 188 unsigned long flags;
> 189
> 190 atomic_dec(>pending[rw]);
> 191
> 
> (gdb) disas rq_completed
> Dump of assembler code for function rq_completed:
>0xdd00 <+0>: push   %rbp
>0xdd01 <+1>: movslq %esi,%rsi
>0xdd04 <+4>: mov%rsp,%rbp
>0xdd07 <+7>: push   %r13
>0xdd09 <+9>: mov%edx,%r13d
>0xdd0c <+12>:push   %r12
>0xdd0e <+14>:push   %rbx
>0xdd0f <+15>:mov%rdi,%rbx
>0xdd12 <+18>:mov0x288(%rdi),%r12
>0xdd19 <+25>:lock decl 0x350(%rdi,%rsi,4)
> 
> So this was caused by an attempt to dereference %rdi = 0x6b6b6b6b6b6b6b6b.
> Hence this is probably a use-after-free of struct mapped_device.

Hello Christoph and Mike,

The above crash occurs with Jens' for-next branch but not with Jens'
for-4.11/block branch. Sorry but I think this means that the SCSI
passthrough refactoring code is not yet ready for prime time.

Bart.--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] block: fix blk-mq debugfs vs. blktrace

2017-02-02 Thread Jens Axboe
On 01/31/2017 03:53 PM, Omar Sandoval wrote:
> From: Omar Sandoval 
> 
> When I moved the blk-mq debugging information to debugfs, I didn't
> realize that blktrace also created directories in debugfs that
> conflicted with the blk-mq directories. This series fixes that.
> 
> Patch 1 adds a new debugfs helper needed for patch 6. Greg, could I get
> an ack on that if it makes sense? Jens and I went back and forth on this
> for a little while, but patch 6 has more of the rationale on why we
> decided that this approach was the cleanest.
> 
> Patches 2 and 3 are cleanups.
> 
> Patch 4 is the first part of the fix, making blk-mq and blktrace use the
> same top-level "block" directory.
> 
> Patches 5 and 6 make blk-mq and blktrace play nicely with each other
> w.r.t. the debugfs "block/$dev" directories.
> 
> I tested this with multiple configurations, so hopefully I didn't mess
> that up this time.
> 
> Applies to for-4.11/block.

Applied with Greg's ack on the first patch, thanks Omar.

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] block: fix blk-mq debugfs vs. blktrace

2017-02-02 Thread Greg Kroah-Hartman
On Thu, Feb 02, 2017 at 09:01:45AM -0800, Omar Sandoval wrote:
> On Thu, Feb 02, 2017 at 11:58:53AM +0100, Greg Kroah-Hartman wrote:
> > On Wed, Feb 01, 2017 at 12:31:15AM -0800, Omar Sandoval wrote:
> > > On Wed, Feb 01, 2017 at 09:16:08AM +0100, Greg Kroah-Hartman wrote:
> > > > On Tue, Jan 31, 2017 at 02:53:16PM -0800, Omar Sandoval wrote:
> > > > > From: Omar Sandoval 
> > > > > 
> > > > > When I moved the blk-mq debugging information to debugfs, I didn't
> > > > > realize that blktrace also created directories in debugfs that
> > > > > conflicted with the blk-mq directories. This series fixes that.
> > > > > 
> > > > > Patch 1 adds a new debugfs helper needed for patch 6. Greg, could I 
> > > > > get
> > > > > an ack on that if it makes sense? Jens and I went back and forth on 
> > > > > this
> > > > > for a little while, but patch 6 has more of the rationale on why we
> > > > > decided that this approach was the cleanest.
> > > > 
> > > > I can't find patch 6, you only cc:ed me on the first patch :(
> > > > 
> > > > Care to bounce them all to me?
> > > > 
> > > > thanks,
> > > > 
> > > > greg k-h
> > > 
> > > Gah, I forgot --cc-cover to git-send-email. The series is all here:
> > > http://marc.info/?l=linux-block=1=201701=2. I can also send the
> > > patches directly to you if you prefer that.
> > 
> > I don't understand the problem here.  How do you not know if you have
> > created the debugfs file or not?  You have the structure, with the
> > correct name, how could it have been created?  Can't you save the dentry
> > to the debugfs file in the structure that has the name?
> > 
> > thanks,
> > 
> > greg k-h
> 
> Hey, Greg,
> 
> So here's the alternative to doing the lookup:
> 
> diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
> index 38052f625a0f..79ef6b9d1f96 100644
> --- a/kernel/trace/blktrace.c
> +++ b/kernel/trace/blktrace.c
> @@ -470,12 +470,15 @@ static int do_blk_trace_setup(struct request_queue *q, 
> char *name, dev_t dev,
>   if (!blk_debugfs_root)
>   goto err;
>  
> - dir = debugfs_create_dir(buts->name, blk_debugfs_root);
> -
> +#ifdef CONFIG_BLK_DEBUG_FS
> + if (q->mq_ops && !bdev->bd_part.partno)
> + dir = q->debugfs_dir;
> +#endif

Eeek, no #ifdefs please :)

The lookup patch is fine, please take it.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] block: fix blk-mq debugfs vs. blktrace

2017-02-02 Thread Greg Kroah-Hartman
On Thu, Feb 02, 2017 at 08:17:31AM -0700, Jens Axboe wrote:
> On 02/02/2017 03:58 AM, Greg Kroah-Hartman wrote:
> > On Wed, Feb 01, 2017 at 12:31:15AM -0800, Omar Sandoval wrote:
> >> On Wed, Feb 01, 2017 at 09:16:08AM +0100, Greg Kroah-Hartman wrote:
> >>> On Tue, Jan 31, 2017 at 02:53:16PM -0800, Omar Sandoval wrote:
>  From: Omar Sandoval 
> 
>  When I moved the blk-mq debugging information to debugfs, I didn't
>  realize that blktrace also created directories in debugfs that
>  conflicted with the blk-mq directories. This series fixes that.
> 
>  Patch 1 adds a new debugfs helper needed for patch 6. Greg, could I get
>  an ack on that if it makes sense? Jens and I went back and forth on this
>  for a little while, but patch 6 has more of the rationale on why we
>  decided that this approach was the cleanest.
> >>>
> >>> I can't find patch 6, you only cc:ed me on the first patch :(
> >>>
> >>> Care to bounce them all to me?
> >>>
> >>> thanks,
> >>>
> >>> greg k-h
> >>
> >> Gah, I forgot --cc-cover to git-send-email. The series is all here:
> >> http://marc.info/?l=linux-block=1=201701=2. I can also send the
> >> patches directly to you if you prefer that.
> > 
> > I don't understand the problem here.  How do you not know if you have
> > created the debugfs file or not?  You have the structure, with the
> > correct name, how could it have been created?  Can't you save the dentry
> > to the debugfs file in the structure that has the name?
> 
> The problem is that blktrace registers a trace name directory, which
> can be either whole device or partition, depending on what you trace.
> For the blk-mq debug parts, we always just register the whole device
> name. There's no way to save the partition dentry, and imho, why even
> would you when you can just look it up. It's a file system...

I agree, it is a file system, but usually that debugfs file is
associated with some sort of data you want to keep track of outside of a
filesystem :)

Anyway, if it's such a big pain, then it's fine, add the function, no
objection from me anymore.

Acked-by: Greg Kroah-Hartman 

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] block: fix blk-mq debugfs vs. blktrace

2017-02-02 Thread Omar Sandoval
On Thu, Feb 02, 2017 at 11:58:53AM +0100, Greg Kroah-Hartman wrote:
> On Wed, Feb 01, 2017 at 12:31:15AM -0800, Omar Sandoval wrote:
> > On Wed, Feb 01, 2017 at 09:16:08AM +0100, Greg Kroah-Hartman wrote:
> > > On Tue, Jan 31, 2017 at 02:53:16PM -0800, Omar Sandoval wrote:
> > > > From: Omar Sandoval 
> > > > 
> > > > When I moved the blk-mq debugging information to debugfs, I didn't
> > > > realize that blktrace also created directories in debugfs that
> > > > conflicted with the blk-mq directories. This series fixes that.
> > > > 
> > > > Patch 1 adds a new debugfs helper needed for patch 6. Greg, could I get
> > > > an ack on that if it makes sense? Jens and I went back and forth on this
> > > > for a little while, but patch 6 has more of the rationale on why we
> > > > decided that this approach was the cleanest.
> > > 
> > > I can't find patch 6, you only cc:ed me on the first patch :(
> > > 
> > > Care to bounce them all to me?
> > > 
> > > thanks,
> > > 
> > > greg k-h
> > 
> > Gah, I forgot --cc-cover to git-send-email. The series is all here:
> > http://marc.info/?l=linux-block=1=201701=2. I can also send the
> > patches directly to you if you prefer that.
> 
> I don't understand the problem here.  How do you not know if you have
> created the debugfs file or not?  You have the structure, with the
> correct name, how could it have been created?  Can't you save the dentry
> to the debugfs file in the structure that has the name?
> 
> thanks,
> 
> greg k-h

Hey, Greg,

So here's the alternative to doing the lookup:

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 38052f625a0f..79ef6b9d1f96 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -470,12 +470,15 @@ static int do_blk_trace_setup(struct request_queue *q, 
char *name, dev_t dev,
if (!blk_debugfs_root)
goto err;
 
-   dir = debugfs_create_dir(buts->name, blk_debugfs_root);
-
+#ifdef CONFIG_BLK_DEBUG_FS
+   if (q->mq_ops && !bdev->bd_part.partno)
+   dir = q->debugfs_dir;
+#endif
+   if (!dir)
+   bt->dir = dir = debugfs_create_dir(buts->name, 
blk_debugfs_root);
if (!dir)
goto err;
 
-   bt->dir = dir;
bt->dev = dev;
atomic_set(>dropped, 0);
INIT_LIST_HEAD(>running_list);


So we could figure out if it exists, but it's very special-cased and
fragile. And then there's this further up:


/*
 * some device names have larger paths - convert the slashes
 * to underscores for this to work as expected
 */
strreplace(buts->name, '/', '_');


which I'm not sure applies anymore but would also break this. Doing the
lookup is just more foolproof.

Thanks,
Omar
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] block: free merged request in the caller

2017-02-02 Thread Jens Axboe
If we end up doing a request-to-request merge when we have completed
a bio-to-request merge, we free the request from deep down in that
path. For blk-mq-sched, the merge path has to hold the appropriate
lock, but we don't need it for freeing the request. And in fact
holding the lock is problematic, since we are now calling the
mq sched put_rq_private() hook with the lock held. Other call paths
do not hold this lock.

Fix this inconsistency by ensuring that the caller frees a merged
request. Then we can do it outside of the lock, making it both more
efficient and fixing the blk-mq-sched problem of invoking parts of
the scheduler with an unknown lock state.

Reported-by: Paolo Valente 
Signed-off-by: Jens Axboe 
---
 block/blk-core.c | 12 +---
 block/blk-merge.c| 15 ---
 block/blk-mq-sched.c |  9 ++---
 block/blk-mq-sched.h |  3 ++-
 block/mq-deadline.c  |  8 ++--
 5 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a5726e01f839..00c90f8cd682 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1591,7 +1591,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
 {
struct blk_plug *plug;
int el_ret, where = ELEVATOR_INSERT_SORT;
-   struct request *req;
+   struct request *req, *free;
unsigned int request_count = 0;
unsigned int wb_acct;
 
@@ -1632,15 +1632,21 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
struct bio *bio)
if (el_ret == ELEVATOR_BACK_MERGE) {
if (bio_attempt_back_merge(q, req, bio)) {
elv_bio_merged(q, req, bio);
-   if (!attempt_back_merge(q, req))
+   free = attempt_back_merge(q, req);
+   if (!free)
elv_merged_request(q, req, el_ret);
+   else
+   __blk_put_request(q, free);
goto out_unlock;
}
} else if (el_ret == ELEVATOR_FRONT_MERGE) {
if (bio_attempt_front_merge(q, req, bio)) {
elv_bio_merged(q, req, bio);
-   if (!attempt_front_merge(q, req))
+   free = attempt_front_merge(q, req);
+   if (!free)
elv_merged_request(q, req, el_ret);
+   else
+   __blk_put_request(q, free);
goto out_unlock;
}
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3826fc32b72c..a373416dbc9a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -733,9 +733,11 @@ static struct request *attempt_merge(struct request_queue 
*q,
if (blk_rq_cpu_valid(next))
req->cpu = next->cpu;
 
-   /* owner-ship of bio passed from next to req */
+   /*
+* ownership of bio passed from next to req, return 'next' for
+* the caller to free
+*/
next->bio = NULL;
-   __blk_put_request(q, next);
return next;
 }
 
@@ -763,12 +765,19 @@ int blk_attempt_req_merge(struct request_queue *q, struct 
request *rq,
  struct request *next)
 {
struct elevator_queue *e = q->elevator;
+   struct request *free;
 
if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
return 0;
 
-   return attempt_merge(q, rq, next) != NULL;
+   free = attempt_merge(q, rq, next);
+   if (free) {
+   __blk_put_request(q, free);
+   return 1;
+   }
+
+   return 0;
 }
 
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 114814ec3d49..d93b56d53c4e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -234,7 +234,8 @@ void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx 
*hctx,
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_move_to_dispatch);
 
-bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio)
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
+   struct request **merged_request)
 {
struct request *rq;
int ret;
@@ -244,7 +245,8 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct 
bio *bio)
if (!blk_mq_sched_allow_merge(q, rq, bio))
return false;
if (bio_attempt_back_merge(q, rq, bio)) {
-   if (!attempt_back_merge(q, rq))
+   *merged_request = attempt_back_merge(q, rq);
+   if (!*merged_request)
elv_merged_request(q, rq, ret);
return true;
}
@@ -252,7 +254,8 @@ bool 

[PATCH 0/2] blk-mq-sched: fix put_rq_private() lock inconsistency

2017-02-02 Thread Jens Axboe
I tested the patch I sent to Paolo yesterday, and it seems to work
fine. I broke it up into two pieces, so the functional change is
restricted to patch #2.

Basically this fixes the case where we can invoke the blk-mq-sched
put request functions in an inconsistent state. Most of the time we
invoke them without any locks held, but for the case where we get
a successful request-to-request merge on the back of a bio-to-request
merge, we can invoke it with whatever lock the scheduler held when
it called blk_mq_sched_try_merge().

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] blk-merge: return the merged request

2017-02-02 Thread Jens Axboe
When we attempt to merge request-to-request, we return a 0/1 if we
ended up merging or not. Change that to return the pointer to the
request that we freed. We will use this to move the freeing of
that request out of the merge logic, so that callers can drop
locks before freeing the request.

There should be no functional changes in this patch.

Signed-off-by: Jens Axboe 
---
 block/blk-merge.c | 31 ---
 block/blk.h   |  4 ++--
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6aa43dec5af4..3826fc32b72c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -659,31 +659,32 @@ static void blk_account_io_merge(struct request *req)
 }
 
 /*
- * Has to be called with the request spinlock acquired
+ * For non-mq, this has to be called with the request spinlock acquired.
+ * For mq with scheduling, the appropriate queue wide lock should be held.
  */
-static int attempt_merge(struct request_queue *q, struct request *req,
- struct request *next)
+static struct request *attempt_merge(struct request_queue *q,
+struct request *req, struct request *next)
 {
if (!rq_mergeable(req) || !rq_mergeable(next))
-   return 0;
+   return NULL;
 
if (req_op(req) != req_op(next))
-   return 0;
+   return NULL;
 
/*
 * not contiguous
 */
if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
-   return 0;
+   return NULL;
 
if (rq_data_dir(req) != rq_data_dir(next)
|| req->rq_disk != next->rq_disk
|| req_no_special_merge(next))
-   return 0;
+   return NULL;
 
if (req_op(req) == REQ_OP_WRITE_SAME &&
!blk_write_same_mergeable(req->bio, next->bio))
-   return 0;
+   return NULL;
 
/*
 * If we are allowed to merge, then append bio list
@@ -692,7 +693,7 @@ static int attempt_merge(struct request_queue *q, struct 
request *req,
 * counts here.
 */
if (!ll_merge_requests_fn(q, req, next))
-   return 0;
+   return NULL;
 
/*
 * If failfast settings disagree or any of the two is already
@@ -735,27 +736,27 @@ static int attempt_merge(struct request_queue *q, struct 
request *req,
/* owner-ship of bio passed from next to req */
next->bio = NULL;
__blk_put_request(q, next);
-   return 1;
+   return next;
 }
 
-int attempt_back_merge(struct request_queue *q, struct request *rq)
+struct request *attempt_back_merge(struct request_queue *q, struct request *rq)
 {
struct request *next = elv_latter_request(q, rq);
 
if (next)
return attempt_merge(q, rq, next);
 
-   return 0;
+   return NULL;
 }
 
-int attempt_front_merge(struct request_queue *q, struct request *rq)
+struct request *attempt_front_merge(struct request_queue *q, struct request 
*rq)
 {
struct request *prev = elv_former_request(q, rq);
 
if (prev)
return attempt_merge(q, prev, rq);
 
-   return 0;
+   return NULL;
 }
 
 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -767,7 +768,7 @@ int blk_attempt_req_merge(struct request_queue *q, struct 
request *rq,
if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
return 0;
 
-   return attempt_merge(q, rq, next);
+   return attempt_merge(q, rq, next) != NULL;
 }
 
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
diff --git a/block/blk.h b/block/blk.h
index c1bd4bf9e645..918cea38d51e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -204,8 +204,8 @@ int ll_back_merge_fn(struct request_queue *q, struct 
request *req,
 struct bio *bio);
 int ll_front_merge_fn(struct request_queue *q, struct request *req, 
  struct bio *bio);
-int attempt_back_merge(struct request_queue *q, struct request *rq);
-int attempt_front_merge(struct request_queue *q, struct request *rq);
+struct request *attempt_back_merge(struct request_queue *q, struct request 
*rq);
+struct request *attempt_front_merge(struct request_queue *q, struct request 
*rq);
 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next);
 void blk_recalc_rq_segments(struct request *rq);
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/8] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

2017-02-02 Thread Jens Axboe
On 02/02/2017 02:19 AM, Paolo Valente wrote:
> The scheme is clear.  One comment, in case it could make sense and
> avoid more complexity: since put_rq_priv is invoked in two different
> contexts, process or interrupt, I didn't feel so confusing that, when
> put_rq_priv is invoked in the context where the lock cannot be held
> (unless one is willing to pay with irq disabling all the times), the
> lock is not held, while, when invoked in the context where the lock
> can be held, the lock is actually held, or must be taken.

If you grab the same lock from put_rq_priv, yes, you must make it IRQ
disabling in all contexts, and use _irqsave() from put_rq_priv. If it's
just freeing resources, you could potentially wait and do that when
someone else needs them, since that part will come from proces context.
That would need two locks, though.

As I said above, I would not worry about the IRQ disabling lock.

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] block: fix blk-mq debugfs vs. blktrace

2017-02-02 Thread Jens Axboe
On 02/02/2017 03:58 AM, Greg Kroah-Hartman wrote:
> On Wed, Feb 01, 2017 at 12:31:15AM -0800, Omar Sandoval wrote:
>> On Wed, Feb 01, 2017 at 09:16:08AM +0100, Greg Kroah-Hartman wrote:
>>> On Tue, Jan 31, 2017 at 02:53:16PM -0800, Omar Sandoval wrote:
 From: Omar Sandoval 

 When I moved the blk-mq debugging information to debugfs, I didn't
 realize that blktrace also created directories in debugfs that
 conflicted with the blk-mq directories. This series fixes that.

 Patch 1 adds a new debugfs helper needed for patch 6. Greg, could I get
 an ack on that if it makes sense? Jens and I went back and forth on this
 for a little while, but patch 6 has more of the rationale on why we
 decided that this approach was the cleanest.
>>>
>>> I can't find patch 6, you only cc:ed me on the first patch :(
>>>
>>> Care to bounce them all to me?
>>>
>>> thanks,
>>>
>>> greg k-h
>>
>> Gah, I forgot --cc-cover to git-send-email. The series is all here:
>> http://marc.info/?l=linux-block=1=201701=2. I can also send the
>> patches directly to you if you prefer that.
> 
> I don't understand the problem here.  How do you not know if you have
> created the debugfs file or not?  You have the structure, with the
> correct name, how could it have been created?  Can't you save the dentry
> to the debugfs file in the structure that has the name?

The problem is that blktrace registers a trace name directory, which
can be either whole device or partition, depending on what you trace.
For the blk-mq debug parts, we always just register the whole device
name. There's no way to save the partition dentry, and imho, why even
would you when you can just look it up. It's a file system...

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/5 v3] BDI lifetime fix

2017-02-02 Thread Jens Axboe
On 02/02/2017 07:56 AM, Jan Kara wrote:
> Hello,
> 
> this is the third version of the patch series that attempts to solve the
> problems with the life time of a backing_dev_info structure. Currently it 
> lives
> inside request_queue structure and thus it gets destroyed as soon as request
> queue goes away. However the block device inode still stays around and thus
> inode_to_bdi() call on that inode (e.g. from flusher worker) may happen after
> request queue has been destroyed resulting in oops.
> 
> This patch set tries to solve these problems by making backing_dev_info
> independent structure referenced from block device inode. That makes sure
> inode_to_bdi() cannot ever oops. I gave some basic testing to the patches in
> KVM and on a real machine, Dan was running them with libnvdimm test suite 
> which
> was previously triggering the oops and things look good. So patches should be
> reasonably healthy.
> 
> Changes since v2:
> * Added Reviewed-by tags
> * Removed slab cache for backing_dev_info
> * Added patch to remove blkdev_get_backing_dev_info()
> 
> Changes since v1:
> * Use kref instead of atomic_t for refcount
> * Get rid of free_on_put flag

Added for 4.11, thanks Jan!

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] block: Dynamically allocate and refcount backing_dev_info

2017-02-02 Thread Jan Kara
Instead of storing backing_dev_info inside struct request_queue,
allocate it dynamically, reference count it, and free it when the last
reference is dropped. Currently only request_queue holds the reference
but in the following patch we add other users referencing
backing_dev_info.

Signed-off-by: Jan Kara 
---
 block/blk-core.c | 12 +---
 block/blk-sysfs.c|  2 +-
 include/linux/backing-dev-defs.h |  2 ++
 include/linux/backing-dev.h  | 10 +-
 include/linux/blkdev.h   |  1 -
 mm/backing-dev.c | 34 +-
 6 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a9ff1b919ae7..545ccb4b96f3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -693,7 +693,6 @@ static void blk_rq_timed_out_timer(unsigned long data)
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
struct request_queue *q;
-   int err;
 
q = kmem_cache_alloc_node(blk_requestq_cachep,
gfp_mask | __GFP_ZERO, node_id);
@@ -708,17 +707,16 @@ struct request_queue *blk_alloc_queue_node(gfp_t 
gfp_mask, int node_id)
if (!q->bio_split)
goto fail_id;
 
-   q->backing_dev_info = >_backing_dev_info;
+   q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
+   if (!q->backing_dev_info)
+   goto fail_split;
+
q->backing_dev_info->ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->backing_dev_info->name = "block";
q->node = node_id;
 
-   err = bdi_init(q->backing_dev_info);
-   if (err)
-   goto fail_split;
-
setup_timer(>backing_dev_info->laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
setup_timer(>timeout, blk_rq_timed_out_timer, (unsigned long) q);
@@ -769,7 +767,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, 
int node_id)
 fail_ref:
percpu_ref_exit(>q_usage_counter);
 fail_bdi:
-   bdi_destroy(q->backing_dev_info);
+   bdi_put(q->backing_dev_info);
 fail_split:
bioset_free(q->bio_split);
 fail_id:
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 64fb54c6b41c..4cbaa519ec2d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -799,7 +799,7 @@ static void blk_release_queue(struct kobject *kobj)
container_of(kobj, struct request_queue, kobj);
 
wbt_exit(q);
-   bdi_exit(q->backing_dev_info);
+   bdi_put(q->backing_dev_info);
blkcg_exit_queue(q);
 
if (q->elevator) {
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index e850e76acaaf..ad955817916d 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct page;
 struct device;
@@ -144,6 +145,7 @@ struct backing_dev_info {
 
char *name;
 
+   struct kref refcnt; /* Reference counter for the structure */
unsigned int capabilities; /* Device capabilities */
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 43b93a947e61..efb6ca992d05 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -18,7 +18,14 @@
 #include 
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
-void bdi_exit(struct backing_dev_info *bdi);
+
+static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
+{
+   kref_get(>refcnt);
+   return bdi;
+}
+
+void bdi_put(struct backing_dev_info *bdi);
 
 __printf(3, 4)
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -29,6 +36,7 @@ void bdi_unregister(struct backing_dev_info *bdi);
 
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
 void bdi_destroy(struct backing_dev_info *bdi);
+struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id);
 
 void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
bool range_cyclic, enum wb_reason reason);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 069e4a102a73..de85701cc699 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -433,7 +433,6 @@ struct request_queue {
struct delayed_work delay_work;
 
struct backing_dev_info *backing_dev_info;
-   struct backing_dev_info _backing_dev_info;
 
/*
 * The queue owner gets to use this for whatever they like.
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 3bfed5ab2475..28ce6cf7b2ff 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -237,6 +237,7 @@ static __init int bdi_class_init(void)
 
bdi_class->dev_groups = bdi_dev_groups;

[PATCH 0/5 v3] BDI lifetime fix

2017-02-02 Thread Jan Kara
Hello,

this is the third version of the patch series that attempts to solve the
problems with the life time of a backing_dev_info structure. Currently it lives
inside request_queue structure and thus it gets destroyed as soon as request
queue goes away. However the block device inode still stays around and thus
inode_to_bdi() call on that inode (e.g. from flusher worker) may happen after
request queue has been destroyed resulting in oops.

This patch set tries to solve these problems by making backing_dev_info
independent structure referenced from block device inode. That makes sure
inode_to_bdi() cannot ever oops. I gave some basic testing to the patches in
KVM and on a real machine, Dan was running them with libnvdimm test suite which
was previously triggering the oops and things look good. So patches should be
reasonably healthy.

Changes since v2:
* Added Reviewed-by tags
* Removed slab cache for backing_dev_info
* Added patch to remove blkdev_get_backing_dev_info()

Changes since v1:
* Use kref instead of atomic_t for refcount
* Get rid of free_on_put flag

Honza
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] block: Unhash block device inodes on gendisk destruction

2017-02-02 Thread Jan Kara
Currently, block device inodes stay around after corresponding gendisk
hash died until memory reclaim finds them and frees them. Since we will
make block device inode pin the bdi, we want to free the block device
inode as soon as the device goes away so that bdi does not stay around
unnecessarily. Furthermore we need to avoid issues when new device with
the same major,minor pair gets created since reusing the bdi structure
would be rather difficult in this case.

Unhashing block device inode on gendisk destruction nicely deals with
these problems. Once last block device inode reference is dropped (which
may be directly in del_gendisk()), the inode gets evicted. Furthermore if
the major,minor pair gets reallocated, we are guaranteed to get new
block device inode even if old block device inode is not yet evicted and
thus we avoid issues with possible reuse of bdi.

Reviewed-by: Christoph Hellwig 
Signed-off-by: Jan Kara 
---
 block/genhd.c  |  2 ++
 fs/block_dev.c | 15 +++
 include/linux/fs.h |  1 +
 3 files changed, 18 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index fcd6d4fae657..f2f22d0e8e14 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -648,6 +648,8 @@ void del_gendisk(struct gendisk *disk)
disk_part_iter_init(, disk,
 DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
while ((part = disk_part_iter_next())) {
+   bdev_unhash_inode(MKDEV(disk->major,
+   disk->first_minor + part->partno));
invalidate_partition(disk, part->partno);
delete_partition(disk, part->partno);
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 5db5d1340d69..ed6a34be7a1e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -954,6 +954,21 @@ static int bdev_set(struct inode *inode, void *data)
 
 static LIST_HEAD(all_bdevs);
 
+/*
+ * If there is a bdev inode for this device, unhash it so that it gets evicted
+ * as soon as last inode reference is dropped.
+ */
+void bdev_unhash_inode(dev_t dev)
+{
+   struct inode *inode;
+
+   inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, );
+   if (inode) {
+   remove_inode_hash(inode);
+   iput(inode);
+   }
+}
+
 struct block_device *bdget(dev_t dev)
 {
struct block_device *bdev;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ba074328894..702cb6c50194 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2342,6 +2342,7 @@ extern struct kmem_cache *names_cachep;
 #ifdef CONFIG_BLOCK
 extern int register_blkdev(unsigned int, const char *);
 extern void unregister_blkdev(unsigned int, const char *);
+extern void bdev_unhash_inode(dev_t dev);
 extern struct block_device *bdget(dev_t);
 extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] block: Make blk_get_backing_dev_info() safe without open bdev

2017-02-02 Thread Jan Kara
Currenly blk_get_backing_dev_info() is not safe to be called when the
block device is not open as bdev->bd_disk is NULL in that case. However
inode_to_bdi() uses this function and may be call called from flusher
worker or other writeback related functions without bdev being open
which leads to crashes such as:

[113031.075540] Unable to handle kernel paging request for data at address 
0x
[113031.075614] Faulting instruction address: 0xc03692e0
0:mon> t
[c000fb65f900] c036cb6c writeback_sb_inodes+0x30c/0x590
[c000fb65fa10] c036ced4 __writeback_inodes_wb+0xe4/0x150
[c000fb65fa70] c036d33c wb_writeback+0x30c/0x450
[c000fb65fb40] c036e198 wb_workfn+0x268/0x580
[c000fb65fc50] c00f3470 process_one_work+0x1e0/0x590
[c000fb65fce0] c00f38c8 worker_thread+0xa8/0x660
[c000fb65fd80] c00fc4b0 kthread+0x110/0x130
[c000fb65fe30] c00098f0 ret_from_kernel_thread+0x5c/0x6c
--- Exception: 0  at 
0:mon> e
cpu 0x0: Vector: 300 (Data Access) at [c000fb65f620]
pc: c03692e0: locked_inode_to_wb_and_lock_list+0x50/0x290
lr: c036cb6c: writeback_sb_inodes+0x30c/0x590
sp: c000fb65f8a0
   msr: 80010280b033
   dar: 0
 dsisr: 4000
  current = 0xc001d69be400
  paca= 0xc348   softe: 0irq_happened: 0x01
pid   = 18689, comm = kworker/u16:10

Fix the problem by grabbing reference to bdi on first open of the block
device and drop the reference only once the inode is evicted from
memory. This pins struct backing_dev_info in memory and thus fixes the
crashes.

Reviewed-by: Christoph Hellwig 
Reported-and-tested-by: Dan Williams 
Reported-by: Laurent Dufour 
Signed-off-by: Jan Kara 
---
 block/blk-core.c   | 8 +++-
 fs/block_dev.c | 7 +++
 include/linux/fs.h | 1 +
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 545ccb4b96f3..84fabb51714a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -109,14 +109,12 @@ void blk_queue_congestion_threshold(struct request_queue 
*q)
  * @bdev:  device
  *
  * Locates the passed device's request queue and returns the address of its
- * backing_dev_info.  This function can only be called if @bdev is opened
- * and the return value is never NULL.
+ * backing_dev_info. The return value is never NULL however we may return
+ * _backing_dev_info if the bdev is not currently open.
  */
 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 {
-   struct request_queue *q = bdev_get_queue(bdev);
-
-   return q->backing_dev_info;
+   return bdev->bd_bdi;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ed6a34be7a1e..601b71b76d7f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -884,6 +884,8 @@ static void bdev_evict_inode(struct inode *inode)
spin_lock(_lock);
list_del_init(>bd_list);
spin_unlock(_lock);
+   if (bdev->bd_bdi != _backing_dev_info)
+   bdi_put(bdev->bd_bdi);
 }
 
 static const struct super_operations bdev_sops = {
@@ -986,6 +988,7 @@ struct block_device *bdget(dev_t dev)
bdev->bd_contains = NULL;
bdev->bd_super = NULL;
bdev->bd_inode = inode;
+   bdev->bd_bdi = _backing_dev_info;
bdev->bd_block_size = (1 << inode->i_blkbits);
bdev->bd_part_count = 0;
bdev->bd_invalidated = 0;
@@ -1542,6 +1545,8 @@ static int __blkdev_get(struct block_device *bdev, 
fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
+   if (bdev->bd_bdi == _backing_dev_info)
+   bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
 
if (!partno) {
ret = -ENXIO;
@@ -1637,6 +1642,8 @@ static int __blkdev_get(struct block_device *bdev, 
fmode_t mode, int for_part)
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_queue = NULL;
+   bdi_put(bdev->bd_bdi);
+   bdev->bd_bdi = _backing_dev_info;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 702cb6c50194..c930cbc19342 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -423,6 +423,7 @@ struct block_device {
int bd_invalidated;
struct gendisk *bd_disk;
struct request_queue *  bd_queue;
+   struct backing_dev_info *bd_bdi;
struct list_headbd_list;
/*
 * Private data.  You must have bd_claim'ed the block_device
-- 
2.10.2

--
To unsubscribe from this list: send the line 

[PATCH 5/5] block: Get rid of blk_get_backing_dev_info()

2017-02-02 Thread Jan Kara
blk_get_backing_dev_info() is now a simple dereference. Remove that
function and simplify some code around that.

Signed-off-by: Jan Kara 
---
 block/blk-core.c| 14 --
 block/compat_ioctl.c|  7 ++-
 block/ioctl.c   |  7 ++-
 fs/btrfs/disk-io.c  |  2 +-
 fs/btrfs/volumes.c  |  2 +-
 fs/xfs/xfs_buf.c|  3 +--
 fs/xfs/xfs_buf.h|  1 -
 include/linux/backing-dev.h |  2 +-
 include/linux/blkdev.h  |  1 -
 9 files changed, 8 insertions(+), 31 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 84fabb51714a..47104f6a398b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -104,20 +104,6 @@ void blk_queue_congestion_threshold(struct request_queue 
*q)
q->nr_congestion_off = nr;
 }
 
-/**
- * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
- * @bdev:  device
- *
- * Locates the passed device's request queue and returns the address of its
- * backing_dev_info. The return value is never NULL however we may return
- * _backing_dev_info if the bdev is not currently open.
- */
-struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
-{
-   return bdev->bd_bdi;
-}
-EXPORT_SYMBOL(blk_get_backing_dev_info);
-
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
memset(rq, 0, sizeof(*rq));
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 556826ac7cb4..570021a0dc1c 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -661,7 +661,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, 
unsigned long arg)
struct block_device *bdev = inode->i_bdev;
struct gendisk *disk = bdev->bd_disk;
fmode_t mode = file->f_mode;
-   struct backing_dev_info *bdi;
loff_t size;
unsigned int max_sectors;
 
@@ -708,9 +707,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, 
unsigned long arg)
case BLKFRAGET:
if (!arg)
return -EINVAL;
-   bdi = blk_get_backing_dev_info(bdev);
return compat_put_long(arg,
-  (bdi->ra_pages * PAGE_SIZE) / 512);
+  (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512);
case BLKROGET: /* compatible */
return compat_put_int(arg, bdev_read_only(bdev) != 0);
case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@@ -728,8 +726,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, 
unsigned long arg)
case BLKFRASET:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
-   bdi = blk_get_backing_dev_info(bdev);
-   bdi->ra_pages = (arg * 512) / PAGE_SIZE;
+   bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0;
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
diff --git a/block/ioctl.c b/block/ioctl.c
index be7f4de3eb3c..7b88820b93d9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -505,7 +505,6 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t 
mode,
 int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
unsigned long arg)
 {
-   struct backing_dev_info *bdi;
void __user *argp = (void __user *)arg;
loff_t size;
unsigned int max_sectors;
@@ -532,8 +531,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, 
unsigned cmd,
case BLKFRAGET:
if (!arg)
return -EINVAL;
-   bdi = blk_get_backing_dev_info(bdev);
-   return put_long(arg, (bdi->ra_pages * PAGE_SIZE) / 512);
+   return put_long(arg, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512);
case BLKROGET:
return put_int(arg, bdev_read_only(bdev) != 0);
case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@@ -560,8 +558,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, 
unsigned cmd,
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
return -EACCES;
-   bdi = blk_get_backing_dev_info(bdev);
-   bdi->ra_pages = (arg * 512) / PAGE_SIZE;
+   bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0;
case BLKBSZSET:
return blkdev_bszset(bdev, mode, argp);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 18004169552c..37a31b12bb0c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1800,7 +1800,7 @@ static int btrfs_congested_fn(void *congested_data, int 
bdi_bits)
list_for_each_entry_rcu(device, >fs_devices->devices, dev_list) {
if (!device->bdev)
continue;
-   bdi = blk_get_backing_dev_info(device->bdev);
+   bdi = device->bdev->bd_bdi;

[PATCH 2/5] block: Use pointer to backing_dev_info from request_queue

2017-02-02 Thread Jan Kara
We will want to have struct backing_dev_info allocated separately from
struct request_queue. As the first step add pointer to backing_dev_info
to request_queue and convert all users touching it. No functional
changes in this patch.

Reviewed-by: Christoph Hellwig 
Signed-off-by: Jan Kara 
---
 block/blk-cgroup.c |  6 +++---
 block/blk-core.c   | 27 ++-
 block/blk-integrity.c  |  4 ++--
 block/blk-settings.c   |  2 +-
 block/blk-sysfs.c  |  8 
 block/blk-wbt.c|  8 
 block/genhd.c  |  2 +-
 drivers/block/aoe/aoeblk.c |  4 ++--
 drivers/block/drbd/drbd_main.c |  6 +++---
 drivers/block/drbd/drbd_nl.c   | 12 +++-
 drivers/block/drbd/drbd_proc.c |  2 +-
 drivers/block/drbd/drbd_req.c  |  2 +-
 drivers/block/pktcdvd.c|  4 ++--
 drivers/block/rbd.c|  2 +-
 drivers/md/bcache/request.c| 10 +-
 drivers/md/bcache/super.c  |  8 
 drivers/md/dm-cache-target.c   |  2 +-
 drivers/md/dm-era-target.c |  2 +-
 drivers/md/dm-table.c  |  2 +-
 drivers/md/dm-thin.c   |  2 +-
 drivers/md/dm.c|  6 +++---
 drivers/md/linear.c|  2 +-
 drivers/md/md.c|  6 +++---
 drivers/md/multipath.c |  2 +-
 drivers/md/raid0.c |  6 +++---
 drivers/md/raid1.c |  4 ++--
 drivers/md/raid10.c| 10 +-
 drivers/md/raid5.c | 12 ++--
 fs/gfs2/ops_fstype.c   |  2 +-
 fs/nilfs2/super.c  |  2 +-
 fs/super.c |  2 +-
 include/linux/blkdev.h |  3 ++-
 mm/page-writeback.c|  4 ++--
 33 files changed, 90 insertions(+), 86 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 8ba0af780e88..d673a69b61b4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -184,7 +184,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
goto err_free_blkg;
}
 
-   wb_congested = wb_congested_get_create(>backing_dev_info,
+   wb_congested = wb_congested_get_create(q->backing_dev_info,
   blkcg->css.id,
   GFP_NOWAIT | __GFP_NOWARN);
if (!wb_congested) {
@@ -469,8 +469,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state 
*css,
 const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
/* some drivers (floppy) instantiate a queue w/o disk registered */
-   if (blkg->q->backing_dev_info.dev)
-   return dev_name(blkg->q->backing_dev_info.dev);
+   if (blkg->q->backing_dev_info->dev)
+   return dev_name(blkg->q->backing_dev_info->dev);
return NULL;
 }
 EXPORT_SYMBOL_GPL(blkg_dev_name);
diff --git a/block/blk-core.c b/block/blk-core.c
index 61ba08c58b64..a9ff1b919ae7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -74,7 +74,7 @@ static void blk_clear_congested(struct request_list *rl, int 
sync)
 * flip its congestion state for events on other blkcgs.
 */
if (rl == >q->root_rl)
-   clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+   clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
 #endif
 }
 
@@ -85,7 +85,7 @@ static void blk_set_congested(struct request_list *rl, int 
sync)
 #else
/* see blk_clear_congested() */
if (rl == >q->root_rl)
-   set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+   set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
 #endif
 }
 
@@ -116,7 +116,7 @@ struct backing_dev_info *blk_get_backing_dev_info(struct 
block_device *bdev)
 {
struct request_queue *q = bdev_get_queue(bdev);
 
-   return >backing_dev_info;
+   return q->backing_dev_info;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 
@@ -584,7 +584,7 @@ void blk_cleanup_queue(struct request_queue *q)
blk_flush_integrity();
 
/* @q won't process any more request, flush async actions */
-   del_timer_sync(>backing_dev_info.laptop_mode_wb_timer);
+   del_timer_sync(>backing_dev_info->laptop_mode_wb_timer);
blk_sync_queue(q);
 
if (q->mq_ops)
@@ -596,7 +596,7 @@ void blk_cleanup_queue(struct request_queue *q)
q->queue_lock = >__queue_lock;
spin_unlock_irq(lock);
 
-   bdi_unregister(>backing_dev_info);
+   bdi_unregister(q->backing_dev_info);
 
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
@@ -708,17 +708,18 @@ struct request_queue *blk_alloc_queue_node(gfp_t 
gfp_mask, int node_id)
if (!q->bio_split)
goto fail_id;
 
-   q->backing_dev_info.ra_pages =
+   q->backing_dev_info = >_backing_dev_info;
+   q->backing_dev_info->ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_SIZE;
-   

Re: [PATCH 3/4] block: Dynamically allocate and refcount backing_dev_info

2017-02-02 Thread Jan Kara
On Wed 01-02-17 14:45:20, Jens Axboe wrote:
> On 02/01/2017 04:22 AM, Jan Kara wrote:
> > On Wed 01-02-17 01:50:07, Christoph Hellwig wrote:
> >> On Tue, Jan 31, 2017 at 01:54:28PM +0100, Jan Kara wrote:
> >>> Instead of storing backing_dev_info inside struct request_queue,
> >>> allocate it dynamically, reference count it, and free it when the last
> >>> reference is dropped. Currently only request_queue holds the reference
> >>> but in the following patch we add other users referencing
> >>> backing_dev_info.
> >>
> >> Do we really need the separate slab cache?  Otherwise this looks
> >> fine to me.
> > 
> > Yeah, probably it is not worth it. I'll remove it.
> 
> I agree on that, it should not be a hot path. Will you respin the series
> after making this change? Would be great to get this queued up.

Yes, will send it later today. I was just waiting whether someone else does
not have more comments to the series.

Honza
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] block: fix blk-mq debugfs vs. blktrace

2017-02-02 Thread Greg Kroah-Hartman
On Wed, Feb 01, 2017 at 12:31:15AM -0800, Omar Sandoval wrote:
> On Wed, Feb 01, 2017 at 09:16:08AM +0100, Greg Kroah-Hartman wrote:
> > On Tue, Jan 31, 2017 at 02:53:16PM -0800, Omar Sandoval wrote:
> > > From: Omar Sandoval 
> > > 
> > > When I moved the blk-mq debugging information to debugfs, I didn't
> > > realize that blktrace also created directories in debugfs that
> > > conflicted with the blk-mq directories. This series fixes that.
> > > 
> > > Patch 1 adds a new debugfs helper needed for patch 6. Greg, could I get
> > > an ack on that if it makes sense? Jens and I went back and forth on this
> > > for a little while, but patch 6 has more of the rationale on why we
> > > decided that this approach was the cleanest.
> > 
> > I can't find patch 6, you only cc:ed me on the first patch :(
> > 
> > Care to bounce them all to me?
> > 
> > thanks,
> > 
> > greg k-h
> 
> Gah, I forgot --cc-cover to git-send-email. The series is all here:
> http://marc.info/?l=linux-block=1=201701=2. I can also send the
> patches directly to you if you prefer that.

I don't understand the problem here.  How do you not know if you have
created the debugfs file or not?  You have the structure, with the
correct name, how could it have been created?  Can't you save the dentry
to the debugfs file in the structure that has the name?

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Doubt related to immutable biovecs

2017-02-02 Thread Suraj Choudhari
Thanks very much Ming and all !

I referred to multipage bvec patches link and it was indeed good & insightful.

I have few more query related to immutable bvecs & multipage bvecs, in
particular related to 4.4 kernel --

1) My understanding is bi_vcnt can't be used for a cloned BIO because
'bio_clone_fast' does NOT update the bi_vcnt field in the cloned BIO.
Is my understanding correct ?

2)  For a block device driver, is it be safe to use the 'bi_vcnt' of a
BIO that's received from the upper layers, in order to access the
bvecs array, particularly in the '4.4' kernel ?
 - I am thinking it may be safe to access bi_vcnt as long as the
block driver has NOT received a CLONE bio from upper layer.  The
reason I think can be bvecs are not multipage in 4.4. Is my
understanding right ?

Thanks & Regards,
Suraj




On Fri, Jan 20, 2017 at 9:27 PM, Ming Lei  wrote:
> Hi,
>
> On Fri, Jan 20, 2017 at 9:41 PM, Suraj Choudhari
>  wrote:
>> Hello,
>>
>> I've some queries related to accessing the 'bio_vec' and 'bi_vcnt'
>> members in the BIO structure after the kernel changes implemented for
>> the 'immutable biovecs' -
>>
>> Background -
>> -  From the changes done for the immutable biovecs, I understand that
>> the driver code now no longer needs to reference the 'bi_vcnt' and
>> 'bi_io_vec' fields directly. Instead we can use the 'bvec_iter'
>> iterator in order to to access the bio_vec.
>> -  We've iterator functions bio_iter_iovec() and bio_advance_iter() as
>> well, which return us literal 'struct biovecs' taking into account the
>> bi_bvec_done and bi_size values.
>>
>> Doubts  -
>> 1)  Few functions in the block layer and some drivers still directly
>> refer to the 'bi_vcnt' and 'bio_vec' members of the BIO structure
>> [instead of accessing bio_vecs using the bvec_iter iterator].
>> Would there be some changes in the functions to compulsorily use the
>> 'bvec_iter' in order to access bio_vecs?  If yes, are such changes
>
> Generally speaking, yes, it is always better to use iterator helpers.
>
>> related to the 'multipage biovec' feature ?
>
> Yes, it is a must for supporting multipage bvec.
>
>>
>> 2)  Can there be some problem if a driver directly accesses the
>> bi_io_vec [bio_vec] in the bio structure based on the values of bi_idx
>> and bi_vcnt ? [instead of using the bvec_iter iterator].
>
> The direct access can't work any more once switching to multipage bvec,
> and you can find some hint in the following link:
>
> https://lkml.org/lkml/2017/1/15/266
>
> Also .bi_vcnt can't be used for BIO_CLONED bio, and it is very easy
> to make mistakes to access the table directly if someone don't understand
> the iterator details enough.
>
>
> Thanks,
> Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/8] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

2017-02-02 Thread Paolo Valente

> Il giorno 02 feb 2017, alle ore 06:19, Jens Axboe  ha scritto:
> 
> On 02/01/2017 04:11 AM, Paolo Valente wrote:
>>> +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
>>> +{
>>> +   struct request_queue *q = hctx->queue;
>>> +   struct deadline_data *dd = q->elevator->elevator_data;
>>> +   int ret;
>>> +
>>> +   spin_lock(>lock);
>>> +   ret = blk_mq_sched_try_merge(q, bio);
>>> +   spin_unlock(>lock);
>>> +
>> 
>> Hi Jens,
>> first, good news, bfq is passing my first sanity checks.  Still, I
>> need a little more help for the following issue.  There is a case that
>> would be impossible to handle without modifying code outside bfq.  But
>> so far such a case never occurred, and I hope that it can never occur.
>> I'll try to briefly list all relevant details on this concern of mine,
>> so that you can quickly confirm my hope, or highlight where or what I
>> am missing.
> 
> Remember my earlier advice - it's not a problem to change anything in
> the core, in fact I would be surprised if you did not need to. My
> foresight isn't THAT good! It's much better to fix up an inconsistency
> there, rather than work around it in the consumer of that API.
> 
>> First, as done above for mq-deadline, invoking blk_mq_sched_try_merge
>> with the scheduler lock held is of course necessary (for example, to
>> protect q->last_merge).  This may lead to put_rq_private invoked
>> with the lock held, in case of successful merge.
> 
> Right, or some other lock with the same scope, as per my other email.
> 
>> As a consequence, put_rq_private may be invoked:
>> (1) in IRQ context, no scheduler lock held, because of a completion:
>> can be handled by deferring work and lock grabbing, because the
>> completed request is not queued in the scheduler any more;
>> (2) in process context, scheduler lock held, because of the above
>> successful merge: must be handled immediately, for consistency,
>> because the request is still queued in the scheduler;
>> (3) in process context, no scheduler lock held, for some other reason:
>> some path apparently may lead to this case, although I've never seen
>> it to happen.  Immediate handling, and hence locking, may be needed,
>> depending on whether the request is still queued in the scheduler.
>> 
>> So, my main question is: is case (3) actually impossible?  Should it
>> be possible, I guess we would have a problem, because of the
>> different lock state with respect to (2).
> 
> I agree, there's some inconsistency there, if you potentially need to
> grab the lock in your put_rq_private handler. The problem case is #2,
> when we have the merge. I would probably suggest that the best way to
> handle that is to pass back the dropped request so we can put it outside
> of holding the lock.
> 
> Let me see if I can come up with a good solution for this. We have to be
> consistent in how we invoke the scheduler functions, we can't have hooks
> that are called in unknown lock states. I also don't want you to have to
> add defer work handling in that kind of path, that will impact your
> performance and overhead.
> 

I'll try to learn from your solution, because, as of now, I don't see
how to avoid deferred work for the case where put_rq_private is
invoked in interrupt context.  In fact, for this case, we cannot grab
the lock, unless we turn all spin_lock into spin_lock_irq*.

>> Finally, I hope that it is certainly impossible to have a case (4): in
>> IRQ context, no lock held, but with the request in the scheduler.
> 
> That should not be possible.
> 
> Edit: since I'm on a flight and email won't send, I had a few minutes to
> hack this up. Totally untested, but something like the below should do
> it. Not super pretty... I'll play with this a bit more tomorrow.
> 
> 

The scheme is clear.  One comment, in case it could make sense and
avoid more complexity: since put_rq_priv is invoked in two different
contexts, process or interrupt, I didn't feel so confusing that, when
put_rq_priv is invoked in the context where the lock cannot be held
(unless one is willing to pay with irq disabling all the times), the
lock is not held, while, when invoked in the context where the lock
can be held, the lock is actually held, or must be taken.

Thanks,
Paolo

> diff --git a/block/blk-core.c b/block/blk-core.c
> index c142de090c41..530a9a3f60c9 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1609,7 +1609,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, 
> struct bio *bio)
> {
>   struct blk_plug *plug;
>   int el_ret, where = ELEVATOR_INSERT_SORT;
> - struct request *req;
> + struct request *req, *free;
>   unsigned int request_count = 0;
>   unsigned int wb_acct;
> 
> @@ -1650,15 +1650,21 @@ static blk_qc_t blk_queue_bio(struct request_queue 
> *q, struct bio *bio)
>   if (el_ret == ELEVATOR_BACK_MERGE) {
>   if (bio_attempt_back_merge(q, req, bio)) {
>   elv_bio_merged(q, req, bio);
> -   

Re: [PATCH v3] scsi, block: fix duplicate bdi name registration crashes

2017-02-02 Thread Christoph Hellwig
Looks good,

Reviewed-by: Christoph Hellwig 
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html