Re: [LSF/MM TOPIC] Badblocks checking/representation in filesystems

2017-01-20 Thread Kani, Toshimitsu
On Fri, 2017-01-20 at 18:24 +0900, Yasunori Goto wrote:
 :
> > 
> > Like mentioned before, this discussion is more about presentation
> > of errors in a known consumable format, rather than recovering from
> > errors. While recovering from errors is interesting, we already
> > have layers like RAID for that, and they are as applicable to
> > NVDIMM backed storage as they have been for disk/SSD based storage.
> 
> I have one question here.
> 
> Certainly, user can use LVM mirroring for storage mode of NVDIMM.
> However, NVDIMM has DAX mode. 
> Can user use LVM mirroring for NVDIMM DAX mode?
> I could not find any information that LVM support DAX

dm-linear and dm-stripe support DAX.  This is done by mapping block
allocations to LVM physical devices.  Once blocks are allocated, all
DAX I/Os are direct and do not go through the device-mapper layer.  We
may be able to change it for read/write paths, but it remains true for
mmap.  So, I do not think DAX can be supported with LVM mirroring. 
This does not preclude hardware mirroring, though.

-Toshi
N�r��yb�X��ǧv�^�)޺{.n�+{�nZ�)w*jg����ݢj/���z�ޖ��2�ޙ&�)ߡ�a�����G���h��j:+v���w��٥

[PATCH 4/4] nbd: add a nbd-control interface

2017-01-20 Thread Josef Bacik
This patch mirrors the loop back device behavior with a few changes.  First
there is no DEL operation as NBD doesn't get as much churn as loop devices do.
Secondly the GET_NEXT operation can optionally create a new NBD device or not.
Our infrastructure people want to not allow NBD to create new devices as it
causes problems for them in containers.  However allow this to be optional as
things like the OSS NBD client probably doesn't care and would like to just be
given a device to use.

Signed-off-by: Josef Bacik 
---
 drivers/block/nbd.c  | 129 ++-
 include/uapi/linux/nbd.h |  10 
 2 files changed, 125 insertions(+), 14 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 96f7681..c06ed36 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -59,6 +60,7 @@ struct nbd_device {
unsigned long runtime_flags;
struct nbd_sock **socks;
int magic;
+   int index;
 
struct blk_mq_tag_set tag_set;
 
@@ -1041,6 +1043,7 @@ static int nbd_dev_add(int index)
if (err < 0)
goto out_free_disk;
 
+   nbd->index = index;
nbd->disk = disk;
nbd->tag_set.ops = _mq_ops;
nbd->tag_set.nr_hw_queues = 1;
@@ -1097,20 +1100,109 @@ static int nbd_dev_add(int index)
return err;
 }
 
-/*
- * And here should be modules and kernel interface 
- *  (Just smiley confuses emacs :-)
- */
+static int find_free_cb(int id, void *ptr, void *data)
+{
+   struct nbd_device *nbd = ptr;
+   struct nbd_device **ret = data;
+
+   if (nbd->num_connections == 0) {
+   *ret = nbd;
+   return 1;
+   }
+   if (*ret == NULL || (*ret)->index < nbd->index)
+   *ret = nbd;
+   return 0;
+}
+
+static int nbd_dev_lookup(struct nbd_device **ret, int index)
+{
+   struct nbd_device *nbd = NULL;
+
+   if (index < 0) {
+   int err = idr_for_each(_index_idr, _free_cb, );
+   if (err != 1) {
+   if (nbd != NULL) {
+   *ret = NULL;
+   return nbd->index + 1;
+   } else {
+   return 0;
+   }
+   }
+   } else
+   nbd = idr_find(_index_idr, index);
+   if (nbd) {
+   *ret = nbd;
+   return nbd->index;
+   }
+   return -ENODEV;
+}
+
+static long nbd_control_ioctl(struct file *file, unsigned int cmd,
+ unsigned long parm)
+{
+   struct nbd_device *nbd = NULL;
+   int ret = -ENOSYS;
+
+   mutex_lock(_index_mutex);
+   switch (cmd) {
+   case NBD_CTL_ADD:
+   ret = nbd_dev_lookup(, parm);
+   if (ret >= 0) {
+   ret = -EEXIST;
+   break;
+   }
+   ret = nbd_dev_add(parm);
+   break;
+   case NBD_CTL_GET_FREE:
+   ret = nbd_dev_lookup(, -1);
+   if (ret < 0)
+   break;
+   if (parm && !nbd) {
+   int index = ret;
+
+   ret = nbd_dev_add(ret);
+   if (ret < 0)
+   break;
+   ret = index;
+   }
+   break;
+   default:
+   break;
+   }
+   mutex_unlock(_index_mutex);
+   return ret;
+}
+
+static const struct file_operations nbd_ctl_fops = {
+   .open   = nonseekable_open,
+   .unlocked_ioctl = nbd_control_ioctl,
+   .compat_ioctl   = nbd_control_ioctl,
+   .owner  = THIS_MODULE,
+   .llseek = noop_llseek,
+};
+
+static struct miscdevice nbd_misc = {
+   .minor  = NBD_CTRL_MINOR,
+   .name   = "nbd-control",
+   .fops   = _ctl_fops,
+};
+
+MODULE_ALIAS_MISCDEV(NBD_CTRL_MINOR);
+MODULE_ALIAS("devname:nbd-control");
 
 static int __init nbd_init(void)
 {
-   int i;
+   int i, ret;
 
BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
+   ret = misc_register(_misc);
+   if (ret < 0)
+   return ret;
 
+   ret = -EINVAL;
if (max_part < 0) {
printk(KERN_ERR "nbd: max_part must be >= 0\n");
-   return -EINVAL;
+   goto out_misc;
}
 
part_shift = 0;
@@ -1129,17 +1221,20 @@ static int __init nbd_init(void)
}
 
if ((1UL << part_shift) > DISK_MAX_PARTS)
-   return -EINVAL;
-
+   goto out_misc;
if (nbds_max > 1UL << (MINORBITS - part_shift))
-   return -EINVAL;
+   goto out_misc;
+
recv_workqueue = alloc_workqueue("knbd-recv",
 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
-   if (!recv_workqueue)
-   

[PATCH 3/4] nbd: use an idr to keep track of nbd devices

2017-01-20 Thread Josef Bacik
To prepare for dynamically adding new nbd devices to the system switch
from using an array for the nbd devices and instead use an idr.  This
copies what loop does for keeping track of its devices.

Signed-off-by: Josef Bacik 
---
 drivers/block/nbd.c | 213 
 1 file changed, 115 insertions(+), 98 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9fe9763..96f7681 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -41,6 +41,9 @@
 
 #include 
 
+static DEFINE_IDR(nbd_index_idr);
+static DEFINE_MUTEX(nbd_index_mutex);
+
 struct nbd_sock {
struct socket *sock;
struct mutex tx_lock;
@@ -89,9 +92,9 @@ static struct dentry *nbd_dbg_dir;
 #define NBD_MAGIC 0x68797548
 
 static unsigned int nbds_max = 16;
-static struct nbd_device *nbd_dev;
 static int max_part;
 static struct workqueue_struct *recv_workqueue;
+static int part_shift;
 
 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 {
@@ -997,6 +1000,103 @@ static struct blk_mq_ops nbd_mq_ops = {
.timeout= nbd_xmit_timeout,
 };
 
+static void nbd_dev_remove(struct nbd_device *nbd)
+{
+   struct gendisk *disk = nbd->disk;
+   nbd->magic = 0;
+   if (disk) {
+   del_gendisk(disk);
+   blk_cleanup_queue(disk->queue);
+   blk_mq_free_tag_set(>tag_set);
+   put_disk(disk);
+   }
+   kfree(nbd);
+}
+
+static int nbd_dev_add(int index)
+{
+   struct nbd_device *nbd;
+   struct gendisk *disk;
+   struct request_queue *q;
+   int err = -ENOMEM;
+
+   nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
+   if (!nbd)
+   goto out;
+
+   disk = alloc_disk(1 << part_shift);
+   if (!disk)
+   goto out_free_nbd;
+
+   if (index >= 0) {
+   err = idr_alloc(_index_idr, nbd, index, index + 1,
+   GFP_KERNEL);
+   if (err == -ENOSPC)
+   err = -EEXIST;
+   } else {
+   err = idr_alloc(_index_idr, nbd, 0, 0, GFP_KERNEL);
+   if (err >= 0)
+   index = err;
+   }
+   if (err < 0)
+   goto out_free_disk;
+
+   nbd->disk = disk;
+   nbd->tag_set.ops = _mq_ops;
+   nbd->tag_set.nr_hw_queues = 1;
+   nbd->tag_set.queue_depth = 128;
+   nbd->tag_set.numa_node = NUMA_NO_NODE;
+   nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
+   nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
+   BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
+   nbd->tag_set.driver_data = nbd;
+
+   err = blk_mq_alloc_tag_set(>tag_set);
+   if (err)
+   goto out_free_idr;
+
+   q = blk_mq_init_queue(>tag_set);
+   if (IS_ERR(q)) {
+   err = PTR_ERR(q);
+   goto out_free_tags;
+   }
+   disk->queue = q;
+
+   /*
+* Tell the block layer that we are not a rotational device
+*/
+   queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
+   queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+   disk->queue->limits.discard_granularity = 512;
+   blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
+   disk->queue->limits.discard_zeroes_data = 0;
+   blk_queue_max_hw_sectors(disk->queue, 65536);
+   disk->queue->limits.max_sectors = 256;
+
+   nbd->magic = NBD_MAGIC;
+   mutex_init(>config_lock);
+   disk->major = NBD_MAJOR;
+   disk->first_minor = index << part_shift;
+   disk->fops = _fops;
+   disk->private_data = nbd;
+   sprintf(disk->disk_name, "nbd%d", index);
+   init_waitqueue_head(>recv_wq);
+   nbd_reset(nbd);
+   add_disk(disk);
+   return index;
+
+out_free_tags:
+   blk_mq_free_tag_set(>tag_set);
+out_free_idr:
+   idr_remove(_index_idr, index);
+out_free_disk:
+   put_disk(disk);
+out_free_nbd:
+   kfree(nbd);
+out:
+   return err;
+}
+
 /*
  * And here should be modules and kernel interface 
  *  (Just smiley confuses emacs :-)
@@ -1004,9 +1104,7 @@ static struct blk_mq_ops nbd_mq_ops = {
 
 static int __init nbd_init(void)
 {
-   int err = -ENOMEM;
int i;
-   int part_shift;
 
BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
 
@@ -1040,114 +1138,33 @@ static int __init nbd_init(void)
if (!recv_workqueue)
return -ENOMEM;
 
-   nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
-   if (!nbd_dev) {
-   destroy_workqueue(recv_workqueue);
-   return -ENOMEM;
-   }
-
-   for (i = 0; i < nbds_max; i++) {
-   struct request_queue *q;
-   struct gendisk *disk = alloc_disk(1 << part_shift);
-   if (!disk)
-   goto out;
-   nbd_dev[i].disk = disk;
-
-   nbd_dev[i].tag_set.ops = _mq_ops;
-   

[PATCH 2/4] miscdevice: add a minor number for nbd-control

2017-01-20 Thread Josef Bacik
NBD is moving to a loop-control esque way of managing it's device
creation and deletion.  Reserver a minor number for the new device.

Signed-off-by: Josef Bacik 
---
 include/linux/miscdevice.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index ed30d5d..7e5c140 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -44,6 +44,7 @@
 #define MISC_MCELOG_MINOR  227
 #define HPET_MINOR 228
 #define FUSE_MINOR 229
+#define NBD_CTRL_MINOR 230
 #define KVM_MINOR  232
 #define BTRFS_MINOR234
 #define AUTOFS_MINOR   235
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] nbd: use our own workqueue for recv threads

2017-01-20 Thread Josef Bacik
Since we are in the memory reclaim path we need our recv work to be on a
workqueue that has WQ_MEM_RECLAIM set so we can avoid deadlocks.  Also
set WQ_HIGHPRI since we are in the completion path for IO.

Signed-off-by: Josef Bacik 
---
 drivers/block/nbd.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 9fd06ee..9fe9763 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -91,6 +91,7 @@ static struct dentry *nbd_dbg_dir;
 static unsigned int nbds_max = 16;
 static struct nbd_device *nbd_dev;
 static int max_part;
+static struct workqueue_struct *recv_workqueue;
 
 static inline struct device *nbd_to_dev(struct nbd_device *nbd)
 {
@@ -785,7 +786,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct 
nbd_device *nbd,
INIT_WORK([i].work, recv_work);
args[i].nbd = nbd;
args[i].index = i;
-   queue_work(system_long_wq, [i].work);
+   queue_work(recv_workqueue, [i].work);
}
wait_event_interruptible(nbd->recv_wq,
 atomic_read(>recv_threads) == 0);
@@ -1034,10 +1035,16 @@ static int __init nbd_init(void)
 
if (nbds_max > 1UL << (MINORBITS - part_shift))
return -EINVAL;
+   recv_workqueue = alloc_workqueue("knbd-recv",
+WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+   if (!recv_workqueue)
+   return -ENOMEM;
 
nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
-   if (!nbd_dev)
+   if (!nbd_dev) {
+   destroy_workqueue(recv_workqueue);
return -ENOMEM;
+   }
 
for (i = 0; i < nbds_max; i++) {
struct request_queue *q;
@@ -1117,6 +1124,7 @@ static int __init nbd_init(void)
put_disk(nbd_dev[i].disk);
}
kfree(nbd_dev);
+   destroy_workqueue(recv_workqueue);
return err;
 }
 
@@ -1136,6 +1144,7 @@ static void __exit nbd_cleanup(void)
put_disk(disk);
}
}
+   destroy_workqueue(recv_workqueue);
unregister_blkdev(NBD_MAJOR, "nbd");
kfree(nbd_dev);
printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] Block fixes for 4.10-rc

2017-01-20 Thread Jens Axboe
Hi Linus,

Just two small fixes for this -rc. One is just killing an unused
variable from Keith, but the other fixes a performance regression for
nbd in this series, where we inadvertently flipped when we set MSG_MORE
when outputting data.

Please pull!


  git://git.kernel.dk/linux-block.git for-linus



Josef Bacik (1):
  nbd: only set MSG_MORE when we have more to send

Keith Busch (1):
  blk-mq: Remove unused variable

 block/blk-mq.c  | 1 -
 drivers/block/nbd.c | 6 ++
 2 files changed, 2 insertions(+), 5 deletions(-)

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [lkp-robot] [rcu] b332151a29: kernel_BUG_at_mm/slab.c

2017-01-20 Thread Sebastian Andrzej Siewior
On 2017-01-20 08:32:37 [-0800], Jens Axboe wrote:
> That's alright, sounds like it's not a -next regression, but rather something
> that is already broken. I can reproduce a lot of breakage if I enable
> CONFIG_DEBUG_TEST_DRIVER_REMOVE, in fact my system doesn't boot at all. This
> is the first bug:
> 
> [   18.247895] [ cut here ]
> [   18.247907] WARNING: CPU: 21 PID: 2223 at drivers/ata/libata-core.c:6522 
> ata_host_detach+0x11b]
> [   18.247908] Modules linked in: igb(+) ahci(+) libahci i2c_algo_bit dca 
> libata nvme(+) nvme_core
> [   18.247917] CPU: 21 PID: 2223 Comm: systemd-udevd Tainted: GW  
>  4.10.0-rc4+ #30
> [   18.247919] Hardware name: Dell Inc. PowerEdge T630/0NT78X, BIOS 2.3.4 
> 11/09/2016
> [   18.247919] Call Trace:
> [   18.247928]  dump_stack+0x68/0x93
> [   18.247934]  __warn+0xc6/0xe0
> [   18.247937]  warn_slowpath_null+0x18/0x20
> [   18.247943]  ata_host_detach+0x11b/0x120 [libata]
…

> and it's even more downhill from there. That option is marked unstable, are we
> expecting it to work right now?

Well, as per 248ff0216543 ("driver core: Make Kconfig text for
DEBUG_TEST_DRIVER_REMOVE stronger"):

|   The current state of driver removal is not great.
|   CONFIG_DEBUG_TEST_DRIVER_REMOVE finds lots of errors. The help text
|   currently undersells exactly how many errors this option will find. Add
|   a bit more description to indicate this option shouldn't be turned on
|   unless you actually want to debug driver removal. The text can be
|   changed later when more drivers are fixed up.

so it looks like the option is working but it uncovers bugs. I've put
you in TO because the breakage in kvm test went away after I disabled
the MQ support in SCSI. So I *assumed* that MQ was not doing something
right in the removal path. I don't know if this libata-core backtrace is
a false positive or not.

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [lkp-robot] [rcu] b332151a29: kernel_BUG_at_mm/slab.c

2017-01-20 Thread Sebastian Andrzej Siewior
On 2017-01-20 08:09:36 [-0800], Jens Axboe wrote:
> Is there a full trace of this?

[3.654003] scsi host0: scsi_debug: version 1.86 [20160430]
[3.654003]   dev_size_mb=8, opts=0x0, submit_queues=1, statistics=0
[3.660755] scsi 0:0:0:0: Direct-Access Linuxscsi_debug   0186 
PQ: 0 ANSI: 7
[3.711231] sd 0:0:0:0: [sda] 16384 512-byte logical blocks: (8.39 MB/8.00 
MiB)
[3.716202] sd 0:0:0:0: [sda] Write Protect is off
[3.717244] sd 0:0:0:0: [sda] Mode Sense: 73 00 10 08
[3.725059] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, 
supports DPO and FUA
[3.795093] sd 0:0:0:0: [sda] Attached SCSI disk
[3.796686] sd 0:0:0:0: [sda] Synchronizing SCSI cache
[3.804770] sd 0:0:0:0: [sda] 16384 512-byte logical blocks: (8.39 MB/8.00 
MiB)
[3.809748] sd 0:0:0:0: [sda] Write Protect is off
[3.810806] sd 0:0:0:0: [sda] Mode Sense: 73 00 10 08
[3.818599] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, 
supports DPO and FUA
[3.830894] kobject (be01a5fc): tried to init an initialized object, 
something is seriously wrong.
[3.832820] CPU: 6 PID: 6 Comm: kworker/u14:0 Not tainted 4.9.0 #86
[3.834172] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
1.10.1-1 04/01/2014
[3.835886] Workqueue: events_unbound async_run_entry_fn
[3.837028]  80079da8 83a5f6cc be01a5fc be01a5fc 80079dc4 83a61e33 842a883c 
be01a5fc
[3.838802]   84397488 be01a108 80079dec 83a46afa be01a5d8 840670a0 
be01a5d8
[3.840570]  be01a108 bd983468 be01a108 bd983470 be01a5d8 80079e14 83a3c857 
be01a5d8
[3.842350] Call Trace:
[3.842884]  [<83a5f6cc>] dump_stack+0x58/0x7c
[3.843834]  [<83a61e33>] kobject_init+0x73/0x80
[3.844828]  [<83a46afa>] blk_mq_register_dev+0x2a/0x110
[3.845971]  [<83a3c857>] blk_register_queue+0x87/0x140
[3.847085]  [<83a49c7e>] device_add_disk+0x1ce/0x470
[3.848170]  [<83c3d328>] sd_probe_async+0xe8/0x1b0
[3.849207]  [<83672347>] async_run_entry_fn+0x37/0xe0
[3.850313]  [<836694f7>] process_one_work+0x1b7/0x3e0
[3.851414]  [<8366947c>] ? process_one_work+0x13c/0x3e0
[3.852552]  [<83669759>] worker_thread+0x39/0x460
[3.853569]  [<83669720>] ? process_one_work+0x3e0/0x3e0
[3.854717]  [<8366f394>] kthread+0xb4/0xd0
[3.855611]  [<83f8cb4d>] ? _raw_spin_unlock_irq+0x2d/0x50
[3.856775]  [<8366f2e0>] ? __kthread_create_on_node+0x160/0x160
[3.858066]  [<8366f2e0>] ? __kthread_create_on_node+0x160/0x160
[3.859340]  [<83f8d363>] ret_from_fork+0x1b/0x28
[3.863951] kobject (ff0ca36c): tried to init an initialized object, 
something is seriously wrong.
[3.865875] CPU: 6 PID: 6 Comm: kworker/u14:0 Not tainted 4.9.0 #86
[3.867202] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
1.10.1-1 04/01/2014
[3.868924] Workqueue: events_unbound async_run_entry_fn
[3.870079]  80079da8 83a5f6cc ff0ca36c be01a5fc 80079dc4 83a61e33 842a883c 
ff0ca36c
[3.871846]  80079dc4 84397474 be01a108 80079dec 83a46b1c be01a5d8 840670a0 
be01a5d8
[3.873605]  be01a108 bd983468 be01a108 bd983470 be01a5d8 80079e14 83a3c857 
be01a5d8
[3.875395] Call Trace:
[3.875928]  [<83a5f6cc>] dump_stack+0x58/0x7c
[3.876878]  [<83a61e33>] kobject_init+0x73/0x80
[3.877884]  [<83a46b1c>] blk_mq_register_dev+0x4c/0x110
[3.879018]  [<83a3c857>] blk_register_queue+0x87/0x140
[3.880136]  [<83a49c7e>] device_add_disk+0x1ce/0x470
[3.881227]  [<83c3d328>] sd_probe_async+0xe8/0x1b0
[3.882283]  [<83672347>] async_run_entry_fn+0x37/0xe0
[3.883381]  [<836694f7>] process_one_work+0x1b7/0x3e0
[3.884481]  [<8366947c>] ? process_one_work+0x13c/0x3e0
[3.885613]  [<83669759>] worker_thread+0x39/0x460
[3.886658]  [<83669720>] ? process_one_work+0x3e0/0x3e0
[3.887791]  [<8366f394>] kthread+0xb4/0xd0
[3.888680]  [<83f8cb4d>] ? _raw_spin_unlock_irq+0x2d/0x50
[3.889881]  [<8366f2e0>] ? __kthread_create_on_node+0x160/0x160
[3.891189]  [<8366f2e0>] ? __kthread_create_on_node+0x160/0x160
[3.892506]  [<83f8d363>] ret_from_fork+0x1b/0x28
[3.893563] kobject (ff21b36c): tried to init an initialized object, 
something is seriously wrong.
[3.895559] CPU: 6 PID: 6 Comm: kworker/u14:0 Not tainted 4.9.0 #86
[3.896938] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
1.10.1-1 04/01/2014
[3.898741] Workqueue: events_unbound async_run_entry_fn
[3.899911]  80079da8 83a5f6cc ff21b36c be01a5fc 80079dc4 83a61e33 842a883c 
ff21b36c
[3.901739]  80079dc4 84397474 be01a108 80079dec 83a46b1c be01a5d8 840670a0 
be01a5d8
[3.903558]  be01a108 bd983468 be01a108 bd983470 be01a5d8 80079e14 83a3c857 
be01a5d8
[3.905381] Call Trace:
[3.905936]  [<83a5f6cc>] dump_stack+0x58/0x7c
[3.906913]  [<83a61e33>] kobject_init+0x73/0x80
[3.907931]  [<83a46b1c>] blk_mq_register_dev+0x4c/0x110
[3.909108]  [<83a3c857>] blk_register_queue+0x87/0x140
[3.910267]  [<83a49c7e>] device_add_disk+0x1ce/0x470
[

Re: [lkp-robot] [rcu] b332151a29: kernel_BUG_at_mm/slab.c

2017-01-20 Thread Jens Axboe
On 01/20/2017 08:23 AM, Sebastian Andrzej Siewior wrote:
>>> yes. With and without the patch there is a lot of wrong stuff like
>>> complains about a kobject initialized again. This leads to a double free
>>> at some point.
>>
>> And what patch are we talking about? I don't mind being CC'ed into a thread,
>> but some context and background would be immensely helpful here...
> 
> The patch is irrelevant. lkp-robot found a bug which was there before
> the patch in question but the pattern changed so it blamed the Author.
> It triggers even v4.9 with
>   CONFIG_SCSI_DEBUG
>   CONFIG_DEBUG_TEST_DRIVER_REMOVE
>   CONFIG_SCSI_MQ_DEFAULT
> enabled and CONFIG_SCSI_DEBUG is simply a SCSI host controller which is
> always there. I can send you a complete config against current HEAD
> which boots in kvm if you want.

That's alright, sounds like it's not a -next regression, but rather something
that is already broken. I can reproduce a lot of breakage if I enable
CONFIG_DEBUG_TEST_DRIVER_REMOVE, in fact my system doesn't boot at all. This
is the first bug:

[   18.247895] [ cut here ]
[   18.247907] WARNING: CPU: 21 PID: 2223 at drivers/ata/libata-core.c:6522 
ata_host_detach+0x11b]
[   18.247908] Modules linked in: igb(+) ahci(+) libahci i2c_algo_bit dca 
libata nvme(+) nvme_core
[   18.247917] CPU: 21 PID: 2223 Comm: systemd-udevd Tainted: GW   
4.10.0-rc4+ #30
[   18.247919] Hardware name: Dell Inc. PowerEdge T630/0NT78X, BIOS 2.3.4 
11/09/2016
[   18.247919] Call Trace:
[   18.247928]  dump_stack+0x68/0x93
[   18.247934]  __warn+0xc6/0xe0
[   18.247937]  warn_slowpath_null+0x18/0x20
[   18.247943]  ata_host_detach+0x11b/0x120 [libata]
[   18.247950]  ata_pci_remove_one+0x10/0x20 [libata]
[   18.247955]  ahci_remove_one+0x10/0x20 [ahci]
[   18.247958]  pci_device_remove+0x34/0xb0
[   18.247966]  driver_probe_device+0xd0/0x370
[   18.247969]  __driver_attach+0x9a/0xa0
[   18.247971]  ? driver_probe_device+0x370/0x370
[   18.247973]  bus_for_each_dev+0x5d/0x90
[   18.247975]  driver_attach+0x19/0x20
[   18.247977]  bus_add_driver+0x11f/0x220
[   18.247980]  driver_register+0x5b/0xd0
[   18.247982]  __pci_register_driver+0x58/0x60
[   18.247984]  ? 0xa00d9000
[   18.247988]  ahci_pci_driver_init+0x1e/0x20 [ahci]
[   18.247992]  do_one_initcall+0x3e/0x170
[   18.247997]  ? rcu_read_lock_sched_held+0x45/0x80
[   18.248001]  ? kmem_cache_alloc_trace+0x22e/0x290
[   18.248004]  do_init_module+0x5a/0x1cb
[   18.248007]  load_module+0x1e60/0x2570
[   18.248008]  ? __symbol_put+0x70/0x70
[   18.248010]  ? show_coresize+0x30/0x30
[   18.248013]  ? kernel_read_file+0x19e/0x1c0
[   18.248015]  ? kernel_read_file_from_fd+0x44/0x70
[   18.248016]  SYSC_finit_module+0xba/0xc0
[   18.248018]  SyS_finit_module+0x9/0x10
[   18.248021]  entry_SYSCALL_64_fastpath+0x18/0xad
[   18.248022] RIP: 0033:0x7f49c5a645b9
[   18.248023] RSP: 002b:7ffccf512658 EFLAGS: 0246 ORIG_RAX: 
0139
[   18.248025] RAX: ffda RBX: 7f49c61659dd RCX: 7f49c5a645b9
[   18.248026] RDX:  RSI: 7f49c53152c7 RDI: 0009
[   18.248026] RBP: 0002 R08:  R09: 
[   18.248027] R10: 0009 R11: 0246 R12: 555737e82b30
[   18.248028] R13: 555737e71200 R14: 555737e82b30 R15: 
[   18.248030] ---[ end trace b0ae5eae3430d5d6 ]---

and it's even more downhill from there. That option is marked unstable, are we
expecting it to work right now?

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [lkp-robot] [rcu] b332151a29: kernel_BUG_at_mm/slab.c

2017-01-20 Thread Jens Axboe
On 01/20/2017 08:01 AM, Sebastian Andrzej Siewior wrote:
> On 2017-01-19 09:02:16 [+0800], kernel test robot wrote:
>> test-description: Trinity is a linux system call fuzz tester.
> 
> you don't even get to fire up trinity. With and without the patch you
> crash very early.
> 
>> +-+++
>> | | d5f6ab9c11 | 
>> b332151a29 |
>> +-+++
>> | boot_successes  | 0  | 0   
>>|
>> | boot_failures   | 6  | 8   
>>|
>> | WARNING:at_include/linux/kref.h:#kobject_get| 6  | 8   
>>|
>> | WARNING:at_arch/x86/mm/dump_pagetables.c:#note_page | 2  | 2   
>>|
>> | kernel_BUG_at_mm/slab.c | 0  | 4   
>>|
>> | invalid_opcode:#[##]PREEMPT_SMP | 0  | 4   
>>|
>> | Kernel_panic-not_syncing:Fatal_exception| 0  | 6   
>>|
>> | BUG:unable_to_handle_kernel | 0  | 2   
>>|
>> | Oops| 0  | 2   
>>|
>> +-+++
> 
> There is no successful boot. The pattern changes with patch in question
> applied.
> 
>> [8.044624] sd 0:0:0:0: [sda] Synchronizing SCSI cache
>> [8.055721] slab: double free detected in cache 'kmalloc-32', objp 
>> 8af558c0
>> [8.057138] [ cut here ]
>> [8.058085] kernel BUG at mm/slab.c:2624!
>> [8.059255] invalid opcode:  [#1] PREEMPT SMP

Is there a full trace of this?

> yes. With and without the patch there is a lot of wrong stuff like
> complains about a kobject initialized again. This leads to a double free
> at some point.

And what patch are we talking about? I don't mind being CC'ed into a thread,
but some context and background would be immensely helpful here...

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Doubt related to immutable biovecs

2017-01-20 Thread Ming Lei
Hi,

On Fri, Jan 20, 2017 at 9:41 PM, Suraj Choudhari
 wrote:
> Hello,
>
> I've some queries related to accessing the 'bio_vec' and 'bi_vcnt'
> members in the BIO structure after the kernel changes implemented for
> the 'immutable biovecs' -
>
> Background -
> -  From the changes done for the immutable biovecs, I understand that
> the driver code now no longer needs to reference the 'bi_vcnt' and
> 'bi_io_vec' fields directly. Instead we can use the 'bvec_iter'
> iterator in order to to access the bio_vec.
> -  We've iterator functions bio_iter_iovec() and bio_advance_iter() as
> well, which return us literal 'struct biovecs' taking into account the
> bi_bvec_done and bi_size values.
>
> Doubts  -
> 1)  Few functions in the block layer and some drivers still directly
> refer to the 'bi_vcnt' and 'bio_vec' members of the BIO structure
> [instead of accessing bio_vecs using the bvec_iter iterator].
> Would there be some changes in the functions to compulsorily use the
> 'bvec_iter' in order to access bio_vecs?  If yes, are such changes

Generally speaking, yes, it is always better to use iterator helpers.

> related to the 'multipage biovec' feature ?

Yes, it is a must for supporting multipage bvec.

>
> 2)  Can there be some problem if a driver directly accesses the
> bi_io_vec [bio_vec] in the bio structure based on the values of bi_idx
> and bi_vcnt ? [instead of using the bvec_iter iterator].

The direct access can't work any more once switching to multipage bvec,
and you can find some hint in the following link:

https://lkml.org/lkml/2017/1/15/266

Also .bi_vcnt can't be used for BIO_CLONED bio, and it is very easy
to make mistakes to access the table directly if someone don't understand
the iterator details enough.


Thanks,
Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [lkp-robot] [rcu] b332151a29: kernel_BUG_at_mm/slab.c

2017-01-20 Thread Sebastian Andrzej Siewior
On 2017-01-19 09:02:16 [+0800], kernel test robot wrote:
> test-description: Trinity is a linux system call fuzz tester.

you don't even get to fire up trinity. With and without the patch you
crash very early.

> +-+++
> | | d5f6ab9c11 | 
> b332151a29 |
> +-+++
> | boot_successes  | 0  | 0
>   |
> | boot_failures   | 6  | 8
>   |
> | WARNING:at_include/linux/kref.h:#kobject_get| 6  | 8
>   |
> | WARNING:at_arch/x86/mm/dump_pagetables.c:#note_page | 2  | 2
>   |
> | kernel_BUG_at_mm/slab.c | 0  | 4
>   |
> | invalid_opcode:#[##]PREEMPT_SMP | 0  | 4
>   |
> | Kernel_panic-not_syncing:Fatal_exception| 0  | 6
>   |
> | BUG:unable_to_handle_kernel | 0  | 2
>   |
> | Oops| 0  | 2
>   |
> +-+++

There is no successful boot. The pattern changes with patch in question
applied.

> [8.044624] sd 0:0:0:0: [sda] Synchronizing SCSI cache
> [8.055721] slab: double free detected in cache 'kmalloc-32', objp 8af558c0
> [8.057138] [ cut here ]
> [8.058085] kernel BUG at mm/slab.c:2624!
> [8.059255] invalid opcode:  [#1] PREEMPT SMP

yes. With and without the patch there is a lot of wrong stuff like
complains about a kobject initialized again. This leads to a double free
at some point.

What happens is the following: CONFIG_SCSI_DEBUG is enabled which adds a
dummy host controller with a dummy disk. This gets probed during boot.
Since you also enabled CONFIG_DEBUG_TEST_DRIVER_REMOVE it gets removed
and re-added. The request_queue in genhd disk is re-used while the disk
is added for the second time:

[1.314404] scsi host0: scsi_debug: version 1.86 [20160430]
[1.314404]   dev_size_mb=8, opts=0x0, submit_queues=1, statistics=0
[1.315994] scsi 0:0:0:0: Direct-Access Linuxscsi_debug   0186 
PQ: 0 ANSI: 7
[1.351052] sd 0:0:0:0: [sda] 16384 512-byte logical blocks: (8.39 MB/8.00 
MiB)
[1.355916] sd 0:0:0:0: [sda] Write Protect is off
[1.356838] sd 0:0:0:0: [sda] Mode Sense: 73 00 10 08
[1.364455] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, 
supports DPO and FUA
[1.437642] sd 0:0:0:0: [sda] Attached SCSI disk
[1.438413] sd 0:0:0:0: [sda] Synchronizing SCSI cache
[1.445868] sd 0:0:0:0: [sda] 16384 512-byte logical blocks: (8.39 MB/8.00 
MiB)
[1.450819] sd 0:0:0:0: [sda] Write Protect is off
[1.451853] sd 0:0:0:0: [sda] Mode Sense: 73 00 10 08
[1.459636] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, 
supports DPO and FUA
[1.471446] kobject (beb87d44): tried to init an initialized object, 
something is seriously wrong.

Since you also need CONFIG_SCSI_MQ_DEFAULT enabled I assume the MQ block
code is buggy here.
But commit b332151a29 in Paul's tree innocent.

Sebastian
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 0/4] dm thin: support blk-throttle on data and metadata device

2017-01-20 Thread Vivek Goyal
On Fri, Jan 20, 2017 at 10:19:22AM -0500, Jeff Moyer wrote:
> Hou Tao  writes:
> 
> > Hi all,
> >
> > We need to throttle the O_DIRECT IO on data and metadata device
> > of a dm-thin pool and encounter some problems. If we set the
> > limitation on the root blkcg, the throttle works. If we set the
> > limitation on a child blkcg, the throttle doesn't work well.
> >
> > The reason why the throttle doesn't work is that dm-thin defers
> > the process of bio when the physical block of bio has not been
> > allocated. The bio will be submitted by the pool worker, and the
> > blkcg of the bio will be the blkcg of the pool worker, namely,
> > the root blkcg instead of the blkcg of the original IO thread.
> > We only set a limitation on the blkcg of the original IO thread,
> > so the blk-throttle doesn't work well.
> >
> > In order to handle the situation, we add a "keep_bio_blkcg" feature
> > to dm-thin. If the feature is enabled, the original blkcg of bio
> > will be saved at thin_map() and will be used during blk-throttle.
> 
> Why is this even an option?  I would think that you would always want
> this behavior.

Agreed that this should not be an optional feature.

One thing which always bothers me is that often we have dependencies
on finishing of these bios. So if we assign bios to their original
cgroups and if we create a cgroup with very low limit, then its
possible that everything slows down.

I guess answer to that is its a configuration issue and don't
create groups with too small a limits. And don't allow unprivileged
users to specify group limits.

Vivek
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 0/4] dm thin: support blk-throttle on data and metadata device

2017-01-20 Thread Mike Snitzer
On Fri, Jan 20 2017 at 10:19am -0500,
Jeff Moyer  wrote:

> Hou Tao  writes:
> 
> > Hi all,
> >
> > We need to throttle the O_DIRECT IO on data and metadata device
> > of a dm-thin pool and encounter some problems. If we set the
> > limitation on the root blkcg, the throttle works. If we set the
> > limitation on a child blkcg, the throttle doesn't work well.
> >
> > The reason why the throttle doesn't work is that dm-thin defers
> > the process of bio when the physical block of bio has not been
> > allocated. The bio will be submitted by the pool worker, and the
> > blkcg of the bio will be the blkcg of the pool worker, namely,
> > the root blkcg instead of the blkcg of the original IO thread.
> > We only set a limitation on the blkcg of the original IO thread,
> > so the blk-throttle doesn't work well.
> >
> > In order to handle the situation, we add a "keep_bio_blkcg" feature
> > to dm-thin. If the feature is enabled, the original blkcg of bio
> > will be saved at thin_map() and will be used during blk-throttle.
> 
> Why is this even an option?  I would think that you would always want
> this behavior.

Right, shouldn't be an optional feature.

Also, this implementation is very dm-thin specific.  I still have this
line of work on my TODO because there should be a more generic way to
wire up these associations in either block core or DM core.

Now that there is both dm-crypt and dm-thin specific RFC patches to fix
this I'll see about finding a solution that works for both but that is
more generic.

Not sure how quickly I'll get to this but I'll do my best.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Lsf-pc] [LSF/MM TOPIC] Badblocks checking/representation in filesystems

2017-01-20 Thread Dan Williams
On Fri, Jan 20, 2017 at 1:47 AM, Jan Kara  wrote:
> On Thu 19-01-17 14:17:19, Vishal Verma wrote:
>> On 01/18, Jan Kara wrote:
>> > On Tue 17-01-17 15:37:05, Vishal Verma wrote:
>> > 2) PMEM is exposed for DAX aware filesystem. This seems to be what you are
>> > mostly interested in. We could possibly do something more efficient than
>> > what NVDIMM driver does however the complexity would be relatively high and
>> > frankly I'm far from convinced this is really worth it. If there are so
>> > many badblocks this would matter, the HW has IMHO bigger problems than
>> > performance.
>>
>> Correct, and Dave was of the opinion that once at least XFS has reverse
>> mapping support (which it does now), adding badblocks information to
>> that should not be a hard lift, and should be a better solution. I
>> suppose should try to benchmark how much of a penalty the current badblock
>> checking in the NVVDIMM driver imposes. The penalty is not because there
>> may be a large number of badblocks, but just due to the fact that we
>> have to do this check for every IO, in fact, every 'bvec' in a bio.
>
> Well, letting filesystem know is certainly good from error reporting quality
> POV. I guess I'll leave it upto XFS guys to tell whether they can be more
> efficient in checking whether current IO overlaps with any of given bad
> blocks.
>
>> > Now my question: Why do we bother with badblocks at all? In cases 1) and 2)
>> > if the platform can recover from MCE, we can just always access persistent
>> > memory using memcpy_mcsafe(), if that fails, return -EIO. Actually that
>> > seems to already happen so we just need to make sure all places handle
>> > returned errors properly (e.g. fs/dax.c does not seem to) and we are done.
>> > No need for bad blocks list at all, no slow down unless we hit a bad cell
>> > and in that case who cares about performance when the data is gone...
>>
>> Even when we have MCE recovery, we cannot do away with the badblocks
>> list:
>> 1. My understanding is that the hardware's ability to do MCE recovery is
>> limited/best-effort, and is not guaranteed. There can be circumstances
>> that cause a "Processor Context Corrupt" state, which is unrecoverable.
>
> Well, then they have to work on improving the hardware. Because having HW
> that just sometimes gets stuck instead of reporting bad storage is simply
> not acceptable. And no matter how hard you try you cannot avoid MCEs from
> OS when accessing persistent memory so OS just has no way to avoid that
> risk.
>
>> 2. We still need to maintain a badblocks list so that we know what
>> blocks need to be cleared (via the ACPI method) on writes.
>
> Well, why cannot we just do the write, see whether we got CMCI and if yes,
> clear the error via the ACPI method?

I would need to check if you get the address reported in the CMCI, but
it would only fire if the write triggered a read-modify-write cycle. I
suspect most copies to pmem, through something like
arch_memcpy_to_pmem(), are not triggering any reads. It also triggers
asynchronously, so what data do you write after clearing the error?
There may have been more writes while the CMCI was being delivered.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 0/4] dm thin: support blk-throttle on data and metadata device

2017-01-20 Thread Jeff Moyer
Hou Tao  writes:

> Hi all,
>
> We need to throttle the O_DIRECT IO on data and metadata device
> of a dm-thin pool and encounter some problems. If we set the
> limitation on the root blkcg, the throttle works. If we set the
> limitation on a child blkcg, the throttle doesn't work well.
>
> The reason why the throttle doesn't work is that dm-thin defers
> the process of bio when the physical block of bio has not been
> allocated. The bio will be submitted by the pool worker, and the
> blkcg of the bio will be the blkcg of the pool worker, namely,
> the root blkcg instead of the blkcg of the original IO thread.
> We only set a limitation on the blkcg of the original IO thread,
> so the blk-throttle doesn't work well.
>
> In order to handle the situation, we add a "keep_bio_blkcg" feature
> to dm-thin. If the feature is enabled, the original blkcg of bio
> will be saved at thin_map() and will be used during blk-throttle.

Why is this even an option?  I would think that you would always want
this behavior.

-Jeff
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/8] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

2017-01-20 Thread Jens Axboe
On Fri, Jan 20 2017, Paolo Valente wrote:
> 
> > Il giorno 20 gen 2017, alle ore 14:14, Paolo Valente 
> >  ha scritto:
> > 
> >> 
> >> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe  ha 
> >> scritto:
> >> 
> >> This is basically identical to deadline-iosched, except it registers
> >> as a MQ capable scheduler. This is still a single queue design.
> >> 
> > 
> > Jens,
> > no spin_lock_irq* in the code.  So, also request dispatches are
> > guaranteed to never be executed in IRQ context?
> 
> Or maybe the opposite? That is, all scheduler functions invoked in IRQ 
> context?

Nope

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Doubt related to immutable biovecs

2017-01-20 Thread Suraj Choudhari
Hello,

I've some queries related to accessing the 'bio_vec' and 'bi_vcnt'
members in the BIO structure after the kernel changes implemented for
the 'immutable biovecs' -

Background -
-  From the changes done for the immutable biovecs, I understand that
the driver code now no longer needs to reference the 'bi_vcnt' and
'bi_io_vec' fields directly. Instead we can use the 'bvec_iter'
iterator in order to to access the bio_vec.
-  We've iterator functions bio_iter_iovec() and bio_advance_iter() as
well, which return us literal 'struct biovecs' taking into account the
bi_bvec_done and bi_size values.

Doubts  -
1)  Few functions in the block layer and some drivers still directly
refer to the 'bi_vcnt' and 'bio_vec' members of the BIO structure
[instead of accessing bio_vecs using the bvec_iter iterator].
Would there be some changes in the functions to compulsorily use the
'bvec_iter' in order to access bio_vecs?  If yes, are such changes
related to the 'multipage biovec' feature ?

2)  Can there be some problem if a driver directly accesses the
bi_io_vec [bio_vec] in the bio structure based on the values of bi_idx
and bi_vcnt ? [instead of using the bvec_iter iterator].

Thanks & Regards,
Suraj
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/8] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

2017-01-20 Thread Paolo Valente

> Il giorno 20 gen 2017, alle ore 14:14, Paolo Valente 
>  ha scritto:
> 
>> 
>> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe  ha scritto:
>> 
>> This is basically identical to deadline-iosched, except it registers
>> as a MQ capable scheduler. This is still a single queue design.
>> 
> 
> Jens,
> no spin_lock_irq* in the code.  So, also request dispatches are
> guaranteed to never be executed in IRQ context?

Or maybe the opposite? That is, all scheduler functions invoked in IRQ context?

Thanks,
Paolo

>  I'm asking this
> question to understand whether I'm missing something that, even in
> BFQ, would somehow allow me to not disable irqs in critical sections,
> even if there is the slice_idle-expiration handler.  Be patient with
> my ignorance.
> 
> Thanks,
> Paolo
> 
>> Signed-off-by: Jens Axboe 
>> ---
>> block/Kconfig.iosched |   6 +
>> block/Makefile|   1 +
>> block/mq-deadline.c   | 649 
>> ++
>> 3 files changed, 656 insertions(+)
>> create mode 100644 block/mq-deadline.c
>> 
>> diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
>> index 421bef9c4c48..490ef2850fae 100644
>> --- a/block/Kconfig.iosched
>> +++ b/block/Kconfig.iosched
>> @@ -32,6 +32,12 @@ config IOSCHED_CFQ
>> 
>>This is the default I/O scheduler.
>> 
>> +config MQ_IOSCHED_DEADLINE
>> +tristate "MQ deadline I/O scheduler"
>> +default y
>> +---help---
>> +  MQ version of the deadline IO scheduler.
>> +
>> config CFQ_GROUP_IOSCHED
>>  bool "CFQ Group Scheduling support"
>>  depends on IOSCHED_CFQ && BLK_CGROUP
>> diff --git a/block/Makefile b/block/Makefile
>> index 2eee9e1bb6db..3ee0abd7205a 100644
>> --- a/block/Makefile
>> +++ b/block/Makefile
>> @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)   += blk-throttle.o
>> obj-$(CONFIG_IOSCHED_NOOP)   += noop-iosched.o
>> obj-$(CONFIG_IOSCHED_DEADLINE)   += deadline-iosched.o
>> obj-$(CONFIG_IOSCHED_CFQ)+= cfq-iosched.o
>> +obj-$(CONFIG_MQ_IOSCHED_DEADLINE)   += mq-deadline.o
>> 
>> obj-$(CONFIG_BLOCK_COMPAT)   += compat_ioctl.o
>> obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
>> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
>> new file mode 100644
>> index ..3cb9de21ab21
>> --- /dev/null
>> +++ b/block/mq-deadline.c
>> @@ -0,0 +1,649 @@
>> +/*
>> + *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
>> + *  for the blk-mq scheduling framework
>> + *
>> + *  Copyright (C) 2016 Jens Axboe 
>> + */
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +
>> +#include "blk.h"
>> +#include "blk-mq.h"
>> +#include "blk-mq-tag.h"
>> +#include "blk-mq-sched.h"
>> +
>> +static unsigned int queue_depth = 256;
>> +module_param(queue_depth, uint, 0644);
>> +MODULE_PARM_DESC(queue_depth, "Use this value as the scheduler queue 
>> depth");
>> +
>> +/*
>> + * See Documentation/block/deadline-iosched.txt
>> + */
>> +static const int read_expire = HZ / 2;  /* max time before a read is 
>> submitted. */
>> +static const int write_expire = 5 * HZ; /* ditto for writes, these limits 
>> are SOFT! */
>> +static const int writes_starved = 2;/* max times reads can starve a 
>> write */
>> +static const int fifo_batch = 16;   /* # of sequential requests treated 
>> as one
>> + by the above parameters. For throughput. */
>> +
>> +struct deadline_data {
>> +/*
>> + * run time data
>> + */
>> +
>> +/*
>> + * requests (deadline_rq s) are present on both sort_list and fifo_list
>> + */
>> +struct rb_root sort_list[2];
>> +struct list_head fifo_list[2];
>> +
>> +/*
>> + * next in sort order. read, write or both are NULL
>> + */
>> +struct request *next_rq[2];
>> +unsigned int batching;  /* number of sequential requests made */
>> +unsigned int starved;   /* times reads have starved writes */
>> +
>> +/*
>> + * settings that change how the i/o scheduler behaves
>> + */
>> +int fifo_expire[2];
>> +int fifo_batch;
>> +int writes_starved;
>> +int front_merges;
>> +
>> +spinlock_t lock;
>> +struct list_head dispatch;
>> +struct blk_mq_tags *tags;
>> +atomic_t wait_index;
>> +};
>> +
>> +static inline struct rb_root *
>> +deadline_rb_root(struct deadline_data *dd, struct request *rq)
>> +{
>> +return >sort_list[rq_data_dir(rq)];
>> +}
>> +
>> +/*
>> + * get the request after `rq' in sector-sorted order
>> + */
>> +static inline struct request *
>> +deadline_latter_request(struct request *rq)
>> +{
>> +struct rb_node *node = rb_next(>rb_node);
>> +
>> +if (node)
>> +return rb_entry_rq(node);
>> +
>> +return NULL;
>> +}
>> +
>> +static void
>> +deadline_add_rq_rb(struct deadline_data *dd, struct 

Re: [PATCH 7/8] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

2017-01-20 Thread Paolo Valente

> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe  ha scritto:
> 
> This is basically identical to deadline-iosched, except it registers
> as a MQ capable scheduler. This is still a single queue design.
> 

Jens,
no spin_lock_irq* in the code.  So, also request dispatches are
guaranteed to never be executed in IRQ context?  I'm asking this
question to understand whether I'm missing something that, even in
BFQ, would somehow allow me to not disable irqs in critical sections,
even if there is the slice_idle-expiration handler.  Be patient with
my ignorance.

Thanks,
Paolo

> Signed-off-by: Jens Axboe 
> ---
> block/Kconfig.iosched |   6 +
> block/Makefile|   1 +
> block/mq-deadline.c   | 649 ++
> 3 files changed, 656 insertions(+)
> create mode 100644 block/mq-deadline.c
> 
> diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
> index 421bef9c4c48..490ef2850fae 100644
> --- a/block/Kconfig.iosched
> +++ b/block/Kconfig.iosched
> @@ -32,6 +32,12 @@ config IOSCHED_CFQ
> 
> This is the default I/O scheduler.
> 
> +config MQ_IOSCHED_DEADLINE
> + tristate "MQ deadline I/O scheduler"
> + default y
> + ---help---
> +   MQ version of the deadline IO scheduler.
> +
> config CFQ_GROUP_IOSCHED
>   bool "CFQ Group Scheduling support"
>   depends on IOSCHED_CFQ && BLK_CGROUP
> diff --git a/block/Makefile b/block/Makefile
> index 2eee9e1bb6db..3ee0abd7205a 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)+= blk-throttle.o
> obj-$(CONFIG_IOSCHED_NOOP)+= noop-iosched.o
> obj-$(CONFIG_IOSCHED_DEADLINE)+= deadline-iosched.o
> obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
> +obj-$(CONFIG_MQ_IOSCHED_DEADLINE)+= mq-deadline.o
> 
> obj-$(CONFIG_BLOCK_COMPAT)+= compat_ioctl.o
> obj-$(CONFIG_BLK_CMDLINE_PARSER)  += cmdline-parser.o
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> new file mode 100644
> index ..3cb9de21ab21
> --- /dev/null
> +++ b/block/mq-deadline.c
> @@ -0,0 +1,649 @@
> +/*
> + *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
> + *  for the blk-mq scheduling framework
> + *
> + *  Copyright (C) 2016 Jens Axboe 
> + */
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "blk.h"
> +#include "blk-mq.h"
> +#include "blk-mq-tag.h"
> +#include "blk-mq-sched.h"
> +
> +static unsigned int queue_depth = 256;
> +module_param(queue_depth, uint, 0644);
> +MODULE_PARM_DESC(queue_depth, "Use this value as the scheduler queue depth");
> +
> +/*
> + * See Documentation/block/deadline-iosched.txt
> + */
> +static const int read_expire = HZ / 2;  /* max time before a read is 
> submitted. */
> +static const int write_expire = 5 * HZ; /* ditto for writes, these limits 
> are SOFT! */
> +static const int writes_starved = 2;/* max times reads can starve a 
> write */
> +static const int fifo_batch = 16;   /* # of sequential requests treated 
> as one
> +  by the above parameters. For throughput. */
> +
> +struct deadline_data {
> + /*
> +  * run time data
> +  */
> +
> + /*
> +  * requests (deadline_rq s) are present on both sort_list and fifo_list
> +  */
> + struct rb_root sort_list[2];
> + struct list_head fifo_list[2];
> +
> + /*
> +  * next in sort order. read, write or both are NULL
> +  */
> + struct request *next_rq[2];
> + unsigned int batching;  /* number of sequential requests made */
> + unsigned int starved;   /* times reads have starved writes */
> +
> + /*
> +  * settings that change how the i/o scheduler behaves
> +  */
> + int fifo_expire[2];
> + int fifo_batch;
> + int writes_starved;
> + int front_merges;
> +
> + spinlock_t lock;
> + struct list_head dispatch;
> + struct blk_mq_tags *tags;
> + atomic_t wait_index;
> +};
> +
> +static inline struct rb_root *
> +deadline_rb_root(struct deadline_data *dd, struct request *rq)
> +{
> + return >sort_list[rq_data_dir(rq)];
> +}
> +
> +/*
> + * get the request after `rq' in sector-sorted order
> + */
> +static inline struct request *
> +deadline_latter_request(struct request *rq)
> +{
> + struct rb_node *node = rb_next(>rb_node);
> +
> + if (node)
> + return rb_entry_rq(node);
> +
> + return NULL;
> +}
> +
> +static void
> +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> + struct rb_root *root = deadline_rb_root(dd, rq);
> +
> + elv_rb_add(root, rq);
> +}
> +
> +static inline void
> +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> + const int data_dir = rq_data_dir(rq);
> +
> + if (dd->next_rq[data_dir] == rq)
> + 

Re: [LSF/MM TOPIC][LSF/MM ATTEND] NAPI polling for block drivers

2017-01-20 Thread Johannes Thumshirn
On Tue, Jan 17, 2017 at 05:45:53PM +0200, Sagi Grimberg wrote:
> 
> >--
> >[1]
> >queue = b'nvme0q1'
> > usecs   : count distribution
> > 0 -> 1  : 7310 ||
> > 2 -> 3  : 11   |  |
> > 4 -> 7  : 10   |  |
> > 8 -> 15 : 20   |  |
> >16 -> 31 : 0|  |
> >32 -> 63 : 0|  |
> >64 -> 127: 1|  |
> >
> >[2]
> >queue = b'nvme0q1'
> > usecs   : count distribution
> > 0 -> 1  : 7309 ||
> > 2 -> 3  : 14   |  |
> > 4 -> 7  : 7|  |
> > 8 -> 15 : 17   |  |
> >
> 
> Rrr, email made the histograms look funky (tabs vs. spaces...)
> The count is what's important anyways...
> 
> Just adding that I used an Intel P3500 nvme device.
> 
> >We can see that most of the time our latency is pretty good (<1ns) but with
> >huge tail latencies (some 8-15 ns and even one in 32-63 ns).
> 
> Obviously is micro-seconds and not nano-seconds (I wish...)

So to share yesterday's (and today's) findings:

On AHCI I see only one completion polled as well.

This probably is because in contrast to networking (with NAPI) in the block
layer we do have a link between submission and completion whereas in networking
RX and TX are decoupled. So if we're sending out one request we get the
completion for it.

What we'd need is a link to know "we've sent 10 requests out, now poll for the
10 completions after the 1st IRQ". So basically what NVMe already did with
calling __nvme_process_cq() after submission. Maybe we should even disable
IRQs when submitting and re-enable after submitting so the
submission patch doesn't get preempted by a completion.

Does this make sense?

Byte,
Johannes

-- 
Johannes Thumshirn  Storage
jthumsh...@suse.de+49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC 4/4] dm thin: associate bio with current task if keep_bio_blkcg is enabled

2017-01-20 Thread Hou Tao
If keep_bio_blkcg is enabled, assign the io_context and the blkcg of
current task to bio before processing the bio.

Signed-off-by: Hou Tao 
---
 drivers/md/dm-thin.c |  5 +
 drivers/md/dm-thin.h | 17 +
 2 files changed, 22 insertions(+)
 create mode 100644 drivers/md/dm-thin.h

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 140cdae..0efbdbe 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -22,6 +22,8 @@
 #include 
 #include 
 
+#include "dm-thin.h"
+
 #defineDM_MSG_PREFIX   "thin"
 
 /*
@@ -2629,6 +2631,9 @@ static int thin_bio_map(struct dm_target *ti, struct bio 
*bio)
struct dm_bio_prison_cell *virt_cell, *data_cell;
struct dm_cell_key key;
 
+   if (keep_pool_bio_blkcg(tc->pool))
+   thin_keep_bio_blkcg(bio);
+
thin_hook_bio(tc, bio);
 
if (tc->requeue_mode) {
diff --git a/drivers/md/dm-thin.h b/drivers/md/dm-thin.h
new file mode 100644
index 000..09e920a
--- /dev/null
+++ b/drivers/md/dm-thin.h
@@ -0,0 +1,17 @@
+#ifndef DM_THIN_H
+#define DM_THIN_H
+
+#include 
+
+#ifdef CONFIG_BLK_CGROUP
+static inline void thin_keep_bio_blkcg(struct bio *bio)
+{
+   if (!bio->bi_css)
+   bio_associate_current(bio);
+}
+#else
+static inline void thin_keep_bio_blkcg(struct bio *bio) {}
+#endif /* CONFIG_BLK_CGROUP */
+
+#endif
+
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC 0/4] dm thin: support blk-throttle on data and metadata device

2017-01-20 Thread Hou Tao
Hi all,

We need to throttle the O_DIRECT IO on data and metadata device
of a dm-thin pool and encounter some problems. If we set the
limitation on the root blkcg, the throttle works. If we set the
limitation on a child blkcg, the throttle doesn't work well.

The reason why the throttle doesn't work is that dm-thin defers
the process of bio when the physical block of bio has not been
allocated. The bio will be submitted by the pool worker, and the
blkcg of the bio will be the blkcg of the pool worker, namely,
the root blkcg instead of the blkcg of the original IO thread.
We only set a limitation on the blkcg of the original IO thread,
so the blk-throttle doesn't work well.

In order to handle the situation, we add a "keep_bio_blkcg" feature
to dm-thin. If the feature is enabled, the original blkcg of bio
will be saved at thin_map() and will be used during blk-throttle.

Tao

Hou Tao (4):
  dm thin: add a pool feature "keep_bio_blkcg"
  dm thin: parse "keep_bio_blkcg" from userspace tools
  dm thin: show the enabled status of keep_bio_blkcg feature
  dm thin: associate bio with current task if keep_bio_blkcg is enabled

 drivers/md/dm-thin.c | 26 --
 drivers/md/dm-thin.h | 17 +
 2 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 drivers/md/dm-thin.h

-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC 3/4] dm thin: show the enabled status of keep_bio_blkcg feature

2017-01-20 Thread Hou Tao
If keep_bio_blkcg feature is enabled, we can ensure that
by STATUSTYPE_TABLE or STATUSTYPE_INFO command.

Signed-off-by: Hou Tao 
---
 drivers/md/dm-thin.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 57d6202..140cdae 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3760,7 +3760,7 @@ static void emit_flags(struct pool_features *pf, char 
*result,
 {
unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
-   pf->error_if_no_space;
+   pf->error_if_no_space + pf->keep_bio_blkcg;
DMEMIT("%u ", count);
 
if (!pf->zero_new_blocks)
@@ -3777,6 +3777,9 @@ static void emit_flags(struct pool_features *pf, char 
*result,
 
if (pf->error_if_no_space)
DMEMIT("error_if_no_space ");
+
+   if (pf->keep_bio_blkcg)
+   DMEMIT("keep_bio_blkcg ");
 }
 
 /*
@@ -3885,6 +3888,9 @@ static void pool_status(struct dm_target *ti, 
status_type_t type,
else
DMEMIT("queue_if_no_space ");
 
+   if (pool->pf.keep_bio_blkcg)
+   DMEMIT("keep_bio_blkcg ");
+
if (dm_pool_metadata_needs_check(pool->pmd))
DMEMIT("needs_check ");
else
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC 2/4] dm thin: parse "keep_bio_blkcg" from userspace tools

2017-01-20 Thread Hou Tao
keep_bio_blkcg feature is off by default, and it can
be turned on by using "keep_bio_blkcg" argument.

Signed-off-by: Hou Tao 
---
 drivers/md/dm-thin.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 8178ee8..57d6202 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2824,6 +2824,7 @@ static void pool_features_init(struct pool_features *pf)
pf->discard_enabled = true;
pf->discard_passdown = true;
pf->error_if_no_space = false;
+   pf->keep_bio_blkcg = false;
 }
 
 static void __pool_destroy(struct pool *pool)
@@ -3053,7 +3054,7 @@ static int parse_pool_features(struct dm_arg_set *as, 
struct pool_features *pf,
const char *arg_name;
 
static struct dm_arg _args[] = {
-   {0, 4, "Invalid number of pool feature arguments"},
+   {0, 5, "Invalid number of pool feature arguments"},
};
 
/*
@@ -3085,6 +3086,9 @@ static int parse_pool_features(struct dm_arg_set *as, 
struct pool_features *pf,
else if (!strcasecmp(arg_name, "error_if_no_space"))
pf->error_if_no_space = true;
 
+   else if (!strcasecmp(arg_name, "keep_bio_blkcg"))
+   pf->keep_bio_blkcg = true;
+
else {
ti->error = "Unrecognised pool feature requested";
r = -EINVAL;
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/8] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

2017-01-20 Thread Paolo Valente

> Il giorno 17 gen 2017, alle ore 03:47, Jens Axboe  ha scritto:
> 
> On 12/22/2016 09:49 AM, Paolo Valente wrote:
>> 
>>> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe  ha scritto:
>>> 
>>> This is basically identical to deadline-iosched, except it registers
>>> as a MQ capable scheduler. This is still a single queue design.
>>> 
>> 
>> One last question (for today ...):in mq-deadline there are no
>> "schedule dispatch" or "unplug work" functions.  In blk, CFQ and BFQ
>> do these schedules/unplugs in a lot of cases.  What's the right
>> replacement?  Just doing nothing?
> 
> You just use blk_mq_run_hw_queue() or variants thereof to kick off queue
> runs.
> 

Hi Jens,
I'm working on this right now.  I have a pair of quick questions about
performance.

In the blk version of bfq, if the in-service bfq_queue happen to have
no more budget when the bfq dispatch function is invoked, then bfq:
returns no request (NULL), immediately expires the in-service
bfq_queue, and schedules a new dispatch.  The third step is taken so
that, if other bfq_queues have requests, then a new in-service
bfq_queue will be selected on the upcoming new dispatch, and a new
request will be provided right away.

My questions are: is this dispatch-schedule step still needed with
blk-mq, to avoid a stall?  If it is not needed to avoid a stall, would
it still be needed to boost throughput, because it would force an
immediate, next dispatch?

BTW, bfq-mq survived its first request completion.  I will provide you
with a link to a github branch as soon as bfq-mq seems able to stand
up with a minimal workload.

Thanks,
Paolo

> -- 
> Jens Axboe
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [LSF/MM TOPIC] Badblocks checking/representation in filesystems

2017-01-20 Thread Yasunori Goto
Hello,
Virshal-san.

First of all, your discussion is quite interesting for me. Thanks.

> > 
> > If DUE does happen and is flagged to the file system via MCE (somehow...),
> > and the fs finds that the error corrupts its allocated data page, or
> > metadata, now if the fs wants to recover its data the intuition is that
> > there needs to be a stronger error correction mechanism to correct the
> > hardware-uncorrectable errors. So knowing the hardware ECC baseline is
> > helpful for the file system to understand how severe are the faults in
> > badblocks, and develop its recovery methods.
> 
> Like mentioned before, this discussion is more about presentation of
> errors in a known consumable format, rather than recovering from errors.
> While recovering from errors is interesting, we already have layers
> like RAID for that, and they are as applicable to NVDIMM backed storage
> as they have been for disk/SSD based storage.

I have one question here.

Certainly, user can use LVM mirroring for storage mode of NVDIMM.
However, NVDIMM has DAX mode. 
Can user use LVM mirroring for NVDIMM DAX mode?
I could not find any information that LVM support DAX

In addition, current specs of NVDIMM (*) only define interleave feature of 
NVDIMMs. 
They does not mention about mirroring feature.
So, I don't understand how to use mirroring for DAX.

(*) "NVDIMM Namespace Specification" , "NVDIMM Block Window Driver Writer’s 
Guide",
   and "ACPI 6.1"

Regards,
---
Yasunori Goto

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Lsf-pc] [LSF/MM TOPIC] Badblocks checking/representation in filesystems

2017-01-20 Thread Jan Kara
On Thu 19-01-17 14:17:19, Vishal Verma wrote:
> On 01/18, Jan Kara wrote:
> > On Tue 17-01-17 15:37:05, Vishal Verma wrote:
> > 2) PMEM is exposed for DAX aware filesystem. This seems to be what you are
> > mostly interested in. We could possibly do something more efficient than
> > what NVDIMM driver does however the complexity would be relatively high and
> > frankly I'm far from convinced this is really worth it. If there are so
> > many badblocks this would matter, the HW has IMHO bigger problems than
> > performance.
> 
> Correct, and Dave was of the opinion that once at least XFS has reverse
> mapping support (which it does now), adding badblocks information to
> that should not be a hard lift, and should be a better solution. I
> suppose should try to benchmark how much of a penalty the current badblock
> checking in the NVVDIMM driver imposes. The penalty is not because there
> may be a large number of badblocks, but just due to the fact that we
> have to do this check for every IO, in fact, every 'bvec' in a bio.

Well, letting filesystem know is certainly good from error reporting quality
POV. I guess I'll leave it upto XFS guys to tell whether they can be more
efficient in checking whether current IO overlaps with any of given bad
blocks.
 
> > Now my question: Why do we bother with badblocks at all? In cases 1) and 2)
> > if the platform can recover from MCE, we can just always access persistent
> > memory using memcpy_mcsafe(), if that fails, return -EIO. Actually that
> > seems to already happen so we just need to make sure all places handle
> > returned errors properly (e.g. fs/dax.c does not seem to) and we are done.
> > No need for bad blocks list at all, no slow down unless we hit a bad cell
> > and in that case who cares about performance when the data is gone...
> 
> Even when we have MCE recovery, we cannot do away with the badblocks
> list:
> 1. My understanding is that the hardware's ability to do MCE recovery is
> limited/best-effort, and is not guaranteed. There can be circumstances
> that cause a "Processor Context Corrupt" state, which is unrecoverable.

Well, then they have to work on improving the hardware. Because having HW
that just sometimes gets stuck instead of reporting bad storage is simply
not acceptable. And no matter how hard you try you cannot avoid MCEs from
OS when accessing persistent memory so OS just has no way to avoid that
risk.

> 2. We still need to maintain a badblocks list so that we know what
> blocks need to be cleared (via the ACPI method) on writes.

Well, why cannot we just do the write, see whether we got CMCI and if yes,
clear the error via the ACPI method?

Honza
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Lsf-pc] [LSF/MM TOPIC] Badblocks checking/representation in filesystems

2017-01-20 Thread Jan Kara
On Thu 19-01-17 11:03:12, Dan Williams wrote:
> On Thu, Jan 19, 2017 at 10:59 AM, Vishal Verma  
> wrote:
> > On 01/19, Jan Kara wrote:
> >> On Wed 18-01-17 21:56:58, Verma, Vishal L wrote:
> >> > On Wed, 2017-01-18 at 13:32 -0800, Dan Williams wrote:
> >> > > On Wed, Jan 18, 2017 at 1:02 PM, Darrick J. Wong
> >> > >  wrote:
> >> > > > On Wed, Jan 18, 2017 at 03:39:17PM -0500, Jeff Moyer wrote:
> >> > > > > Jan Kara  writes:
> >> > > > >
> >> > > > > > On Tue 17-01-17 15:14:21, Vishal Verma wrote:
> >> > > > > > > Your note on the online repair does raise another tangentially
> >> > > > > > > related
> >> > > > > > > topic. Currently, if there are badblocks, writes via the bio
> >> > > > > > > submission
> >> > > > > > > path will clear the error (if the hardware is able to remap
> >> > > > > > > the bad
> >> > > > > > > locations). However, if the filesystem is mounted eith DAX,
> >> > > > > > > even
> >> > > > > > > non-mmap operations - read() and write() will go through the
> >> > > > > > > dax paths
> >> > > > > > > (dax_do_io()). We haven't found a good/agreeable way to
> >> > > > > > > perform
> >> > > > > > > error-clearing in this case. So currently, if a dax mounted
> >> > > > > > > filesystem
> >> > > > > > > has badblocks, the only way to clear those badblocks is to
> >> > > > > > > mount it
> >> > > > > > > without DAX, and overwrite/zero the bad locations. This is a
> >> > > > > > > pretty
> >> > > > > > > terrible user experience, and I'm hoping this can be solved in
> >> > > > > > > a better
> >> > > > > > > way.
> >> > > > > >
> >> > > > > > Please remind me, what is the problem with DAX code doing
> >> > > > > > necessary work to
> >> > > > > > clear the error when it gets EIO from memcpy on write?
> >> > > > >
> >> > > > > You won't get an MCE for a store;  only loads generate them.
> >> > > > >
> >> > > > > Won't fallocate FL_ZERO_RANGE clear bad blocks when mounted with
> >> > > > > -o dax?
> >> > > >
> >> > > > Not necessarily; XFS usually implements this by punching out the
> >> > > > range
> >> > > > and then reallocating it as unwritten blocks.
> >> > > >
> >> > >
> >> > > That does clear the error because the unwritten blocks are zeroed and
> >> > > errors cleared when they become allocated again.
> >> >
> >> > Yes, the problem was that writes won't clear errors. zeroing through
> >> > either hole-punch, truncate, unlinking the file should all work
> >> > (assuming the hole-punch or truncate ranges wholly contain the
> >> > 'badblock' sector).
> >>
> >> Let me repeat my question: You have mentioned that if we do IO through DAX,
> >> writes won't clear errors and we should fall back to normal block path to
> >> do write to clear the error. What does prevent us from directly clearing
> >> the error from DAX path?
> >>
> > With DAX, all IO goes through DAX paths. There are two cases:
> > 1. mmap and loads/stores: Obviously there is no kernel intervention
> > here, and no badblocks handling is possible.
> > 2. read() or write() IO: In the absence of dax, this would go through
> > the bio submission path, through the pmem driver, and that would handle
> > error clearing. With DAX, this goes through dax_iomap_actor, which also
> > doesn't go through the pmem driver (it does a dax mapping, followed by
> > essentially memcpy), and hence cannot handle badblocks.
> 
> Hmm, that may no longer be true after my changes to push dax flushing
> to the driver. I.e. we could have a copy_from_iter() implementation
> that attempts to clear errors... I'll get that series out and we can
> discuss there.

Yeah, that was precisely my point - doing copy_from_iter() that clears
errors should be possible...

Honza
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html