Re: [Qemu-devel] [PATCH v3] block/rbd: add preallocation support

2019-07-29 Thread Jason Dillaman
On Mon, Jul 29, 2019 at 5:40 AM Stefano Garzarella  wrote:
>
> On Fri, Jul 26, 2019 at 08:46:56AM -0400, Jason Dillaman wrote:
> > On Fri, Jul 26, 2019 at 4:48 AM Stefano Garzarella  
> > wrote:
> > >
> > > On Thu, Jul 25, 2019 at 09:30:30AM -0400, Jason Dillaman wrote:
> > > > On Thu, Jul 25, 2019 at 4:13 AM Stefano Garzarella 
> > > >  wrote:
> > > > >
> > > > > On Wed, Jul 24, 2019 at 01:48:42PM -0400, Jason Dillaman wrote:
> > > > > > On Tue, Jul 23, 2019 at 3:13 AM Stefano Garzarella 
> > > > > >  wrote:
> > > > > > >
> > > > > > > This patch adds the support of preallocation (off/full) for the 
> > > > > > > RBD
> > > > > > > block driver.
> > > > > > > If rbd_writesame() is available and supports zeroed buffers, we 
> > > > > > > use
> > > > > > > it to quickly fill the image when full preallocation is required.
> > > > > > >
> > > > > > > Signed-off-by: Stefano Garzarella 
> > > > > > > ---
> > > > > > > v3:
> > > > > > >  - rebased on master
> > > > > > >  - filled with zeroed buffer [Max]
> > > > > > >  - used rbd_writesame() only when we can disable the discard of 
> > > > > > > zeroed
> > > > > > >buffers
> > > > > > >  - added 'since: 4.2' in qapi/block-core.json [Max]
> > > > > > >  - used buffer as large as the "stripe unit"
> > > > > > > ---
> > > > > > >  block/rbd.c  | 202 
> > > > > > > ---
> > > > > > >  qapi/block-core.json |   5 +-
> > > > > > >  2 files changed, 192 insertions(+), 15 deletions(-)
> > > > > > >
> > > > > > > diff --git a/block/rbd.c b/block/rbd.c
> > > > > > > index 59757b3120..d923a5a26c 100644
> > > > > > > --- a/block/rbd.c
> > > > > > > +++ b/block/rbd.c
> > > > > > > @@ -64,6 +64,7 @@
> > > > > > >  #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
> > > > > > >
> > > > > > >  #define RBD_MAX_SNAPS 100
> > > > > > > +#define RBD_DEFAULT_CONCURRENT_OPS 10
> > > > > > >
> > > > > > >  /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
> > > > > > >  #ifdef LIBRBD_SUPPORTS_IOVEC
> > > > > > > @@ -104,6 +105,7 @@ typedef struct BDRVRBDState {
> > > > > > >  char *image_name;
> > > > > > >  char *snap;
> > > > > > >  uint64_t image_size;
> > > > > > > +bool ws_zero_supported; /* rbd_writesame() supports zeroed 
> > > > > > > buffers */
> > > > > > >  } BDRVRBDState;
> > > > > > >
> > > > > > >  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t 
> > > > > > > *io_ctx,
> > > > > > > @@ -333,6 +335,155 @@ static void qemu_rbd_memset(RADOSCB *rcb, 
> > > > > > > int64_t offs)
> > > > > > >  }
> > > > > > >  }
> > > > > > >
> > > > > > > +static int qemu_rbd_get_max_concurrent_ops(rados_t cluster)
> > > > > > > +{
> > > > > > > +char buf[16];
> > > > > > > +int ret, max_concurrent_ops;
> > > > > > > +
> > > > > > > +ret = rados_conf_get(cluster, 
> > > > > > > "rbd_concurrent_management_ops", buf,
> > > > > > > + sizeof(buf));
> > > > > > > +if (ret < 0) {
> > > > > > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > > > > > +}
> > > > > > > +
> > > > > > > +ret = qemu_strtoi(buf, NULL, 10, _concurrent_ops);
> > > > > > > +if (ret < 0) {
> > > > > > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > > > > > +}
> > > > > > > +
> > > > > > > +return max_concurrent_ops;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t 
> > > > > > > image,
> > > > > > > +int64_t offset, PreallocMode 
> > > > > > > prealloc,
> > > > > > > +bool ws_zero_supported, Error 
> > > > > > > **errp)
> > > > > > > +{
> > > > > > > +uint64_t current_length;
> > > > > > > +char *buf = NULL;
> > > > > > > +int ret;
> > > > > > > +
> > > > > > > +ret = rbd_get_size(image, _length);
> > > > > > > +if (ret < 0) {
> > > > > > > +error_setg_errno(errp, -ret, "Failed to get file 
> > > > > > > length");
> > > > > > > +goto out;
> > > > > > > +}
> > > > > > > +
> > > > > > > +if (current_length > offset && prealloc != 
> > > > > > > PREALLOC_MODE_OFF) {
> > > > > > > +error_setg(errp, "Cannot use preallocation for shrinking 
> > > > > > > files");
> > > > > > > +ret = -ENOTSUP;
> > > > > > > +goto out;
> > > > > > > +}
> > > > > > > +
> > > > > > > +switch (prealloc) {
> > > > > > > +case PREALLOC_MODE_FULL: {
> > > > > > > +uint64_t buf_size, current_offset = current_length;
> > > > > > > +ssize_t bytes;
> > > > > > > +
> > > > > > > +ret = rbd_get_stripe_unit(image, _size);
> > > > > > > +if (ret < 0) {
> > > > > > > +error_setg_errno(errp, -ret, "Failed to get stripe 
> > > > > > > unit");
> > > > > > > +goto out;
> > > > > > > +}
> > > > > > > +
> > > > > > > +ret = rbd_resize(image, offset);
> > > > > > > +if (ret < 0) {
> > > > > > > +error_setg_errno(errp, -ret, 

Re: [Qemu-devel] [PATCH v3] block/rbd: add preallocation support

2019-07-29 Thread Stefano Garzarella
On Fri, Jul 26, 2019 at 08:46:56AM -0400, Jason Dillaman wrote:
> On Fri, Jul 26, 2019 at 4:48 AM Stefano Garzarella  
> wrote:
> >
> > On Thu, Jul 25, 2019 at 09:30:30AM -0400, Jason Dillaman wrote:
> > > On Thu, Jul 25, 2019 at 4:13 AM Stefano Garzarella  
> > > wrote:
> > > >
> > > > On Wed, Jul 24, 2019 at 01:48:42PM -0400, Jason Dillaman wrote:
> > > > > On Tue, Jul 23, 2019 at 3:13 AM Stefano Garzarella 
> > > > >  wrote:
> > > > > >
> > > > > > This patch adds the support of preallocation (off/full) for the RBD
> > > > > > block driver.
> > > > > > If rbd_writesame() is available and supports zeroed buffers, we use
> > > > > > it to quickly fill the image when full preallocation is required.
> > > > > >
> > > > > > Signed-off-by: Stefano Garzarella 
> > > > > > ---
> > > > > > v3:
> > > > > >  - rebased on master
> > > > > >  - filled with zeroed buffer [Max]
> > > > > >  - used rbd_writesame() only when we can disable the discard of 
> > > > > > zeroed
> > > > > >buffers
> > > > > >  - added 'since: 4.2' in qapi/block-core.json [Max]
> > > > > >  - used buffer as large as the "stripe unit"
> > > > > > ---
> > > > > >  block/rbd.c  | 202 
> > > > > > ---
> > > > > >  qapi/block-core.json |   5 +-
> > > > > >  2 files changed, 192 insertions(+), 15 deletions(-)
> > > > > >
> > > > > > diff --git a/block/rbd.c b/block/rbd.c
> > > > > > index 59757b3120..d923a5a26c 100644
> > > > > > --- a/block/rbd.c
> > > > > > +++ b/block/rbd.c
> > > > > > @@ -64,6 +64,7 @@
> > > > > >  #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
> > > > > >
> > > > > >  #define RBD_MAX_SNAPS 100
> > > > > > +#define RBD_DEFAULT_CONCURRENT_OPS 10
> > > > > >
> > > > > >  /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
> > > > > >  #ifdef LIBRBD_SUPPORTS_IOVEC
> > > > > > @@ -104,6 +105,7 @@ typedef struct BDRVRBDState {
> > > > > >  char *image_name;
> > > > > >  char *snap;
> > > > > >  uint64_t image_size;
> > > > > > +bool ws_zero_supported; /* rbd_writesame() supports zeroed 
> > > > > > buffers */
> > > > > >  } BDRVRBDState;
> > > > > >
> > > > > >  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t 
> > > > > > *io_ctx,
> > > > > > @@ -333,6 +335,155 @@ static void qemu_rbd_memset(RADOSCB *rcb, 
> > > > > > int64_t offs)
> > > > > >  }
> > > > > >  }
> > > > > >
> > > > > > +static int qemu_rbd_get_max_concurrent_ops(rados_t cluster)
> > > > > > +{
> > > > > > +char buf[16];
> > > > > > +int ret, max_concurrent_ops;
> > > > > > +
> > > > > > +ret = rados_conf_get(cluster, "rbd_concurrent_management_ops", 
> > > > > > buf,
> > > > > > + sizeof(buf));
> > > > > > +if (ret < 0) {
> > > > > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > > > > +}
> > > > > > +
> > > > > > +ret = qemu_strtoi(buf, NULL, 10, _concurrent_ops);
> > > > > > +if (ret < 0) {
> > > > > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > > > > +}
> > > > > > +
> > > > > > +return max_concurrent_ops;
> > > > > > +}
> > > > > > +
> > > > > > +static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
> > > > > > +int64_t offset, PreallocMode 
> > > > > > prealloc,
> > > > > > +bool ws_zero_supported, Error 
> > > > > > **errp)
> > > > > > +{
> > > > > > +uint64_t current_length;
> > > > > > +char *buf = NULL;
> > > > > > +int ret;
> > > > > > +
> > > > > > +ret = rbd_get_size(image, _length);
> > > > > > +if (ret < 0) {
> > > > > > +error_setg_errno(errp, -ret, "Failed to get file length");
> > > > > > +goto out;
> > > > > > +}
> > > > > > +
> > > > > > +if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> > > > > > +error_setg(errp, "Cannot use preallocation for shrinking 
> > > > > > files");
> > > > > > +ret = -ENOTSUP;
> > > > > > +goto out;
> > > > > > +}
> > > > > > +
> > > > > > +switch (prealloc) {
> > > > > > +case PREALLOC_MODE_FULL: {
> > > > > > +uint64_t buf_size, current_offset = current_length;
> > > > > > +ssize_t bytes;
> > > > > > +
> > > > > > +ret = rbd_get_stripe_unit(image, _size);
> > > > > > +if (ret < 0) {
> > > > > > +error_setg_errno(errp, -ret, "Failed to get stripe 
> > > > > > unit");
> > > > > > +goto out;
> > > > > > +}
> > > > > > +
> > > > > > +ret = rbd_resize(image, offset);
> > > > > > +if (ret < 0) {
> > > > > > +error_setg_errno(errp, -ret, "Failed to resize file");
> > > > > > +goto out;
> > > > > > +}
> > > > > > +
> > > > > > +buf = g_malloc0(buf_size);
> > > > > > +
> > > > > > +#ifdef LIBRBD_SUPPORTS_WRITESAME
> > > > > > +if (ws_zero_supported) {
> > > > > > +uint64_t writesame_max_size;
> > > > > > +int max_concurrent_ops;
> > > > > > +
> > > 

Re: [Qemu-devel] [PATCH v3] block/rbd: add preallocation support

2019-07-26 Thread Jason Dillaman
On Fri, Jul 26, 2019 at 4:48 AM Stefano Garzarella  wrote:
>
> On Thu, Jul 25, 2019 at 09:30:30AM -0400, Jason Dillaman wrote:
> > On Thu, Jul 25, 2019 at 4:13 AM Stefano Garzarella  
> > wrote:
> > >
> > > On Wed, Jul 24, 2019 at 01:48:42PM -0400, Jason Dillaman wrote:
> > > > On Tue, Jul 23, 2019 at 3:13 AM Stefano Garzarella 
> > > >  wrote:
> > > > >
> > > > > This patch adds the support of preallocation (off/full) for the RBD
> > > > > block driver.
> > > > > If rbd_writesame() is available and supports zeroed buffers, we use
> > > > > it to quickly fill the image when full preallocation is required.
> > > > >
> > > > > Signed-off-by: Stefano Garzarella 
> > > > > ---
> > > > > v3:
> > > > >  - rebased on master
> > > > >  - filled with zeroed buffer [Max]
> > > > >  - used rbd_writesame() only when we can disable the discard of zeroed
> > > > >buffers
> > > > >  - added 'since: 4.2' in qapi/block-core.json [Max]
> > > > >  - used buffer as large as the "stripe unit"
> > > > > ---
> > > > >  block/rbd.c  | 202 
> > > > > ---
> > > > >  qapi/block-core.json |   5 +-
> > > > >  2 files changed, 192 insertions(+), 15 deletions(-)
> > > > >
> > > > > diff --git a/block/rbd.c b/block/rbd.c
> > > > > index 59757b3120..d923a5a26c 100644
> > > > > --- a/block/rbd.c
> > > > > +++ b/block/rbd.c
> > > > > @@ -64,6 +64,7 @@
> > > > >  #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
> > > > >
> > > > >  #define RBD_MAX_SNAPS 100
> > > > > +#define RBD_DEFAULT_CONCURRENT_OPS 10
> > > > >
> > > > >  /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
> > > > >  #ifdef LIBRBD_SUPPORTS_IOVEC
> > > > > @@ -104,6 +105,7 @@ typedef struct BDRVRBDState {
> > > > >  char *image_name;
> > > > >  char *snap;
> > > > >  uint64_t image_size;
> > > > > +bool ws_zero_supported; /* rbd_writesame() supports zeroed 
> > > > > buffers */
> > > > >  } BDRVRBDState;
> > > > >
> > > > >  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
> > > > > @@ -333,6 +335,155 @@ static void qemu_rbd_memset(RADOSCB *rcb, 
> > > > > int64_t offs)
> > > > >  }
> > > > >  }
> > > > >
> > > > > +static int qemu_rbd_get_max_concurrent_ops(rados_t cluster)
> > > > > +{
> > > > > +char buf[16];
> > > > > +int ret, max_concurrent_ops;
> > > > > +
> > > > > +ret = rados_conf_get(cluster, "rbd_concurrent_management_ops", 
> > > > > buf,
> > > > > + sizeof(buf));
> > > > > +if (ret < 0) {
> > > > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > > > +}
> > > > > +
> > > > > +ret = qemu_strtoi(buf, NULL, 10, _concurrent_ops);
> > > > > +if (ret < 0) {
> > > > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > > > +}
> > > > > +
> > > > > +return max_concurrent_ops;
> > > > > +}
> > > > > +
> > > > > +static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
> > > > > +int64_t offset, PreallocMode 
> > > > > prealloc,
> > > > > +bool ws_zero_supported, Error **errp)
> > > > > +{
> > > > > +uint64_t current_length;
> > > > > +char *buf = NULL;
> > > > > +int ret;
> > > > > +
> > > > > +ret = rbd_get_size(image, _length);
> > > > > +if (ret < 0) {
> > > > > +error_setg_errno(errp, -ret, "Failed to get file length");
> > > > > +goto out;
> > > > > +}
> > > > > +
> > > > > +if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> > > > > +error_setg(errp, "Cannot use preallocation for shrinking 
> > > > > files");
> > > > > +ret = -ENOTSUP;
> > > > > +goto out;
> > > > > +}
> > > > > +
> > > > > +switch (prealloc) {
> > > > > +case PREALLOC_MODE_FULL: {
> > > > > +uint64_t buf_size, current_offset = current_length;
> > > > > +ssize_t bytes;
> > > > > +
> > > > > +ret = rbd_get_stripe_unit(image, _size);
> > > > > +if (ret < 0) {
> > > > > +error_setg_errno(errp, -ret, "Failed to get stripe 
> > > > > unit");
> > > > > +goto out;
> > > > > +}
> > > > > +
> > > > > +ret = rbd_resize(image, offset);
> > > > > +if (ret < 0) {
> > > > > +error_setg_errno(errp, -ret, "Failed to resize file");
> > > > > +goto out;
> > > > > +}
> > > > > +
> > > > > +buf = g_malloc0(buf_size);
> > > > > +
> > > > > +#ifdef LIBRBD_SUPPORTS_WRITESAME
> > > > > +if (ws_zero_supported) {
> > > > > +uint64_t writesame_max_size;
> > > > > +int max_concurrent_ops;
> > > > > +
> > > > > +max_concurrent_ops = 
> > > > > qemu_rbd_get_max_concurrent_ops(cluster);
> > > > > +/*
> > > > > + * We limit the rbd_writesame() size to avoid to spawn 
> > > > > more then
> > > > > + * 'rbd_concurrent_management_ops' concurrent operations.
> > > > > + */
> > > > > +

Re: [Qemu-devel] [PATCH v3] block/rbd: add preallocation support

2019-07-26 Thread Stefano Garzarella
On Thu, Jul 25, 2019 at 09:30:30AM -0400, Jason Dillaman wrote:
> On Thu, Jul 25, 2019 at 4:13 AM Stefano Garzarella  
> wrote:
> >
> > On Wed, Jul 24, 2019 at 01:48:42PM -0400, Jason Dillaman wrote:
> > > On Tue, Jul 23, 2019 at 3:13 AM Stefano Garzarella  
> > > wrote:
> > > >
> > > > This patch adds the support of preallocation (off/full) for the RBD
> > > > block driver.
> > > > If rbd_writesame() is available and supports zeroed buffers, we use
> > > > it to quickly fill the image when full preallocation is required.
> > > >
> > > > Signed-off-by: Stefano Garzarella 
> > > > ---
> > > > v3:
> > > >  - rebased on master
> > > >  - filled with zeroed buffer [Max]
> > > >  - used rbd_writesame() only when we can disable the discard of zeroed
> > > >buffers
> > > >  - added 'since: 4.2' in qapi/block-core.json [Max]
> > > >  - used buffer as large as the "stripe unit"
> > > > ---
> > > >  block/rbd.c  | 202 ---
> > > >  qapi/block-core.json |   5 +-
> > > >  2 files changed, 192 insertions(+), 15 deletions(-)
> > > >
> > > > diff --git a/block/rbd.c b/block/rbd.c
> > > > index 59757b3120..d923a5a26c 100644
> > > > --- a/block/rbd.c
> > > > +++ b/block/rbd.c
> > > > @@ -64,6 +64,7 @@
> > > >  #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
> > > >
> > > >  #define RBD_MAX_SNAPS 100
> > > > +#define RBD_DEFAULT_CONCURRENT_OPS 10
> > > >
> > > >  /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
> > > >  #ifdef LIBRBD_SUPPORTS_IOVEC
> > > > @@ -104,6 +105,7 @@ typedef struct BDRVRBDState {
> > > >  char *image_name;
> > > >  char *snap;
> > > >  uint64_t image_size;
> > > > +bool ws_zero_supported; /* rbd_writesame() supports zeroed buffers 
> > > > */
> > > >  } BDRVRBDState;
> > > >
> > > >  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
> > > > @@ -333,6 +335,155 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t 
> > > > offs)
> > > >  }
> > > >  }
> > > >
> > > > +static int qemu_rbd_get_max_concurrent_ops(rados_t cluster)
> > > > +{
> > > > +char buf[16];
> > > > +int ret, max_concurrent_ops;
> > > > +
> > > > +ret = rados_conf_get(cluster, "rbd_concurrent_management_ops", buf,
> > > > + sizeof(buf));
> > > > +if (ret < 0) {
> > > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > > +}
> > > > +
> > > > +ret = qemu_strtoi(buf, NULL, 10, _concurrent_ops);
> > > > +if (ret < 0) {
> > > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > > +}
> > > > +
> > > > +return max_concurrent_ops;
> > > > +}
> > > > +
> > > > +static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
> > > > +int64_t offset, PreallocMode prealloc,
> > > > +bool ws_zero_supported, Error **errp)
> > > > +{
> > > > +uint64_t current_length;
> > > > +char *buf = NULL;
> > > > +int ret;
> > > > +
> > > > +ret = rbd_get_size(image, _length);
> > > > +if (ret < 0) {
> > > > +error_setg_errno(errp, -ret, "Failed to get file length");
> > > > +goto out;
> > > > +}
> > > > +
> > > > +if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> > > > +error_setg(errp, "Cannot use preallocation for shrinking 
> > > > files");
> > > > +ret = -ENOTSUP;
> > > > +goto out;
> > > > +}
> > > > +
> > > > +switch (prealloc) {
> > > > +case PREALLOC_MODE_FULL: {
> > > > +uint64_t buf_size, current_offset = current_length;
> > > > +ssize_t bytes;
> > > > +
> > > > +ret = rbd_get_stripe_unit(image, _size);
> > > > +if (ret < 0) {
> > > > +error_setg_errno(errp, -ret, "Failed to get stripe unit");
> > > > +goto out;
> > > > +}
> > > > +
> > > > +ret = rbd_resize(image, offset);
> > > > +if (ret < 0) {
> > > > +error_setg_errno(errp, -ret, "Failed to resize file");
> > > > +goto out;
> > > > +}
> > > > +
> > > > +buf = g_malloc0(buf_size);
> > > > +
> > > > +#ifdef LIBRBD_SUPPORTS_WRITESAME
> > > > +if (ws_zero_supported) {
> > > > +uint64_t writesame_max_size;
> > > > +int max_concurrent_ops;
> > > > +
> > > > +max_concurrent_ops = 
> > > > qemu_rbd_get_max_concurrent_ops(cluster);
> > > > +/*
> > > > + * We limit the rbd_writesame() size to avoid to spawn 
> > > > more then
> > > > + * 'rbd_concurrent_management_ops' concurrent operations.
> > > > + */
> > > > +writesame_max_size = MIN(buf_size * max_concurrent_ops, 
> > > > INT_MAX);
> > >
> > > In the most efficient world, the 'buf_size' would be some small, fixed
> > > power of 2 value (like 512 bytes) since there isn't much need to send
> > > extra zeroes. You would then want to writesame the full stripe period
> > > (if possible), where a stripe period is the 

Re: [Qemu-devel] [PATCH v3] block/rbd: add preallocation support

2019-07-25 Thread Jason Dillaman
On Thu, Jul 25, 2019 at 4:13 AM Stefano Garzarella  wrote:
>
> On Wed, Jul 24, 2019 at 01:48:42PM -0400, Jason Dillaman wrote:
> > On Tue, Jul 23, 2019 at 3:13 AM Stefano Garzarella  
> > wrote:
> > >
> > > This patch adds the support of preallocation (off/full) for the RBD
> > > block driver.
> > > If rbd_writesame() is available and supports zeroed buffers, we use
> > > it to quickly fill the image when full preallocation is required.
> > >
> > > Signed-off-by: Stefano Garzarella 
> > > ---
> > > v3:
> > >  - rebased on master
> > >  - filled with zeroed buffer [Max]
> > >  - used rbd_writesame() only when we can disable the discard of zeroed
> > >buffers
> > >  - added 'since: 4.2' in qapi/block-core.json [Max]
> > >  - used buffer as large as the "stripe unit"
> > > ---
> > >  block/rbd.c  | 202 ---
> > >  qapi/block-core.json |   5 +-
> > >  2 files changed, 192 insertions(+), 15 deletions(-)
> > >
> > > diff --git a/block/rbd.c b/block/rbd.c
> > > index 59757b3120..d923a5a26c 100644
> > > --- a/block/rbd.c
> > > +++ b/block/rbd.c
> > > @@ -64,6 +64,7 @@
> > >  #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
> > >
> > >  #define RBD_MAX_SNAPS 100
> > > +#define RBD_DEFAULT_CONCURRENT_OPS 10
> > >
> > >  /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
> > >  #ifdef LIBRBD_SUPPORTS_IOVEC
> > > @@ -104,6 +105,7 @@ typedef struct BDRVRBDState {
> > >  char *image_name;
> > >  char *snap;
> > >  uint64_t image_size;
> > > +bool ws_zero_supported; /* rbd_writesame() supports zeroed buffers */
> > >  } BDRVRBDState;
> > >
> > >  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
> > > @@ -333,6 +335,155 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t 
> > > offs)
> > >  }
> > >  }
> > >
> > > +static int qemu_rbd_get_max_concurrent_ops(rados_t cluster)
> > > +{
> > > +char buf[16];
> > > +int ret, max_concurrent_ops;
> > > +
> > > +ret = rados_conf_get(cluster, "rbd_concurrent_management_ops", buf,
> > > + sizeof(buf));
> > > +if (ret < 0) {
> > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > +}
> > > +
> > > +ret = qemu_strtoi(buf, NULL, 10, _concurrent_ops);
> > > +if (ret < 0) {
> > > +return RBD_DEFAULT_CONCURRENT_OPS;
> > > +}
> > > +
> > > +return max_concurrent_ops;
> > > +}
> > > +
> > > +static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
> > > +int64_t offset, PreallocMode prealloc,
> > > +bool ws_zero_supported, Error **errp)
> > > +{
> > > +uint64_t current_length;
> > > +char *buf = NULL;
> > > +int ret;
> > > +
> > > +ret = rbd_get_size(image, _length);
> > > +if (ret < 0) {
> > > +error_setg_errno(errp, -ret, "Failed to get file length");
> > > +goto out;
> > > +}
> > > +
> > > +if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> > > +error_setg(errp, "Cannot use preallocation for shrinking files");
> > > +ret = -ENOTSUP;
> > > +goto out;
> > > +}
> > > +
> > > +switch (prealloc) {
> > > +case PREALLOC_MODE_FULL: {
> > > +uint64_t buf_size, current_offset = current_length;
> > > +ssize_t bytes;
> > > +
> > > +ret = rbd_get_stripe_unit(image, _size);
> > > +if (ret < 0) {
> > > +error_setg_errno(errp, -ret, "Failed to get stripe unit");
> > > +goto out;
> > > +}
> > > +
> > > +ret = rbd_resize(image, offset);
> > > +if (ret < 0) {
> > > +error_setg_errno(errp, -ret, "Failed to resize file");
> > > +goto out;
> > > +}
> > > +
> > > +buf = g_malloc0(buf_size);
> > > +
> > > +#ifdef LIBRBD_SUPPORTS_WRITESAME
> > > +if (ws_zero_supported) {
> > > +uint64_t writesame_max_size;
> > > +int max_concurrent_ops;
> > > +
> > > +max_concurrent_ops = 
> > > qemu_rbd_get_max_concurrent_ops(cluster);
> > > +/*
> > > + * We limit the rbd_writesame() size to avoid to spawn more 
> > > then
> > > + * 'rbd_concurrent_management_ops' concurrent operations.
> > > + */
> > > +writesame_max_size = MIN(buf_size * max_concurrent_ops, 
> > > INT_MAX);
> >
> > In the most efficient world, the 'buf_size' would be some small, fixed
> > power of 2 value (like 512 bytes) since there isn't much need to send
> > extra zeroes. You would then want to writesame the full stripe period
> > (if possible), where a stripe period is the data block object size
> > (defaults to 4MiB and is availble via 'rbd_stat') * the stripe count.
> > In this case, the stripe count becomes the number of in-flight IOs.
> > Therefore, you could substitute its value w/ the max_concurrent_ops to
> > ensure you are issuing exactly max_concurrent_ops IOs per
> > rbd_writesame call.
> >
>
> 

Re: [Qemu-devel] [PATCH v3] block/rbd: add preallocation support

2019-07-25 Thread Stefano Garzarella
On Wed, Jul 24, 2019 at 01:48:42PM -0400, Jason Dillaman wrote:
> On Tue, Jul 23, 2019 at 3:13 AM Stefano Garzarella  
> wrote:
> >
> > This patch adds the support of preallocation (off/full) for the RBD
> > block driver.
> > If rbd_writesame() is available and supports zeroed buffers, we use
> > it to quickly fill the image when full preallocation is required.
> >
> > Signed-off-by: Stefano Garzarella 
> > ---
> > v3:
> >  - rebased on master
> >  - filled with zeroed buffer [Max]
> >  - used rbd_writesame() only when we can disable the discard of zeroed
> >buffers
> >  - added 'since: 4.2' in qapi/block-core.json [Max]
> >  - used buffer as large as the "stripe unit"
> > ---
> >  block/rbd.c  | 202 ---
> >  qapi/block-core.json |   5 +-
> >  2 files changed, 192 insertions(+), 15 deletions(-)
> >
> > diff --git a/block/rbd.c b/block/rbd.c
> > index 59757b3120..d923a5a26c 100644
> > --- a/block/rbd.c
> > +++ b/block/rbd.c
> > @@ -64,6 +64,7 @@
> >  #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
> >
> >  #define RBD_MAX_SNAPS 100
> > +#define RBD_DEFAULT_CONCURRENT_OPS 10
> >
> >  /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
> >  #ifdef LIBRBD_SUPPORTS_IOVEC
> > @@ -104,6 +105,7 @@ typedef struct BDRVRBDState {
> >  char *image_name;
> >  char *snap;
> >  uint64_t image_size;
> > +bool ws_zero_supported; /* rbd_writesame() supports zeroed buffers */
> >  } BDRVRBDState;
> >
> >  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
> > @@ -333,6 +335,155 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t 
> > offs)
> >  }
> >  }
> >
> > +static int qemu_rbd_get_max_concurrent_ops(rados_t cluster)
> > +{
> > +char buf[16];
> > +int ret, max_concurrent_ops;
> > +
> > +ret = rados_conf_get(cluster, "rbd_concurrent_management_ops", buf,
> > + sizeof(buf));
> > +if (ret < 0) {
> > +return RBD_DEFAULT_CONCURRENT_OPS;
> > +}
> > +
> > +ret = qemu_strtoi(buf, NULL, 10, _concurrent_ops);
> > +if (ret < 0) {
> > +return RBD_DEFAULT_CONCURRENT_OPS;
> > +}
> > +
> > +return max_concurrent_ops;
> > +}
> > +
> > +static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
> > +int64_t offset, PreallocMode prealloc,
> > +bool ws_zero_supported, Error **errp)
> > +{
> > +uint64_t current_length;
> > +char *buf = NULL;
> > +int ret;
> > +
> > +ret = rbd_get_size(image, _length);
> > +if (ret < 0) {
> > +error_setg_errno(errp, -ret, "Failed to get file length");
> > +goto out;
> > +}
> > +
> > +if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> > +error_setg(errp, "Cannot use preallocation for shrinking files");
> > +ret = -ENOTSUP;
> > +goto out;
> > +}
> > +
> > +switch (prealloc) {
> > +case PREALLOC_MODE_FULL: {
> > +uint64_t buf_size, current_offset = current_length;
> > +ssize_t bytes;
> > +
> > +ret = rbd_get_stripe_unit(image, _size);
> > +if (ret < 0) {
> > +error_setg_errno(errp, -ret, "Failed to get stripe unit");
> > +goto out;
> > +}
> > +
> > +ret = rbd_resize(image, offset);
> > +if (ret < 0) {
> > +error_setg_errno(errp, -ret, "Failed to resize file");
> > +goto out;
> > +}
> > +
> > +buf = g_malloc0(buf_size);
> > +
> > +#ifdef LIBRBD_SUPPORTS_WRITESAME
> > +if (ws_zero_supported) {
> > +uint64_t writesame_max_size;
> > +int max_concurrent_ops;
> > +
> > +max_concurrent_ops = qemu_rbd_get_max_concurrent_ops(cluster);
> > +/*
> > + * We limit the rbd_writesame() size to avoid to spawn more 
> > then
> > + * 'rbd_concurrent_management_ops' concurrent operations.
> > + */
> > +writesame_max_size = MIN(buf_size * max_concurrent_ops, 
> > INT_MAX);
> 
> In the most efficient world, the 'buf_size' would be some small, fixed
> power of 2 value (like 512 bytes) since there isn't much need to send
> extra zeroes. You would then want to writesame the full stripe period
> (if possible), where a stripe period is the data block object size
> (defaults to 4MiB and is availble via 'rbd_stat') * the stripe count.
> In this case, the stripe count becomes the number of in-flight IOs.
> Therefore, you could substitute its value w/ the max_concurrent_ops to
> ensure you are issuing exactly max_concurrent_ops IOs per
> rbd_writesame call.
> 

Initially, I had a fixed buffer size to 4 KiB, but I noted that, when
we didn't use writesame, the rbd_write() was very slow, so I used the
stripe unit as a buffer size.

Do you think is better to have a small buffer (512 byte) when we use
writesame or a 'stripe unit' buffer when we can't use it?


> > +
> > +while 

Re: [Qemu-devel] [PATCH v3] block/rbd: add preallocation support

2019-07-24 Thread Jason Dillaman
On Tue, Jul 23, 2019 at 3:13 AM Stefano Garzarella  wrote:
>
> This patch adds the support of preallocation (off/full) for the RBD
> block driver.
> If rbd_writesame() is available and supports zeroed buffers, we use
> it to quickly fill the image when full preallocation is required.
>
> Signed-off-by: Stefano Garzarella 
> ---
> v3:
>  - rebased on master
>  - filled with zeroed buffer [Max]
>  - used rbd_writesame() only when we can disable the discard of zeroed
>buffers
>  - added 'since: 4.2' in qapi/block-core.json [Max]
>  - used buffer as large as the "stripe unit"
> ---
>  block/rbd.c  | 202 ---
>  qapi/block-core.json |   5 +-
>  2 files changed, 192 insertions(+), 15 deletions(-)
>
> diff --git a/block/rbd.c b/block/rbd.c
> index 59757b3120..d923a5a26c 100644
> --- a/block/rbd.c
> +++ b/block/rbd.c
> @@ -64,6 +64,7 @@
>  #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
>
>  #define RBD_MAX_SNAPS 100
> +#define RBD_DEFAULT_CONCURRENT_OPS 10
>
>  /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
>  #ifdef LIBRBD_SUPPORTS_IOVEC
> @@ -104,6 +105,7 @@ typedef struct BDRVRBDState {
>  char *image_name;
>  char *snap;
>  uint64_t image_size;
> +bool ws_zero_supported; /* rbd_writesame() supports zeroed buffers */
>  } BDRVRBDState;
>
>  static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
> @@ -333,6 +335,155 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
>  }
>  }
>
> +static int qemu_rbd_get_max_concurrent_ops(rados_t cluster)
> +{
> +char buf[16];
> +int ret, max_concurrent_ops;
> +
> +ret = rados_conf_get(cluster, "rbd_concurrent_management_ops", buf,
> + sizeof(buf));
> +if (ret < 0) {
> +return RBD_DEFAULT_CONCURRENT_OPS;
> +}
> +
> +ret = qemu_strtoi(buf, NULL, 10, _concurrent_ops);
> +if (ret < 0) {
> +return RBD_DEFAULT_CONCURRENT_OPS;
> +}
> +
> +return max_concurrent_ops;
> +}
> +
> +static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
> +int64_t offset, PreallocMode prealloc,
> +bool ws_zero_supported, Error **errp)
> +{
> +uint64_t current_length;
> +char *buf = NULL;
> +int ret;
> +
> +ret = rbd_get_size(image, _length);
> +if (ret < 0) {
> +error_setg_errno(errp, -ret, "Failed to get file length");
> +goto out;
> +}
> +
> +if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
> +error_setg(errp, "Cannot use preallocation for shrinking files");
> +ret = -ENOTSUP;
> +goto out;
> +}
> +
> +switch (prealloc) {
> +case PREALLOC_MODE_FULL: {
> +uint64_t buf_size, current_offset = current_length;
> +ssize_t bytes;
> +
> +ret = rbd_get_stripe_unit(image, _size);
> +if (ret < 0) {
> +error_setg_errno(errp, -ret, "Failed to get stripe unit");
> +goto out;
> +}
> +
> +ret = rbd_resize(image, offset);
> +if (ret < 0) {
> +error_setg_errno(errp, -ret, "Failed to resize file");
> +goto out;
> +}
> +
> +buf = g_malloc0(buf_size);
> +
> +#ifdef LIBRBD_SUPPORTS_WRITESAME
> +if (ws_zero_supported) {
> +uint64_t writesame_max_size;
> +int max_concurrent_ops;
> +
> +max_concurrent_ops = qemu_rbd_get_max_concurrent_ops(cluster);
> +/*
> + * We limit the rbd_writesame() size to avoid to spawn more then
> + * 'rbd_concurrent_management_ops' concurrent operations.
> + */
> +writesame_max_size = MIN(buf_size * max_concurrent_ops, INT_MAX);

In the most efficient world, the 'buf_size' would be some small, fixed
power of 2 value (like 512 bytes) since there isn't much need to send
extra zeroes. You would then want to writesame the full stripe period
(if possible), where a stripe period is the data block object size
(defaults to 4MiB and is availble via 'rbd_stat') * the stripe count.
In this case, the stripe count becomes the number of in-flight IOs.
Therefore, you could substitute its value w/ the max_concurrent_ops to
ensure you are issuing exactly max_concurrent_ops IOs per
rbd_writesame call.

> +
> +while (offset - current_offset > buf_size) {
> +bytes = MIN(offset - current_offset, writesame_max_size);
> +/*
> + * rbd_writesame() supports only request where the size of 
> the
> + * operation is multiple of buffer size.
> + */
> +bytes -= bytes % buf_size;
> +
> +bytes = rbd_writesame(image, current_offset, bytes, buf,
> +  buf_size, 0);

If the RBD in-memory cache is enabled during this operation, the
writesame will effectively just be turned into a write. Therefore,
when 

[Qemu-devel] [PATCH v3] block/rbd: add preallocation support

2019-07-23 Thread Stefano Garzarella
This patch adds the support of preallocation (off/full) for the RBD
block driver.
If rbd_writesame() is available and supports zeroed buffers, we use
it to quickly fill the image when full preallocation is required.

Signed-off-by: Stefano Garzarella 
---
v3:
 - rebased on master
 - filled with zeroed buffer [Max]
 - used rbd_writesame() only when we can disable the discard of zeroed
   buffers
 - added 'since: 4.2' in qapi/block-core.json [Max]
 - used buffer as large as the "stripe unit"
---
 block/rbd.c  | 202 ---
 qapi/block-core.json |   5 +-
 2 files changed, 192 insertions(+), 15 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 59757b3120..d923a5a26c 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -64,6 +64,7 @@
 #define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER)
 
 #define RBD_MAX_SNAPS 100
+#define RBD_DEFAULT_CONCURRENT_OPS 10
 
 /* The LIBRBD_SUPPORTS_IOVEC is defined in librbd.h */
 #ifdef LIBRBD_SUPPORTS_IOVEC
@@ -104,6 +105,7 @@ typedef struct BDRVRBDState {
 char *image_name;
 char *snap;
 uint64_t image_size;
+bool ws_zero_supported; /* rbd_writesame() supports zeroed buffers */
 } BDRVRBDState;
 
 static int qemu_rbd_connect(rados_t *cluster, rados_ioctx_t *io_ctx,
@@ -333,6 +335,155 @@ static void qemu_rbd_memset(RADOSCB *rcb, int64_t offs)
 }
 }
 
+static int qemu_rbd_get_max_concurrent_ops(rados_t cluster)
+{
+char buf[16];
+int ret, max_concurrent_ops;
+
+ret = rados_conf_get(cluster, "rbd_concurrent_management_ops", buf,
+ sizeof(buf));
+if (ret < 0) {
+return RBD_DEFAULT_CONCURRENT_OPS;
+}
+
+ret = qemu_strtoi(buf, NULL, 10, _concurrent_ops);
+if (ret < 0) {
+return RBD_DEFAULT_CONCURRENT_OPS;
+}
+
+return max_concurrent_ops;
+}
+
+static int qemu_rbd_do_truncate(rados_t cluster, rbd_image_t image,
+int64_t offset, PreallocMode prealloc,
+bool ws_zero_supported, Error **errp)
+{
+uint64_t current_length;
+char *buf = NULL;
+int ret;
+
+ret = rbd_get_size(image, _length);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "Failed to get file length");
+goto out;
+}
+
+if (current_length > offset && prealloc != PREALLOC_MODE_OFF) {
+error_setg(errp, "Cannot use preallocation for shrinking files");
+ret = -ENOTSUP;
+goto out;
+}
+
+switch (prealloc) {
+case PREALLOC_MODE_FULL: {
+uint64_t buf_size, current_offset = current_length;
+ssize_t bytes;
+
+ret = rbd_get_stripe_unit(image, _size);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "Failed to get stripe unit");
+goto out;
+}
+
+ret = rbd_resize(image, offset);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "Failed to resize file");
+goto out;
+}
+
+buf = g_malloc0(buf_size);
+
+#ifdef LIBRBD_SUPPORTS_WRITESAME
+if (ws_zero_supported) {
+uint64_t writesame_max_size;
+int max_concurrent_ops;
+
+max_concurrent_ops = qemu_rbd_get_max_concurrent_ops(cluster);
+/*
+ * We limit the rbd_writesame() size to avoid to spawn more then
+ * 'rbd_concurrent_management_ops' concurrent operations.
+ */
+writesame_max_size = MIN(buf_size * max_concurrent_ops, INT_MAX);
+
+while (offset - current_offset > buf_size) {
+bytes = MIN(offset - current_offset, writesame_max_size);
+/*
+ * rbd_writesame() supports only request where the size of the
+ * operation is multiple of buffer size.
+ */
+bytes -= bytes % buf_size;
+
+bytes = rbd_writesame(image, current_offset, bytes, buf,
+  buf_size, 0);
+if (bytes < 0) {
+ret = bytes;
+error_setg_errno(errp, -ret,
+ "Failed to write for preallocation");
+goto out;
+}
+
+current_offset += bytes;
+}
+}
+#endif /* LIBRBD_SUPPORTS_WRITESAME */
+
+while (current_offset < offset) {
+bytes = rbd_write(image, current_offset,
+  MIN(offset - current_offset, buf_size), buf);
+if (bytes < 0) {
+ret = bytes;
+error_setg_errno(errp, -ret,
+ "Failed to write for preallocation");
+goto out;
+}
+
+current_offset += bytes;
+}
+
+ret = rbd_flush(image);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "Failed to flush the file");
+goto out;
+}
+
+break;
+}
+case PREALLOC_MODE_OFF: