Re: [Qemu-block] [PATCH 2/2] block: align bounce buffers to page

2015-05-12 Thread Denis V. Lunev

On 12/05/15 13:27, Kevin Wolf wrote:

Am 12.05.2015 um 07:47 hat Denis V. Lunev geschrieben:

The following sequence
 int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
 for (i = 0; i  10; i++)
 write(fd, buf, 4096);
performs 5% better if buf is aligned to 4096 bytes.

The difference is quite reliable.

On the other hand we do not want at the moment to enforce bounce
buffering if guest request is aligned to 512 bytes.

The patch changes default bounce buffer optimal alignment to
MAX(page size, 4k). 4k is chosen as maximal known sector size on real
HDD.

The justification of the performance improve is quite interesting.
 From the kernel point of view each request to the disk was split
by two. This could be seen by blktrace like this:
   9,0   11  1 0.0 11151  Q  WS 312737792 + 1023 [qemu-img]
   9,0   11  2 0.07938 11151  Q  WS 312738815 + 8 [qemu-img]
   9,0   11  3 0.30735 11151  Q  WS 312738823 + 1016 [qemu-img]
   9,0   11  4 0.32482 11151  Q  WS 312739839 + 8 [qemu-img]
   9,0   11  5 0.41379 11151  Q  WS 312739847 + 1016 [qemu-img]
   9,0   11  6 0.42818 11151  Q  WS 312740863 + 8 [qemu-img]
   9,0   11  7 0.51236 11151  Q  WS 312740871 + 1017 [qemu-img]
   9,05  1 0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]
After the patch the pattern becomes normal:
   9,06  1 0.0 12422  Q  WS 314834944 + 1024 [qemu-img]
   9,06  2 0.38527 12422  Q  WS 314835968 + 1024 [qemu-img]
   9,06  3 0.72849 12422  Q  WS 314836992 + 1024 [qemu-img]
   9,06  4 0.000106276 12422  Q  WS 314838016 + 1024 [qemu-img]
and the amount of requests sent to disk (could be calculated counting
number of lines in the output of blktrace) is reduced about 2 times.

Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
does his job well and real requests comes properly aligned (to page).

Signed-off-by: Denis V. Lunev d...@openvz.org
CC: Paolo Bonzini pbonz...@redhat.com
CC: Kevin Wolf kw...@redhat.com
CC: Stefan Hajnoczi stefa...@redhat.com
---
  block.c   |  8 
  block/io.c|  2 +-
  block/raw-posix.c | 14 --
  3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index e293907..325f727 100644
--- a/block.c
+++ b/block.c
@@ -106,8 +106,8 @@ int is_windows_drive(const char *filename)
  size_t bdrv_opt_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
-/* 4k should be on the safe side */
-return 4096;
+/* page size or 4k (hdd sector size) should be on the safe side */
+return MAX(4096, getpagesize());
  }

  return bs-bl.opt_mem_alignment;
@@ -116,8 +116,8 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
  size_t bdrv_min_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
-/* 4k should be on the safe side */
-return 4096;
+/* page size or 4k (hdd sector size) should be on the safe side */
+return MAX(4096, getpagesize());
  }

  return bs-bl.min_mem_alignment;
diff --git a/block/io.c b/block/io.c
index 908a3d1..071652c 100644
--- a/block/io.c
+++ b/block/io.c
@@ -205,7 +205,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  bs-bl.opt_mem_alignment = bs-file-bl.opt_mem_alignment;
  } else {
  bs-bl.min_mem_alignment = 512;
-bs-bl.opt_mem_alignment = 512;
+bs-bl.opt_mem_alignment = getpagesize();
  }

  if (bs-backing_hd) {


I think it would make more sense to keep this specific to the raw-posix
driver. After all, it's only the kernel page cache that we optimise
here. Other backends probably don't take advantage of page alignment.


diff --git a/block/raw-posix.c b/block/raw-posix.c
index 7083924..04f3d4e 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -301,6 +301,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, Error **errp)
  {
  BDRVRawState *s = bs-opaque;
  char *buf;
+size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());

  /* For /dev/sg devices the alignment is not really used.
 With buffered I/O, we don't have any restrictions. */
@@ -330,9 +331,9 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, Error **errp)
  /* If we could not get the sizes so far, we can only guess them */
  if (!s-buf_align) {
  size_t align;
-buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
-for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
-if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
+buf = qemu_memalign(max_align, 2 * max_align);
+for (align = 512; align = max_align; align = 1) {
+if (raw_is_io_aligned(fd, buf + align, max_align)) {
  s-buf_align = align;
  break;
  }
@@ -342,8 +343,8 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, 

Re: [Qemu-block] [PATCH 2/2] block: align bounce buffers to page

2015-05-12 Thread Kevin Wolf
Am 12.05.2015 um 12:36 hat Denis V. Lunev geschrieben:
 On 12/05/15 13:27, Kevin Wolf wrote:
 Am 12.05.2015 um 07:47 hat Denis V. Lunev geschrieben:
 The following sequence
  int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
  for (i = 0; i  10; i++)
  write(fd, buf, 4096);
 performs 5% better if buf is aligned to 4096 bytes.
 
 The difference is quite reliable.
 
 On the other hand we do not want at the moment to enforce bounce
 buffering if guest request is aligned to 512 bytes.
 
 The patch changes default bounce buffer optimal alignment to
 MAX(page size, 4k). 4k is chosen as maximal known sector size on real
 HDD.
 
 The justification of the performance improve is quite interesting.
  From the kernel point of view each request to the disk was split
 by two. This could be seen by blktrace like this:
9,0   11  1 0.0 11151  Q  WS 312737792 + 1023 [qemu-img]
9,0   11  2 0.07938 11151  Q  WS 312738815 + 8 [qemu-img]
9,0   11  3 0.30735 11151  Q  WS 312738823 + 1016 [qemu-img]
9,0   11  4 0.32482 11151  Q  WS 312739839 + 8 [qemu-img]
9,0   11  5 0.41379 11151  Q  WS 312739847 + 1016 [qemu-img]
9,0   11  6 0.42818 11151  Q  WS 312740863 + 8 [qemu-img]
9,0   11  7 0.51236 11151  Q  WS 312740871 + 1017 [qemu-img]
9,05  1 0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]
 After the patch the pattern becomes normal:
9,06  1 0.0 12422  Q  WS 314834944 + 1024 [qemu-img]
9,06  2 0.38527 12422  Q  WS 314835968 + 1024 [qemu-img]
9,06  3 0.72849 12422  Q  WS 314836992 + 1024 [qemu-img]
9,06  4 0.000106276 12422  Q  WS 314838016 + 1024 [qemu-img]
 and the amount of requests sent to disk (could be calculated counting
 number of lines in the output of blktrace) is reduced about 2 times.
 
 Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
 does his job well and real requests comes properly aligned (to page).
 
 Signed-off-by: Denis V. Lunev d...@openvz.org
 CC: Paolo Bonzini pbonz...@redhat.com
 CC: Kevin Wolf kw...@redhat.com
 CC: Stefan Hajnoczi stefa...@redhat.com
 ---
   block.c   |  8 
   block/io.c|  2 +-
   block/raw-posix.c | 14 --
   3 files changed, 13 insertions(+), 11 deletions(-)
 
 diff --git a/block.c b/block.c
 index e293907..325f727 100644
 --- a/block.c
 +++ b/block.c
 @@ -106,8 +106,8 @@ int is_windows_drive(const char *filename)
   size_t bdrv_opt_mem_align(BlockDriverState *bs)
   {
   if (!bs || !bs-drv) {
 -/* 4k should be on the safe side */
 -return 4096;
 +/* page size or 4k (hdd sector size) should be on the safe side */
 +return MAX(4096, getpagesize());
   }
 
   return bs-bl.opt_mem_alignment;
 @@ -116,8 +116,8 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
   size_t bdrv_min_mem_align(BlockDriverState *bs)
   {
   if (!bs || !bs-drv) {
 -/* 4k should be on the safe side */
 -return 4096;
 +/* page size or 4k (hdd sector size) should be on the safe side */
 +return MAX(4096, getpagesize());
   }
 
   return bs-bl.min_mem_alignment;
 diff --git a/block/io.c b/block/io.c
 index 908a3d1..071652c 100644
 --- a/block/io.c
 +++ b/block/io.c
 @@ -205,7 +205,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error 
 **errp)
   bs-bl.opt_mem_alignment = bs-file-bl.opt_mem_alignment;
   } else {
   bs-bl.min_mem_alignment = 512;
 -bs-bl.opt_mem_alignment = 512;
 +bs-bl.opt_mem_alignment = getpagesize();
   }
 
   if (bs-backing_hd) {
 
 I think it would make more sense to keep this specific to the raw-posix
 driver. After all, it's only the kernel page cache that we optimise
 here. Other backends probably don't take advantage of page alignment.
 
 diff --git a/block/raw-posix.c b/block/raw-posix.c
 index 7083924..04f3d4e 100644
 --- a/block/raw-posix.c
 +++ b/block/raw-posix.c
 @@ -301,6 +301,7 @@ static void raw_probe_alignment(BlockDriverState *bs, 
 int fd, Error **errp)
   {
   BDRVRawState *s = bs-opaque;
   char *buf;
 +size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
 
   /* For /dev/sg devices the alignment is not really used.
  With buffered I/O, we don't have any restrictions. */
 @@ -330,9 +331,9 @@ static void raw_probe_alignment(BlockDriverState *bs, 
 int fd, Error **errp)
   /* If we could not get the sizes so far, we can only guess them */
   if (!s-buf_align) {
   size_t align;
 -buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
 -for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
 -if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
 +buf = qemu_memalign(max_align, 2 * max_align);
 +for (align = 512; align = max_align; align = 1) {
 +if (raw_is_io_aligned(fd, buf + align, max_align)) 

Re: [Qemu-block] [PATCH 2/2] block: align bounce buffers to page

2015-05-12 Thread Kevin Wolf
Am 12.05.2015 um 15:41 hat Denis V. Lunev geschrieben:
 The following sequence
 int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
 for (i = 0; i  10; i++)
 write(fd, buf, 4096);
 performs 5% better if buf is aligned to 4096 bytes.
 
 The difference is quite reliable.
 
 On the other hand we do not want at the moment to enforce bounce
 buffering if guest request is aligned to 512 bytes.
 
 The patch changes default bounce buffer optimal alignment to
 MAX(page size, 4k). 4k is chosen as maximal known sector size on real
 HDD.
 
 The justification of the performance improve is quite interesting.
 From the kernel point of view each request to the disk was split
 by two. This could be seen by blktrace like this:
   9,0   11  1 0.0 11151  Q  WS 312737792 + 1023 [qemu-img]
   9,0   11  2 0.07938 11151  Q  WS 312738815 + 8 [qemu-img]
   9,0   11  3 0.30735 11151  Q  WS 312738823 + 1016 [qemu-img]
   9,0   11  4 0.32482 11151  Q  WS 312739839 + 8 [qemu-img]
   9,0   11  5 0.41379 11151  Q  WS 312739847 + 1016 [qemu-img]
   9,0   11  6 0.42818 11151  Q  WS 312740863 + 8 [qemu-img]
   9,0   11  7 0.51236 11151  Q  WS 312740871 + 1017 [qemu-img]
   9,05  1 0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]
 After the patch the pattern becomes normal:
   9,06  1 0.0 12422  Q  WS 314834944 + 1024 [qemu-img]
   9,06  2 0.38527 12422  Q  WS 314835968 + 1024 [qemu-img]
   9,06  3 0.72849 12422  Q  WS 314836992 + 1024 [qemu-img]
   9,06  4 0.000106276 12422  Q  WS 314838016 + 1024 [qemu-img]
 and the amount of requests sent to disk (could be calculated counting
 number of lines in the output of blktrace) is reduced about 2 times.
 
 Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
 does his job well and real requests comes properly aligned (to page).
 
 Signed-off-by: Denis V. Lunev d...@openvz.org
 CC: Paolo Bonzini pbonz...@redhat.com
 CC: Kevin Wolf kw...@redhat.com
 CC: Stefan Hajnoczi stefa...@redhat.com
 ---
  block.c   |  8 
  block/io.c|  2 +-
  block/raw-posix.c | 15 +--
  3 files changed, 14 insertions(+), 11 deletions(-)
 
 diff --git a/block.c b/block.c
 index e293907..325f727 100644
 --- a/block.c
 +++ b/block.c
 @@ -106,8 +106,8 @@ int is_windows_drive(const char *filename)
  size_t bdrv_opt_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
 -/* 4k should be on the safe side */
 -return 4096;
 +/* page size or 4k (hdd sector size) should be on the safe side */
 +return MAX(4096, getpagesize());
  }
  
  return bs-bl.opt_mem_alignment;
 @@ -116,8 +116,8 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
  size_t bdrv_min_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
 -/* 4k should be on the safe side */
 -return 4096;
 +/* page size or 4k (hdd sector size) should be on the safe side */
 +return MAX(4096, getpagesize());
  }
  
  return bs-bl.min_mem_alignment;
 diff --git a/block/io.c b/block/io.c
 index 908a3d1..071652c 100644
 --- a/block/io.c
 +++ b/block/io.c
 @@ -205,7 +205,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error 
 **errp)
  bs-bl.opt_mem_alignment = bs-file-bl.opt_mem_alignment;
  } else {
  bs-bl.min_mem_alignment = 512;
 -bs-bl.opt_mem_alignment = 512;
 +bs-bl.opt_mem_alignment = getpagesize();
  }
  
  if (bs-backing_hd) {
 diff --git a/block/raw-posix.c b/block/raw-posix.c
 index 7083924..4659552 100644
 --- a/block/raw-posix.c
 +++ b/block/raw-posix.c
 @@ -301,6 +301,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
 fd, Error **errp)
  {
  BDRVRawState *s = bs-opaque;
  char *buf;
 +size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
  
  /* For /dev/sg devices the alignment is not really used.
 With buffered I/O, we don't have any restrictions. */
 @@ -330,9 +331,9 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
 fd, Error **errp)
  /* If we could not get the sizes so far, we can only guess them */
  if (!s-buf_align) {
  size_t align;
 -buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
 -for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
 -if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
 +buf = qemu_memalign(max_align, 2 * max_align);
 +for (align = 512; align = max_align; align = 1) {
 +if (raw_is_io_aligned(fd, buf + align, max_align)) {
  s-buf_align = align;
  break;
  }
 @@ -342,8 +343,8 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
 fd, Error **errp)
  
  if (!bs-request_alignment) {
  size_t align;
 -buf = qemu_memalign(s-buf_align, MAX_BLOCKSIZE);
 -for (align = 512; align = 

Re: [Qemu-block] [PATCH 2/2] block: align bounce buffers to page

2015-05-12 Thread Denis V. Lunev

On 12/05/15 17:08, Kevin Wolf wrote:

Am 12.05.2015 um 15:41 hat Denis V. Lunev geschrieben:

The following sequence
 int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
 for (i = 0; i  10; i++)
 write(fd, buf, 4096);
performs 5% better if buf is aligned to 4096 bytes.

The difference is quite reliable.

On the other hand we do not want at the moment to enforce bounce
buffering if guest request is aligned to 512 bytes.

The patch changes default bounce buffer optimal alignment to
MAX(page size, 4k). 4k is chosen as maximal known sector size on real
HDD.

The justification of the performance improve is quite interesting.
 From the kernel point of view each request to the disk was split
by two. This could be seen by blktrace like this:
   9,0   11  1 0.0 11151  Q  WS 312737792 + 1023 [qemu-img]
   9,0   11  2 0.07938 11151  Q  WS 312738815 + 8 [qemu-img]
   9,0   11  3 0.30735 11151  Q  WS 312738823 + 1016 [qemu-img]
   9,0   11  4 0.32482 11151  Q  WS 312739839 + 8 [qemu-img]
   9,0   11  5 0.41379 11151  Q  WS 312739847 + 1016 [qemu-img]
   9,0   11  6 0.42818 11151  Q  WS 312740863 + 8 [qemu-img]
   9,0   11  7 0.51236 11151  Q  WS 312740871 + 1017 [qemu-img]
   9,05  1 0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]
After the patch the pattern becomes normal:
   9,06  1 0.0 12422  Q  WS 314834944 + 1024 [qemu-img]
   9,06  2 0.38527 12422  Q  WS 314835968 + 1024 [qemu-img]
   9,06  3 0.72849 12422  Q  WS 314836992 + 1024 [qemu-img]
   9,06  4 0.000106276 12422  Q  WS 314838016 + 1024 [qemu-img]
and the amount of requests sent to disk (could be calculated counting
number of lines in the output of blktrace) is reduced about 2 times.

Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
does his job well and real requests comes properly aligned (to page).

Signed-off-by: Denis V. Lunev d...@openvz.org
CC: Paolo Bonzini pbonz...@redhat.com
CC: Kevin Wolf kw...@redhat.com
CC: Stefan Hajnoczi stefa...@redhat.com
---
  block.c   |  8 
  block/io.c|  2 +-
  block/raw-posix.c | 15 +--
  3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index e293907..325f727 100644
--- a/block.c
+++ b/block.c
@@ -106,8 +106,8 @@ int is_windows_drive(const char *filename)
  size_t bdrv_opt_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
-/* 4k should be on the safe side */
-return 4096;
+/* page size or 4k (hdd sector size) should be on the safe side */
+return MAX(4096, getpagesize());
  }
  
  return bs-bl.opt_mem_alignment;

@@ -116,8 +116,8 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
  size_t bdrv_min_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
-/* 4k should be on the safe side */
-return 4096;
+/* page size or 4k (hdd sector size) should be on the safe side */
+return MAX(4096, getpagesize());
  }
  
  return bs-bl.min_mem_alignment;

diff --git a/block/io.c b/block/io.c
index 908a3d1..071652c 100644
--- a/block/io.c
+++ b/block/io.c
@@ -205,7 +205,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  bs-bl.opt_mem_alignment = bs-file-bl.opt_mem_alignment;
  } else {
  bs-bl.min_mem_alignment = 512;
-bs-bl.opt_mem_alignment = 512;
+bs-bl.opt_mem_alignment = getpagesize();
  }
  
  if (bs-backing_hd) {

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 7083924..4659552 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -301,6 +301,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, Error **errp)
  {
  BDRVRawState *s = bs-opaque;
  char *buf;
+size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
  
  /* For /dev/sg devices the alignment is not really used.

 With buffered I/O, we don't have any restrictions. */
@@ -330,9 +331,9 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, Error **errp)
  /* If we could not get the sizes so far, we can only guess them */
  if (!s-buf_align) {
  size_t align;
-buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
-for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
-if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
+buf = qemu_memalign(max_align, 2 * max_align);
+for (align = 512; align = max_align; align = 1) {
+if (raw_is_io_aligned(fd, buf + align, max_align)) {
  s-buf_align = align;
  break;
  }
@@ -342,8 +343,8 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, Error **errp)
  
  if (!bs-request_alignment) {

  size_t align;
-buf = qemu_memalign(s-buf_align, MAX_BLOCKSIZE);
-for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
+  

Re: [Qemu-block] [PATCH 2/2] block: align bounce buffers to page

2015-05-12 Thread Kevin Wolf
Am 12.05.2015 um 16:20 hat Denis V. Lunev geschrieben:
 On 12/05/15 17:08, Kevin Wolf wrote:
 Am 12.05.2015 um 15:41 hat Denis V. Lunev geschrieben:
 The following sequence
  int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
  for (i = 0; i  10; i++)
  write(fd, buf, 4096);
 performs 5% better if buf is aligned to 4096 bytes.
 
 The difference is quite reliable.
 
 On the other hand we do not want at the moment to enforce bounce
 buffering if guest request is aligned to 512 bytes.
 
 The patch changes default bounce buffer optimal alignment to
 MAX(page size, 4k). 4k is chosen as maximal known sector size on real
 HDD.
 
 The justification of the performance improve is quite interesting.
  From the kernel point of view each request to the disk was split
 by two. This could be seen by blktrace like this:
9,0   11  1 0.0 11151  Q  WS 312737792 + 1023 [qemu-img]
9,0   11  2 0.07938 11151  Q  WS 312738815 + 8 [qemu-img]
9,0   11  3 0.30735 11151  Q  WS 312738823 + 1016 [qemu-img]
9,0   11  4 0.32482 11151  Q  WS 312739839 + 8 [qemu-img]
9,0   11  5 0.41379 11151  Q  WS 312739847 + 1016 [qemu-img]
9,0   11  6 0.42818 11151  Q  WS 312740863 + 8 [qemu-img]
9,0   11  7 0.51236 11151  Q  WS 312740871 + 1017 [qemu-img]
9,05  1 0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]
 After the patch the pattern becomes normal:
9,06  1 0.0 12422  Q  WS 314834944 + 1024 [qemu-img]
9,06  2 0.38527 12422  Q  WS 314835968 + 1024 [qemu-img]
9,06  3 0.72849 12422  Q  WS 314836992 + 1024 [qemu-img]
9,06  4 0.000106276 12422  Q  WS 314838016 + 1024 [qemu-img]
 and the amount of requests sent to disk (could be calculated counting
 number of lines in the output of blktrace) is reduced about 2 times.
 
 Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
 does his job well and real requests comes properly aligned (to page).
 
 Signed-off-by: Denis V. Lunev d...@openvz.org
 CC: Paolo Bonzini pbonz...@redhat.com
 CC: Kevin Wolf kw...@redhat.com
 CC: Stefan Hajnoczi stefa...@redhat.com
 ---
   block.c   |  8 
   block/io.c|  2 +-
   block/raw-posix.c | 15 +--
   3 files changed, 14 insertions(+), 11 deletions(-)
 
 diff --git a/block.c b/block.c
 index e293907..325f727 100644
 --- a/block.c
 +++ b/block.c
 @@ -106,8 +106,8 @@ int is_windows_drive(const char *filename)
   size_t bdrv_opt_mem_align(BlockDriverState *bs)
   {
   if (!bs || !bs-drv) {
 -/* 4k should be on the safe side */
 -return 4096;
 +/* page size or 4k (hdd sector size) should be on the safe side */
 +return MAX(4096, getpagesize());
   }
   return bs-bl.opt_mem_alignment;
 @@ -116,8 +116,8 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
   size_t bdrv_min_mem_align(BlockDriverState *bs)
   {
   if (!bs || !bs-drv) {
 -/* 4k should be on the safe side */
 -return 4096;
 +/* page size or 4k (hdd sector size) should be on the safe side */
 +return MAX(4096, getpagesize());
   }
   return bs-bl.min_mem_alignment;
 diff --git a/block/io.c b/block/io.c
 index 908a3d1..071652c 100644
 --- a/block/io.c
 +++ b/block/io.c
 @@ -205,7 +205,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error 
 **errp)
   bs-bl.opt_mem_alignment = bs-file-bl.opt_mem_alignment;
   } else {
   bs-bl.min_mem_alignment = 512;
 -bs-bl.opt_mem_alignment = 512;
 +bs-bl.opt_mem_alignment = getpagesize();
   }
   if (bs-backing_hd) {
 diff --git a/block/raw-posix.c b/block/raw-posix.c
 index 7083924..4659552 100644
 --- a/block/raw-posix.c
 +++ b/block/raw-posix.c
 @@ -301,6 +301,7 @@ static void raw_probe_alignment(BlockDriverState *bs, 
 int fd, Error **errp)
   {
   BDRVRawState *s = bs-opaque;
   char *buf;
 +size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
   /* For /dev/sg devices the alignment is not really used.
  With buffered I/O, we don't have any restrictions. */
 @@ -330,9 +331,9 @@ static void raw_probe_alignment(BlockDriverState *bs, 
 int fd, Error **errp)
   /* If we could not get the sizes so far, we can only guess them */
   if (!s-buf_align) {
   size_t align;
 -buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
 -for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
 -if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
 +buf = qemu_memalign(max_align, 2 * max_align);
 +for (align = 512; align = max_align; align = 1) {
 +if (raw_is_io_aligned(fd, buf + align, max_align)) {
   s-buf_align = align;
   break;
   }
 @@ -342,8 +343,8 @@ static void raw_probe_alignment(BlockDriverState *bs, 
 int fd, Error **errp)
   if (!bs-request_alignment) {

Re: [Qemu-block] [PATCH 2/2] block: align bounce buffers to page

2015-05-12 Thread Kevin Wolf
Am 12.05.2015 um 07:47 hat Denis V. Lunev geschrieben:
 The following sequence
 int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
 for (i = 0; i  10; i++)
 write(fd, buf, 4096);
 performs 5% better if buf is aligned to 4096 bytes.
 
 The difference is quite reliable.
 
 On the other hand we do not want at the moment to enforce bounce
 buffering if guest request is aligned to 512 bytes.
 
 The patch changes default bounce buffer optimal alignment to
 MAX(page size, 4k). 4k is chosen as maximal known sector size on real
 HDD.
 
 The justification of the performance improve is quite interesting.
 From the kernel point of view each request to the disk was split
 by two. This could be seen by blktrace like this:
   9,0   11  1 0.0 11151  Q  WS 312737792 + 1023 [qemu-img]
   9,0   11  2 0.07938 11151  Q  WS 312738815 + 8 [qemu-img]
   9,0   11  3 0.30735 11151  Q  WS 312738823 + 1016 [qemu-img]
   9,0   11  4 0.32482 11151  Q  WS 312739839 + 8 [qemu-img]
   9,0   11  5 0.41379 11151  Q  WS 312739847 + 1016 [qemu-img]
   9,0   11  6 0.42818 11151  Q  WS 312740863 + 8 [qemu-img]
   9,0   11  7 0.51236 11151  Q  WS 312740871 + 1017 [qemu-img]
   9,05  1 0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]
 After the patch the pattern becomes normal:
   9,06  1 0.0 12422  Q  WS 314834944 + 1024 [qemu-img]
   9,06  2 0.38527 12422  Q  WS 314835968 + 1024 [qemu-img]
   9,06  3 0.72849 12422  Q  WS 314836992 + 1024 [qemu-img]
   9,06  4 0.000106276 12422  Q  WS 314838016 + 1024 [qemu-img]
 and the amount of requests sent to disk (could be calculated counting
 number of lines in the output of blktrace) is reduced about 2 times.
 
 Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
 does his job well and real requests comes properly aligned (to page).
 
 Signed-off-by: Denis V. Lunev d...@openvz.org
 CC: Paolo Bonzini pbonz...@redhat.com
 CC: Kevin Wolf kw...@redhat.com
 CC: Stefan Hajnoczi stefa...@redhat.com
 ---
  block.c   |  8 
  block/io.c|  2 +-
  block/raw-posix.c | 14 --
  3 files changed, 13 insertions(+), 11 deletions(-)
 
 diff --git a/block.c b/block.c
 index e293907..325f727 100644
 --- a/block.c
 +++ b/block.c
 @@ -106,8 +106,8 @@ int is_windows_drive(const char *filename)
  size_t bdrv_opt_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
 -/* 4k should be on the safe side */
 -return 4096;
 +/* page size or 4k (hdd sector size) should be on the safe side */
 +return MAX(4096, getpagesize());
  }
  
  return bs-bl.opt_mem_alignment;
 @@ -116,8 +116,8 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
  size_t bdrv_min_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
 -/* 4k should be on the safe side */
 -return 4096;
 +/* page size or 4k (hdd sector size) should be on the safe side */
 +return MAX(4096, getpagesize());
  }
  
  return bs-bl.min_mem_alignment;
 diff --git a/block/io.c b/block/io.c
 index 908a3d1..071652c 100644
 --- a/block/io.c
 +++ b/block/io.c
 @@ -205,7 +205,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error 
 **errp)
  bs-bl.opt_mem_alignment = bs-file-bl.opt_mem_alignment;
  } else {
  bs-bl.min_mem_alignment = 512;
 -bs-bl.opt_mem_alignment = 512;
 +bs-bl.opt_mem_alignment = getpagesize();
  }
  
  if (bs-backing_hd) {

I think it would make more sense to keep this specific to the raw-posix
driver. After all, it's only the kernel page cache that we optimise
here. Other backends probably don't take advantage of page alignment.

 diff --git a/block/raw-posix.c b/block/raw-posix.c
 index 7083924..04f3d4e 100644
 --- a/block/raw-posix.c
 +++ b/block/raw-posix.c
 @@ -301,6 +301,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
 fd, Error **errp)
  {
  BDRVRawState *s = bs-opaque;
  char *buf;
 +size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
  
  /* For /dev/sg devices the alignment is not really used.
 With buffered I/O, we don't have any restrictions. */
 @@ -330,9 +331,9 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
 fd, Error **errp)
  /* If we could not get the sizes so far, we can only guess them */
  if (!s-buf_align) {
  size_t align;
 -buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
 -for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
 -if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
 +buf = qemu_memalign(max_align, 2 * max_align);
 +for (align = 512; align = max_align; align = 1) {
 +if (raw_is_io_aligned(fd, buf + align, max_align)) {
  s-buf_align = align;
  break;
  }
 @@ -342,8 +343,8 @@ static void 

Re: [Qemu-block] [PATCH 2/2] block: align bounce buffers to page

2015-05-12 Thread Denis V. Lunev

On 12/05/15 17:26, Kevin Wolf wrote:

Am 12.05.2015 um 16:20 hat Denis V. Lunev geschrieben:

On 12/05/15 17:08, Kevin Wolf wrote:

Am 12.05.2015 um 15:41 hat Denis V. Lunev geschrieben:

The following sequence
 int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
 for (i = 0; i  10; i++)
 write(fd, buf, 4096);
performs 5% better if buf is aligned to 4096 bytes.

The difference is quite reliable.

On the other hand we do not want at the moment to enforce bounce
buffering if guest request is aligned to 512 bytes.

The patch changes default bounce buffer optimal alignment to
MAX(page size, 4k). 4k is chosen as maximal known sector size on real
HDD.

The justification of the performance improve is quite interesting.
 From the kernel point of view each request to the disk was split
by two. This could be seen by blktrace like this:
   9,0   11  1 0.0 11151  Q  WS 312737792 + 1023 [qemu-img]
   9,0   11  2 0.07938 11151  Q  WS 312738815 + 8 [qemu-img]
   9,0   11  3 0.30735 11151  Q  WS 312738823 + 1016 [qemu-img]
   9,0   11  4 0.32482 11151  Q  WS 312739839 + 8 [qemu-img]
   9,0   11  5 0.41379 11151  Q  WS 312739847 + 1016 [qemu-img]
   9,0   11  6 0.42818 11151  Q  WS 312740863 + 8 [qemu-img]
   9,0   11  7 0.51236 11151  Q  WS 312740871 + 1017 [qemu-img]
   9,05  1 0.169071519 11151  Q  WS 312741888 + 1023 [qemu-img]
After the patch the pattern becomes normal:
   9,06  1 0.0 12422  Q  WS 314834944 + 1024 [qemu-img]
   9,06  2 0.38527 12422  Q  WS 314835968 + 1024 [qemu-img]
   9,06  3 0.72849 12422  Q  WS 314836992 + 1024 [qemu-img]
   9,06  4 0.000106276 12422  Q  WS 314838016 + 1024 [qemu-img]
and the amount of requests sent to disk (could be calculated counting
number of lines in the output of blktrace) is reduced about 2 times.

Both qemu-img and qemu-io are affected while qemu-kvm is not. The guest
does his job well and real requests comes properly aligned (to page).

Signed-off-by: Denis V. Lunev d...@openvz.org
CC: Paolo Bonzini pbonz...@redhat.com
CC: Kevin Wolf kw...@redhat.com
CC: Stefan Hajnoczi stefa...@redhat.com
---
  block.c   |  8 
  block/io.c|  2 +-
  block/raw-posix.c | 15 +--
  3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index e293907..325f727 100644
--- a/block.c
+++ b/block.c
@@ -106,8 +106,8 @@ int is_windows_drive(const char *filename)
  size_t bdrv_opt_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
-/* 4k should be on the safe side */
-return 4096;
+/* page size or 4k (hdd sector size) should be on the safe side */
+return MAX(4096, getpagesize());
  }
  return bs-bl.opt_mem_alignment;
@@ -116,8 +116,8 @@ size_t bdrv_opt_mem_align(BlockDriverState *bs)
  size_t bdrv_min_mem_align(BlockDriverState *bs)
  {
  if (!bs || !bs-drv) {
-/* 4k should be on the safe side */
-return 4096;
+/* page size or 4k (hdd sector size) should be on the safe side */
+return MAX(4096, getpagesize());
  }
  return bs-bl.min_mem_alignment;
diff --git a/block/io.c b/block/io.c
index 908a3d1..071652c 100644
--- a/block/io.c
+++ b/block/io.c
@@ -205,7 +205,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  bs-bl.opt_mem_alignment = bs-file-bl.opt_mem_alignment;
  } else {
  bs-bl.min_mem_alignment = 512;
-bs-bl.opt_mem_alignment = 512;
+bs-bl.opt_mem_alignment = getpagesize();
  }
  if (bs-backing_hd) {
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 7083924..4659552 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -301,6 +301,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, Error **errp)
  {
  BDRVRawState *s = bs-opaque;
  char *buf;
+size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize());
  /* For /dev/sg devices the alignment is not really used.
 With buffered I/O, we don't have any restrictions. */
@@ -330,9 +331,9 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, Error **errp)
  /* If we could not get the sizes so far, we can only guess them */
  if (!s-buf_align) {
  size_t align;
-buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
-for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
-if (raw_is_io_aligned(fd, buf + align, MAX_BLOCKSIZE)) {
+buf = qemu_memalign(max_align, 2 * max_align);
+for (align = 512; align = max_align; align = 1) {
+if (raw_is_io_aligned(fd, buf + align, max_align)) {
  s-buf_align = align;
  break;
  }
@@ -342,8 +343,8 @@ static void raw_probe_alignment(BlockDriverState *bs, int 
fd, Error **errp)
  if (!bs-request_alignment) {
  size_t align;
-buf = qemu_memalign(s-buf_align, 

Re: [Qemu-block] [PATCH 2/2] block: align bounce buffers to page

2015-05-12 Thread Paolo Bonzini


On 12/05/2015 12:27, Kevin Wolf wrote:
 I think it would make more sense to keep this specific to the raw-posix
 driver. After all, it's only the kernel page cache that we optimise
 here. Other backends probably don't take advantage of page alignment.

I don't think it makes sense to keep it raw-posix-specific, though.
It's not the page cache that we optimize for, because this is with
O_DIRECT.  If anything, making it page aligned means that the buffer
spans one fewer physical page and thus it may economize a bit on TLB misses.

Paolo