Re: [Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking

2013-11-19 Thread Peter Lieven

On 18.11.2013 17:27, Anthony Liguori wrote:



On Nov 18, 2013 12:20 AM, Peter Lieven p...@kamp.de mailto:p...@kamp.de 
wrote:

 vnc_update_client currently scans the dirty bitmap of each client
 bitwise which is a very costly operation if only few bits are dirty.
 vnc_refresh_server_surface does almost the same.
 this patch optimizes both by utilizing the heavily optimized
 function find_next_bit to find the offset of the next dirty
 bit in the dirty bitmaps.

 Signed-off-by: Peter Lieven p...@kamp.de mailto:p...@kamp.de

Can you include performance data?



I made some aritificial analysis of vnc_update_client with the attached test 
code.

$ gcc -O2  -o vnc_perf vnc_perf.c
$ ./vnc_perf
All bits clean - vnc_update_client_new: 0.07 secs
 vnc_update_client_old: 10.82 secs

All bits dirty - vnc_update_client_new: 9.81 secs
 vnc_update_client_old: 20.00 secs

Few bits dirty - vnc_update_client_new: 0.08 secs
 vnc_update_client_old: 10.62 secs

find_and_clear_dirty_height() is still very slow, but I will look at this 
separately.

Peter

---

#define ITERATIONS 16*1024

#define VNC_MAX_WIDTH 2560
#define VNC_MAX_HEIGHT 2048

#define VNC_DIRTY_PIXELS_PER_BIT 16

/* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */
#define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT)

/* VNC_DIRTY_BITS_PER_LINE might be greater than VNC_DIRTY_BITS due to 
alignment */
#define VNC_DIRTY_BITS_PER_LINE(x) (sizeof(x) / VNC_MAX_HEIGHT * BITS_PER_BYTE)

#define BITS_PER_BYTE   8
#define BITS_PER_LONG   (sizeof (unsigned long) * BITS_PER_BYTE)

#define BITOP_WORD(nr)((nr) / BITS_PER_LONG)

#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))

#define BIT(nr)(1UL  (nr))
#define BIT_MASK(nr)(1UL  ((nr) % BITS_PER_LONG))
#define BIT_WORD(nr)((nr) / BITS_PER_LONG)
#define BITS_TO_LONGS(nr)DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))

#define DECLARE_BITMAP(name,bits)  \
unsigned long name[BITS_TO_LONGS(bits)]

#define ctzl ctz64

#include stdio.h
#include stdint.h
#include string.h
#include time.h

static inline int ctz64(uint64_t val)
{
return val ? __builtin_ctzll(val) : 64;
}

DECLARE_BITMAP(dirty[VNC_MAX_HEIGHT], VNC_DIRTY_BITS);

unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
{
const unsigned long *p = addr + BITOP_WORD(offset);
unsigned long result = offset  ~(BITS_PER_LONG-1);
unsigned long tmp;

if (offset = size) {
return size;
}
size -= result;
offset %= BITS_PER_LONG;
if (offset) {
tmp = *(p++);
tmp = (~0UL  offset);
if (size  BITS_PER_LONG) {
goto found_first;
}
if (tmp) {
goto found_middle;
}
size -= BITS_PER_LONG;
result += BITS_PER_LONG;
}
while (size = 4*BITS_PER_LONG) {
unsigned long d1, d2, d3;
tmp = *p;
d1 = *(p+1);
d2 = *(p+2);
d3 = *(p+3);
if (tmp) {
goto found_middle;
}
if (d1 | d2 | d3) {
break;
}
p += 4;
result += 4*BITS_PER_LONG;
size -= 4*BITS_PER_LONG;
}
while (size = BITS_PER_LONG) {
if ((tmp = *(p++))) {
goto found_middle;
}
result += BITS_PER_LONG;
size -= BITS_PER_LONG;
}
if (!size) {
return result;
}
tmp = *p;

found_first:
tmp = (~0UL  (BITS_PER_LONG - size));
if (tmp == 0UL) {/* Are any bits set? */
return result + size;/* Nope. */
}
found_middle:
return result + ctzl(tmp);
}

static inline int test_bit(int nr, const unsigned long *addr)
{
return 1UL  (addr[BIT_WORD(nr)]  (nr  (BITS_PER_LONG-1)));
}

static inline void clear_bit(int nr, unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = addr + BIT_WORD(nr);

*p = ~mask;
}

static inline void set_bit(int nr, unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = addr + BIT_WORD(nr);

*p  |= mask;
}

static inline int test_and_clear_bit(int nr, unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = addr + BIT_WORD(nr);
unsigned long old = *p;

*p = old  ~mask;
return (old  mask) != 0;
}

static int find_and_clear_dirty_height(int y, int last_x, int x, int height)
{
int h;

for (h = 1; h  (height - y); h++) {
int tmp_x;
if (!test_bit(last_x, dirty[y + h])) {
break;
}
for (tmp_x = last_x; tmp_x  x; tmp_x++) {
clear_bit(tmp_x, dirty[y + h]);
}
}

return h;
}

#define VNC_CLIENT_UPDATE_RECT() \
if (last_x != -1) {\
int h = find_and_clear_dirty_height(y, last_x, x, height);\
}

void vnc_update_client_new() {
int width = VNC_MAX_WIDTH;
int height = 

Re: [Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking

2013-11-19 Thread Peter Lieven

On 19.11.2013 14:48, Peter Lieven wrote:

On 18.11.2013 17:27, Anthony Liguori wrote:



On Nov 18, 2013 12:20 AM, Peter Lieven p...@kamp.de mailto:p...@kamp.de 
wrote:

 vnc_update_client currently scans the dirty bitmap of each client
 bitwise which is a very costly operation if only few bits are dirty.
 vnc_refresh_server_surface does almost the same.
 this patch optimizes both by utilizing the heavily optimized
 function find_next_bit to find the offset of the next dirty
 bit in the dirty bitmaps.

 Signed-off-by: Peter Lieven p...@kamp.de mailto:p...@kamp.de

Can you include performance data?



I made some aritificial analysis of vnc_update_client with the attached test 
code.

$ gcc -O2  -o vnc_perf vnc_perf.c
$ ./vnc_perf
All bits clean - vnc_update_client_new: 0.07 secs
 vnc_update_client_old: 10.82 secs

All bits dirty - vnc_update_client_new: 9.81 secs
 vnc_update_client_old: 20.00 secs

Few bits dirty - vnc_update_client_new: 0.08 secs
 vnc_update_client_old: 10.62 secs

find_and_clear_dirty_height() is still very slow, but I will look at this 
separately.

quite easy, but great effect:

replacing:

 for (tmp_x = last_x; tmp_x  x; tmp_x++) {
 clear_bit(tmp_x, dirty[y + h]);
 }

with:

bitmap_clear(dirty[y + h], last_x, x - last_x);

in find_and_clear_dirty_height(), yields the following performance ;-)

All bits clean - vnc_update_client_new: 0.07 secs
 vnc_update_client_old: 10.65 secs

All bits dirty - vnc_update_client_new: 0.69 secs
 vnc_update_client_old: 19.86 secs

Few bits dirty - vnc_update_client_new: 0.07 secs
 vnc_update_client_old: 10.69 secs



Peter

---

#define ITERATIONS 16*1024

#define VNC_MAX_WIDTH 2560
#define VNC_MAX_HEIGHT 2048

#define VNC_DIRTY_PIXELS_PER_BIT 16

/* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */
#define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT)

/* VNC_DIRTY_BITS_PER_LINE might be greater than VNC_DIRTY_BITS due to 
alignment */
#define VNC_DIRTY_BITS_PER_LINE(x) (sizeof(x) / VNC_MAX_HEIGHT * BITS_PER_BYTE)

#define BITS_PER_BYTE   8
#define BITS_PER_LONG   (sizeof (unsigned long) * BITS_PER_BYTE)

#define BITOP_WORD(nr)((nr) / BITS_PER_LONG)

#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))

#define BIT(nr)(1UL  (nr))
#define BIT_MASK(nr)(1UL  ((nr) % BITS_PER_LONG))
#define BIT_WORD(nr)((nr) / BITS_PER_LONG)
#define BITS_TO_LONGS(nr)DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))

#define DECLARE_BITMAP(name,bits)  \
unsigned long name[BITS_TO_LONGS(bits)]

#define ctzl ctz64

#include stdio.h
#include stdint.h
#include string.h
#include time.h

static inline int ctz64(uint64_t val)
{
return val ? __builtin_ctzll(val) : 64;
}

DECLARE_BITMAP(dirty[VNC_MAX_HEIGHT], VNC_DIRTY_BITS);

unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
unsigned long offset)
{
const unsigned long *p = addr + BITOP_WORD(offset);
unsigned long result = offset  ~(BITS_PER_LONG-1);
unsigned long tmp;

if (offset = size) {
return size;
}
size -= result;
offset %= BITS_PER_LONG;
if (offset) {
tmp = *(p++);
tmp = (~0UL  offset);
if (size  BITS_PER_LONG) {
goto found_first;
}
if (tmp) {
goto found_middle;
}
size -= BITS_PER_LONG;
result += BITS_PER_LONG;
}
while (size = 4*BITS_PER_LONG) {
unsigned long d1, d2, d3;
tmp = *p;
d1 = *(p+1);
d2 = *(p+2);
d3 = *(p+3);
if (tmp) {
goto found_middle;
}
if (d1 | d2 | d3) {
break;
}
p += 4;
result += 4*BITS_PER_LONG;
size -= 4*BITS_PER_LONG;
}
while (size = BITS_PER_LONG) {
if ((tmp = *(p++))) {
goto found_middle;
}
result += BITS_PER_LONG;
size -= BITS_PER_LONG;
}
if (!size) {
return result;
}
tmp = *p;

found_first:
tmp = (~0UL  (BITS_PER_LONG - size));
if (tmp == 0UL) {/* Are any bits set? */
return result + size;/* Nope. */
}
found_middle:
return result + ctzl(tmp);
}

static inline int test_bit(int nr, const unsigned long *addr)
{
return 1UL  (addr[BIT_WORD(nr)]  (nr  (BITS_PER_LONG-1)));
}

static inline void clear_bit(int nr, unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = addr + BIT_WORD(nr);

*p = ~mask;
}

static inline void set_bit(int nr, unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = addr + BIT_WORD(nr);

*p  |= mask;
}

static inline int test_and_clear_bit(int nr, unsigned long *addr)
{
unsigned long mask = BIT_MASK(nr);
unsigned long *p = addr + BIT_WORD(nr);
unsigned long old 

[Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking

2013-11-18 Thread Peter Lieven
vnc_update_client currently scans the dirty bitmap of each client
bitwise which is a very costly operation if only few bits are dirty.
vnc_refresh_server_surface does almost the same.
this patch optimizes both by utilizing the heavily optimized
function find_next_bit to find the offset of the next dirty
bit in the dirty bitmaps.

Signed-off-by: Peter Lieven p...@kamp.de
---
 ui/vnc.c |  146 ++
 ui/vnc.h |3 ++
 2 files changed, 92 insertions(+), 57 deletions(-)

diff --git a/ui/vnc.c b/ui/vnc.c
index 67b1f75..edf33be 100644
--- a/ui/vnc.c
+++ b/ui/vnc.c
@@ -572,6 +572,16 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y)
 ptr += x * VNC_SERVER_FB_BYTES;
 return ptr;
 }
+/* this sets only the visible pixels of a dirty bitmap */
+#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\
+int x, y;\
+memset(bitmap, 0x00, sizeof(bitmap));\
+for (y = 0; y  h; y++) {\
+for (x = 0; x  w / VNC_DIRTY_PIXELS_PER_BIT; x++) {\
+set_bit(x, bitmap[y]);\
+} \
+} \
+}
 
 static void vnc_dpy_switch(DisplayChangeListener *dcl,
DisplaySurface *surface)
@@ -597,7 +607,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
 qemu_pixman_image_unref(vd-guest.fb);
 vd-guest.fb = pixman_image_ref(surface-image);
 vd-guest.format = surface-format;
-memset(vd-guest.dirty, 0xFF, sizeof(vd-guest.dirty));
+VNC_SET_VISIBLE_PIXELS_DIRTY(vd-guest.dirty,
+ surface_width(vd-ds),
+ surface_height(vd-ds));
 
 QTAILQ_FOREACH(vs, vd-clients, next) {
 vnc_colordepth(vs);
@@ -605,7 +617,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
 if (vs-vd-cursor) {
 vnc_cursor_define(vs);
 }
-memset(vs-dirty, 0xFF, sizeof(vs-dirty));
+VNC_SET_VISIBLE_PIXELS_DIRTY(vs-dirty,
+ surface_width(vd-ds),
+ surface_height(vd-ds));
 }
 }
 
@@ -882,6 +896,14 @@ static int vnc_update_client_sync(VncState *vs, int 
has_dirty)
 return ret;
 }
 
+#define VNC_CLIENT_UPDATE_RECT() \
+if (last_x != -1) {\
+int h = find_and_clear_dirty_height(vs, y, last_x, x, height);\
+n += vnc_job_add_rect(job,\
+  last_x * VNC_DIRTY_PIXELS_PER_BIT, y,\
+  (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, h);\
+}
+
 static int vnc_update_client(VncState *vs, int has_dirty)
 {
 if (vs-need_update  vs-csock != -1) {
@@ -910,35 +932,32 @@ static int vnc_update_client(VncState *vs, int has_dirty)
 width = MIN(pixman_image_get_width(vd-server), vs-client_width);
 height = MIN(pixman_image_get_height(vd-server), vs-client_height);
 
-for (y = 0; y  height; y++) {
+y = 0;
+for (;;) {
 int x;
 int last_x = -1;
-for (x = 0; x  width / VNC_DIRTY_PIXELS_PER_BIT; x++) {
+unsigned long offset = find_next_bit((unsigned long *) vs-dirty,
+ height * 
VNC_DIRTY_BITS_PER_LINE(vs),
+ y * 
VNC_DIRTY_BITS_PER_LINE(vs));
+if (offset == height * VNC_DIRTY_BITS_PER_LINE(vs)) {
+/* no more dirty bits */
+break;
+}
+y = offset / VNC_DIRTY_BITS_PER_LINE(vs);
+
+for (x = offset % VNC_DIRTY_BITS_PER_LINE(vs);
+ x  width / VNC_DIRTY_PIXELS_PER_BIT; x++) {
 if (test_and_clear_bit(x, vs-dirty[y])) {
 if (last_x == -1) {
 last_x = x;
 }
 } else {
-if (last_x != -1) {
-int h = find_and_clear_dirty_height(vs, y, last_x, x,
-height);
-
-n += vnc_job_add_rect(job,
-  last_x * 
VNC_DIRTY_PIXELS_PER_BIT,
-  y,
-  (x - last_x) * 
VNC_DIRTY_PIXELS_PER_BIT,
-  h);
-}
+VNC_CLIENT_UPDATE_RECT();
 last_x = -1;
 }
 }
-if (last_x != -1) {
-int h = find_and_clear_dirty_height(vs, y, last_x, x, height);
-n += vnc_job_add_rect(job, last_x * VNC_DIRTY_PIXELS_PER_BIT,
-  y,
-  (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT,
-  h);
-}
+VNC_CLIENT_UPDATE_RECT();
+y++;
 }
 
 vnc_job_push(job);
@@ -2676,8 +2695,8 @@ static int 

Re: [Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking

2013-11-18 Thread Anthony Liguori
On Nov 18, 2013 12:20 AM, Peter Lieven p...@kamp.de wrote:

 vnc_update_client currently scans the dirty bitmap of each client
 bitwise which is a very costly operation if only few bits are dirty.
 vnc_refresh_server_surface does almost the same.
 this patch optimizes both by utilizing the heavily optimized
 function find_next_bit to find the offset of the next dirty
 bit in the dirty bitmaps.

 Signed-off-by: Peter Lieven p...@kamp.de

Can you include performance data?

Regards,

Anthony Liguori

 ---
  ui/vnc.c |  146
++
  ui/vnc.h |3 ++
  2 files changed, 92 insertions(+), 57 deletions(-)

 diff --git a/ui/vnc.c b/ui/vnc.c
 index 67b1f75..edf33be 100644
 --- a/ui/vnc.c
 +++ b/ui/vnc.c
 @@ -572,6 +572,16 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y)
  ptr += x * VNC_SERVER_FB_BYTES;
  return ptr;
  }
 +/* this sets only the visible pixels of a dirty bitmap */
 +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\
 +int x, y;\
 +memset(bitmap, 0x00, sizeof(bitmap));\
 +for (y = 0; y  h; y++) {\
 +for (x = 0; x  w / VNC_DIRTY_PIXELS_PER_BIT; x++) {\
 +set_bit(x, bitmap[y]);\
 +} \
 +} \
 +}

  static void vnc_dpy_switch(DisplayChangeListener *dcl,
 DisplaySurface *surface)
 @@ -597,7 +607,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
  qemu_pixman_image_unref(vd-guest.fb);
  vd-guest.fb = pixman_image_ref(surface-image);
  vd-guest.format = surface-format;
 -memset(vd-guest.dirty, 0xFF, sizeof(vd-guest.dirty));
 +VNC_SET_VISIBLE_PIXELS_DIRTY(vd-guest.dirty,
 + surface_width(vd-ds),
 + surface_height(vd-ds));

  QTAILQ_FOREACH(vs, vd-clients, next) {
  vnc_colordepth(vs);
 @@ -605,7 +617,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl,
  if (vs-vd-cursor) {
  vnc_cursor_define(vs);
  }
 -memset(vs-dirty, 0xFF, sizeof(vs-dirty));
 +VNC_SET_VISIBLE_PIXELS_DIRTY(vs-dirty,
 + surface_width(vd-ds),
 + surface_height(vd-ds));
  }
  }

 @@ -882,6 +896,14 @@ static int vnc_update_client_sync(VncState *vs, int
has_dirty)
  return ret;
  }

 +#define VNC_CLIENT_UPDATE_RECT() \
 +if (last_x != -1) {\
 +int h = find_and_clear_dirty_height(vs, y, last_x, x, height);\
 +n += vnc_job_add_rect(job,\
 +  last_x * VNC_DIRTY_PIXELS_PER_BIT, y,\
 +  (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, h);\
 +}
 +
  static int vnc_update_client(VncState *vs, int has_dirty)
  {
  if (vs-need_update  vs-csock != -1) {
 @@ -910,35 +932,32 @@ static int vnc_update_client(VncState *vs, int
has_dirty)
  width = MIN(pixman_image_get_width(vd-server),
vs-client_width);
  height = MIN(pixman_image_get_height(vd-server),
vs-client_height);

 -for (y = 0; y  height; y++) {
 +y = 0;
 +for (;;) {
  int x;
  int last_x = -1;
 -for (x = 0; x  width / VNC_DIRTY_PIXELS_PER_BIT; x++) {
 +unsigned long offset = find_next_bit((unsigned long *)
vs-dirty,
 + height *
VNC_DIRTY_BITS_PER_LINE(vs),
 + y *
VNC_DIRTY_BITS_PER_LINE(vs));
 +if (offset == height * VNC_DIRTY_BITS_PER_LINE(vs)) {
 +/* no more dirty bits */
 +break;
 +}
 +y = offset / VNC_DIRTY_BITS_PER_LINE(vs);
 +
 +for (x = offset % VNC_DIRTY_BITS_PER_LINE(vs);
 + x  width / VNC_DIRTY_PIXELS_PER_BIT; x++) {
  if (test_and_clear_bit(x, vs-dirty[y])) {
  if (last_x == -1) {
  last_x = x;
  }
  } else {
 -if (last_x != -1) {
 -int h = find_and_clear_dirty_height(vs, y,
last_x, x,
 -height);
 -
 -n += vnc_job_add_rect(job,
 -  last_x *
VNC_DIRTY_PIXELS_PER_BIT,
 -  y,
 -  (x - last_x) *
VNC_DIRTY_PIXELS_PER_BIT,
 -  h);
 -}
 +VNC_CLIENT_UPDATE_RECT();
  last_x = -1;
  }
  }
 -if (last_x != -1) {
 -int h = find_and_clear_dirty_height(vs, y, last_x, x,
height);
 -n += vnc_job_add_rect(job, last_x *
VNC_DIRTY_PIXELS_PER_BIT,
 -  y,
 -  (x - last_x) *

Re: [Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking

2013-11-18 Thread Peter Lieven
Am 18.11.2013 17:27, schrieb Anthony Liguori:


 On Nov 18, 2013 12:20 AM, Peter Lieven p...@kamp.de mailto:p...@kamp.de 
 wrote:
 
  vnc_update_client currently scans the dirty bitmap of each client
  bitwise which is a very costly operation if only few bits are dirty.
  vnc_refresh_server_surface does almost the same.
  this patch optimizes both by utilizing the heavily optimized
  function find_next_bit to find the offset of the next dirty
  bit in the dirty bitmaps.
 
  Signed-off-by: Peter Lieven p...@kamp.de mailto:p...@kamp.de

 Can you include performance data?

I hoped that the checking 32bits (pipelined) at once compared to
checking 32bits one-by-one would be convincing enough ;-)

Do you have a special test in mind? Otherwise I could try
to create an artificial test case with e.g. no bits dirty, all bits
dirty, only a few bits dirty (cursor update) and compare the
timing for both versions.

Peter