Re: [Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking
On 18.11.2013 17:27, Anthony Liguori wrote: On Nov 18, 2013 12:20 AM, Peter Lieven p...@kamp.de mailto:p...@kamp.de wrote: vnc_update_client currently scans the dirty bitmap of each client bitwise which is a very costly operation if only few bits are dirty. vnc_refresh_server_surface does almost the same. this patch optimizes both by utilizing the heavily optimized function find_next_bit to find the offset of the next dirty bit in the dirty bitmaps. Signed-off-by: Peter Lieven p...@kamp.de mailto:p...@kamp.de Can you include performance data? I made some aritificial analysis of vnc_update_client with the attached test code. $ gcc -O2 -o vnc_perf vnc_perf.c $ ./vnc_perf All bits clean - vnc_update_client_new: 0.07 secs vnc_update_client_old: 10.82 secs All bits dirty - vnc_update_client_new: 9.81 secs vnc_update_client_old: 20.00 secs Few bits dirty - vnc_update_client_new: 0.08 secs vnc_update_client_old: 10.62 secs find_and_clear_dirty_height() is still very slow, but I will look at this separately. Peter --- #define ITERATIONS 16*1024 #define VNC_MAX_WIDTH 2560 #define VNC_MAX_HEIGHT 2048 #define VNC_DIRTY_PIXELS_PER_BIT 16 /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */ #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT) /* VNC_DIRTY_BITS_PER_LINE might be greater than VNC_DIRTY_BITS due to alignment */ #define VNC_DIRTY_BITS_PER_LINE(x) (sizeof(x) / VNC_MAX_HEIGHT * BITS_PER_BYTE) #define BITS_PER_BYTE 8 #define BITS_PER_LONG (sizeof (unsigned long) * BITS_PER_BYTE) #define BITOP_WORD(nr)((nr) / BITS_PER_LONG) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define BIT(nr)(1UL (nr)) #define BIT_MASK(nr)(1UL ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr)((nr) / BITS_PER_LONG) #define BITS_TO_LONGS(nr)DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] #define ctzl ctz64 #include stdio.h #include stdint.h #include string.h #include time.h static inline int ctz64(uint64_t val) { return val ? __builtin_ctzll(val) : 64; } DECLARE_BITMAP(dirty[VNC_MAX_HEIGHT], VNC_DIRTY_BITS); unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset ~(BITS_PER_LONG-1); unsigned long tmp; if (offset = size) { return size; } size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp = (~0UL offset); if (size BITS_PER_LONG) { goto found_first; } if (tmp) { goto found_middle; } size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size = 4*BITS_PER_LONG) { unsigned long d1, d2, d3; tmp = *p; d1 = *(p+1); d2 = *(p+2); d3 = *(p+3); if (tmp) { goto found_middle; } if (d1 | d2 | d3) { break; } p += 4; result += 4*BITS_PER_LONG; size -= 4*BITS_PER_LONG; } while (size = BITS_PER_LONG) { if ((tmp = *(p++))) { goto found_middle; } result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) { return result; } tmp = *p; found_first: tmp = (~0UL (BITS_PER_LONG - size)); if (tmp == 0UL) {/* Are any bits set? */ return result + size;/* Nope. */ } found_middle: return result + ctzl(tmp); } static inline int test_bit(int nr, const unsigned long *addr) { return 1UL (addr[BIT_WORD(nr)] (nr (BITS_PER_LONG-1))); } static inline void clear_bit(int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = addr + BIT_WORD(nr); *p = ~mask; } static inline void set_bit(int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = addr + BIT_WORD(nr); *p |= mask; } static inline int test_and_clear_bit(int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = addr + BIT_WORD(nr); unsigned long old = *p; *p = old ~mask; return (old mask) != 0; } static int find_and_clear_dirty_height(int y, int last_x, int x, int height) { int h; for (h = 1; h (height - y); h++) { int tmp_x; if (!test_bit(last_x, dirty[y + h])) { break; } for (tmp_x = last_x; tmp_x x; tmp_x++) { clear_bit(tmp_x, dirty[y + h]); } } return h; } #define VNC_CLIENT_UPDATE_RECT() \ if (last_x != -1) {\ int h = find_and_clear_dirty_height(y, last_x, x, height);\ } void vnc_update_client_new() { int width = VNC_MAX_WIDTH; int height =
Re: [Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking
On 19.11.2013 14:48, Peter Lieven wrote: On 18.11.2013 17:27, Anthony Liguori wrote: On Nov 18, 2013 12:20 AM, Peter Lieven p...@kamp.de mailto:p...@kamp.de wrote: vnc_update_client currently scans the dirty bitmap of each client bitwise which is a very costly operation if only few bits are dirty. vnc_refresh_server_surface does almost the same. this patch optimizes both by utilizing the heavily optimized function find_next_bit to find the offset of the next dirty bit in the dirty bitmaps. Signed-off-by: Peter Lieven p...@kamp.de mailto:p...@kamp.de Can you include performance data? I made some aritificial analysis of vnc_update_client with the attached test code. $ gcc -O2 -o vnc_perf vnc_perf.c $ ./vnc_perf All bits clean - vnc_update_client_new: 0.07 secs vnc_update_client_old: 10.82 secs All bits dirty - vnc_update_client_new: 9.81 secs vnc_update_client_old: 20.00 secs Few bits dirty - vnc_update_client_new: 0.08 secs vnc_update_client_old: 10.62 secs find_and_clear_dirty_height() is still very slow, but I will look at this separately. quite easy, but great effect: replacing: for (tmp_x = last_x; tmp_x x; tmp_x++) { clear_bit(tmp_x, dirty[y + h]); } with: bitmap_clear(dirty[y + h], last_x, x - last_x); in find_and_clear_dirty_height(), yields the following performance ;-) All bits clean - vnc_update_client_new: 0.07 secs vnc_update_client_old: 10.65 secs All bits dirty - vnc_update_client_new: 0.69 secs vnc_update_client_old: 19.86 secs Few bits dirty - vnc_update_client_new: 0.07 secs vnc_update_client_old: 10.69 secs Peter --- #define ITERATIONS 16*1024 #define VNC_MAX_WIDTH 2560 #define VNC_MAX_HEIGHT 2048 #define VNC_DIRTY_PIXELS_PER_BIT 16 /* VNC_DIRTY_BITS is the number of bits in the dirty bitmap. */ #define VNC_DIRTY_BITS (VNC_MAX_WIDTH / VNC_DIRTY_PIXELS_PER_BIT) /* VNC_DIRTY_BITS_PER_LINE might be greater than VNC_DIRTY_BITS due to alignment */ #define VNC_DIRTY_BITS_PER_LINE(x) (sizeof(x) / VNC_MAX_HEIGHT * BITS_PER_BYTE) #define BITS_PER_BYTE 8 #define BITS_PER_LONG (sizeof (unsigned long) * BITS_PER_BYTE) #define BITOP_WORD(nr)((nr) / BITS_PER_LONG) #define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) #define BIT(nr)(1UL (nr)) #define BIT_MASK(nr)(1UL ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr)((nr) / BITS_PER_LONG) #define BITS_TO_LONGS(nr)DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] #define ctzl ctz64 #include stdio.h #include stdint.h #include string.h #include time.h static inline int ctz64(uint64_t val) { return val ? __builtin_ctzll(val) : 64; } DECLARE_BITMAP(dirty[VNC_MAX_HEIGHT], VNC_DIRTY_BITS); unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BITOP_WORD(offset); unsigned long result = offset ~(BITS_PER_LONG-1); unsigned long tmp; if (offset = size) { return size; } size -= result; offset %= BITS_PER_LONG; if (offset) { tmp = *(p++); tmp = (~0UL offset); if (size BITS_PER_LONG) { goto found_first; } if (tmp) { goto found_middle; } size -= BITS_PER_LONG; result += BITS_PER_LONG; } while (size = 4*BITS_PER_LONG) { unsigned long d1, d2, d3; tmp = *p; d1 = *(p+1); d2 = *(p+2); d3 = *(p+3); if (tmp) { goto found_middle; } if (d1 | d2 | d3) { break; } p += 4; result += 4*BITS_PER_LONG; size -= 4*BITS_PER_LONG; } while (size = BITS_PER_LONG) { if ((tmp = *(p++))) { goto found_middle; } result += BITS_PER_LONG; size -= BITS_PER_LONG; } if (!size) { return result; } tmp = *p; found_first: tmp = (~0UL (BITS_PER_LONG - size)); if (tmp == 0UL) {/* Are any bits set? */ return result + size;/* Nope. */ } found_middle: return result + ctzl(tmp); } static inline int test_bit(int nr, const unsigned long *addr) { return 1UL (addr[BIT_WORD(nr)] (nr (BITS_PER_LONG-1))); } static inline void clear_bit(int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = addr + BIT_WORD(nr); *p = ~mask; } static inline void set_bit(int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = addr + BIT_WORD(nr); *p |= mask; } static inline int test_and_clear_bit(int nr, unsigned long *addr) { unsigned long mask = BIT_MASK(nr); unsigned long *p = addr + BIT_WORD(nr); unsigned long old
[Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking
vnc_update_client currently scans the dirty bitmap of each client bitwise which is a very costly operation if only few bits are dirty. vnc_refresh_server_surface does almost the same. this patch optimizes both by utilizing the heavily optimized function find_next_bit to find the offset of the next dirty bit in the dirty bitmaps. Signed-off-by: Peter Lieven p...@kamp.de --- ui/vnc.c | 146 ++ ui/vnc.h |3 ++ 2 files changed, 92 insertions(+), 57 deletions(-) diff --git a/ui/vnc.c b/ui/vnc.c index 67b1f75..edf33be 100644 --- a/ui/vnc.c +++ b/ui/vnc.c @@ -572,6 +572,16 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y) ptr += x * VNC_SERVER_FB_BYTES; return ptr; } +/* this sets only the visible pixels of a dirty bitmap */ +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\ +int x, y;\ +memset(bitmap, 0x00, sizeof(bitmap));\ +for (y = 0; y h; y++) {\ +for (x = 0; x w / VNC_DIRTY_PIXELS_PER_BIT; x++) {\ +set_bit(x, bitmap[y]);\ +} \ +} \ +} static void vnc_dpy_switch(DisplayChangeListener *dcl, DisplaySurface *surface) @@ -597,7 +607,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, qemu_pixman_image_unref(vd-guest.fb); vd-guest.fb = pixman_image_ref(surface-image); vd-guest.format = surface-format; -memset(vd-guest.dirty, 0xFF, sizeof(vd-guest.dirty)); +VNC_SET_VISIBLE_PIXELS_DIRTY(vd-guest.dirty, + surface_width(vd-ds), + surface_height(vd-ds)); QTAILQ_FOREACH(vs, vd-clients, next) { vnc_colordepth(vs); @@ -605,7 +617,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, if (vs-vd-cursor) { vnc_cursor_define(vs); } -memset(vs-dirty, 0xFF, sizeof(vs-dirty)); +VNC_SET_VISIBLE_PIXELS_DIRTY(vs-dirty, + surface_width(vd-ds), + surface_height(vd-ds)); } } @@ -882,6 +896,14 @@ static int vnc_update_client_sync(VncState *vs, int has_dirty) return ret; } +#define VNC_CLIENT_UPDATE_RECT() \ +if (last_x != -1) {\ +int h = find_and_clear_dirty_height(vs, y, last_x, x, height);\ +n += vnc_job_add_rect(job,\ + last_x * VNC_DIRTY_PIXELS_PER_BIT, y,\ + (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, h);\ +} + static int vnc_update_client(VncState *vs, int has_dirty) { if (vs-need_update vs-csock != -1) { @@ -910,35 +932,32 @@ static int vnc_update_client(VncState *vs, int has_dirty) width = MIN(pixman_image_get_width(vd-server), vs-client_width); height = MIN(pixman_image_get_height(vd-server), vs-client_height); -for (y = 0; y height; y++) { +y = 0; +for (;;) { int x; int last_x = -1; -for (x = 0; x width / VNC_DIRTY_PIXELS_PER_BIT; x++) { +unsigned long offset = find_next_bit((unsigned long *) vs-dirty, + height * VNC_DIRTY_BITS_PER_LINE(vs), + y * VNC_DIRTY_BITS_PER_LINE(vs)); +if (offset == height * VNC_DIRTY_BITS_PER_LINE(vs)) { +/* no more dirty bits */ +break; +} +y = offset / VNC_DIRTY_BITS_PER_LINE(vs); + +for (x = offset % VNC_DIRTY_BITS_PER_LINE(vs); + x width / VNC_DIRTY_PIXELS_PER_BIT; x++) { if (test_and_clear_bit(x, vs-dirty[y])) { if (last_x == -1) { last_x = x; } } else { -if (last_x != -1) { -int h = find_and_clear_dirty_height(vs, y, last_x, x, -height); - -n += vnc_job_add_rect(job, - last_x * VNC_DIRTY_PIXELS_PER_BIT, - y, - (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, - h); -} +VNC_CLIENT_UPDATE_RECT(); last_x = -1; } } -if (last_x != -1) { -int h = find_and_clear_dirty_height(vs, y, last_x, x, height); -n += vnc_job_add_rect(job, last_x * VNC_DIRTY_PIXELS_PER_BIT, - y, - (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, - h); -} +VNC_CLIENT_UPDATE_RECT(); +y++; } vnc_job_push(job); @@ -2676,8 +2695,8 @@ static int
Re: [Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking
On Nov 18, 2013 12:20 AM, Peter Lieven p...@kamp.de wrote: vnc_update_client currently scans the dirty bitmap of each client bitwise which is a very costly operation if only few bits are dirty. vnc_refresh_server_surface does almost the same. this patch optimizes both by utilizing the heavily optimized function find_next_bit to find the offset of the next dirty bit in the dirty bitmaps. Signed-off-by: Peter Lieven p...@kamp.de Can you include performance data? Regards, Anthony Liguori --- ui/vnc.c | 146 ++ ui/vnc.h |3 ++ 2 files changed, 92 insertions(+), 57 deletions(-) diff --git a/ui/vnc.c b/ui/vnc.c index 67b1f75..edf33be 100644 --- a/ui/vnc.c +++ b/ui/vnc.c @@ -572,6 +572,16 @@ void *vnc_server_fb_ptr(VncDisplay *vd, int x, int y) ptr += x * VNC_SERVER_FB_BYTES; return ptr; } +/* this sets only the visible pixels of a dirty bitmap */ +#define VNC_SET_VISIBLE_PIXELS_DIRTY(bitmap, w, h) {\ +int x, y;\ +memset(bitmap, 0x00, sizeof(bitmap));\ +for (y = 0; y h; y++) {\ +for (x = 0; x w / VNC_DIRTY_PIXELS_PER_BIT; x++) {\ +set_bit(x, bitmap[y]);\ +} \ +} \ +} static void vnc_dpy_switch(DisplayChangeListener *dcl, DisplaySurface *surface) @@ -597,7 +607,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, qemu_pixman_image_unref(vd-guest.fb); vd-guest.fb = pixman_image_ref(surface-image); vd-guest.format = surface-format; -memset(vd-guest.dirty, 0xFF, sizeof(vd-guest.dirty)); +VNC_SET_VISIBLE_PIXELS_DIRTY(vd-guest.dirty, + surface_width(vd-ds), + surface_height(vd-ds)); QTAILQ_FOREACH(vs, vd-clients, next) { vnc_colordepth(vs); @@ -605,7 +617,9 @@ static void vnc_dpy_switch(DisplayChangeListener *dcl, if (vs-vd-cursor) { vnc_cursor_define(vs); } -memset(vs-dirty, 0xFF, sizeof(vs-dirty)); +VNC_SET_VISIBLE_PIXELS_DIRTY(vs-dirty, + surface_width(vd-ds), + surface_height(vd-ds)); } } @@ -882,6 +896,14 @@ static int vnc_update_client_sync(VncState *vs, int has_dirty) return ret; } +#define VNC_CLIENT_UPDATE_RECT() \ +if (last_x != -1) {\ +int h = find_and_clear_dirty_height(vs, y, last_x, x, height);\ +n += vnc_job_add_rect(job,\ + last_x * VNC_DIRTY_PIXELS_PER_BIT, y,\ + (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, h);\ +} + static int vnc_update_client(VncState *vs, int has_dirty) { if (vs-need_update vs-csock != -1) { @@ -910,35 +932,32 @@ static int vnc_update_client(VncState *vs, int has_dirty) width = MIN(pixman_image_get_width(vd-server), vs-client_width); height = MIN(pixman_image_get_height(vd-server), vs-client_height); -for (y = 0; y height; y++) { +y = 0; +for (;;) { int x; int last_x = -1; -for (x = 0; x width / VNC_DIRTY_PIXELS_PER_BIT; x++) { +unsigned long offset = find_next_bit((unsigned long *) vs-dirty, + height * VNC_DIRTY_BITS_PER_LINE(vs), + y * VNC_DIRTY_BITS_PER_LINE(vs)); +if (offset == height * VNC_DIRTY_BITS_PER_LINE(vs)) { +/* no more dirty bits */ +break; +} +y = offset / VNC_DIRTY_BITS_PER_LINE(vs); + +for (x = offset % VNC_DIRTY_BITS_PER_LINE(vs); + x width / VNC_DIRTY_PIXELS_PER_BIT; x++) { if (test_and_clear_bit(x, vs-dirty[y])) { if (last_x == -1) { last_x = x; } } else { -if (last_x != -1) { -int h = find_and_clear_dirty_height(vs, y, last_x, x, -height); - -n += vnc_job_add_rect(job, - last_x * VNC_DIRTY_PIXELS_PER_BIT, - y, - (x - last_x) * VNC_DIRTY_PIXELS_PER_BIT, - h); -} +VNC_CLIENT_UPDATE_RECT(); last_x = -1; } } -if (last_x != -1) { -int h = find_and_clear_dirty_height(vs, y, last_x, x, height); -n += vnc_job_add_rect(job, last_x * VNC_DIRTY_PIXELS_PER_BIT, - y, - (x - last_x) *
Re: [Qemu-devel] [PATCH 2/3] ui/vnc: optimize dirty bitmap tracking
Am 18.11.2013 17:27, schrieb Anthony Liguori: On Nov 18, 2013 12:20 AM, Peter Lieven p...@kamp.de mailto:p...@kamp.de wrote: vnc_update_client currently scans the dirty bitmap of each client bitwise which is a very costly operation if only few bits are dirty. vnc_refresh_server_surface does almost the same. this patch optimizes both by utilizing the heavily optimized function find_next_bit to find the offset of the next dirty bit in the dirty bitmaps. Signed-off-by: Peter Lieven p...@kamp.de mailto:p...@kamp.de Can you include performance data? I hoped that the checking 32bits (pipelined) at once compared to checking 32bits one-by-one would be convincing enough ;-) Do you have a special test in mind? Otherwise I could try to create an artificial test case with e.g. no bits dirty, all bits dirty, only a few bits dirty (cursor update) and compare the timing for both versions. Peter