Re: [RFC] vhost-blk implementation

2010-04-05 Thread Stefan Hajnoczi
On Mon, Mar 29, 2010 at 4:41 PM, Badari Pulavarty pbad...@us.ibm.com wrote:
 +static void handle_io_work(struct work_struct *work)
 +{
 +       struct vhost_blk_io *vbio;
 +       struct vhost_virtqueue *vq;
 +       struct vhost_blk *blk;
 +       int i, ret = 0;
 +       loff_t pos;
 +       uint8_t status = 0;
 +
 +       vbio = container_of(work, struct vhost_blk_io, work);
 +       blk = vbio-blk;
 +       vq = blk-dev.vqs[0];
 +       pos = vbio-sector  8;
 +
 +       use_mm(blk-dev.mm);
 +
 +       if (vbio-type  VIRTIO_BLK_T_FLUSH)  {
 +               ret = vfs_fsync(vbio-file, vbio-file-f_path.dentry, 1);
 +       } else if (vbio-type  VIRTIO_BLK_T_OUT) {
 +               ret = vfs_writev(vbio-file, vbio-iov, vbio-nvecs, pos);
 +       } else {
 +               ret = vfs_readv(vbio-file, vbio-iov, vbio-nvecs, pos);
 +       }
 +
 +       status = (ret  0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
 +       if (copy_to_user(vbio-iov[vbio-nvecs].iov_base, status, sizeof 
 status)  0) {
 +               printk(copy to user failed\n);
 +               vhost_discard_vq_desc(vq);
 +               unuse_mm(blk-dev.mm);
 +               return;

Do you need to kfree(vbio) here?

 +static long vhost_blk_set_backend(struct vhost_blk *n, unsigned index, int 
 fd)
 +{
 +       struct file *file;
 +       struct vhost_virtqueue *vq;
 +
 +       file = fget(fd);
 +       if (!file)
 +               return -EBADF;
 +
 +       vq = n-vqs + index;
 +       mutex_lock(vq-mutex);
 +       rcu_assign_pointer(vq-private_data, file);
 +       mutex_unlock(vq-mutex);
 +       return 0;
 +}
 +
 +
 +static long vhost_blk_ioctl(struct file *f, unsigned int ioctl,
 +                            unsigned long arg)
 +{
 +       struct vhost_blk *n = f-private_data;
 +       void __user *argp = (void __user *)arg;
 +       struct vhost_vring_file backend;
 +       int r;
 +
 +       switch (ioctl) {
 +        case VHOST_NET_SET_BACKEND:
 +               r = copy_from_user(backend, argp, sizeof backend);
 +               if (r  0)
 +                       return r;
 +               return vhost_blk_set_backend(n, backend.index, backend.fd);

I don't see backend.index being checked against VHOST_BLK_VQ_MAX.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] vhost-blk implementation

2010-04-08 Thread Stefan Hajnoczi
On Fri, Mar 26, 2010 at 6:53 PM, Eran Rom er...@il.ibm.com wrote:
 Christoph Hellwig hch at infradead.org writes:


 Ok.  cache=writeback performance is something I haven't bothered looking
 at at all.  For cache=none any streaming write or random workload with
 large enough record sizes got basically the same performance as native
 using kernel aio, and same for write but slightly degraded for reads
 using the thread pool.  See my attached JLS presentation for some
 numbers.

 Looks like the presentation did not make it...

I am interested in the JLS presentation too.  Here is what I found,
hope it's the one you meant, Christoph:

http://events.linuxfoundation.org/images/stories/slides/jls09/jls09_hellwig.odp

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [GSoC 2010] Pass-through filesystem support.

2010-04-08 Thread Stefan Hajnoczi
On Thu, Apr 8, 2010 at 5:02 PM, Mohammed Gamal m.gamal...@gmail.com wrote:
 On Thu, Apr 8, 2010 at 6:01 PM, Mohammed Gamal m.gamal...@gmail.com wrote:
 1- What does the community prefer to use and improve? CIFS, 9p, or
 both? And which is better taken up for GSoC.

There have been recent patches for filesystem passthrough using 9P:

http://www.mail-archive.com/qemu-de...@nongnu.org/msg28100.html

You might want to consider them if you haven't seen them already.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: Enhance the coalesced_mmio_write() parameter to avoid stack buffer overflow

2010-04-12 Thread Stefan Hajnoczi
Does len need to be int?  Perhaps it should be unsigned int?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 04/20] Make QEMUFile buf expandable, and introduce qemu_realloc_buffer() and qemu_clear_buffer().

2010-04-21 Thread Stefan Hajnoczi
On Wed, Apr 21, 2010 at 6:57 AM, Yoshiaki Tamura
tamura.yoshi...@lab.ntt.co.jp wrote:
 @@ -454,6 +458,25 @@ void qemu_fflush(QEMUFile *f)
     }
  }

 +void *qemu_realloc_buffer(QEMUFile *f, int size)
 +{
 +    f-buf_max_size = size;
 +
 +    f-buf = qemu_realloc(f-buf, f-buf_max_size);
 +    if (f-buf == NULL) {
 +        fprintf(stderr, qemu file buffer realloc failed\n);
 +        exit(1);
 +    }
 +
 +    return f-buf;
 +}
 +

qemu_realloc() will abort() if there was not enough memory to realloc.
 Just like qemu_malloc(), you don't need to check for NULL.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] block: Free iovec arrays allocated by multiwrite_merge()

2010-04-21 Thread Stefan Hajnoczi
A new iovec array is allocated when creating a merged write request.
This patch ensures that the iovec array is deleted in addition to its
qiov owner.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 block.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/block.c b/block.c
index e891544..2d31474 100644
--- a/block.c
+++ b/block.c
@@ -1731,6 +1731,9 @@ static void multiwrite_user_cb(MultiwriteCB *mcb)
 
 for (i = 0; i  mcb-num_callbacks; i++) {
 mcb-callbacks[i].cb(mcb-callbacks[i].opaque, mcb-error);
+if (mcb-callbacks[i].free_qiov) {
+qemu_iovec_destroy(mcb-callbacks[i].free_qiov);
+}
 qemu_free(mcb-callbacks[i].free_qiov);
 qemu_vfree(mcb-callbacks[i].free_buf);
 }
-- 
1.7.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Huge memory leak in virtio, see kvm-Bugs-2989366

2010-04-21 Thread Stefan Hajnoczi
Leszek,
Please try the qemu-kvm.git patch I have sent called block: Free
iovec arrays allocated by multiwrite_merge() to confirm that it fixes
the leak.

Thanks,
Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH][STABLE] block: Free iovec arrays allocated by multiwrite_merge()

2010-04-21 Thread Stefan Hajnoczi
A new iovec array is allocated when creating a merged write request.
This patch ensures that the iovec array is deleted in addition to its
qiov owner.

Reported-by: Leszek Urbanski tyg...@moo.pl
Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---

This fixes the virtio-blk memory leak that has recently been reported by Leszek
Urbanski tyg...@moo.pl.

The patch should apply to qemu.git and qemu-kvm.git.  I'm proposing this patch
for both qemu and qemu-kvm and their stable branches.  Sorry if the CCs are
overkill, please let me know so I can follow the process better next time.

 block.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/block.c b/block.c
index 0881c93..99dd0f3 100644
--- a/block.c
+++ b/block.c
@@ -1731,6 +1731,9 @@ static void multiwrite_user_cb(MultiwriteCB *mcb)
 
 for (i = 0; i  mcb-num_callbacks; i++) {
 mcb-callbacks[i].cb(mcb-callbacks[i].opaque, mcb-error);
+if (mcb-callbacks[i].free_qiov) {
+qemu_iovec_destroy(mcb-callbacks[i].free_qiov);
+}
 qemu_free(mcb-callbacks[i].free_qiov);
 qemu_vfree(mcb-callbacks[i].free_buf);
 }
-- 
1.7.0

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PXE Boot Timeout Issue...

2010-04-23 Thread Stefan Hajnoczi
On Fri, Apr 23, 2010 at 1:45 AM, Stuart Sheldon s...@actusa.net wrote:
 Just upgraded to 12.3 user space tools from 11.0, and now when I attempt
 to netboot a guest, it appears that the pxe rom is timing out on dhcp
 before the bridge has enough time to come up.

 Is there a command line switch to set the dhcp timeout, or a build
 option that can be changed to set the timeout to a longer value, or
 disable it entirely?

The bridge shouldn't need significant amounts of time to come up.  Can
you describe the networking setup?  Are you using libvirt and with
what network config?

If you have a bridge configured, can you show the output of:

$ sudo brctl showstp $bridge_name

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: PXE Boot Timeout Issue...

2010-04-23 Thread Stefan Hajnoczi
For reference, my libvirt managed virbr0 has forwarding delay 0.  This
is the default:

http://libvirt.org/formatnetwork.html#elementsConnect

I know that my VM is a leaf node (it only has one NIC and isn't going
to create a loop in the network) and therefore it makes sense to
eliminate the forwarding delay entirely.

You might be interested in this link about STP delay on physical networks:

http://www.cisco.com/en/US/products/hw/switches/ps700/products_tech_note09186a00800b1500.shtml

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] block: Free iovec arrays allocated by multiwrite_merge()

2010-04-25 Thread Stefan Hajnoczi
From: Stefan Hajnoczi stefa...@gmail.com

The MALLOC_TRACE output didn't look useful when I tried it either.

Instead I used the following to find origin of the leak.  Still very basic but
works better with qemu_malloc() and friends.

This is just a hack but I wanted to share it in case someone finds it useful in
the future.

---
 Makefile.objs |2 +-
 leakcheck.c   |   17 +++
 leakcheck.py  |   63 +
 osdep.c   |7 +-
 qemu-malloc.c |   26 +++
 5 files changed, 108 insertions(+), 7 deletions(-)
 create mode 100644 leakcheck.c
 create mode 100755 leakcheck.py

diff --git a/Makefile.objs b/Makefile.objs
index 59ec879..82a4fac 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -7,7 +7,7 @@ qobject-obj-y += qerror.o
 ###
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
-block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
+block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o 
leakcheck.o
 block-obj-y += nbd.o block.o aio.o aes.o osdep.o
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
diff --git a/leakcheck.c b/leakcheck.c
new file mode 100644
index 000..a5fa51a
--- /dev/null
+++ b/leakcheck.c
@@ -0,0 +1,17 @@
+#include stdio.h
+
+static FILE *fp;
+
+extern void leakcheck_log(char action, void *old_addr, void *addr, size_t 
size, void *ret1);
+
+void leakcheck_log(char action, void *old_addr, void *addr, size_t size, void 
*ret1)
+{
+   if (!fp) {
+   fp = fopen(/tmp/leakcheck.log, w);
+   if (!fp) {
+   return;
+   }
+   }
+
+   fprintf(fp, %c %p %p %zd %p\n, action, old_addr, addr, size, ret1);
+}
diff --git a/leakcheck.py b/leakcheck.py
new file mode 100755
index 000..64b1a1b
--- /dev/null
+++ b/leakcheck.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+import sys
+
+class Event(object):
+def __init__(self, num, action, old_addr, addr, size, ret_addr):
+self.num = num
+self.action = action
+self.old_addr = old_addr
+self.addr = addr
+self.size = size
+self.ret_addr = ret_addr
+
+def __str__(self):
+return '%d %s %s %s %s %s' % (self.num, self.action, self.old_addr, 
self.addr, self.size, self.ret_addr)
+
+def malloc(event):
+if event.addr in allocs:
+sys.stderr.write('malloc returned duplicate address from %s\n' % event)
+allocs[event.addr] = event
+
+def free(event):
+if event.addr == '(nil)':
+return
+if event.addr not in allocs:
+sys.stderr.write('free of unallocated address from %s\n' % event)
+return
+malloc_event = allocs[event.addr]
+del allocs[event.addr]
+if (malloc_event.action in 'msz' and event.action == 'f') or \
+   (malloc_event.action == 'a' and event.action == 'v'):
+return
+sys.stderr.write('mismatched actions for %s and %s\n' % (malloc_event, 
event))
+
+def realloc(event):
+free(Event(event.num, 'f', event.old_addr, '(nil)', 0, event.ret_addr))
+malloc(Event(event.num, 'm', '(nil)', event.addr, event.size, 
event.ret_addr))
+
+allocs = {}
+watermark = 0
+event_num = 0
+for line in sys.stdin:
+event_num += 1
+
+cmd = line.strip()
+if cmd == 'watermark':
+watermark = event_num
+continue
+
+action, old_addr, addr, size, ret_addr = cmd.split()
+event = Event(event_num, action, old_addr, addr, size, ret_addr)
+if action in 'amsz':
+malloc(event)
+elif action in 'fv':
+free(event)
+elif action == 'r':
+realloc(event)
+else:
+sys.stderr.write('invalid action %c\n' % action)
+sys.exit(1)
+
+for event in sorted(allocs.itervalues(), key=lambda e: e.num):
+if event.num  watermark:
+print event
diff --git a/osdep.c b/osdep.c
index 8a710e7..40788e5 100644
--- a/osdep.c
+++ b/osdep.c
@@ -95,6 +95,8 @@ void qemu_vfree(void *ptr)
 
 #else
 
+extern void leakcheck_log(char action, void *old_addr, void *addr, size_t 
size, void *ret1);
+
 void *qemu_memalign(size_t alignment, size_t size)
 {
 #if defined(_POSIX_C_SOURCE)  !defined(__sun__)
@@ -110,7 +112,9 @@ void *qemu_memalign(size_t alignment, size_t size)
 #elif defined(CONFIG_BSD)
 return oom_check(valloc(size));
 #else
-return oom_check(memalign(alignment, size));
+void *p = oom_check(memalign(alignment, size));
+leakcheck_log('a', NULL, p, size, __builtin_return_address(0));
+return p;
 #endif
 }
 
@@ -126,6 +130,7 @@ void *qemu_vmalloc(size_t size)
 
 void qemu_vfree(void *ptr)
 {
+leakcheck_log('v', NULL, ptr, 0, __builtin_return_address(0));
 free(ptr);
 }
 
diff --git a/qemu-malloc.c b/qemu-malloc.c
index 6cdc5de..bf832f2 100644
--- a/qemu-malloc.c
+++ b/qemu-malloc.c
@@ -24,6 +24,8 @@
 #include qemu-common.h
 #include

Re: Potential thread synchronization issue in qcow2.c and qcow2-cluster.c

2010-04-30 Thread Stefan Hajnoczi
 I profiled all executions of
 qemu_mutex_lock_iothread(), and found that
 it only protects the vl.c:main_loop_wai() thread but does NOT protect
 the qemu-kvm.c:kvm_cpu_exec() thread. Did I miss something or is this
 a defect?

Hi again, I took another look at qemu-kvm 0.12.3 and here is how I read it:

The mutex which is supposed to protect IO emulation is qemu-kvm.c:qemu_mutex.

The cpu thread will unlock qemu_mutex in pre_kvm_run() before
ioctl(fd, KVM_RUN, 0).  Then it will lock qemu_mutex again in
post_kvm_run().

The io thread will unlock qemu_mutex via
qemu-kvm.c:qemu_mutex_unlock_iothread() before waiting in select().
Then it will lock qemu_mutex again in
qemu-kvm.c:qemu_mutex_lock_iothread().

I believe this *does* protect IO emulation correctly.  The code is
confusing because there are multiple definitions of the same functions
and #ifdefs, maybe I made a mistake.

 Here is the trace showing that
 qemu_mutex_lock_iothread() does not protect the thread
 that executes. kvm_cpu_exec()-...-qcow_aio_write_cb().

 home/ctang/kvm/qemu-kvm-0.12.3/qemu-kvm.c : 2530    thread: b7e056d0
       /home/ctang/kvm/bin/qemu-system-x86_64(qemu_mutex_unlock_iothread+0x1a)
 [0x8092242]
       /home/ctang/kvm/bin/qemu-system-x86_64(main_loop_wait+0x221) [0x806edef]
       /home/ctang/kvm/bin/qemu-system-x86_64(kvm_main_loop+0x1ff) [0x80916a1]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x806f5c2]
       /home/ctang/kvm/bin/qemu-system-x86_64(main+0x2e2c) [0x80736d1]
       /lib/tls/i686/cmov/libc.so.6(__libc_start_main+0xe5) [0xb7e33775]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x8068bb1]

 block/qcow2-cluster.c : 721    thread: b7dc2b90
       /home/ctang/kvm/bin/qemu-system-x86_64(qcow2_alloc_cluster_offset+0x3c)
 [0x81175fa]
       /home/ctang/kvm/bin/qemu-system-x86_64(qcow_aio_write_cb+0x158)
 [0x8111d73]
       /home/ctang/kvm/bin/qemu-system-x86_64(qcow_aio_writev+0x94) [0x8112054]
       /home/ctang/kvm/bin/qemu-system-x86_64(bdrv_aio_writev+0xe1) [0x80fa8e9]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x81f4a96]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x81f4c04]
       /home/ctang/kvm/bin/qemu-system-x86_64(dma_bdrv_write+0x48) [0x81f4cbf]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x80a437c]
       /home/ctang/kvm/bin/qemu-system-x86_64(bmdma_cmd_writeb+0x73)
 [0x80a9503]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x812b1eb]
       /home/ctang/kvm/bin/qemu-system-x86_64(cpu_outb+0x27) [0x812b4e6]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x808d267]
       /home/ctang/kvm/bin/qemu-system-x86_64(kvm_run+0x2f4) [0x808f4b8]
       /home/ctang/kvm/bin/qemu-system-x86_64(kvm_cpu_exec+0x56) [0x80907b2]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x8090f4d]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x8091098]
       /lib/tls/i686/cmov/libpthread.so.0 [0xb7fd24ff]
       /lib/tls/i686/cmov/libc.so.6(clone+0x5e) [0xb7f0149e]

 /home/ctang/kvm/qemu-kvm-0.12.3/qemu-kvm.c : 2537    thread: b7e056d0
       /home/ctang/kvm/bin/qemu-system-x86_64(qemu_mutex_lock_iothread+0x1a)
 [0x809229d]
       /home/ctang/kvm/bin/qemu-system-x86_64(main_loop_wait+0x25c) [0x806ee2a]
       /home/ctang/kvm/bin/qemu-system-x86_64(kvm_main_loop+0x1ff) [0x80916a1]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x806f5c2]
       /home/ctang/kvm/bin/qemu-system-x86_64(main+0x2e2c) [0x80736d1]
       /lib/tls/i686/cmov/libc.so.6(__libc_start_main+0xe5) [0xb7e33775]
       /home/ctang/kvm/bin/qemu-system-x86_64 [0x8068bb1]

kvm_cpu_exec() never calls qemu_mutex_lock_iothread() but it does lock
the underlying mutex via post_kvm_run().  It's just confusing because
vl.c calls it the iothread mutex whereas qemu-kvm.c calls it qemu
mutex and there are wrapper functions.

Does this help?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [SeaBIOS] [PATCH] Support for booting from virtio disks

2010-05-09 Thread Stefan Hajnoczi
On Sun, May 9, 2010 at 4:23 PM, Gleb Natapov g...@redhat.com wrote:
Neat!  I believe SeaBIOS will see virtio-blk devices as harddisks and
not attempt to boot ISOs?  Many existing OS installers probably cannot
boot from virtio-blk, but in the longer term folks might like to get
rid of ATAPI CD-ROMs in their VMs.

 +        char *desc = malloc_tmphigh(MAXDESCSIZE);
 +        struct virtiodrive_s *vdrive_g = malloc_fseg(sizeof(*vdrive_g));
 +        struct vring_virtqueue *vq = malloc_low(sizeof(*vq));
 +        if (!vdrive_g || !desc || !vq) {
 +            warn_noalloc();
 +            return;
 +        }
[...]
 +        if (vp_find_vq(ioaddr, 0, vdrive_g-vq)  0 ) {
 +            free(vdrive_g);
 +            dprintf(1, fail to find vq for virtio-blk %x:%x\n,
 +                    pci_bdf_to_bus (bdf), pci_bdf_to_dev(bdf));
 +            continue;
 +        }

Are desc, vdrive_g, and/or vq getting leaked on error?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] Support for booting from virtio disks

2010-05-10 Thread Stefan Hajnoczi
 diff --git a/src/virtio-blk.c b/src/virtio-blk.c
 new file mode 100644
 index 000..a41c336
 --- /dev/null
 +++ b/src/virtio-blk.c
 @@ -0,0 +1,155 @@
 +// Virtio blovl boot support.

Just noticed the blovl typo.

 +        char *desc = malloc_tmphigh(MAXDESCSIZE);
 +        struct virtiodrive_s *vdrive_g = malloc_fseg(sizeof(*vdrive_g));
 +        struct vring_virtqueue *vq = malloc_low(sizeof(*vq));
 +        if (!vdrive_g || !desc || !vq) {
 +            warn_noalloc();
 +            return;
 +        }

This error return can still leak.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [SeaBIOS] [PATCHv3] Support for booting from virtio disks

2010-05-10 Thread Stefan Hajnoczi
Looks good.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2] Support for booting from virtio disks

2010-05-11 Thread Stefan Hajnoczi
From what I can tell SeaBIOS is reading CMOS_BIOS_BOOTFLAG1 and
CMOS_BIOS_BOOTFLAG2 from non-volatile memory.  The values index into
bev[], which contains IPL entries (the drives).

Is the order of bev[] entries well-defined?  Is there a way for QEMU
command-line to know that the first virtio-blk device corresponds to x
and the IDE CD-ROM corresponds to y?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Wiki docs on counting and tracing KVM perf events

2010-05-13 Thread Stefan Hajnoczi
How to count and trace KVM perf events:

http://www.linux-kvm.org/page/Perf_events

I want to draw attention to this because traditional kvm_stat and
kvm_trace use has been moving over to the debugfs based tracing
mechanisms.  Perhaps we can flesh out documentation and examples of
common perf event usage.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC] virtio_blk: Use blk-iopoll for host-guest notify

2010-05-14 Thread Stefan Hajnoczi
This patch adds blk-iopoll interrupt mitigation to virtio-blk.  Instead
of processing completed requests inside the virtqueue interrupt handler,
a softirq is scheduled to process up to a maximum number of completed
requests in one go.

If the number of complete requests exceeds the maximum number, then another
softirq is scheduled to continue polling.  Otherwise the virtqueue interrupt is
enabled again and we return to interrupt-driven mode.

The patch sets the maximum number of completed requests (aka budget, aka
weight) to 4.  This is a low number but reflects the expensive context
switch between guest and host virtio-blk emulation.

The blk-iopoll infrastructure is enabled system-wide by default:

kernel.blk_iopoll = 1

It can be disabled to always use interrupt-driven mode (useful for comparison):

kernel.blk_iopoll = 0

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
No performance figures yet.

 drivers/block/virtio_blk.c |   71 ++-
 1 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2138a7a..1523895 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -6,6 +6,7 @@
 #include linux/virtio.h
 #include linux/virtio_blk.h
 #include linux/scatterlist.h
+#include linux/blk-iopoll.h
 
 #define PART_BITS 4
 
@@ -26,6 +27,9 @@ struct virtio_blk
 
mempool_t *pool;
 
+   /* Host-guest notify mitigation */
+   struct blk_iopoll iopoll;
+
/* What host tells us, plus 2 for header  tailer. */
unsigned int sg_elems;
 
@@ -42,16 +46,18 @@ struct virtblk_req
u8 status;
 };
 
-static void blk_done(struct virtqueue *vq)
+/* Assumes vblk-lock held */
+static int __virtblk_end_requests(struct virtio_blk *vblk, int weight)
 {
-   struct virtio_blk *vblk = vq-vdev-priv;
struct virtblk_req *vbr;
unsigned int len;
-   unsigned long flags;
+   int error;
+   int work = 0;
 
-   spin_lock_irqsave(vblk-lock, flags);
-   while ((vbr = vblk-vq-vq_ops-get_buf(vblk-vq, len)) != NULL) {
-   int error;
+   while (!weight || work  weight) {
+   vbr = vblk-vq-vq_ops-get_buf(vblk-vq, len);
+   if (!vbr)
+   break;
 
switch (vbr-status) {
case VIRTIO_BLK_S_OK:
@@ -74,10 +80,53 @@ static void blk_done(struct virtqueue *vq)
__blk_end_request_all(vbr-req, error);
list_del(vbr-list);
mempool_free(vbr, vblk-pool);
+   work++;
}
+
/* In case queue is stopped waiting for more buffers. */
blk_start_queue(vblk-disk-queue);
+   return work;
+}
+
+static int virtblk_iopoll(struct blk_iopoll *iopoll, int weight)
+{
+   struct virtio_blk *vblk =
+   container_of(iopoll, struct virtio_blk, iopoll);
+   unsigned long flags;
+   int work;
+
+   spin_lock_irqsave(vblk-lock, flags);
+
+   work = __virtblk_end_requests(vblk, weight);
+   if (work  weight) {
+   /* Keep polling if there are pending requests. */
+   if (vblk-vq-vq_ops-enable_cb(vblk-vq))
+   __blk_iopoll_complete(vblk-iopoll);
+   else
+   vblk-vq-vq_ops-disable_cb(vblk-vq);
+   }
+
spin_unlock_irqrestore(vblk-lock, flags);
+   return work;
+}
+
+static void blk_done(struct virtqueue *vq)
+{
+   struct virtio_blk *vblk = vq-vdev-priv;
+   unsigned long flags;
+
+   if (blk_iopoll_enabled) {
+   if (!blk_iopoll_sched_prep(vblk-iopoll)) {
+   spin_lock_irqsave(vblk-lock, flags);
+   vblk-vq-vq_ops-disable_cb(vblk-vq);
+   spin_unlock_irqrestore(vblk-lock, flags);
+   blk_iopoll_sched(vblk-iopoll);
+   }
+   } else {
+   spin_lock_irqsave(vblk-lock, flags);
+   __virtblk_end_requests(vblk, 0);
+   spin_unlock_irqrestore(vblk-lock, flags);
+   }
 }
 
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
@@ -289,11 +338,14 @@ static int __devinit virtblk_probe(struct virtio_device 
*vdev)
goto out_free_vq;
}
 
+   blk_iopoll_init(vblk-iopoll, 4 /* budget */, virtblk_iopoll);
+   blk_iopoll_enable(vblk-iopoll);
+
/* FIXME: How many partitions?  How long is a piece of string? */
vblk-disk = alloc_disk(1  PART_BITS);
if (!vblk-disk) {
err = -ENOMEM;
-   goto out_mempool;
+   goto out_iopoll;
}
 
q = vblk-disk-queue = blk_init_queue(do_virtblk_request, vblk-lock);
@@ -401,13 +453,13 @@ static int __devinit virtblk_probe(struct virtio_device 
*vdev)
if (!err  opt_io_size)
blk_queue_io_opt(q, blk_size * opt_io_size);
 
-
add_disk(vblk-disk);
return 0

Re: [PATCH RFC] virtio_blk: Use blk-iopoll for host-guest notify

2010-05-18 Thread Stefan Hajnoczi
On Fri, May 14, 2010 at 05:30:56PM -0500, Brian Jackson wrote:
 Any preliminary numbers? latency, throughput, cpu use? What about comparing 
 different weights?

I am running benchmarks and will report results when they are in.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH +stable] block: don't attempt to merge overlapping requests

2010-05-18 Thread Stefan Hajnoczi
On Tue, May 18, 2010 at 6:18 PM, Avi Kivity a...@redhat.com wrote:
 The block multiwrite code pretends to be able to merge overlapping requests,
 but doesn't do so in fact.  This leads to I/O errors (for example on mkfs
 of a large virtio disk).

Are overlapping write requests correct guest behavior?  I thought the
ordering semantics require a flush between overlapping writes to
ensure A is written before B.

What cache= mode are you running?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH +stable] block: don't attempt to merge overlapping requests

2010-05-18 Thread Stefan Hajnoczi
I just caught up on mails and saw you had already mentioned that
overlapping writes from the guest look fishy in the the 1Tb block
issue.  Cache mode might still be interesting because it affects how
guest virtio-blk chooses queue ordering mode.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH +stable] block: don't attempt to merge overlapping requests

2010-05-19 Thread Stefan Hajnoczi
On Wed, May 19, 2010 at 9:09 AM, Avi Kivity a...@redhat.com wrote:
 On 05/18/2010 10:22 PM, Stefan Hajnoczi wrote:
 What cache= mode are you running?

 writeback.

In the cache=writeback case the virtio-blk guest driver does:

blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, ...)

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH +stable] block: don't attempt to merge overlapping requests

2010-05-19 Thread Stefan Hajnoczi
On Wed, May 19, 2010 at 10:06 AM, Avi Kivity a...@redhat.com wrote:
 In the cache=writeback case the virtio-blk guest driver does:

 blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH, ...)


 I don't follow.  What's the implication?

I was wondering whether the queue is incorrectly set to a mode where
overlapping write requests aren't being ordered.  Anyway, Christoph
says overlapping write requests can be issued so my theory is broken.

 btw I really dislike how the cache attribute (which I see as a pure host
 choice) is exposed to the guest.  It means we can't change caching mode on
 the fly (for example after live migration), or that changing caching mode
 during a restart may expose a previously hidden guest bug.

Christoph has mentioned wanting to make the write cache switchable
from inside the guest.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Wiki docs on counting and tracing KVM perf events

2010-05-20 Thread Stefan Hajnoczi
8330  kvm:kvm_entry#  0.000 M/sec
^--- count since starting perf

The 8330 number means that kvm_entry has fired 8330 times since perf
was started.  Like Avi says, you need to keep the perf process
running.  I run benchmarks using a script that kills perf after the
benchmark completes.

Jes, you're right, something like perf stat -e kvm:* --start and
perf stat --stop would be more usable for system-wide monitoring.  I
wonder if it is possible to support this or whether the perf process
needs to periodically accumulate the counters (i.e. babysit the kernel
infrastructure)?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Wiki docs on counting and tracing KVM perf events

2010-05-20 Thread Stefan Hajnoczi
On Thu, May 20, 2010 at 12:16 PM, Jes Sorensen jes.soren...@redhat.com wrote:
 On 05/20/10 13:10, Avi Kivity wrote:
 What's wrong with starting perf after the warm-up period and stopping it
 before it's done?

 It's pretty hard to script.

I use the following.  It ain't pretty:

#!/bin/bash
cleanup() {
trap - 2
kill -2 $sleep_pid
echo 0 /sys/kernel/debug/tracing/events/kvm/enable
kill $cat_pid
}

perf stat -a -e 'kvm:*' sleep 1h results/perf_stat 21 
sleep_pid=$(sleep 1  pgrep -x -f sleep 1h)   # sleep 1 is to avoid
race with forked perf process
trap cleanup 2
echo 1 /sys/kernel/debug/tracing/events/kvm/enable
cat /sys/kernel/debug/tracing/trace_pipe results/trace 
cat_pid=$!

# ...do stuff here...

cleanup

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Wiki docs on counting and tracing KVM perf events

2010-05-20 Thread Stefan Hajnoczi
On Thu, May 20, 2010 at 1:14 PM, Avi Kivity a...@redhat.com wrote:
 echo 1/sys/kernel/debug/tracing/events/kvm/enable
 cat /sys/kernel/debug/tracing/trace_piperesults/trace

 perf will enable the events by itself (no?), so all you need is is the perf
 call in the middle.

Yes, it will enable events.  However, I am also generating a
kvm_trace-like log using trace_pipe.  On this box I couldn't get perf
trace working so I used trace_pipe for the kvm_trace-equivalent and
perf stat for the kvm_stat equivalent.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm

2010-05-20 Thread Stefan Hajnoczi
On Thu, May 20, 2010 at 11:16 PM, Christian Brunner c...@muc.de wrote:
 2010/5/20 Anthony Liguori anth...@codemonkey.ws:
 Both sheepdog and ceph ultimately transmit I/O over a socket to a central
 daemon, right?  So could we not standardize a protocol for this that both
 sheepdog and ceph could implement?

 There is no central daemon. The concept is that they talk to many
 storage nodes at the same time. Data is distributed and replicated
 over many nodes in the network. The mechanism to do this is quite
 complex. I don't know about sheepdog, but in Ceph this is called RADOS
 (reliable autonomic distributed object store). Sheepdog and Ceph may
 look similar, but this is where they act different. I don't think that
 it would be possible to implement a common protocol.

I believe Sheepdog has a local daemon on each node.  The QEMU storage
backend talks to the daemon on the same node, which then does the real
network communication with the rest of the distributed storage system.
 So I think we're not talking about a network protocol here, we're
talking about a common interface that can be used by QEMU and other
programs to take advantage of Ceph, Sheepdog, etc services available
on the local node.

Haven't looked into your patch enough yet, but does librados talk
directly over the network or does it connect to a local daemon/driver?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 0/2] Tracing

2010-05-21 Thread Stefan Hajnoczi
Trace events in QEMU/KVM can be very useful for debugging and performance
analysis.  I'd like to discuss tracing support and hope others have an interest
in this feature, too.

Following this email are patches I am using to debug virtio-blk and storage.
The patches provide trivial tracing support, but they don't address the details
of real tracing tools: enabling/disabling events at runtime, no overhead for
disabled events, multithreading support, etc.

It would be nice to have userland tracing facilities that work out-of-the-box
on production systems.  Unfortunately, I'm not aware of any such facilities out
there right now on Linux.  Perhaps SystemTap userspace tracing is the way to
go, has anyone tried it with KVM?

For the medium term, without userspace tracing facilities in the OS we could
put something into QEMU to address the need for tracing.  Here are my thoughts
on fleshing out the tracing patch I have posted:

1. Make it possible to enable/disable events at runtime.  Users enable only the
   events they are interested in and aren't flooded with trace data for all
   other events.

2. Either make trace events cheap or build without trace events by default.
   Disable by default still allows tracing to be used for development but
   less for production.

3. Allow events in any execution context (cpu, io, aio emulation threads).  The
   current code does not support concurrency and is meant for when the iothread
   mutex is held.

4. Make it easy to add new events.  Instead of keeping trace.h and trace.py in
   sync manually, use something like .hx to produce the appropriate C and
   Python.

Summary: Tracing is useful, are there external tools we can use right now?  If
not, should we put in something that works well enough until external tools
catch up?

Stefan

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] trace: Add simple tracing support

2010-05-21 Thread Stefan Hajnoczi
Trace events should be defined in trace.h.  Events are written to
/tmp/trace.log and can be formatted using trace.py.  Remember to add
events to trace.py for pretty-printing.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 Makefile.objs |2 +-
 trace.c   |   64 +
 trace.h   |9 
 trace.py  |   30 ++
 4 files changed, 104 insertions(+), 1 deletions(-)
 create mode 100644 trace.c
 create mode 100644 trace.h
 create mode 100755 trace.py

diff --git a/Makefile.objs b/Makefile.objs
index acbaf22..307e989 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -8,7 +8,7 @@ qobject-obj-y += qerror.o
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
 block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
-block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o
+block-obj-y += nbd.o block.o aio.o aes.o osdep.o qemu-config.o trace.o
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
diff --git a/trace.c b/trace.c
new file mode 100644
index 000..2fec4d3
--- /dev/null
+++ b/trace.c
@@ -0,0 +1,64 @@
+#include stdlib.h
+#include stdio.h
+#include trace.h
+
+typedef struct {
+unsigned long event;
+unsigned long x1;
+unsigned long x2;
+unsigned long x3;
+unsigned long x4;
+unsigned long x5;
+} TraceRecord;
+
+enum {
+TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord),
+};
+
+static TraceRecord trace_buf[TRACE_BUF_LEN];
+static unsigned int trace_idx;
+static FILE *trace_fp;
+
+static void trace(TraceEvent event, unsigned long x1,
+  unsigned long x2, unsigned long x3,
+  unsigned long x4, unsigned long x5) {
+TraceRecord *rec = trace_buf[trace_idx];
+rec-event = event;
+rec-x1 = x1;
+rec-x2 = x2;
+rec-x3 = x3;
+rec-x4 = x4;
+rec-x5 = x5;
+
+if (++trace_idx == TRACE_BUF_LEN) {
+trace_idx = 0;
+
+if (!trace_fp) {
+trace_fp = fopen(/tmp/trace.log, w);
+}
+if (trace_fp) {
+size_t result = fwrite(trace_buf, sizeof trace_buf, 1, trace_fp);
+result = result;
+}
+}
+}
+
+void trace1(TraceEvent event, unsigned long x1) {
+trace(event, x1, 0, 0, 0, 0);
+}
+
+void trace2(TraceEvent event, unsigned long x1, unsigned long x2) {
+trace(event, x1, x2, 0, 0, 0);
+}
+
+void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3) {
+trace(event, x1, x2, x3, 0, 0);
+}
+
+void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4) {
+trace(event, x1, x2, x3, x4, 0);
+}
+
+void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4, unsigned long x5) {
+trace(event, x1, x2, x3, x4, x5);
+}
diff --git a/trace.h b/trace.h
new file mode 100644
index 000..144aa1e
--- /dev/null
+++ b/trace.h
@@ -0,0 +1,9 @@
+typedef enum {
+TRACE_MAX
+} TraceEvent;
+
+void trace1(TraceEvent event, unsigned long x1);
+void trace2(TraceEvent event, unsigned long x1, unsigned long x2);
+void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3);
+void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4);
+void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4, unsigned long x5);
diff --git a/trace.py b/trace.py
new file mode 100755
index 000..f38ab6b
--- /dev/null
+++ b/trace.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+import sys
+import struct
+
+trace_fmt = 'LL'
+trace_len = struct.calcsize(trace_fmt)
+
+events = {
+}
+
+def read_record(fobj):
+s = fobj.read(trace_len)
+if len(s) != trace_len:
+return None
+return struct.unpack(trace_fmt, s)
+
+def format_record(rec):
+event = events[rec[0]]
+fields = [event[0]]
+for i in xrange(1, len(event)):
+fields.append('%s=0x%x' % (event[i], rec[i]))
+return ' '.join(fields)
+
+f = open(sys.argv[1], 'rb')
+while True:
+rec = read_record(f)
+if rec is None:
+break
+
+print format_record(rec)
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] trace: Trace write requests in virtio-blk, multiwrite, and paio_submit

2010-05-21 Thread Stefan Hajnoczi
Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 block.c|7 +++
 hw/virtio-blk.c|6 ++
 posix-aio-compat.c |2 ++
 trace.h|   42 +-
 trace.py   |8 
 5 files changed, 64 insertions(+), 1 deletions(-)

diff --git a/block.c b/block.c
index bfe46e3..a7fb040 100644
--- a/block.c
+++ b/block.c
@@ -27,6 +27,7 @@
 #include block_int.h
 #include module.h
 #include qemu-objects.h
+#include trace.h
 
 #ifdef CONFIG_BSD
 #include sys/types.h
@@ -1913,6 +1914,8 @@ static void multiwrite_cb(void *opaque, int ret)
 {
 MultiwriteCB *mcb = opaque;
 
+trace_multiwrite_cb(mcb, ret);
+
 if (ret  0  !mcb-error) {
 mcb-error = ret;
 multiwrite_user_cb(mcb);
@@ -2044,6 +2047,8 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, 
BlockRequest *reqs, int num_reqs)
 // Check for mergable requests
 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
 
+trace_bdrv_aio_multiwrite(mcb, mcb-num_callbacks, num_reqs);
+
 // Run the aio requests
 for (i = 0; i  num_reqs; i++) {
 acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
@@ -2054,9 +2059,11 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, 
BlockRequest *reqs, int num_reqs)
 // submitted yet. Otherwise we'll wait for the submitted AIOs to
 // complete and report the error in the callback.
 if (mcb-num_requests == 0) {
+trace_bdrv_aio_multiwrite_earlyfail(mcb);
 reqs[i].error = -EIO;
 goto fail;
 } else {
+trace_bdrv_aio_multiwrite_latefail(mcb, i);
 mcb-num_requests++;
 multiwrite_cb(mcb, -EIO);
 break;
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index b05d15e..73b873e 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -13,6 +13,7 @@
 
 #include qemu-common.h
 #include sysemu.h
+#include trace.h
 #include virtio-blk.h
 #include block_int.h
 #ifdef __linux__
@@ -50,6 +51,8 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, int 
status)
 {
 VirtIOBlock *s = req-dev;
 
+trace_virtio_blk_req_complete(req, status);
+
 req-in-status = status;
 virtqueue_push(s-vq, req-elem, req-qiov.size + sizeof(*req-in));
 virtio_notify(s-vdev, s-vq);
@@ -87,6 +90,8 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
 {
 VirtIOBlockReq *req = opaque;
 
+trace_virtio_blk_rw_complete(req, ret);
+
 if (ret) {
 int is_read = !(req-out-type  VIRTIO_BLK_T_OUT);
 if (virtio_blk_handle_rw_error(req, -ret, is_read))
@@ -270,6 +275,7 @@ static void virtio_blk_handle_write(BlockRequest *blkreq, 
int *num_writes,
 blkreq[*num_writes].cb = virtio_blk_rw_complete;
 blkreq[*num_writes].opaque = req;
 blkreq[*num_writes].error = 0;
+trace_virtio_blk_handle_write(req, req-out-sector, req-qiov.size / 512);
 
 (*num_writes)++;
 }
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index b43c531..57d83f0 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -23,6 +23,7 @@
 #include stdio.h
 
 #include qemu-queue.h
+#include trace.h
 #include osdep.h
 #include qemu-common.h
 #include block_int.h
@@ -583,6 +584,7 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
 acb-next = posix_aio_state-first_aio;
 posix_aio_state-first_aio = acb;
 
+trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
 qemu_paio_submit(acb);
 return acb-common;
 }
diff --git a/trace.h b/trace.h
index 144aa1e..3c4564f 100644
--- a/trace.h
+++ b/trace.h
@@ -1,5 +1,12 @@
 typedef enum {
-TRACE_MAX
+TRACE_MULTIWRITE_CB,
+TRACE_BDRV_AIO_MULTIWRITE,
+TRACE_BDRV_AIO_MULTIWRITE_EARLYFAIL,
+TRACE_BDRV_AIO_MULTIWRITE_LATEFAIL,
+TRACE_VIRTIO_BLK_REQ_COMPLETE,
+TRACE_VIRTIO_BLK_RW_COMPLETE,
+TRACE_VIRTIO_BLK_HANDLE_WRITE,
+TRACE_PAIO_SUBMIT,
 } TraceEvent;
 
 void trace1(TraceEvent event, unsigned long x1);
@@ -7,3 +14,36 @@ void trace2(TraceEvent event, unsigned long x1, unsigned long 
x2);
 void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3);
 void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4);
 void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4, unsigned long x5);
+
+static inline void trace_multiwrite_cb(void *mcb, int ret) {
+trace2(TRACE_MULTIWRITE_CB, (unsigned long)mcb, ret);
+}
+
+static inline void trace_bdrv_aio_multiwrite(void *mcb, int num_callbacks, int 
num_reqs) {
+trace3(TRACE_BDRV_AIO_MULTIWRITE, (unsigned long)mcb, num_callbacks, 
num_reqs);
+}
+
+static inline void trace_bdrv_aio_multiwrite_earlyfail(void *mcb) {
+trace1(TRACE_BDRV_AIO_MULTIWRITE_EARLYFAIL, (unsigned long)mcb);
+}
+
+static inline void trace_bdrv_aio_multiwrite_latefail(void *mcb, int i) {
+trace2

Re: [PATCH 1/2] trace: Add simple tracing support

2010-05-21 Thread Stefan Hajnoczi
I should have used the [RFC] tag to make it clear that I'm not
proposing these patches for merge, sorry.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] trace: Add simple tracing support

2010-05-21 Thread Stefan Hajnoczi
On Fri, May 21, 2010 at 12:13 PM, Jan Kiszka jan.kis...@siemens.com wrote:
 Stefan Hajnoczi wrote:
 Trace events should be defined in trace.h.  Events are written to
 /tmp/trace.log and can be formatted using trace.py.  Remember to add
 events to trace.py for pretty-printing.

 When already writing to a file, why not reusing QEMU's logging
 infrastructure (log foo / -d foo)? Shouldn't make a huge
 performance difference if the data is saved in clear-text.

 Also, having support for ftrace's user space markers would be a very
 nice option (only an option as it's Linux-specific), see
 http://lwn.net/Articles/366796.

Thanks for the links.

I think using the platform's tracing facility has many advantages.
The main one being that we can focus on QEMU/KVM development rather
than re-implementing tracing infrastructure :).

It may be possible to have SystemTap, DTrace, or nop static trace
event code.  A platform with no tracing support can only use the nop
backend, which results in a build without static trace events.
Platforms with tracing support can build with the appropriate backend
or nop.  The backend tracing facility is abstracted and most of QEMU
doesn't need to know which one is being used.

I hadn't seen trace markers.  However, I suspect they aren't ideal for
static trace events because logging an event requires a write system
call.  They look useful for annotating kernel tracing information, but
less for high frequency/low overhead userspace tracing.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 1/2] trace: Add simple tracing support

2010-05-21 Thread Stefan Hajnoczi
On Fri, May 21, 2010 at 5:52 PM, Jan Kiszka jan.kis...@siemens.com wrote:
 I would just like to avoid that too much efforts are spent on
 re-inventing smart trace buffers, trace daemons, or trace visualization
 tools. Then better pick up some semi-perfect approach (e.g. [1], it
 unfortunately still seems to lack kernel integration) and drive it
 according to our needs.

I agree we have to consider existing solutions.  The killer is the
usability: what dependencies are required to build with tracing?  Is a
patched kernel or module required?  How easy is it to add static trace
events during debugging?

If there are too many dependencies, especially to unpackaged software,
many people will stop right there and not bother.  A patched kernel or
module isn't acceptable since the hassle of reconfiguring a system for
tracing becomes too great (or in some cases changing the kernel is not
possible/allowed).

Adding new static trace events should be easy, too.  Ideally it
doesn't require adding information about the trace event in multiple
places (header files, C files, etc).  It also shouldn't require
learning about the tracing system, adding a trace event should be
self-explanatory so anyone can easily add one for debugging.

A lot of opinions there, but what I'm saying is that friction must be
low.  If the tracing system is a pain to use, then no-one will use it.

http://lttng.org/files/ust/manual/ust.html

LTTng Userspace Tracer looks interesting - no kernel support required
AFAICT.  Toggling trace events in a running process supported.
Similar to kernel tracepoint.h and existing report/visualization tool.

x86 (32- and 64-bit) only.  Like you say, no correlation with kernel trace data.

I'll try to give LTTng UST a spin by converting my trace events to use
UST.  This seems closest to an existing tracing system we can drop in.

http://sourceware.org/systemtap/wiki/AddingUserSpaceProbingToApps

Requires kernel support - not sure if enough of utrace is in mainline
for this to work out-of-the-box across distros.

Unclear how exactly SystemTap userspace probing would work out.  Does
anyone have experience or want to try this?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] trace: Add LTTng Userspace Tracer backend

2010-05-22 Thread Stefan Hajnoczi
This patch adds LTTng Userspace Tracer (UST) backend support.  The UST
system requires no kernel support but libust and liburcu must be
installed.

$ ./configure --trace-backend ust
$ make

Start the UST daemon:
$ ustd 

List available tracepoints and enable some:
$ ustctl --list-markers $(pgrep qemu)
[...]
{PID: 5458, channel/marker: ust/paio_submit, state: 0, fmt: acb %p
opaque %p sector_num %lu nb_sectors %lu type %lu 0x4b32ba}
$ ustctl --enable-marker ust/paio_submit $(pgrep qemu)

Run the trace:
$ ustctl --create-trace $(pgrep qemu)
$ ustctl --start-trace $(pgrep qemu)
[...]
$ ustctl --stop-trace $(pgrep qemu)
$ ustctl --destroy-trace $(pgrep qemu)

Trace results can be viewed using lttv-gui.

More information about UST:
http://lttng.org/ust

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
I wrote this as part of trying out UST.  Although UST is promising, the
usability is poor at the moment.

The dependencies include the lttv trace viewer which I had to build from source
(and it required a makefile tweak to build).  Luckily libust, liburcu, and
ust-bin are packaged on my distro.

Error messages are periodically printed by the UST code when running QEMU.  I
haven't investigated but this is may be due to signals interrupting UST's
thread in poll().

Finally, the UST header files include some userspace ported kernel
infrastructure and pollute the namespace.  I had to add some #undefs to get
QEMU to build after including UST headers.

I don't see LTTng UST as a default option at the moment.  Hopefully this will
change in the future.

 configure |5 +++-
 tracetool |   77 +++-
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index d599879..307dbcb 100755
--- a/configure
+++ b/configure
@@ -829,7 +829,7 @@ echo   --enable-docsenable documentation build
 echo   --disable-docs   disable documentation build
 echo   --disable-vhost-net  disable vhost-net acceleration support
 echo   --enable-vhost-net   enable vhost-net acceleration support
-echo   --trace-backend=BTrace backend nop simple
+echo   --trace-backend=BTrace backend nop simple ust
 echo 
 echo NOTE: The object files are built at the place where configure is 
launched
 exit 1
@@ -2302,6 +2302,9 @@ bsd)
 esac
 
 echo TRACE_BACKEND=$trace_backend  $config_host_mak
+if test $trace_backend = ust; then
+  LIBS=-lust $LIBS
+fi
 
 tools=
 if test `expr $target_list : .*softmmu.*` != 0 ; then
diff --git a/tracetool b/tracetool
index bcd163e..72beb20 100755
--- a/tracetool
+++ b/tracetool
@@ -3,12 +3,13 @@
 usage()
 {
 cat 2 EOF
-usage: $0 [--nop | --simple] [-h | -c | --py]
+usage: $0 [--nop | --simple | --ust] [-h | -c | --py]
 Generate tracing code for a file on stdin.
 
 Backends:
   --nop Tracing disabled
   --simple  Simple built-in backend
+  --ust LTTng User Space Tracing backend
 
 Output formats:
   -hGenerate .h file
@@ -220,6 +221,78 @@ linetopy_end_simple()
 echo }
 }
 
+linetoh_begin_ust()
+{
+echo #include ust/tracepoint.h
+}
+
+linetoh_ust()
+{
+local name args argnames
+name=$(get_name $1)
+args=$(get_args $1)
+argnames=$(get_argnames $1)
+
+cat EOF
+DECLARE_TRACE(ust_$name, TPPROTO($args), TPARGS($argnames));
+#define trace_$name trace_ust_$name
+EOF
+}
+
+linetoh_end_ust()
+{
+# Clean up after UST headers which pollute the namespace
+cat EOF
+#undef mutex_lock
+#undef mutex_unlock
+EOF
+}
+
+linetoc_begin_ust()
+{
+cat EOF
+#include ust/marker.h
+#include trace.h
+EOF
+}
+
+linetoc_ust()
+{
+local name args argnames fmt
+name=$(get_name $1)
+args=$(get_args $1)
+argnames=$(get_argnames $1)
+fmt=$(get_fmt $1)
+
+cat EOF
+DEFINE_TRACE(ust_$name);
+
+static void ust_${name}_probe($args)
+{
+trace_mark(ust, $name, $fmt, $argnames);
+}
+EOF
+
+# Collect names for later
+names=$names $name
+}
+
+linetoc_end_ust()
+{
+cat EOF
+static void __attribute__((constructor)) trace_init(void)
+{
+EOF
+
+for name in $names; do
+cat EOF
+register_trace_ust_$name(ust_${name}_probe);
+EOF
+done
+
+echo }
+}
+
 # Process stdin by calling begin, line, and end functions for the backend
 convert()
 {
@@ -267,7 +340,7 @@ tracetopy()
 
 # Choose backend
 case $1 in
---nop | --simple) backend=${1#--} ;;
+--nop | --simple | --ust) backend=${1#--} ;;
 *) usage ;;
 esac
 shift
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] trace: Trace qemu_malloc() and qemu_vmalloc()

2010-05-22 Thread Stefan Hajnoczi
It is often useful to instrument memory management functions in order to
find leaks or performance problems.  This patch adds trace events for
the memory allocation primitives.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
An example of adding trace events.

 osdep.c   |9 +
 qemu-malloc.c |4 
 trace-events  |   10 ++
 3 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/osdep.c b/osdep.c
index abbc8a2..8e4b8ea 100644
--- a/osdep.c
+++ b/osdep.c
@@ -50,6 +50,7 @@
 #endif
 
 #include qemu-common.h
+#include trace.h
 #include sysemu.h
 #include qemu_socket.h
 
@@ -71,6 +72,8 @@ static void *oom_check(void *ptr)
 #if defined(_WIN32)
 void *qemu_memalign(size_t alignment, size_t size)
 {
+trace_qemu_memalign(alignment, size);
+
 if (!size) {
 abort();
 }
@@ -79,6 +82,8 @@ void *qemu_memalign(size_t alignment, size_t size)
 
 void *qemu_vmalloc(size_t size)
 {
+trace_qemu_vmalloc(size);
+
 /* FIXME: this is not exactly optimal solution since VirtualAlloc
has 64Kb granularity, but at least it guarantees us that the
memory is page aligned. */
@@ -90,6 +95,7 @@ void *qemu_vmalloc(size_t size)
 
 void qemu_vfree(void *ptr)
 {
+trace_qemu_vfree(ptr);
 VirtualFree(ptr, 0, MEM_RELEASE);
 }
 
@@ -97,6 +103,8 @@ void qemu_vfree(void *ptr)
 
 void *qemu_memalign(size_t alignment, size_t size)
 {
+trace_qemu_memalign(alignment, size);
+
 #if defined(_POSIX_C_SOURCE)  !defined(__sun__)
 int ret;
 void *ptr;
@@ -122,6 +130,7 @@ void *qemu_vmalloc(size_t size)
 
 void qemu_vfree(void *ptr)
 {
+trace_qemu_vfree(ptr);
 free(ptr);
 }
 
diff --git a/qemu-malloc.c b/qemu-malloc.c
index 6cdc5de..69fc3cf 100644
--- a/qemu-malloc.c
+++ b/qemu-malloc.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include qemu-common.h
+#include trace.h
 #include stdlib.h
 
 static void *oom_check(void *ptr)
@@ -39,6 +40,7 @@ void *get_mmap_addr(unsigned long size)
 
 void qemu_free(void *ptr)
 {
+trace_qemu_free(ptr);
 free(ptr);
 }
 
@@ -53,6 +55,7 @@ static int allow_zero_malloc(void)
 
 void *qemu_malloc(size_t size)
 {
+trace_qemu_malloc(size);
 if (!size  !allow_zero_malloc()) {
 abort();
 }
@@ -61,6 +64,7 @@ void *qemu_malloc(size_t size)
 
 void *qemu_realloc(void *ptr, size_t size)
 {
+trace_qemu_realloc(ptr, size);
 if (!size  !allow_zero_malloc()) {
 abort();
 }
diff --git a/trace-events b/trace-events
index a37d3cc..a93ea29 100644
--- a/trace-events
+++ b/trace-events
@@ -22,3 +22,13 @@
 # system may not have the necessary headers included.
 #
 # The format-string should be a sprintf()-compatible format string.
+
+# qemu-malloc.c
+qemu_malloc(size_t size) size %zu
+qemu_realloc(void *ptr, size_t size) ptr %p size %zu
+qemu_free(void *ptr) ptr %p
+
+# osdep.c
+qemu_memalign(size_t alignment, size_t size) alignment %zu size %zu
+qemu_valloc(size_t size) size %zu
+qemu_vfree(void *ptr) ptr %p
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/5] trace: Trace virtio-blk, multiwrite, and paio_submit

2010-05-22 Thread Stefan Hajnoczi
This patch adds trace events that make it possible to observe
virtio-blk.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 block.c|7 +++
 hw/virtio-blk.c|7 +++
 posix-aio-compat.c |2 ++
 trace-events   |   14 ++
 4 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/block.c b/block.c
index bfe46e3..86fe7f5 100644
--- a/block.c
+++ b/block.c
@@ -23,6 +23,7 @@
  */
 #include config-host.h
 #include qemu-common.h
+#include trace.h
 #include monitor.h
 #include block_int.h
 #include module.h
@@ -1913,6 +1914,8 @@ static void multiwrite_cb(void *opaque, int ret)
 {
 MultiwriteCB *mcb = opaque;
 
+trace_multiwrite_cb(mcb, ret);
+
 if (ret  0  !mcb-error) {
 mcb-error = ret;
 multiwrite_user_cb(mcb);
@@ -2044,6 +2047,8 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, 
BlockRequest *reqs, int num_reqs)
 // Check for mergable requests
 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
 
+trace_bdrv_aio_multiwrite(mcb, mcb-num_callbacks, num_reqs);
+
 // Run the aio requests
 for (i = 0; i  num_reqs; i++) {
 acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
@@ -2054,9 +2059,11 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, 
BlockRequest *reqs, int num_reqs)
 // submitted yet. Otherwise we'll wait for the submitted AIOs to
 // complete and report the error in the callback.
 if (mcb-num_requests == 0) {
+trace_bdrv_aio_multiwrite_earlyfail(mcb);
 reqs[i].error = -EIO;
 goto fail;
 } else {
+trace_bdrv_aio_multiwrite_latefail(mcb, i);
 mcb-num_requests++;
 multiwrite_cb(mcb, -EIO);
 break;
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index b05d15e..ef384e0 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -13,6 +13,7 @@
 
 #include qemu-common.h
 #include sysemu.h
+#include trace.h
 #include virtio-blk.h
 #include block_int.h
 #ifdef __linux__
@@ -50,6 +51,8 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, int 
status)
 {
 VirtIOBlock *s = req-dev;
 
+trace_virtio_blk_req_complete(req, status);
+
 req-in-status = status;
 virtqueue_push(s-vq, req-elem, req-qiov.size + sizeof(*req-in));
 virtio_notify(s-vdev, s-vq);
@@ -87,6 +90,8 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
 {
 VirtIOBlockReq *req = opaque;
 
+trace_virtio_blk_rw_complete(req, ret);
+
 if (ret) {
 int is_read = !(req-out-type  VIRTIO_BLK_T_OUT);
 if (virtio_blk_handle_rw_error(req, -ret, is_read))
@@ -251,6 +256,8 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req)
 static void virtio_blk_handle_write(BlockRequest *blkreq, int *num_writes,
 VirtIOBlockReq *req, BlockDriverState **old_bs)
 {
+trace_virtio_blk_handle_write(req, req-out-sector, req-qiov.size / 512);
+
 if (req-out-sector  req-dev-sector_mask) {
 virtio_blk_rw_complete(req, -EIO);
 return;
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index b43c531..c2200fe 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -25,6 +25,7 @@
 #include qemu-queue.h
 #include osdep.h
 #include qemu-common.h
+#include trace.h
 #include block_int.h
 
 #include block/raw-posix-aio.h
@@ -583,6 +584,7 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
 acb-next = posix_aio_state-first_aio;
 posix_aio_state-first_aio = acb;
 
+trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
 qemu_paio_submit(acb);
 return acb-common;
 }
diff --git a/trace-events b/trace-events
index a93ea29..4d96b8e 100644
--- a/trace-events
+++ b/trace-events
@@ -32,3 +32,17 @@ qemu_free(void *ptr) ptr %p
 qemu_memalign(size_t alignment, size_t size) alignment %zu size %zu
 qemu_valloc(size_t size) size %zu
 qemu_vfree(void *ptr) ptr %p
+
+# block.c
+multiwrite_cb(void *mcb, int ret) mcb %p ret %d
+bdrv_aio_multiwrite(void *mcb, int num_callbacks, int num_reqs) mcb %p 
num_callbacks %d num_reqs %d
+bdrv_aio_multiwrite_earlyfail(void *mcb) mcb %p
+bdrv_aio_multiwrite_latefail(void *mcb, int i) mcb %p i %d
+
+# hw/virtio-blk.c
+virtio_blk_req_complete(void *req, int status) req %p status %d
+virtio_blk_rw_complete(void *req, int ret) req %p ret %d
+virtio_blk_handle_write(void *req, unsigned long sector, unsigned long 
nsectors) req %p sector %lu nsectors %lu
+
+# posix-aio-compat.c
+paio_submit(void *acb, void *opaque, unsigned long sector_num, unsigned long 
nb_sectors, unsigned long type) acb %p opaque %p sector_num %lu nb_sectors %lu 
type %lu
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC 0/5] Tracing backends

2010-05-22 Thread Stefan Hajnoczi
The following patches against qemu.git allow static trace events to be declared
in QEMU.  Trace events use a lightweight syntax and are independent of the
backend tracing system (e.g. LTTng UST).

Supported backends are:
 * my trivial tracer (simple)
 * LTTng Userspace Tracer (ust)
 * no tracer (nop, the default)

The ./configure option to choose a backend is --trace-backend=.

Main point of this patchset: adding new trace events is easy and we can switch
between backends without modifying the code.

Prerna: Would you like to add your tracing system as a backend?  This would be
similar to my patches to add simple and ust backend support.

Jan: Adding kernel marker backend support should be straightforward if you are
interested.

These patches are also available at:
http://repo.or.cz/w/qemu/stefanha.git/shortlog/refs/heads/tracing

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] trace: Add simple built-in tracing backend

2010-05-22 Thread Stefan Hajnoczi
This patch adds a simple tracer which produces binary trace files and is
built into QEMU.  The main purpose of this patch is to show how new
tracing backends can be added to tracetool.

To try out the simple backend:

./configure --trace-backend=simple
make

After running QEMU you can pretty-print the trace:

./tracetool --simple --py trace-events events.py  # first time only
./simpletrace.py /tmp/trace.log

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
This is the same trivial tracer that I posted previously.

 .gitignore |2 +
 Makefile.objs  |3 +
 configure  |2 +-
 simpletrace.c  |   64 
 simpletrace.py |   38 +
 tracetool  |  127 ++-
 6 files changed, 232 insertions(+), 4 deletions(-)
 create mode 100644 simpletrace.c
 create mode 100755 simpletrace.py

diff --git a/.gitignore b/.gitignore
index 4644557..68fb21d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ config-host.*
 config-target.*
 trace.h
 trace.c
+events.py
 *-softmmu
 *-darwin-user
 *-linux-user
@@ -39,6 +40,7 @@ qemu-monitor.texi
 *.log
 *.pdf
 *.pg
+*.pyc
 *.toc
 *.tp
 *.vr
diff --git a/Makefile.objs b/Makefile.objs
index 9bbdf6f..d870767 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -252,6 +252,9 @@ libdis-$(CONFIG_SPARC_DIS) += sparc-dis.o
 # trace
 
 trace-obj-y = trace.o
+ifeq ($(TRACE_BACKEND),simple)
+trace-obj-y += simpletrace.o
+endif
 
 vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS)
 
diff --git a/configure b/configure
index 5e66f3a..d599879 100755
--- a/configure
+++ b/configure
@@ -829,7 +829,7 @@ echo   --enable-docsenable documentation build
 echo   --disable-docs   disable documentation build
 echo   --disable-vhost-net  disable vhost-net acceleration support
 echo   --enable-vhost-net   enable vhost-net acceleration support
-echo   --trace-backend=BTrace backend nop
+echo   --trace-backend=BTrace backend nop simple
 echo 
 echo NOTE: The object files are built at the place where configure is 
launched
 exit 1
diff --git a/simpletrace.c b/simpletrace.c
new file mode 100644
index 000..2fec4d3
--- /dev/null
+++ b/simpletrace.c
@@ -0,0 +1,64 @@
+#include stdlib.h
+#include stdio.h
+#include trace.h
+
+typedef struct {
+unsigned long event;
+unsigned long x1;
+unsigned long x2;
+unsigned long x3;
+unsigned long x4;
+unsigned long x5;
+} TraceRecord;
+
+enum {
+TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord),
+};
+
+static TraceRecord trace_buf[TRACE_BUF_LEN];
+static unsigned int trace_idx;
+static FILE *trace_fp;
+
+static void trace(TraceEvent event, unsigned long x1,
+  unsigned long x2, unsigned long x3,
+  unsigned long x4, unsigned long x5) {
+TraceRecord *rec = trace_buf[trace_idx];
+rec-event = event;
+rec-x1 = x1;
+rec-x2 = x2;
+rec-x3 = x3;
+rec-x4 = x4;
+rec-x5 = x5;
+
+if (++trace_idx == TRACE_BUF_LEN) {
+trace_idx = 0;
+
+if (!trace_fp) {
+trace_fp = fopen(/tmp/trace.log, w);
+}
+if (trace_fp) {
+size_t result = fwrite(trace_buf, sizeof trace_buf, 1, trace_fp);
+result = result;
+}
+}
+}
+
+void trace1(TraceEvent event, unsigned long x1) {
+trace(event, x1, 0, 0, 0, 0);
+}
+
+void trace2(TraceEvent event, unsigned long x1, unsigned long x2) {
+trace(event, x1, x2, 0, 0, 0);
+}
+
+void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3) {
+trace(event, x1, x2, x3, 0, 0);
+}
+
+void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4) {
+trace(event, x1, x2, x3, x4, 0);
+}
+
+void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4, unsigned long x5) {
+trace(event, x1, x2, x3, x4, x5);
+}
diff --git a/simpletrace.py b/simpletrace.py
new file mode 100755
index 000..70609cf
--- /dev/null
+++ b/simpletrace.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+import sys
+import struct
+
+try:
+from events import events
+except ImportError:
+sys.stderr.write('''Unable to import trace events from current working 
directory.  Please run:
+tracetool --simple --py trace-events events.py\n''')
+sys.exit(1)
+
+trace_fmt = 'LL'
+trace_len = struct.calcsize(trace_fmt)
+
+def read_record(fobj):
+s = fobj.read(trace_len)
+if len(s) != trace_len:
+return None
+return struct.unpack(trace_fmt, s)
+
+def format_record(rec):
+event = events[rec[0]]
+fields = [event[0]]
+for i in xrange(1, len(event)):
+fields.append('%s=0x%x' % (event[i], rec[i]))
+return ' '.join(fields)
+
+if len(sys.argv) != 2:
+sys.stderr.write('usage: %s trace-file\n' % sys.argv[0])
+sys.exit(1)
+
+f = open(sys.argv[1], 'rb')
+while True:
+rec = read_record(f)
+if rec is None:
+break
+
+print

[PATCH 1/5] trace: Add trace-events file for declaring trace events

2010-05-22 Thread Stefan Hajnoczi
This patch introduces the trace-events file where trace events can be
declared like so:

qemu_malloc(size_t size) size %zu
qemu_free(void *ptr) ptr %p

These trace event declarations are processed by a new tool called
tracetool to generate code for the trace events.  Trace event
declarations are independent of the backend tracing system (LTTng User
Space Tracing, kernel markers, DTrace).

The default nop backend generates empty trace event functions.
Therefore trace events are disabled by default.

The trace-events file serves to purposes:

1. Adding trace events is easy.  It is not necessary to understand the
   details of a backend tracing system.  The trace-events file is a
   single location where trace events can be declared without code
   duplication.

2. QEMU is not tightly coupled to one particular backend tracing system.
   In order to support tracing across QEMU host platforms and to
   anticipate new backend tracing systems that are currently maturing,
   it is important to be flexible and not tied to one system.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 .gitignore  |2 +
 Makefile|   17 +--
 Makefile.objs   |5 ++
 Makefile.target |1 +
 configure   |   19 +++
 trace-events|   24 
 tracetool   |  162 +++
 7 files changed, 226 insertions(+), 4 deletions(-)
 create mode 100644 trace-events
 create mode 100755 tracetool

diff --git a/.gitignore b/.gitignore
index fdfe2f0..4644557 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,8 @@ config-devices.*
 config-all-devices.*
 config-host.*
 config-target.*
+trace.h
+trace.c
 *-softmmu
 *-darwin-user
 *-linux-user
diff --git a/Makefile b/Makefile
index 306a1a4..ff57845 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile for QEMU.
 
-GENERATED_HEADERS = config-host.h
+GENERATED_HEADERS = config-host.h trace.h
 
 ifneq ($(wildcard config-host.mak),)
 # Put the all: rule here so that config-host.mak can contain dependencies.
@@ -130,16 +130,24 @@ bt-host.o: QEMU_CFLAGS += $(BLUEZ_CFLAGS)
 
 iov.o: iov.c iov.h
 
+trace.h: trace-events
+   $(call quiet-command,sh $(SRC_PATH)/tracetool --$(TRACE_BACKEND) -h  
$  $@,  GEN   $@)
+
+trace.c: trace-events
+   $(call quiet-command,sh $(SRC_PATH)/tracetool --$(TRACE_BACKEND) -c  
$  $@,  GEN   $@)
+
+trace.o: trace.c
+
 ##
 
 qemu-img.o: qemu-img-cmds.h
 qemu-img.o qemu-tool.o qemu-nbd.o qemu-io.o: $(GENERATED_HEADERS)
 
-qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-error.o $(block-obj-y) 
$(qobject-obj-y)
+qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-error.o $(trace-obj-y) 
$(block-obj-y) $(qobject-obj-y)
 
-qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-error.o $(block-obj-y) 
$(qobject-obj-y)
+qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-error.o $(trace-obj-y) 
$(block-obj-y) $(qobject-obj-y)
 
-qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(block-obj-y) 
$(qobject-obj-y)
+qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(trace-obj-y) 
$(block-obj-y) $(qobject-obj-y)
 
 qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx
$(call quiet-command,sh $(SRC_PATH)/hxtool -h  $  $@,  GEN   $@)
@@ -157,6 +165,7 @@ clean:
rm -f *.o *.d *.a $(TOOLS) TAGS cscope.* *.pod *~ */*~
rm -f slirp/*.o slirp/*.d audio/*.o audio/*.d block/*.o block/*.d 
net/*.o net/*.d
rm -f qemu-img-cmds.h
+   rm -f trace.c trace.h
$(MAKE) -C tests clean
for d in $(ALL_SUBDIRS) libhw32 libhw64 libuser libdis libdis-user; do \
if test -d $$d; then $(MAKE) -C $$d $@ || exit 1; fi; \
diff --git a/Makefile.objs b/Makefile.objs
index acbaf22..9bbdf6f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -248,6 +248,11 @@ libdis-$(CONFIG_S390_DIS) += s390-dis.o
 libdis-$(CONFIG_SH4_DIS) += sh4-dis.o
 libdis-$(CONFIG_SPARC_DIS) += sparc-dis.o
 
+##
+# trace
+
+trace-obj-y = trace.o
+
 vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS)
 
 vl.o: QEMU_CFLAGS+=$(SDL_CFLAGS)
diff --git a/Makefile.target b/Makefile.target
index a22484e..4e63c02 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -294,6 +294,7 @@ $(obj-y) $(obj-$(TARGET_BASE_ARCH)-y): $(GENERATED_HEADERS)
 
 obj-y += $(addprefix ../, $(common-obj-y))
 obj-y += $(addprefix ../libdis/, $(libdis-y))
+obj-y += $(addprefix ../, $(trace-obj-y))
 obj-y += $(libobj-y)
 obj-y += $(addprefix $(HWDIR)/, $(hw-obj-y))
 
diff --git a/configure b/configure
index 3cd2c5f..5e66f3a 100755
--- a/configure
+++ b/configure
@@ -299,6 +299,7 @@ pkgversion=
 check_utests=no
 user_pie=no
 zero_malloc=
+trace_backend=nop
 
 # OS specific
 if check_define __linux__ ; then
@@ -494,6 +495,8 @@ for opt do
   ;;
   --target-list=*) target_list=$optarg
   ;;
+  --trace-backend=*) trace_backend=$optarg
+  ;;
   --enable-gprof) gprof=yes
   ;;
   --static)
@@ -826,6 +829,7 @@ echo

Re: raw disks no longer work in latest kvm (kvm-88 was fine)

2010-05-23 Thread Stefan Hajnoczi
On Sun, May 23, 2010 at 5:18 PM, Antoine Martin anto...@nagafix.co.uk wrote:
 Why does it work in a chroot for the other options (aio=native, if=ide, etc)
 but not for aio!=native??
 Looks like I am misunderstanding the semantics of chroot...

It might not be the chroot() semantics but the environment inside that
chroot, like the glibc.  Have you compared strace inside and outside
the chroot?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC PATCH 1/1] ceph/rbd block driver for qemu-kvm

2010-05-24 Thread Stefan Hajnoczi
On Sun, May 23, 2010 at 1:01 PM, Avi Kivity a...@redhat.com wrote:
 On 05/21/2010 12:29 AM, Anthony Liguori wrote:

 I'd be more interested in enabling people to build these types of storage
 systems without touching qemu.

 Both sheepdog and ceph ultimately transmit I/O over a socket to a central
 daemon, right?

 That incurs an extra copy.

Besides a shared memory approach, I wonder if the splice() family of
syscalls could be used to send/receive data through a storage daemon
without the daemon looking at or copying the data?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH 1/5] trace: Add trace-events file for declaring trace events

2010-05-25 Thread Stefan Hajnoczi
On Mon, May 24, 2010 at 11:20 PM, Anthony Liguori
aligu...@linux.vnet.ibm.com wrote:
 +# check if trace backend exists
 +
 +sh tracetool --$trace_backend --check-backend  /dev/null 2  /dev/null


 This will fail if objdir != srcdir.  You have to qualify tracetool with the
 path to srcdir.

Thanks Anthony, fixed on my branch.  I'll resend a v2 together with other fixes.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/7] Tracing backends

2010-05-25 Thread Stefan Hajnoczi
After the RFC discussion, updated patches which I propose for review and merge:

The following patches against qemu.git allow static trace events to be declared
in QEMU.  Trace events use a lightweight syntax and are independent of the
backend tracing system (e.g. LTTng UST).

Supported backends are:
 * my trivial tracer (simple)
 * LTTng Userspace Tracer (ust)
 * no tracer (nop, the default)

The ./configure option to choose a backend is --trace-backend=.

Main point of this patchset: adding new trace events is easy and we can switch
between backends without modifying the code.

These patches are also available at:
http://repo.or.cz/w/qemu/stefanha.git/shortlog/refs/heads/tracing

v2:
[PATCH 1/7] trace: Add trace-events file for declaring trace events
 * Use $source_path/tracetool in ./configure
 * Include qemu-common.h in trace.h so common types are available

[PATCH 2/7] trace: Support disabled events in trace-events
 * New in v2: makes it easy to build only a subset of trace events

[PATCH 3/7] trace: Add simple built-in tracing backend
 * Make simpletrace.py parse trace-events instead of generating Python

[PATCH 4/7] trace: Add LTTng Userspace Tracer backend

[PATCH 5/7] trace: Trace qemu_malloc() and qemu_vmalloc()
 * Record pointer result from allocation functions

[PATCH 6/7] trace: Trace virtio-blk, multiwrite, and paio_submit

[PATCH 7/7] trace: Trace virtqueue operations
 * New in v2: observe virtqueue buffer add/remove and notifies

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/7] trace: Support disabled events in trace-events

2010-05-25 Thread Stefan Hajnoczi
Sometimes it is useful to disable a trace event.  Removing the event
from trace-events is not enough since source code will call the
trace_*() function for the event.

This patch makes it easy to build without specific trace events by
marking them disabled in trace-events:

disable multiwrite_cb(void *mcb, int ret) mcb %p ret %d

This builds without the multiwrite_cb trace event.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
v2:
 * This patch is new in v2

 trace-events |4 +++-
 tracetool|   10 --
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/trace-events b/trace-events
index a37d3cc..5efaa86 100644
--- a/trace-events
+++ b/trace-events
@@ -12,10 +12,12 @@
 #
 # Format of a trace event:
 #
-# name(type1 arg1[, type2 arg2] ...) format-string
+# [disable] name(type1 arg1[, type2 arg2] ...) format-string
 #
 # Example: qemu_malloc(size_t size) size %zu
 #
+# The disable keyword will build without the trace event.
+#
 # The name must be a valid as a C function name.
 #
 # Types should be standard C types.  Use void * for pointers because the trace
diff --git a/tracetool b/tracetool
index 766a9ba..53d3612 100755
--- a/tracetool
+++ b/tracetool
@@ -110,7 +110,7 @@ linetoc_end_nop()
 # Process stdin by calling begin, line, and end functions for the backend
 convert()
 {
-local begin process_line end
+local begin process_line end str disable
 begin=lineto$1_begin_$backend
 process_line=lineto$1_$backend
 end=lineto$1_end_$backend
@@ -123,8 +123,14 @@ convert()
 str=${str%%#*}
 test -z $str  continue
 
+# Process the line.  The nop backend handles disabled lines.
+disable=${str%%disable*}
 echo
-$process_line $str
+if test -z $disable; then
+lineto$1_nop ${str##disable}
+else
+$process_line $str
+fi
 done
 
 echo
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/7] trace: Add trace-events file for declaring trace events

2010-05-25 Thread Stefan Hajnoczi
This patch introduces the trace-events file where trace events can be
declared like so:

qemu_malloc(size_t size) size %zu
qemu_free(void *ptr) ptr %p

These trace event declarations are processed by a new tool called
tracetool to generate code for the trace events.  Trace event
declarations are independent of the backend tracing system (LTTng User
Space Tracing, ftrace markers, DTrace).

The default nop backend generates empty trace event functions.
Therefore trace events are disabled by default.

The trace-events file serves two purposes:

1. Adding trace events is easy.  It is not necessary to understand the
   details of a backend tracing system.  The trace-events file is a
   single location where trace events can be declared without code
   duplication.

2. QEMU is not tightly coupled to one particular backend tracing system.
   In order to support tracing across QEMU host platforms and to
   anticipate new backend tracing systems that are currently maturing,
   it is important to be flexible and not tied to one system.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
v2:
 * Use $source_path/tracetool in ./configure
 * Include qemu-common.h in trace.h so common types are available

 .gitignore  |2 +
 Makefile|   17 -
 Makefile.objs   |5 ++
 Makefile.target |1 +
 configure   |   19 ++
 trace-events|   24 
 tracetool   |  165 +++
 7 files changed, 229 insertions(+), 4 deletions(-)
 create mode 100644 trace-events
 create mode 100755 tracetool

diff --git a/.gitignore b/.gitignore
index fdfe2f0..4644557 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,8 @@ config-devices.*
 config-all-devices.*
 config-host.*
 config-target.*
+trace.h
+trace.c
 *-softmmu
 *-darwin-user
 *-linux-user
diff --git a/Makefile b/Makefile
index 7986bf6..a9f79a9 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 # Makefile for QEMU.
 
-GENERATED_HEADERS = config-host.h
+GENERATED_HEADERS = config-host.h trace.h
 
 ifneq ($(wildcard config-host.mak),)
 # Put the all: rule here so that config-host.mak can contain dependencies.
@@ -130,16 +130,24 @@ bt-host.o: QEMU_CFLAGS += $(BLUEZ_CFLAGS)
 
 iov.o: iov.c iov.h
 
+trace.h: trace-events
+   $(call quiet-command,sh $(SRC_PATH)/tracetool --$(TRACE_BACKEND) -h  
$  $@,  GEN   $@)
+
+trace.c: trace-events
+   $(call quiet-command,sh $(SRC_PATH)/tracetool --$(TRACE_BACKEND) -c  
$  $@,  GEN   $@)
+
+trace.o: trace.c
+
 ##
 
 qemu-img.o: qemu-img-cmds.h
 qemu-img.o qemu-tool.o qemu-nbd.o qemu-io.o: $(GENERATED_HEADERS)
 
-qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-error.o $(block-obj-y) 
$(qobject-obj-y)
+qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-error.o $(trace-obj-y) 
$(block-obj-y) $(qobject-obj-y)
 
-qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-error.o $(block-obj-y) 
$(qobject-obj-y)
+qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-error.o $(trace-obj-y) 
$(block-obj-y) $(qobject-obj-y)
 
-qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(block-obj-y) 
$(qobject-obj-y)
+qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(trace-obj-y) 
$(block-obj-y) $(qobject-obj-y)
 
 qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx
$(call quiet-command,sh $(SRC_PATH)/hxtool -h  $  $@,  GEN   $@)
@@ -157,6 +165,7 @@ clean:
rm -f *.o *.d *.a $(TOOLS) TAGS cscope.* *.pod *~ */*~
rm -f slirp/*.o slirp/*.d audio/*.o audio/*.d block/*.o block/*.d 
net/*.o net/*.d
rm -f qemu-img-cmds.h
+   rm -f trace.c trace.h
$(MAKE) -C tests clean
for d in $(ALL_SUBDIRS) libhw32 libhw64 libuser libdis libdis-user; do \
if test -d $$d; then $(MAKE) -C $$d $@ || exit 1; fi; \
diff --git a/Makefile.objs b/Makefile.objs
index 1a942e5..20e709e 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -251,6 +251,11 @@ libdis-$(CONFIG_S390_DIS) += s390-dis.o
 libdis-$(CONFIG_SH4_DIS) += sh4-dis.o
 libdis-$(CONFIG_SPARC_DIS) += sparc-dis.o
 
+##
+# trace
+
+trace-obj-y = trace.o
+
 vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS)
 
 vl.o: QEMU_CFLAGS+=$(SDL_CFLAGS)
diff --git a/Makefile.target b/Makefile.target
index fda5bf3..8f7b564 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -293,6 +293,7 @@ $(obj-y) $(obj-$(TARGET_BASE_ARCH)-y): $(GENERATED_HEADERS)
 
 obj-y += $(addprefix ../, $(common-obj-y))
 obj-y += $(addprefix ../libdis/, $(libdis-y))
+obj-y += $(addprefix ../, $(trace-obj-y))
 obj-y += $(libobj-y)
 obj-y += $(addprefix $(HWDIR)/, $(hw-obj-y))
 
diff --git a/configure b/configure
index 3cd2c5f..e94e113 100755
--- a/configure
+++ b/configure
@@ -299,6 +299,7 @@ pkgversion=
 check_utests=no
 user_pie=no
 zero_malloc=
+trace_backend=nop
 
 # OS specific
 if check_define __linux__ ; then
@@ -494,6 +495,8 @@ for opt do
   ;;
   --target-list=*) target_list=$optarg
   ;;
+  --trace

[PATCH 4/7] trace: Add LTTng Userspace Tracer backend

2010-05-25 Thread Stefan Hajnoczi
This patch adds LTTng Userspace Tracer (UST) backend support.  The UST
system requires no kernel support but libust and liburcu must be
installed.

$ ./configure --trace-backend ust
$ make

Start the UST daemon:
$ ustd 

List available tracepoints and enable some:
$ ustctl --list-markers $(pgrep qemu)
[...]
{PID: 5458, channel/marker: ust/paio_submit, state: 0, fmt: acb %p
opaque %p sector_num %lu nb_sectors %lu type %lu 0x4b32ba}
$ ustctl --enable-marker ust/paio_submit $(pgrep qemu)

Run the trace:
$ ustctl --create-trace $(pgrep qemu)
$ ustctl --start-trace $(pgrep qemu)
[...]
$ ustctl --stop-trace $(pgrep qemu)
$ ustctl --destroy-trace $(pgrep qemu)

Trace results can be viewed using lttv-gui.

More information about UST:
http://lttng.org/ust

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 configure |5 +++-
 tracetool |   77 +++-
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 7d2c69b..675d0fc 100755
--- a/configure
+++ b/configure
@@ -829,7 +829,7 @@ echo   --enable-docsenable documentation build
 echo   --disable-docs   disable documentation build
 echo   --disable-vhost-net  disable vhost-net acceleration support
 echo   --enable-vhost-net   enable vhost-net acceleration support
-echo   --trace-backend=BTrace backend nop simple
+echo   --trace-backend=BTrace backend nop simple ust
 echo 
 echo NOTE: The object files are built at the place where configure is 
launched
 exit 1
@@ -2302,6 +2302,9 @@ bsd)
 esac
 
 echo TRACE_BACKEND=$trace_backend  $config_host_mak
+if test $trace_backend = ust; then
+  LIBS=-lust $LIBS
+fi
 
 tools=
 if test `expr $target_list : .*softmmu.*` != 0 ; then
diff --git a/tracetool b/tracetool
index f094ddc..9ea9c08 100755
--- a/tracetool
+++ b/tracetool
@@ -3,12 +3,13 @@
 usage()
 {
 cat 2 EOF
-usage: $0 [--nop | --simple] [-h | -c]
+usage: $0 [--nop | --simple | --ust] [-h | -c]
 Generate tracing code for a file on stdin.
 
 Backends:
   --nop Tracing disabled
   --simple  Simple built-in backend
+  --ust LTTng User Space Tracing backend
 
 Output formats:
   -hGenerate .h file
@@ -179,6 +180,78 @@ linetoc_end_simple()
 return
 }
 
+linetoh_begin_ust()
+{
+echo #include ust/tracepoint.h
+}
+
+linetoh_ust()
+{
+local name args argnames
+name=$(get_name $1)
+args=$(get_args $1)
+argnames=$(get_argnames $1)
+
+cat EOF
+DECLARE_TRACE(ust_$name, TPPROTO($args), TPARGS($argnames));
+#define trace_$name trace_ust_$name
+EOF
+}
+
+linetoh_end_ust()
+{
+# Clean up after UST headers which pollute the namespace
+cat EOF
+#undef mutex_lock
+#undef mutex_unlock
+EOF
+}
+
+linetoc_begin_ust()
+{
+cat EOF
+#include ust/marker.h
+#include trace.h
+EOF
+}
+
+linetoc_ust()
+{
+local name args argnames fmt
+name=$(get_name $1)
+args=$(get_args $1)
+argnames=$(get_argnames $1)
+fmt=$(get_fmt $1)
+
+cat EOF
+DEFINE_TRACE(ust_$name);
+
+static void ust_${name}_probe($args)
+{
+trace_mark(ust, $name, $fmt, $argnames);
+}
+EOF
+
+# Collect names for later
+names=$names $name
+}
+
+linetoc_end_ust()
+{
+cat EOF
+static void __attribute__((constructor)) trace_init(void)
+{
+EOF
+
+for name in $names; do
+cat EOF
+register_trace_ust_$name(ust_${name}_probe);
+EOF
+done
+
+echo }
+}
+
 # Process stdin by calling begin, line, and end functions for the backend
 convert()
 {
@@ -228,7 +301,7 @@ tracetoc()
 
 # Choose backend
 case $1 in
---nop | --simple) backend=${1#--} ;;
+--nop | --simple | --ust) backend=${1#--} ;;
 *) usage ;;
 esac
 shift
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/7] trace: Trace virtqueue operations

2010-05-25 Thread Stefan Hajnoczi
This patch adds trace events for virtqueue operations including
adding/removing buffers, notifying the guest, and receiving a notify
from the guest.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
v2:
 * This patch is new in v2

 hw/virtio.c  |8 
 trace-events |8 
 2 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/hw/virtio.c b/hw/virtio.c
index 4475bb3..a5741ae 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -13,6 +13,7 @@
 
 #include inttypes.h
 
+#include trace.h
 #include virtio.h
 #include sysemu.h
 
@@ -205,6 +206,8 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement 
*elem,
 unsigned int offset;
 int i;
 
+trace_virtqueue_fill(vq, elem, len, idx);
+
 offset = 0;
 for (i = 0; i  elem-in_num; i++) {
 size_t size = MIN(len - offset, elem-in_sg[i].iov_len);
@@ -232,6 +235,7 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count)
 {
 /* Make sure buffer is written before we update index. */
 wmb();
+trace_virtqueue_flush(vq, count);
 vring_used_idx_increment(vq, count);
 vq-inuse -= count;
 }
@@ -422,6 +426,7 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
 
 vq-inuse++;
 
+trace_virtqueue_pop(vq, elem, elem-in_num, elem-out_num);
 return elem-in_num + elem-out_num;
 }
 
@@ -560,6 +565,7 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n)
 void virtio_queue_notify(VirtIODevice *vdev, int n)
 {
 if (n  VIRTIO_PCI_QUEUE_MAX  vdev-vq[n].vring.desc) {
+trace_virtio_queue_notify(vdev, n, vdev-vq[n]);
 vdev-vq[n].handle_output(vdev, vdev-vq[n]);
 }
 }
@@ -597,6 +603,7 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int 
queue_size,
 
 void virtio_irq(VirtQueue *vq)
 {
+trace_virtio_irq(vq);
 vq-vdev-isr |= 0x01;
 virtio_notify_vector(vq-vdev, vq-vector);
 }
@@ -609,6 +616,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
  (vq-inuse || vring_avail_idx(vq) != vq-last_avail_idx)))
 return;
 
+trace_virtio_notify(vdev, vq);
 vdev-isr |= 0x01;
 virtio_notify_vector(vdev, vq-vector);
 }
diff --git a/trace-events b/trace-events
index 48415f8..a533414 100644
--- a/trace-events
+++ b/trace-events
@@ -35,6 +35,14 @@ qemu_memalign(size_t alignment, size_t size, void *ptr) 
alignment %zu size %zu
 qemu_valloc(size_t size, void *ptr) size %zu ptr %p
 qemu_vfree(void *ptr) ptr %p
 
+# hw/virtio.c
+virtqueue_fill(void *vq, const void *elem, unsigned int len, unsigned int idx) 
vq %p elem %p len %u idx %u
+virtqueue_flush(void *vq, unsigned int count) vq %p count %u
+virtqueue_pop(void *vq, void *elem, unsigned int in_num, unsigned int out_num) 
vq %p elem %p in_num %u out_num %u
+virtio_queue_notify(void *vdev, int n, void *vq) vdev %p n %d vq %p
+virtio_irq(void *vq) vq %p
+virtio_notify(void *vdev, void *vq) vdev %p vq %p
+
 # block.c
 multiwrite_cb(void *mcb, int ret) mcb %p ret %d
 bdrv_aio_multiwrite(void *mcb, int num_callbacks, int num_reqs) mcb %p 
num_callbacks %d num_reqs %d
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/7] trace: Trace virtio-blk, multiwrite, and paio_submit

2010-05-25 Thread Stefan Hajnoczi
This patch adds trace events that make it possible to observe
virtio-blk.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
 block.c|7 +++
 hw/virtio-blk.c|7 +++
 posix-aio-compat.c |2 ++
 trace-events   |   14 ++
 4 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/block.c b/block.c
index 0b0966c..56db112 100644
--- a/block.c
+++ b/block.c
@@ -23,6 +23,7 @@
  */
 #include config-host.h
 #include qemu-common.h
+#include trace.h
 #include monitor.h
 #include block_int.h
 #include module.h
@@ -1922,6 +1923,8 @@ static void multiwrite_cb(void *opaque, int ret)
 {
 MultiwriteCB *mcb = opaque;
 
+trace_multiwrite_cb(mcb, ret);
+
 if (ret  0  !mcb-error) {
 mcb-error = ret;
 multiwrite_user_cb(mcb);
@@ -2065,6 +2068,8 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, 
BlockRequest *reqs, int num_reqs)
 // Check for mergable requests
 num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb);
 
+trace_bdrv_aio_multiwrite(mcb, mcb-num_callbacks, num_reqs);
+
 // Run the aio requests
 for (i = 0; i  num_reqs; i++) {
 acb = bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov,
@@ -2075,9 +2080,11 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, 
BlockRequest *reqs, int num_reqs)
 // submitted yet. Otherwise we'll wait for the submitted AIOs to
 // complete and report the error in the callback.
 if (mcb-num_requests == 0) {
+trace_bdrv_aio_multiwrite_earlyfail(mcb);
 reqs[i].error = -EIO;
 goto fail;
 } else {
+trace_bdrv_aio_multiwrite_latefail(mcb, i);
 mcb-num_requests++;
 multiwrite_cb(mcb, -EIO);
 break;
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 5d7f1a2..706f109 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -13,6 +13,7 @@
 
 #include qemu-common.h
 #include sysemu.h
+#include trace.h
 #include virtio-blk.h
 #include block_int.h
 #ifdef __linux__
@@ -50,6 +51,8 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, int 
status)
 {
 VirtIOBlock *s = req-dev;
 
+trace_virtio_blk_req_complete(req, status);
+
 req-in-status = status;
 virtqueue_push(s-vq, req-elem, req-qiov.size + sizeof(*req-in));
 virtio_notify(s-vdev, s-vq);
@@ -87,6 +90,8 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
 {
 VirtIOBlockReq *req = opaque;
 
+trace_virtio_blk_rw_complete(req, ret);
+
 if (ret) {
 int is_read = !(req-out-type  VIRTIO_BLK_T_OUT);
 if (virtio_blk_handle_rw_error(req, -ret, is_read))
@@ -263,6 +268,8 @@ static void virtio_blk_handle_flush(BlockRequest *blkreq, 
int *num_writes,
 static void virtio_blk_handle_write(BlockRequest *blkreq, int *num_writes,
 VirtIOBlockReq *req, BlockDriverState **old_bs)
 {
+trace_virtio_blk_handle_write(req, req-out-sector, req-qiov.size / 512);
+
 if (req-out-sector  req-dev-sector_mask) {
 virtio_blk_rw_complete(req, -EIO);
 return;
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index b43c531..c2200fe 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -25,6 +25,7 @@
 #include qemu-queue.h
 #include osdep.h
 #include qemu-common.h
+#include trace.h
 #include block_int.h
 
 #include block/raw-posix-aio.h
@@ -583,6 +584,7 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
 acb-next = posix_aio_state-first_aio;
 posix_aio_state-first_aio = acb;
 
+trace_paio_submit(acb, opaque, sector_num, nb_sectors, type);
 qemu_paio_submit(acb);
 return acb-common;
 }
diff --git a/trace-events b/trace-events
index 3fde0c6..48415f8 100644
--- a/trace-events
+++ b/trace-events
@@ -34,3 +34,17 @@ qemu_free(void *ptr) ptr %p
 qemu_memalign(size_t alignment, size_t size, void *ptr) alignment %zu size 
%zu ptr %p
 qemu_valloc(size_t size, void *ptr) size %zu ptr %p
 qemu_vfree(void *ptr) ptr %p
+
+# block.c
+multiwrite_cb(void *mcb, int ret) mcb %p ret %d
+bdrv_aio_multiwrite(void *mcb, int num_callbacks, int num_reqs) mcb %p 
num_callbacks %d num_reqs %d
+bdrv_aio_multiwrite_earlyfail(void *mcb) mcb %p
+bdrv_aio_multiwrite_latefail(void *mcb, int i) mcb %p i %d
+
+# hw/virtio-blk.c
+virtio_blk_req_complete(void *req, int status) req %p status %d
+virtio_blk_rw_complete(void *req, int ret) req %p ret %d
+virtio_blk_handle_write(void *req, unsigned long sector, unsigned long 
nsectors) req %p sector %lu nsectors %lu
+
+# posix-aio-compat.c
+paio_submit(void *acb, void *opaque, unsigned long sector_num, unsigned long 
nb_sectors, unsigned long type) acb %p opaque %p sector_num %lu nb_sectors %lu 
type %lu
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/7] trace: Trace qemu_malloc() and qemu_vmalloc()

2010-05-25 Thread Stefan Hajnoczi
It is often useful to instrument memory management functions in order to
find leaks or performance problems.  This patch adds trace events for
the memory allocation primitives.

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
v2:
 * Record pointer result from allocation functions

 osdep.c   |   24 ++--
 qemu-malloc.c |   12 ++--
 trace-events  |   10 ++
 3 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/osdep.c b/osdep.c
index abbc8a2..a6b7726 100644
--- a/osdep.c
+++ b/osdep.c
@@ -50,6 +50,7 @@
 #endif
 
 #include qemu-common.h
+#include trace.h
 #include sysemu.h
 #include qemu_socket.h
 
@@ -71,25 +72,34 @@ static void *oom_check(void *ptr)
 #if defined(_WIN32)
 void *qemu_memalign(size_t alignment, size_t size)
 {
+void *ptr;
+
 if (!size) {
 abort();
 }
-return oom_check(VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE));
+ptr = oom_check(VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE));
+trace_qemu_memalign(alignment, size, ptr);
+return ptr;
 }
 
 void *qemu_vmalloc(size_t size)
 {
+void *ptr;
+
 /* FIXME: this is not exactly optimal solution since VirtualAlloc
has 64Kb granularity, but at least it guarantees us that the
memory is page aligned. */
 if (!size) {
 abort();
 }
-return oom_check(VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE));
+ptr = oom_check(VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE));
+trace_qemu_vmalloc(size, ptr);
+return ptr;
 }
 
 void qemu_vfree(void *ptr)
 {
+trace_qemu_vfree(ptr);
 VirtualFree(ptr, 0, MEM_RELEASE);
 }
 
@@ -97,21 +107,22 @@ void qemu_vfree(void *ptr)
 
 void *qemu_memalign(size_t alignment, size_t size)
 {
+void *ptr;
 #if defined(_POSIX_C_SOURCE)  !defined(__sun__)
 int ret;
-void *ptr;
 ret = posix_memalign(ptr, alignment, size);
 if (ret != 0) {
 fprintf(stderr, Failed to allocate %zu B: %s\n,
 size, strerror(ret));
 abort();
 }
-return ptr;
 #elif defined(CONFIG_BSD)
-return oom_check(valloc(size));
+ptr = oom_check(valloc(size));
 #else
-return oom_check(memalign(alignment, size));
+ptr = oom_check(memalign(alignment, size));
 #endif
+trace_qemu_memalign(alignment, size, ptr);
+return ptr;
 }
 
 /* alloc shared memory pages */
@@ -122,6 +133,7 @@ void *qemu_vmalloc(size_t size)
 
 void qemu_vfree(void *ptr)
 {
+trace_qemu_vfree(ptr);
 free(ptr);
 }
 
diff --git a/qemu-malloc.c b/qemu-malloc.c
index 6cdc5de..72de60a 100644
--- a/qemu-malloc.c
+++ b/qemu-malloc.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include qemu-common.h
+#include trace.h
 #include stdlib.h
 
 static void *oom_check(void *ptr)
@@ -39,6 +40,7 @@ void *get_mmap_addr(unsigned long size)
 
 void qemu_free(void *ptr)
 {
+trace_qemu_free(ptr);
 free(ptr);
 }
 
@@ -53,18 +55,24 @@ static int allow_zero_malloc(void)
 
 void *qemu_malloc(size_t size)
 {
+void *ptr;
 if (!size  !allow_zero_malloc()) {
 abort();
 }
-return oom_check(malloc(size ? size : 1));
+ptr = oom_check(malloc(size ? size : 1));
+trace_qemu_malloc(size, ptr);
+return ptr;
 }
 
 void *qemu_realloc(void *ptr, size_t size)
 {
+void *newptr;
 if (!size  !allow_zero_malloc()) {
 abort();
 }
-return oom_check(realloc(ptr, size ? size : 1));
+newptr = oom_check(realloc(ptr, size ? size : 1));
+trace_qemu_realloc(ptr, size, newptr);
+return newptr;
 }
 
 void *qemu_mallocz(size_t size)
diff --git a/trace-events b/trace-events
index 5efaa86..3fde0c6 100644
--- a/trace-events
+++ b/trace-events
@@ -24,3 +24,13 @@
 # system may not have the necessary headers included.
 #
 # The format-string should be a sprintf()-compatible format string.
+
+# qemu-malloc.c
+qemu_malloc(size_t size, void *ptr) size %zu ptr %p
+qemu_realloc(void *ptr, size_t size, void *newptr) ptr %p size %zu newptr %p
+qemu_free(void *ptr) ptr %p
+
+# osdep.c
+qemu_memalign(size_t alignment, size_t size, void *ptr) alignment %zu size 
%zu ptr %p
+qemu_valloc(size_t size, void *ptr) size %zu ptr %p
+qemu_vfree(void *ptr) ptr %p
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/7] trace: Add simple built-in tracing backend

2010-05-25 Thread Stefan Hajnoczi
This patch adds a simple tracer which produces binary trace files and is
built into QEMU.  The main purpose of this patch is to show how new
tracing backends can be added to tracetool.

To try out the simple backend:

./configure --trace-backend=simple
make

After running QEMU you can pretty-print the trace:

./simpletrace.py trace-events /tmp/trace.log

Signed-off-by: Stefan Hajnoczi stefa...@linux.vnet.ibm.com
---
I intend for this tracing backend to be replaced by something based on Prerna's
work.  For now it is useful for basic tracing.

v2:
 * Make simpletrace.py parse trace-events instead of generating Python

 .gitignore |1 +
 Makefile.objs  |3 ++
 configure  |2 +-
 simpletrace.c  |   64 ++
 simpletrace.py |   53 ++
 tracetool  |   78 +--
 6 files changed, 197 insertions(+), 4 deletions(-)
 create mode 100644 simpletrace.c
 create mode 100755 simpletrace.py

diff --git a/.gitignore b/.gitignore
index 4644557..5128452 100644
--- a/.gitignore
+++ b/.gitignore
@@ -39,6 +39,7 @@ qemu-monitor.texi
 *.log
 *.pdf
 *.pg
+*.pyc
 *.toc
 *.tp
 *.vr
diff --git a/Makefile.objs b/Makefile.objs
index 20e709e..7cb40ac 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -255,6 +255,9 @@ libdis-$(CONFIG_SPARC_DIS) += sparc-dis.o
 # trace
 
 trace-obj-y = trace.o
+ifeq ($(TRACE_BACKEND),simple)
+trace-obj-y += simpletrace.o
+endif
 
 vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS)
 
diff --git a/configure b/configure
index e94e113..7d2c69b 100755
--- a/configure
+++ b/configure
@@ -829,7 +829,7 @@ echo   --enable-docsenable documentation build
 echo   --disable-docs   disable documentation build
 echo   --disable-vhost-net  disable vhost-net acceleration support
 echo   --enable-vhost-net   enable vhost-net acceleration support
-echo   --trace-backend=BTrace backend nop
+echo   --trace-backend=BTrace backend nop simple
 echo 
 echo NOTE: The object files are built at the place where configure is 
launched
 exit 1
diff --git a/simpletrace.c b/simpletrace.c
new file mode 100644
index 000..2fec4d3
--- /dev/null
+++ b/simpletrace.c
@@ -0,0 +1,64 @@
+#include stdlib.h
+#include stdio.h
+#include trace.h
+
+typedef struct {
+unsigned long event;
+unsigned long x1;
+unsigned long x2;
+unsigned long x3;
+unsigned long x4;
+unsigned long x5;
+} TraceRecord;
+
+enum {
+TRACE_BUF_LEN = 64 * 1024 / sizeof(TraceRecord),
+};
+
+static TraceRecord trace_buf[TRACE_BUF_LEN];
+static unsigned int trace_idx;
+static FILE *trace_fp;
+
+static void trace(TraceEvent event, unsigned long x1,
+  unsigned long x2, unsigned long x3,
+  unsigned long x4, unsigned long x5) {
+TraceRecord *rec = trace_buf[trace_idx];
+rec-event = event;
+rec-x1 = x1;
+rec-x2 = x2;
+rec-x3 = x3;
+rec-x4 = x4;
+rec-x5 = x5;
+
+if (++trace_idx == TRACE_BUF_LEN) {
+trace_idx = 0;
+
+if (!trace_fp) {
+trace_fp = fopen(/tmp/trace.log, w);
+}
+if (trace_fp) {
+size_t result = fwrite(trace_buf, sizeof trace_buf, 1, trace_fp);
+result = result;
+}
+}
+}
+
+void trace1(TraceEvent event, unsigned long x1) {
+trace(event, x1, 0, 0, 0, 0);
+}
+
+void trace2(TraceEvent event, unsigned long x1, unsigned long x2) {
+trace(event, x1, x2, 0, 0, 0);
+}
+
+void trace3(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3) {
+trace(event, x1, x2, x3, 0, 0);
+}
+
+void trace4(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4) {
+trace(event, x1, x2, x3, x4, 0);
+}
+
+void trace5(TraceEvent event, unsigned long x1, unsigned long x2, unsigned 
long x3, unsigned long x4, unsigned long x5) {
+trace(event, x1, x2, x3, x4, x5);
+}
diff --git a/simpletrace.py b/simpletrace.py
new file mode 100755
index 000..d6631ba
--- /dev/null
+++ b/simpletrace.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+import sys
+import struct
+import re
+
+trace_fmt = 'LL'
+trace_len = struct.calcsize(trace_fmt)
+event_re  = re.compile(r'(disable\s+)?([a-zA-Z0-9_]+)\(([^)]*)\)\s+([^]*)')
+
+def parse_events(fobj):
+def get_argnames(args):
+return tuple(arg.split()[-1].lstrip('*') for arg in args.split(','))
+
+events = {}
+event_num = 0
+for line in fobj:
+m = event_re.match(line.strip())
+if m is None:
+continue
+
+disable, name, args, fmt = m.groups()
+if disable:
+continue
+
+events[event_num] = (name,) + get_argnames(args)
+event_num += 1
+return events
+
+def read_record(fobj):
+s = fobj.read(trace_len)
+if len(s) != trace_len:
+return None
+return struct.unpack(trace_fmt, s)
+
+def format_record(events, rec):
+event = events[rec[0]]
+fields

Re: [PATCH 7/7] trace: Trace virtqueue operations

2010-05-25 Thread Stefan Hajnoczi
On Tue, May 25, 2010 at 1:04 PM, Avi Kivity a...@redhat.com wrote:
 Those %ps are more or less useless.  We need better ways of identifying
 them.

You're right, the vq pointer is useless in isolation.  We don't know
which virtio device or which virtqueue number.

With the full context of a trace it would be possible to correlate the
vq pointer if we had trace events for vdev and vq setup.

Adding custom formatters is could be tricky since the format string is
passed only to tracing backends that use it, like UST.  And UST uses
its own sprintf implementation which we don't have direct control
over.

I think we just need to guarantee that any pointer can be correlated
with previous trace entries that give context for that pointer.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 7/7] trace: Trace virtqueue operations

2010-05-25 Thread Stefan Hajnoczi
On Tue, May 25, 2010 at 2:52 PM, Avi Kivity a...@redhat.com wrote:
 Hm.  Perhaps we can convert %{type} to %p for backends which don't support
 it, and to whatever format they do support for those that do.

True.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Perf trace event parse errors for KVM events

2010-05-26 Thread Stefan Hajnoczi
The perf trace command produces the following messages:

For kvm:kvm_apic:
$ perf trace
  Warning: Error: expected type 4 but read 7
  Warning: Error: expected type 5 but read 0
  Warning: failed to read event print fmt for kvm_apic

For kvm:kvm_inj_exception:
$ perf trace
  Warning: Error: expected type 4 but read 7
  Warning: Error: expected type 5 but read 0
  Warning: failed to read event print fmt for kvm_inj_exception

For kvm:kvm_emulate_insn:
$ perf trace
  Fatal: bad op token {

The other KVM trace events work fine.

To reproduce this issue (you don't need any actual KVM activity while running):
$ perf record -a -f -e 'kvm:kvm_apic' sleep 1
$ perf trace

Both linux-2.6.git 7e125f7b9cbfce4101191b8076d606c517a73066 and Avi's kvm.git
show this problem.

It appears that the perf.data written by perf record cannot be parsed by
perf trace run using the same perf binary.  The perf binary was built from
the same kernel tree that I am running (there should be no mismatch between the
kernel and the perf binary).

At the end of this email is a gdb session that shows where the
kvm:kvm_apic expected type 4 but read 7 error occurs.

Any ideas?

Thanks,
Stefan

$ sudo gdb --args ./perf trace
[...]
(gdb) b warning
Breakpoint 1 at 0x42d000: file util/usage.c, line 74.
(gdb) r
Starting program: /home/stefanha/linux-2.6/tools/perf/perf trace
[Thread debugging using libthread_db enabled]

Breakpoint 1, warning (warn=0x463390 Error: expected type %d but read %d) at 
util/usage.c:74
74  {
(gdb) bt
#0  warning (warn=0x463390 Error: expected type %d but read %d) at 
util/usage.c:74
#1  0x00443123 in __test_type_token (event=0x9d0d40, list=value 
optimized out, tok=value optimized out) at util/trace-event-parse.c:639
#2  test_type_token (event=0x9d0d40, list=value optimized out, tok=value 
optimized out) at util/trace-event-parse.c:656
#3  process_fields (event=0x9d0d40, list=value optimized out, tok=value 
optimized out) at util/trace-event-parse.c:1499
#4  0x00442598 in process_symbols (event=0x9d0d40, arg=0x9d2110, 
tok=0x7fff9fa8, type=value optimized out) at util/trace-event-parse.c:1588
#5  process_arg_token (event=0x9d0d40, arg=0x9d2110, tok=0x7fff9fa8, 
type=value optimized out) at util/trace-event-parse.c:1703
#6  0x00443cee in process_arg (event=0x9d0d40) at 
util/trace-event-parse.c:1034
#7  event_read_print_args (event=0x9d0d40) at util/trace-event-parse.c:1774
#8  event_read_print (event=0x9d0d40) at util/trace-event-parse.c:1853
#9  0x00444c0e in parse_event_file (buf=value optimized out, 
size=value optimized out, sys=0x9ccf90 kvm) at util/trace-event-parse.c:3192
#10 0x00445e3d in read_event_file (fd=value optimized out, 
__repipe=value optimized out) at util/trace-event-read.c:236
#11 read_event_files (fd=value optimized out, __repipe=value optimized out) 
at util/trace-event-read.c:270
#12 trace_report (fd=value optimized out, __repipe=value optimized out) at 
util/trace-event-read.c:514
#13 0x00434c52 in perf_file_section__process (self=0x9cc540, 
ph=0x9cc9a0, feat=1, fd=5) at util/header.c:777
#14 0x00435ad9 in perf_header__process_sections (self=0x9cc9a0, fd=5, 
process=0x434b80 perf_file_section__process) at util/header.c:618
#15 0x00436c31 in perf_header__read (session=0x9cc9a0, fd=5) at 
util/header.c:893
#16 0x0043d566 in perf_session__open (filename=value optimized out, 
mode=0, force=value optimized out, repipe=false) at util/session.c:51
#17 perf_session__new (filename=value optimized out, mode=0, force=value 
optimized out, repipe=false) at util/session.c:103
#18 0x004169b7 in cmd_trace (argc=0, argv=0x7fffe6e0, prefix=value 
optimized out) at builtin-trace.c:666
#19 0x00404dd7 in run_builtin (p=0x66ee58, argc=1, argv=0x7fffe6e0) 
at perf.c:265
#20 0x00405876 in handle_internal_command (argc=1, argv=0x7fffe6e0) 
at perf.c:336
#21 run_argv (argc=1, argv=0x7fffe6e0) at perf.c:380
#22 main (argc=1, argv=0x7fffe6e0) at perf.c:464
(gdb) p *(struct event*)0x9d0d40
$1 = {next = 0x0, name = 0x9d0d90 kvm_apic, id = 762, flags = 0, format = 
{nr_common = 5, nr_fields = 3, common_fields = 0x9d0df0, fields = 0x9d1d20},
  print_fmt = {format = 0x9d1ef0 apic_%s %s = 0x%x, args = 0x9d1f30}, system 
= 0x0}
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Perf trace event parse errors for KVM events

2010-05-28 Thread Stefan Hajnoczi
I get parse errors when using Steven Rostedt's trace-cmd tool, too.

Any ideas what is going on here?  I can provide more info (e.g. trace
files) if necessary.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Perf trace event parse errors for KVM events

2010-05-29 Thread Stefan Hajnoczi
On Fri, May 28, 2010 at 05:45:57PM -0400, Steven Rostedt wrote:
 On Fri, 2010-05-28 at 17:42 +0100, Stefan Hajnoczi wrote:
  I get parse errors when using Steven Rostedt's trace-cmd tool, too.
  
  Any ideas what is going on here?  I can provide more info (e.g. trace
  files) if necessary.
 
 Does trace-cmd fail on the same tracepoints? Have you checkout the
 latest code?.

$ sudo trace-cmd record -e kvm:kvm_pio
$ trace-cmd report
version = 6
  bad op token {
  failed to read event print fmt for kvm_mmu_get_page
  bad op token {
  failed to read event print fmt for kvm_mmu_sync_page
  bad op token {
  failed to read event print fmt for kvm_mmu_unsync_page
  bad op token {
  failed to read event print fmt for kvm_mmu_zap_page
  Error: expected type 4 but read 7
  Error: expected type 5 but read 0
  failed to read event print fmt for kvm_apic
  function ftrace_print_symbols_seq not defined
  failed to read event print fmt for kvm_exit
  Error: expected type 4 but read 7
  Error: expected type 5 but read 0
  failed to read event print fmt for kvm_inj_exception
  function ftrace_print_symbols_seq not defined
  failed to read event print fmt for kvm_nested_vmexit
  function ftrace_print_symbols_seq not defined
  failed to read event print fmt for kvm_nested_vmexit_inject
  bad op token {
  failed to read event print fmt for kvm_emulate_insn

These are different from those reported by perf.

Yes, I use trace-cmd.git master branch (currently built from
b530a23f0442be322b1717e6dbce2bd502634cb4).

My kernel is 2.6.34 based.

 I do know it fails on some of the KVM tracerpoints since the formatting
 they use is obnoxious.
 
 Could you show the print-fmt of the failing events?

Here are the details along with my amateur comments on what might have gone 
wrong:

$ for event in kvmmmu/kvm_mmu_get_page kvmmmu/kvm_mmu_sync_page 
kvmmmu/kvm_mmu_unsync_page kvmmmu/kvm_mmu_zap_page kvm/kvm_apic kvm/kvm_exit 
kvm/kvm_inj_exception kvm/kvm_nested_vmexit kvm/kvm_nested_vmexit_inject 
kvm/kvm_emulate_insn; do echo -n $event: ; grep 'print fmt:' 
/sys/kernel/debug/tracing/events/$event/format; done

kvmmmu/kvm_mmu_get_page: print fmt: %s %s, ({ const char *ret = p-buffer + 
p-len; static const char *access_str[] = { ---, --x, w--, w-x, -u-, 
-ux, wu-, wux }; union kvm_mmu_page_role role; role.word = REC-role; 
trace_seq_printf(p, sp gfn %llx %u%s q%u%s %s%s  %snxe root %u %s%c, 
REC-gfn, role.level, role.cr4_pae ?  pae : , role.quadrant, role.direct ? 
 direct : , access_str[role.access], role.invalid ?  invalid : , 
role.nxe ?  : !, REC-root_count, REC-unsync ? unsync : sync, 0); ret; 
}), REC-created ? new : existing
kvmmmu/kvm_mmu_sync_page: print fmt: %s, ({ const char *ret = p-buffer + 
p-len; static const char *access_str[] = { ---, --x, w--, w-x, -u-, 
-ux, wu-, wux }; union kvm_mmu_page_role role; role.word = REC-role; 
trace_seq_printf(p, sp gfn %llx %u%s q%u%s %s%s  %snxe root %u %s%c, 
REC-gfn, role.level, role.cr4_pae ?  pae : , role.quadrant, role.direct ? 
 direct : , access_str[role.access], role.invalid ?  invalid : , 
role.nxe ?  : !, REC-root_count, REC-unsync ? unsync : sync, 0); ret; 
})
kvmmmu/kvm_mmu_unsync_page: print fmt: %s, ({ const char *ret = p-buffer + 
p-len; static const char *access_str[] = { ---, --x, w--, w-x, -u-, 
-ux, wu-, wux }; union kvm_mmu_page_role role; role.word = REC-role; 
trace_seq_printf(p, sp gfn %llx %u%s q%u%s %s%s  %snxe root %u %s%c, 
REC-gfn, role.level, role.cr4_pae ?  pae : , role.quadrant, role.direct ? 
 direct : , access_str[role.access], role.invalid ?  invalid : , 
role.nxe ?  : !, REC-root_count, REC-unsync ? unsync : sync, 0); ret; 
})
kvmmmu/kvm_mmu_zap_page: print fmt: %s, ({ const char *ret = p-buffer + 
p-len; static const char *access_str[] = { ---, --x, w--, w-x, -u-, 
-ux, wu-, wux }; union kvm_mmu_page_role role; role.word = REC-role; 
trace_seq_printf(p, sp gfn %llx %u%s q%u%s %s%s  %snxe root %u %s%c, 
REC-gfn, role.level, role.cr4_pae ?  pae : , role.quadrant, role.direct ? 
 direct : , access_str[role.access], role.invalid ?  invalid : , 
role.nxe ?  : !, REC-root_count, REC-unsync ? unsync : sync, 0); ret; 
})
kvm/kvm_emulate_insn: print fmt: %x:%llx:%s (%s)%s, REC-csbase, REC-rip, ({ 
int i; const char *ret = p-buffer + p-len; for (i = 0; i  REC-len; ++i) 
trace_seq_printf(p,  %02x, REC-insn[i]); trace_seq_printf(p, %c, 0); ret; 
}), __print_symbolic(REC-flags, { 0, real }, { (1  0) | (1  1), vm16 
}, { (1  0), prot16 }, { (1  0) | (1  2), prot32 }, { (1  0) | (1 
 3), prot64 }), REC-failed ?  failed : 

Macro expanded into C code that shouldn't have?

kvm/kvm_apic: print fmt: apic_%s %s = 0x%x, REC-rw ? write : read, 
__print_symbolic(REC-reg, { 0x20, APIC_ ID }, { 0x30, APIC_ LVR }, { 
0x80, APIC_ TASKPRI }, { 0x90, APIC_ ARBPRI }, { 0xA0, APIC_ 
PROCPRI }, { 0xB0, APIC_ EOI }, { 0xC0, APIC_ RRR }, { 0xD0, APIC_ 
LDR }, { 0xE0, APIC_ DFR }, { 0xF0, APIC_ SPIV }, { 0x100, APIC_ 
ISR }, { 0x180, APIC_ TMR }, { 0x200, APIC_ IRR }, { 0x280, APIC_ 
ESR

Re: raw disks no longer work in latest kvm (kvm-88 was fine)

2010-05-29 Thread Stefan Hajnoczi
On Sat, May 29, 2010 at 10:42 AM, Antoine Martin anto...@nagafix.co.uk wrote:
 Can someone explain the aio options?
 All I can find is this:
 # qemu-system-x86_64 -h | grep -i aio
       [,addr=A][,id=name][,aio=threads|native]
 I assume it means the aio=threads emulates the kernel's aio with
 separate threads? And is therefore likely to be slower, right?
 Is there a reason why aio=native is not the default? Shouldn't
 aio=threads be the fallback?

aio=threads uses posix-aio-compat.c, a POSIX AIO-like implementation
using a thread pool.  Each thread services queued I/O requests using
blocking syscalls (e.g. preadv()/pwritev()).

aio=native uses Linux libaio, the native (non-POSIX) AIO interface.

I would expect that aio=native is faster but benchmarks show that this
isn't true for all workloads.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: raw disks no longer work in latest kvm (kvm-88 was fine)

2010-05-29 Thread Stefan Hajnoczi
On Sat, May 29, 2010 at 11:34 AM, Christoph Hellwig h...@infradead.org wrote:
 In what benchmark do you see worse results for aio=native compared to
 aio=threads?

Sequential reads using 4 concurrent dd if=/dev/vdb iflag=direct
of=/dev/null bs=8k processes.  2 vcpu guest with 4 GB RAM, virtio
block devices, cache=none.  Host storage is a striped LVM volume.
Host kernel kvm.git and qemu-kvm.git userspace.

aio=native and aio=threads each run 3 times.

Result: aio=native has 15% lower throughput than aio=threads.

I haven't looked into this so I don't know what is causes these results.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ATA Trim for qcow(2)

2011-01-23 Thread Stefan Hajnoczi
On Sun, Jan 23, 2011 at 9:35 PM, Emil Langrock emil.langr...@gmx.de wrote:
 there is support for ext4 to use the trim ATA command when a block is freed. I
 read that there should be an extra command which does that freeing afterwards.
 So is it possible to use that information inside the qcow to mark those
 sectors as free? This would make it possible to shrink the size of an image
 significantly using some offline (maybe also some online) tools.

There is currently no TRIM support in qcow2.  Christoph Hellwig
recently added TRIM support to raw images on an XFS host file system.
In the future we'll see wider support.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: KVM call agenda for Jan 25

2011-01-25 Thread Stefan Hajnoczi
On Tue, Jan 25, 2011 at 2:02 PM, Luiz Capitulino lcapitul...@redhat.com wrote:
  - Google summer of code 2011 is on, are we interested? (note: I just saw the
   news, I don't have any information yet)

http://www.google-melange.com/document/show/gsoc_program/google/gsoc2011/timeline

I'd like to see an in-place QCOW2 - QED image converter with tests.
I'm interested in mentoring this year.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Jan 25

2011-01-25 Thread Stefan Hajnoczi
On Tue, Jan 25, 2011 at 2:26 PM, Avi Kivity a...@redhat.com wrote:
 On 01/25/2011 12:06 AM, Anthony Liguori wrote:

 On 01/24/2011 07:25 AM, Chris Wright wrote:

 Please send in any agenda items you are interested in covering.

 - coroutines for the block layer

 I have a perpetually in progress branch for this, and would very much like
 to see this done.

Seen this?
http://repo.or.cz/w/qemu/stefanha.git/commit/8179e8ff20bb3f14f361109afe5b3bf2bac24f0d
http://repo.or.cz/w/qemu/stefanha.git/shortlog/8179e8ff20bb3f14f361109afe5b3bf2bac24f0d

And the qemu-devel thread:
http://www.mail-archive.com/qemu-devel@nongnu.org/msg52522.html

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Qemu-img create problem

2011-01-28 Thread Stefan Hajnoczi
On Fri, Jan 28, 2011 at 1:13 PM, Himanshu Chauhan
hschau...@nulltrace.org wrote:
 I just cloned qemu-kvm, built and installed it. But the qemu-img fails
 to create any disk image above 1G. The problem as I see is use of
 ssize_t for image size. When size is 2G, the check if (sval  0)
 succeeds and I get the error:

This is fixed in qemu.git 70b4f4bb05ff5e6812c6593eeefbd19bd61b517d
Make strtosz() return int64_t instead of ssize_t.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Zero-copy block driver?

2011-01-29 Thread Stefan Hajnoczi
2011/1/29 Darko Petrović darko.b.petro...@gmail.com:
 Could you please tell me if it is possible to use a block driver that
 completely avoids the guest kernel and copies block data directly to/from
 the given buffer in the guest userspace?
 If yes, how to activate it? If not... why not? :)

Inside the guest, open files using the O_DIRECT flag.  This tells the
guest kernel to avoid the page cache when possible, enabling
zero-copy.  You need to use aligned memory buffers and perform I/O in
multiples of the block size.

See the open(2) man page for details.  Make sure you really want to do
this, most applications don't.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Zero-copy block driver?

2011-01-29 Thread Stefan Hajnoczi
2011/1/29 Darko Petrović darko.b.petro...@gmail.com:
 Thanks for your help. Actually, I am more interested in doing it from the
 outside, if possible (I am not allowed to change the application code). Can
 the guest be tricked by KVM somehow, using the appropriate drivers? Just to
 clear it out, copying to/from a host buffer is fine, I just want to avoid
 having guest buffers.

Not really.  If the application is designed to use the page cache then
it will use it.

You might want to look at unmapped page cache control which is not in
mainline Linux yet:
http://lwn.net/Articles/419713/

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: [PATCH v3 14/22] kvm: Fix race between timer signals and vcpu entry under !IOTHREAD

2011-01-31 Thread Stefan Hajnoczi
On Mon, Jan 31, 2011 at 11:27 AM, Jan Kiszka jan.kis...@siemens.com wrote:
 On 2011-01-31 11:03, Avi Kivity wrote:
 On 01/27/2011 04:33 PM, Jan Kiszka wrote:
 Found by Stefan Hajnoczi: There is a race in kvm_cpu_exec between
 checking for exit_request on vcpu entry and timer signals arriving
 before KVM starts to catch them. Plug it by blocking both timer related
 signals also on !CONFIG_IOTHREAD and process those via signalfd.

 As this fix depends on real signalfd support (otherwise the timer
 signals only kick the compat helper thread, and the main thread hangs),
 we need to detect the invalid constellation and abort configure.

 Signed-off-by: Jan Kiszkajan.kis...@siemens.com
 CC: Stefan Hajnoczistefa...@linux.vnet.ibm.com
 ---

 I don't want to invest that much into !IOTHREAD anymore, so let's see if
 the proposed catchabort is acceptable.


 I don't understand the dependency on signalfd.  The normal way of doing
 things, either waiting for the signal in sigtimedwait() or in
 ioctl(KVM_RUN), works with SIGALRM just fine.

 And how would you be kicked out of the select() call if it is waiting
 with a timeout? We only have a single thread here.

 The only alternative is Stefan's original proposal. But that required
 fiddling with the signal mask twice per KVM_RUN.

I think my original patch messed with the sigmask in the wrong place,
as you mentioned doing it twice per KVM_RUN isn't a good idea.  I
wonder if we can enable SIGALRM only in blocking calls and guest code
execution but without signalfd.  It might be possible, I don't see an
immediate problem with doing that, we might have to use pselect(2) or
similar in a few places.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: [PATCH v3 14/22] kvm: Fix race between timer signals and vcpu entry under !IOTHREAD

2011-01-31 Thread Stefan Hajnoczi
On Mon, Jan 31, 2011 at 12:18 PM, Jan Kiszka jan.kis...@siemens.com wrote:
 On 2011-01-31 13:13, Stefan Hajnoczi wrote:
 On Mon, Jan 31, 2011 at 11:27 AM, Jan Kiszka jan.kis...@siemens.com wrote:
 On 2011-01-31 11:03, Avi Kivity wrote:
 On 01/27/2011 04:33 PM, Jan Kiszka wrote:
 Found by Stefan Hajnoczi: There is a race in kvm_cpu_exec between
 checking for exit_request on vcpu entry and timer signals arriving
 before KVM starts to catch them. Plug it by blocking both timer related
 signals also on !CONFIG_IOTHREAD and process those via signalfd.

 As this fix depends on real signalfd support (otherwise the timer
 signals only kick the compat helper thread, and the main thread hangs),
 we need to detect the invalid constellation and abort configure.

 Signed-off-by: Jan Kiszkajan.kis...@siemens.com
 CC: Stefan Hajnoczistefa...@linux.vnet.ibm.com
 ---

 I don't want to invest that much into !IOTHREAD anymore, so let's see if
 the proposed catchabort is acceptable.


 I don't understand the dependency on signalfd.  The normal way of doing
 things, either waiting for the signal in sigtimedwait() or in
 ioctl(KVM_RUN), works with SIGALRM just fine.

 And how would you be kicked out of the select() call if it is waiting
 with a timeout? We only have a single thread here.

 The only alternative is Stefan's original proposal. But that required
 fiddling with the signal mask twice per KVM_RUN.

 I think my original patch messed with the sigmask in the wrong place,
 as you mentioned doing it twice per KVM_RUN isn't a good idea.  I
 wonder if we can enable SIGALRM only in blocking calls and guest code
 execution but without signalfd.  It might be possible, I don't see an
 immediate problem with doing that, we might have to use pselect(2) or
 similar in a few places.

 My main concern about alternative approaches is that IOTHREAD is about
 to become the default, and hardly anyone (of the few upstream KVM users)
 will run without it in the foreseeable future. The next step will be the
 removal of any !CONFIG_IOTHREAD section. So, how much do we want to
 invest here (provided my proposal has not remaining issues)?

Yes, you're right.  I'm not volunteering to dig more into this, the
best case would be to switch to a non-I/O thread world that works for
everybody.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call agenda for Feb 8

2011-02-08 Thread Stefan Hajnoczi
On Mon, Feb 7, 2011 at 10:40 PM, Chris Wright chr...@redhat.com wrote:
 Please send in any agenda items you are interested in covering.

Automated builds and testing: maintainer trees, integrating
KVM-Autotest, and QEMU tests we need but don't exist

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM call minutes for Feb 8

2011-02-08 Thread Stefan Hajnoczi
On Tue, Feb 8, 2011 at 3:55 PM, Chris Wright chr...@redhat.com wrote:
 Automated builds and testing
 - found broken 32-bit

The broken build was found (and fixed?) before automated qemu.git
builds.  It's a good motivator though.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Network performance with small packets

2011-02-08 Thread Stefan Hajnoczi
On Wed, Feb 9, 2011 at 1:55 AM, Michael S. Tsirkin m...@redhat.com wrote:
 On Wed, Feb 09, 2011 at 12:09:35PM +1030, Rusty Russell wrote:
 On Wed, 9 Feb 2011 11:23:45 am Michael S. Tsirkin wrote:
  On Wed, Feb 09, 2011 at 11:07:20AM +1030, Rusty Russell wrote:
   On Wed, 2 Feb 2011 03:12:22 pm Michael S. Tsirkin wrote:
On Wed, Feb 02, 2011 at 10:09:18AM +0530, Krishna Kumar2 wrote:
  Michael S. Tsirkin m...@redhat.com 02/02/2011 03:11 AM
 
  On Tue, Feb 01, 2011 at 01:28:45PM -0800, Shirley Ma wrote:
   On Tue, 2011-02-01 at 23:21 +0200, Michael S. Tsirkin wrote:
Confused. We compare capacity to skb frags, no?
That's sg I think ...
  
   Current guest kernel use indirect buffers, num_free returns how 
   many
   available descriptors not skb frags. So it's wrong here.
  
   Shirley
 
  I see. Good point. In other words when we complete the buffer
  it was indirect, but when we add a new one we
  can not allocate indirect so we consume.
  And then we start the queue and add will fail.
  I guess we need some kind of API to figure out
  whether the buf we complete was indirect?
  
   I've finally read this thread... I think we need to get more serious
   with our stats gathering to diagnose these kind of performance issues.
  
   This is a start; it should tell us what is actually happening to the
   virtio ring(s) without significant performance impact...
  
   Subject: virtio: CONFIG_VIRTIO_STATS
  
   For performance problems we'd like to know exactly what the ring looks
   like.  This patch adds stats indexed by how-full-ring-is; we could extend
   it to also record them by how-used-ring-is if we need.
  
   Signed-off-by: Rusty Russell ru...@rustcorp.com.au
 
  Not sure whether the intent is to merge this. If yes -
  would it make sense to use tracing for this instead?
  That's what kvm does.

 Intent wasn't; I've not used tracepoints before, but maybe we should
 consider a longer-term monitoring solution?

 Patch welcome!

 Cheers,
 Rusty.

 Sure, I'll look into this.

There are several virtio trace events already in QEMU today (see the
trace-events file):
virtqueue_fill(void *vq, const void *elem, unsigned int len, unsigned
int idx) vq %p elem %p len %u idx %u
virtqueue_flush(void *vq, unsigned int count) vq %p count %u
virtqueue_pop(void *vq, void *elem, unsigned int in_num, unsigned int
out_num) vq %p elem %p in_num %u out_num %u
virtio_queue_notify(void *vdev, int n, void *vq) vdev %p n %d vq %p
virtio_irq(void *vq) vq %p
virtio_notify(void *vdev, void *vq) vdev %p vq %p

These can be used by building QEMU with a suitable tracing backend
like SystemTap (see docs/tracing.txt).

Inside the guest I've used dynamic ftrace in the past, although static
tracepoints would be nice.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Slow disk IO on virtio kvm guests with Centos 5.5 as hypervisor

2011-02-15 Thread Stefan Hajnoczi
On Mon, Feb 14, 2011 at 6:15 PM, Thomas Broda tho...@bassfimass.de wrote:
 dd'ing /dev/zero to a testfile gives me a throughput of about 400MB/s when
 done directly on the hypervisor. If I try this from within a virtual guest,
 it's only 19MB/s to 24MB/s if the guest is on the LVM volume (raw device,
 not qcow2 or something, no filesystem on top of the LVM).

Did you run dd with O_DIRECT?

dd if=/dev/zero of=path-to-device oflag=direct bs=64k

In order to exercise the disk and eliminate page cache effects you
need to do this.

Also, you are using oldish KVM packages.  You could try a modern
kernel and KVM userspace.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Slow disk IO on virtio kvm guests with Centos 5.5 as hypervisor

2011-02-15 Thread Stefan Hajnoczi
On Tue, Feb 15, 2011 at 10:15 AM, Thomas Broda tho...@bassfimass.de wrote:
 On Tue, 15 Feb 2011 09:19:23 +, Stefan Hajnoczi
 stefa...@gmail.com wrote:

 Did you run dd with O_DIRECT?

 dd if=/dev/zero of=path-to-device oflag=direct bs=64k

 Using O_DIRECT, performance went down to 11 MB/s on the hypervisor...

Hmm...can you restate that as:

host  X MB/s
guest Y MB/s

I don't understand from your answer which values you have found.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call agenda for Feb 15

2011-02-15 Thread Stefan Hajnoczi
On Mon, Feb 14, 2011 at 10:18 PM, Anthony Liguori anth...@codemonkey.ws wrote:
 On 02/14/2011 11:56 AM, Chris Wright wrote:

 Please send in any agenda items you are interested in covering.


 -rc2 is tagged and waiting for announcement.  Please take a look at -rc2 and
 make sure there is nothing critical missing.  Will tag 0.14.0 very late
 tomorrow but unless there's something critical, it'll be 0.14.0-rc2 with an
 updated version.

Most of my -rc2 testing is done now and has passed.

http://wiki.qemu.org/Planning/0.14/Testing

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Slow disk IO on virtio kvm guests with Centos 5.5 as hypervisor

2011-02-16 Thread Stefan Hajnoczi
On Wed, Feb 16, 2011 at 12:50 PM, Thomas Broda tho...@bassfimass.de wrote:
 On Tue, 15 Feb 2011 15:50:00 +, Stefan Hajnoczi
 stefa...@gmail.com wrote:

 On Tue, Feb 15, 2011 at 10:15 AM, Thomas Broda tho...@bassfimass.de
 wrote:
 Using O_DIRECT, performance went down to 11 MB/s on the hypervisor...


 Hmm...can you restate that as:

 host  X MB/s
 guest Y MB/s

 Trying dd with oflag=direct an of=/dev/vg0/lvtest (directly on the
 KVM hypervisor) yielded a result of 11MB/s.

 If I try this on the guest with /dev/vda1 as output device, results are
 between 1.9MB/s and 7.7MB/s, usually around 3.5MB/s.

 To sum it up:

 Host: 11 MB/s
 Guest: 3.5 MB/s

 I've checked the RAID controller in the meantime. It's a HP Smart Array
 P400. Write Caching is switched off since the contoller has no BBU
 (yet).

 Could it be related to this?

The disabled write cache will result in slow writes so your host
benchmark result is low on an absolute scale.  However, the relative
Guest/Host performance is very poor here (3.5/11 = 31%).

A number of performance improvements have been made to KVM and Centos
5.5 does not contain them because it is too old.  If you want to see a
more current reflection of KVM performance, you could try Fedora 14
host and guest.  The components that matter are: host kernel, qemu-kvm
userspace, and guest kernel.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RFH: Windos 7 64 + VirtIO stalls during installation / crashed with qcow2

2011-02-17 Thread Stefan Hajnoczi
On Thu, Feb 17, 2011 at 10:44 AM, Philipp Hahn h...@univention.de wrote:
 Hello,

 I tried to install Windows 7 Professional 64 Bit with VirtIO 1.16 on an Debian
 based system using AMD64 CPUs. During the install, the system froze (progress
 bar didn't advance) and kvm was slowly eating CPU cycles on the host.

 $ dpkg-query -W libvirt0 qemu-kvm linux-image-`uname -r`
 libvirt0        0.8.7-1.48.201102031226
 linux-image-2.6.32-ucs37-amd64  2.6.32-30.37.201102031101
 qemu-kvm        0.12.4+dfsg-1~bpo50+1.3.201010011432

 It was started using virsh, which generated the following command line:
 /usr/bin/kvm.bin -S \
  -M pc-0.12 \
  -enable-kvm \
  -m 768 \
  -smp 1,sockets=1,cores=1,threads=1 \
  -name 7-Professional_amd64 \
  -uuid 89c82cf9-0797-3da4-62f4-8767e4f59b7e \
  -nodefaults \
  -chardev
 socket,id=monitor,path=/var/lib/libvirt/qemu/7-Professional_amd64.monitor,server,nowait
 \
  -mon chardev=monitor,mode=readline \
  -rtc base=utc \
  -boot dc \
  -drive
 file=/var/lib/libvirt/images/7-Professional_amd64.qcow2,if=none,id=drive-virtio-disk0,boot=on,format=qcow2
 -device
 virtio-blk-pci,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0 \
  -drive
 file=/mnt/omar/vmwares/kvm/iso/windows/win_7_pro_64bit.iso,if=none,media=cdrom,id=drive-ide0-0-1,readonly=on,format=raw
 -device ide-drive,bus=ide.0,unit=1,drive=drive-ide0-0-1,id=ide0-0-1 \
  -drive
 file=/mnt/omar/vmwares/kvm/iso/others/virtio-win-1.1.16.iso,if=none,media=cdrom,id=drive-ide0-1-0,readonly=on,format=raw
 -device ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 \
  -device
 virtio-net-pci,vlan=0,id=net0,mac=52:54:00:f7:da:b5,bus=pci.0,addr=0x3
 \
  -net tap,fd=20,vlan=0,name=hostnet0 \
  -usb \
  -device usb-tablet,id=input0 \
  -vnc 0.0.0.0:0 \
  -k de \
  -vga cirrus \
  -incoming exec:cat \
  -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 \
  -no-kvm-irqchip

 The -no-kvm-irqchip-Option was added, because we experienced shutdown/resume
 problems with other machines, which either received no interrupts anymore or
 where caught in their interrupt service routine, never being able to
 acknowledge the interrupts. Adding that option solved that problem, but might
 be causing other problems now.

 Using gdb I was able to track down Windows hanging in the following routine,
 which look like some spin-lock / semaphore aquire() implementation:
 (gdb) x/20i 0xf8000c485a80
 0xf8000c485a80:     mov    %rbx,0x8(%rsp)
 0xf8000c485a85:     push   %rdi
 0xf8000c485a86:     sub    $0x20,%rsp
 0xf8000c485a8a:     mov    %rcx,%rdi
 0xf8000c485a8d:     xor    %ebx,%ebx
 0xf8000c485a8f:     nop
 0xf8000c485a90:     inc    %ebx
 0xf8000c485a92:     test   %ebx,0x274834(%rip)        # 0xf8000c6fa2cc
 0xf8000c485a98:     je     0xf8000c48adad
 0xf8000c485a9e:     pause
 0xf8000c485aa0:     mov    (%rdi),%rcx
 0xf8000c485aa3:     test   %rcx,%rcx
 0xf8000c485aa6:     jne    0xf8000c485a90
 0xf8000c485aa8:     lock btsq $0x0,(%rdi)
 0xf8000c485aae:     jb     0xf8000c485a90
 0xf8000c485ab0:     mov    %ebx,%eax
 0xf8000c485ab2:     mov    0x30(%rsp),%rbx
 0xf8000c485ab7:     add    $0x20,%rsp
 0xf8000c485abb:     pop    %rdi
 0xf8000c485abc:     retq
 (gdb) x/w 0xf8000c6fa2cc
 0xf8000c6fa2cc:     0x
 (gdb) x/w $rdi
 0xfa800131f600:     0x0001

 Did someone experience similar problems or does somebody know if there was a
 fix for such a problem in newer kvm- or Linux-kernel versions?

 We also encountered problems with some Windows Versions when using VirtIO with
 Qcow2 images, which were using backing-files for copy-on-write: they just
 crashed with a blue-screen. Just changing from the CoW-qcow2 to the
 master-qcow2 file fixed the problem, but this isn't satisfactory, since we
 would like to use the CoW-functionality. Not using VirtIO also fixed the
 problem, but has performance penalties.

Vadim: Any suggestions for extracting more relevant information in these cases?

One option may might be to set up the Windows debugger in order to
closely monitor what the guest is doing when it hangs or BSODs:
http://etherboot.org/wiki/sanboot/winnt_iscsi_debug

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: RFH: Windos 7 64 + VirtIO stalls during installation / crashed with qcow2

2011-02-17 Thread Stefan Hajnoczi
On Thu, Feb 17, 2011 at 12:45 PM, Vadim Rozenfeld vroze...@redhat.com wrote:
 On Thu, 2011-02-17 at 13:41 +0200, Gleb Natapov wrote:
 On Thu, Feb 17, 2011 at 11:30:25AM +, Stefan Hajnoczi wrote:
  On Thu, Feb 17, 2011 at 10:44 AM, Philipp Hahn h...@univention.de wrote:
   Hello,
  
   I tried to install Windows 7 Professional 64 Bit with VirtIO 1.16 on an 
   Debian
   based system using AMD64 CPUs. During the install, the system froze 
   (progress
   bar didn't advance) and kvm was slowly eating CPU cycles on the host.
  
   $ dpkg-query -W libvirt0 qemu-kvm linux-image-`uname -r`
   libvirt0        0.8.7-1.48.201102031226
   linux-image-2.6.32-ucs37-amd64  2.6.32-30.37.201102031101
   qemu-kvm        0.12.4+dfsg-1~bpo50+1.3.201010011432
  
   It was started using virsh, which generated the following command line:
   /usr/bin/kvm.bin -S \
    -M pc-0.12 \
    -enable-kvm \
    -m 768 \
    -smp 1,sockets=1,cores=1,threads=1 \
    -name 7-Professional_amd64 \
    -uuid 89c82cf9-0797-3da4-62f4-8767e4f59b7e \
    -nodefaults \
    -chardev
   socket,id=monitor,path=/var/lib/libvirt/qemu/7-Professional_amd64.monitor,server,nowait
   \
    -mon chardev=monitor,mode=readline \
    -rtc base=utc \
    -boot dc \
    -drive
   file=/var/lib/libvirt/images/7-Professional_amd64.qcow2,if=none,id=drive-virtio-disk0,boot=on,format=qcow2
   -device
   virtio-blk-pci,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0
\
    -drive
   file=/mnt/omar/vmwares/kvm/iso/windows/win_7_pro_64bit.iso,if=none,media=cdrom,id=drive-ide0-0-1,readonly=on,format=raw
   -device ide-drive,bus=ide.0,unit=1,drive=drive-ide0-0-1,id=ide0-0-1 \
    -drive
   file=/mnt/omar/vmwares/kvm/iso/others/virtio-win-1.1.16.iso,if=none,media=cdrom,id=drive-ide0-1-0,readonly=on,format=raw
   -device ide-drive,bus=ide.1,unit=0,drive=drive-ide0-1-0,id=ide0-1-0 \
    -device
   virtio-net-pci,vlan=0,id=net0,mac=52:54:00:f7:da:b5,bus=pci.0,addr=0x3
   \
    -net tap,fd=20,vlan=0,name=hostnet0 \
    -usb \
    -device usb-tablet,id=input0 \
    -vnc 0.0.0.0:0 \
    -k de \
    -vga cirrus \
    -incoming exec:cat \
    -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4 \
    -no-kvm-irqchip
  
   The -no-kvm-irqchip-Option was added, because we experienced 
   shutdown/resume
   problems with other machines, which either received no interrupts 
   anymore or
   where caught in their interrupt service routine, never being able to
   acknowledge the interrupts. Adding that option solved that problem, but 
   might
   be causing other problems now.
  
   Using gdb I was able to track down Windows hanging in the following 
   routine,
   which look like some spin-lock / semaphore aquire() implementation:
   (gdb) x/20i 0xf8000c485a80
   0xf8000c485a80:     mov    %rbx,0x8(%rsp)
   0xf8000c485a85:     push   %rdi
   0xf8000c485a86:     sub    $0x20,%rsp
   0xf8000c485a8a:     mov    %rcx,%rdi
   0xf8000c485a8d:     xor    %ebx,%ebx
   0xf8000c485a8f:     nop
   0xf8000c485a90:     inc    %ebx
   0xf8000c485a92:     test   %ebx,0x274834(%rip)        # 
   0xf8000c6fa2cc
   0xf8000c485a98:     je     0xf8000c48adad
   0xf8000c485a9e:     pause
   0xf8000c485aa0:     mov    (%rdi),%rcx
   0xf8000c485aa3:     test   %rcx,%rcx
   0xf8000c485aa6:     jne    0xf8000c485a90
   0xf8000c485aa8:     lock btsq $0x0,(%rdi)
   0xf8000c485aae:     jb     0xf8000c485a90
   0xf8000c485ab0:     mov    %ebx,%eax
   0xf8000c485ab2:     mov    0x30(%rsp),%rbx
   0xf8000c485ab7:     add    $0x20,%rsp
   0xf8000c485abb:     pop    %rdi
   0xf8000c485abc:     retq
   (gdb) x/w 0xf8000c6fa2cc
   0xf8000c6fa2cc:     0x
   (gdb) x/w $rdi
   0xfa800131f600:     0x0001
  
   Did someone experience similar problems or does somebody know if there 
   was a
   fix for such a problem in newer kvm- or Linux-kernel versions?
  
   We also encountered problems with some Windows Versions when using 
   VirtIO with
   Qcow2 images, which were using backing-files for copy-on-write: they just
   crashed with a blue-screen. Just changing from the CoW-qcow2 to the
   master-qcow2 file fixed the problem, but this isn't satisfactory, 
   since we
   would like to use the CoW-functionality. Not using VirtIO also fixed the
   problem, but has performance penalties.
 
  Vadim: Any suggestions for extracting more relevant information in these 
  cases?
 Debugging installation-phase problems on 64-bit platforms is a very
 complicated thing. If the problem is reproducible on x86 platforms, you
 can try printing messages (RhelDbgPrint function) to localize the
 problem. You will need to adjust RhelDbgLevel in virtio_stor.c and build
 checked (debug) version of the driver.

Is that even possible - I thought these drivers need to be signed on
recent versions of Windows?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message

Re: [PATCH v3 uq/master 05/22] add win32 qemu-thread implementation

2011-02-28 Thread Stefan Hajnoczi
On Mon, Feb 28, 2011 at 9:10 AM, Paolo Bonzini pbonz...@redhat.com wrote:
 +static unsigned __stdcall win32_start_routine(void *arg)
 +{
 +    struct QemuThreadData data = *(struct QemuThreadData *) arg;
 +    QemuThread *thread = data.thread;
 +
 +    free(arg);

qemu_free(arg);

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: problem about blocked monitor when disk image on NFS can not be reached.

2011-03-01 Thread Stefan Hajnoczi
On Tue, Mar 1, 2011 at 5:01 AM, ya su suya94...@gmail.com wrote:
   kvm start with disk image on nfs server, when nfs server can not be
 reached, monitor will be blocked. I change io_thread to SCHED_RR
 policy, it will work unfluently waiting for disk read/write timeout.

There are some synchronous disk image reads that can put qemu-kvm to
sleep until NFS responds or errors.  For example, when starting
hw/virtio-blk.c calls bdrv_guess_geometry() which may invoke
bdrv_read().

Once the VM is running and you're using virtio-blk then disk I/O
should be asynchronous.  There are some synchronous cases to do with
migration, snapshotting, etc where we wait for outstanding aio
requests.  Again this can block qemu-kvm.

So in short, there's no easy way to avoid blocking the VM in all cases
today.  You should find, however, that normal read/write operation to
a running VM does not cause qemu-kvm to sleep.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Poor disk IO performance from Windows 2003 guest

2011-03-01 Thread Stefan Hajnoczi
On Tue, Mar 1, 2011 at 10:23 AM, Kevin Clark kevin.cl...@csoft.co.uk wrote:
 Any thoughts/ideas?

There are a lot of variables here.  Are you using virtio-blk devices
and Windows guest drivers?  Are you using hardware RAID5 on the NFS
server?  Could it be a network issue (contention during benchmark
runs)?

I'd start by benchmarking NFS on the host without running a virtual
machine.  Make sure you're getting acceptable performance and
repeatable results there first.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: problem about blocked monitor when disk image on NFS can not be reached.

2011-03-01 Thread Stefan Hajnoczi
On Tue, Mar 1, 2011 at 12:39 PM, ya su suya94...@gmail.com wrote:
    how about to remove kvm_handle_io/handle_mmio in kvm_run function
 into kvm_main_loop, as these operation belong to io operation, this
 will remove the qemu_mutux between the 2 threads. is this an
 reasonable thought?

    In order to keep the monitor to response to user quicker under
 this suition, an easier way is to take monito io out of qemu_mutux
 protection. this include vnc/serial/telnet io related with monitor,
 as these io will not affect the running of vm itself, it need not in
 so stirct protection.

The qemu_mutex protects all QEMU global state.  The monitor does some
I/O and parsing which is not necessarily global state but once it
begins actually performing the command you sent, access to global
state will be required (pretty much any monitor command will operate
on global state).

I think there are two options for handling NFS hangs:
1. Ensure that QEMU is never put to sleep by NFS for disk images.  The
guest continues executing, may time out and notice that storage is
unavailable.
2. Pause the VM but keep the monitor running if a timeout error
occurs.  Not sure if there is a timeout from NFS that we can detect.

For I/O errors (e.g. running out of disk space on the host) there is a
configurable policy.  You can choose whether to return an error to the
guest or to pause the VM.  I think we should treat NFS hangs as an
extension to this and as a block layer problem rather than an io
thread problem.

Can you get backtraces when KVM hangs (gdb command: thread apply all
bt)?  It would be interesting to see some of the blocking cases that
you are hitting.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: problem about blocked monitor when disk image on NFS can not be reached.

2011-03-02 Thread Stefan Hajnoczi
On Wed, Mar 2, 2011 at 10:39 AM, ya su suya94...@gmail.com wrote:
 io_thread bt as the following:
 #0  0x7f3086eaa034 in __lll_lock_wait () from /lib64/libpthread.so.0
 #1  0x7f3086ea5345 in _L_lock_870 () from /lib64/libpthread.so.0
 #2  0x7f3086ea5217 in pthread_mutex_lock () from /lib64/libpthread.so.0
 #3  0x00436018 in kvm_mutex_lock () at
 /root/rpmbuild/BUILD/qemu-kvm-0.14/qemu-kvm.c:1730
 #4  qemu_mutex_lock_iothread () at
 /root/rpmbuild/BUILD/qemu-kvm-0.14/qemu-kvm.c:1744
 #5  0x0041ca67 in main_loop_wait (nonblocking=value optimized out)
    at /root/rpmbuild/BUILD/qemu-kvm-0.14/vl.c:1377
 #6  0x004363e7 in kvm_main_loop () at
 /root/rpmbuild/BUILD/qemu-kvm-0.14/qemu-kvm.c:1589
 #7  0x0041dc3a in main_loop (argc=value optimized out,
 argv=value optimized out,
    envp=value optimized out) at /root/rpmbuild/BUILD/qemu-kvm-0.14/vl.c:1429
 #8  main (argc=value optimized out, argv=value optimized out,
 envp=value optimized out)
    at /root/rpmbuild/BUILD/qemu-kvm-0.14/vl.c:3201

 cpu thread as the following:
 #0  0x7f3084dff093 in select () from /lib64/libc.so.6
 #1  0x004453ea in qemu_aio_wait () at aio.c:193
 #2  0x00444175 in bdrv_write_em (bs=0x1ec3090, sector_num=2009871,
    buf=0x7f3087532800
 F\b\200u\022\366F$\004u\fPV\350\226\367\377\377\003Ft\353\fPV\350\212\367\377\377\353\003\213Ft^]\302\b,
 nb_sectors=16) at block.c:2577
 #3  0x0059ca13 in ide_sector_write (s=0x215f508) at
 /root/rpmbuild/BUILD/qemu-kvm-0.14/hw/ide/core.c:574
 #4  0x00438ced in kvm_handle_io (env=0x202ef60) at
 /root/rpmbuild/BUILD/qemu-kvm-0.14/kvm-all.c:821
 #5  kvm_run (env=0x202ef60) at 
 /root/rpmbuild/BUILD/qemu-kvm-0.14/qemu-kvm.c:617
 #6  0x00438e09 in kvm_cpu_exec (env=value optimized out)
    at /root/rpmbuild/BUILD/qemu-kvm-0.14/qemu-kvm.c:1233
 #7  0x0043a0f7 in kvm_main_loop_cpu (_env=0x202ef60)
    at /root/rpmbuild/BUILD/qemu-kvm-0.14/qemu-kvm.c:1419
 #8  ap_main_loop (_env=0x202ef60) at
 /root/rpmbuild/BUILD/qemu-kvm-0.14/qemu-kvm.c:1466
 #9  0x7f3086ea37e1 in start_thread () from /lib64/libpthread.so.0
 #10 0x7f3084e0653d in clone () from /lib64/libc.so.6

 aio_thread bt as the following:
 #0  0x7f3086eaae83 in pwrite64 () from /lib64/libpthread.so.0
 #1  0x00447501 in handle_aiocb_rw_linear (aiocb=0x21cff10,
    buf=0x7f3087532800
 F\b\200u\022\366F$\004u\fPV\350\226\367\377\377\003Ft\353\fPV\350\212\367\377\377\353\003\213Ft^]\302\b)
 at posix-aio-compat.c:212
 #2  0x00447d48 in handle_aiocb_rw (unused=value optimized
 out) at posix-aio-compat.c:247
 #3  aio_thread (unused=value optimized out) at posix-aio-compat.c:341
 #4  0x7f3086ea37e1 in start_thread () from /lib64/libpthread.so.0
 #5  0x7f3084e0653d in clone () from /lib64/libc.so.6

 I think io_thread is blocked by cpu thread which take the qemu_mutux
 first, cpu thread is waiting for aio_thread's result by qemu_aio_wait
 function,  aio_thead take much time on pwrite64, it will take about
 5-10s, then return a error(it seems like an non-block timeout call),
 after that, io thead will have a chance to receive monitor input, so
 the monitor seems to blocked frequently. in this suition, if I stop
 the vm, the monitor will response faster.

 the problem is caused by unavailabity of block layer, the block layer
 process the io error in a normal way, it report error to ide device,
 the error is handled in ide_sector_write. the root cause is: monitor's
 input and io operation(pwrite function) must execute in a serialized
 method(by qemu_mutux seamphore), so pwrite long block time will hinder
 monitor input.

 as stefan says, it seems difficult to take monitor input out of the
 protection, currently I will stop the vm if the disk image can not be
 reached.

If you switch to -drive if=virtio instead of IDE then the problem
should be greatly reduced.  Virtio-blk uses aio instead of synchronous
calls, which means that the vcpu thread does not run qemu_aio_wait().

Kevin and I have been looking into the limitations imposed by
synchronous calls.  Today there is unfortunately synchronous code in
QEMU and we can hit these NFS hang situations.  qemu_aio_wait() runs a
nested event loop that does a subset of what the full event loop does.
 This is why the monitor does not respond.

If all code was asynchronous then only a top-level event loop would be
necessary and the monitor would continue to function.

In the immediate term I suggest using virtio-blk instead of IDE.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Poor disk IO performance from Windows 2003 guest

2011-03-02 Thread Stefan Hajnoczi
On Wed, Mar 2, 2011 at 10:30 AM, Kevin Clark kevin.cl...@csoft.co.uk wrote:
 The results are much better, with 64MB writes on the system drive coming in 
 at 39MB/s and reads 310MB/s.  The second drive gives me 94MB/s for writes and 
 777MB/s for reads for a 64MB file.  Again, that's wildy different results for 
 two storage devices in the same guest, and it needs further investigation, 
 but now the system is usable and I need to move on.

Good to hear that you're seeing acceptable performance now.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Degraded performance with Windows 2008 R2 with applications

2011-03-07 Thread Stefan Hajnoczi
On Sun, Mar 6, 2011 at 10:25 PM, Mathias Klette mkle...@gmail.com wrote:
 I've tested with iozone to compare IO with a linux guest and also to
 verify changes made to improve situation - but nothing really helped.

 TESTS with iozone -s 4G -r 256k -c -e:

Please use the -I option to bypass the page cache, otherwise buffered
I/O will be used and requests may be satisfied from memory rather than
actually accessing the disk.

What is the qemu-kvm command-line (ps aux | grep kvm)?

Are you using virtio-blk and the Windows guest drivers from here:

http://www.linux-kvm.org/page/WindowsGuestDrivers/Download_Drivers

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call minutes for Mar 8

2011-03-08 Thread Stefan Hajnoczi
On Tue, Mar 8, 2011 at 4:00 PM, Anthony Liguori anth...@codemonkey.ws wrote:
 http://wiki.qemu.org/Features/QAPI/VirtAgent

That page does not exist.  I think you meant this one:
http://wiki.qemu.org/Features/QAPI/GuestAgent

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vnc: threaded server depends on io-thread

2011-03-09 Thread Stefan Hajnoczi
On Wed, Mar 9, 2011 at 10:57 AM, Corentin Chary
corentin.ch...@gmail.com wrote:
 The threaded VNC servers messed up with QEMU fd handlers without
 any kind of locking, and that can cause some nasty race conditions.

 The IO-Thread provides appropriate locking primitives to avoid that.
 This patch makes CONFIG_VNC_THREAD depends on CONFIG_IO_THREAD,
 and add lock and unlock calls around the two faulty calls.

 qemu-kvm currently doesn't compile with --enable-io-thread. is there an easy 
 fix
 for this?

 If IO Thread is not available, I'm afraid that --disable-vnc-thread is
 the only fix.
 Or, you can try to define some global mutex acting like iothread
 locks, but that doesn't sounds like an easy fix.

Jan or Marcelo can help here but qemu-kvm has an iothread equivalent
built in by default.  It should be possible to use that.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: default elevator=noop for virtio block devices?

2011-03-09 Thread Stefan Hajnoczi
On Wed, Mar 9, 2011 at 10:01 AM, Avi Kivity a...@redhat.com wrote:
 On 03/09/2011 11:42 AM, Harald Dunkel wrote:

 Hi folks,

 would it make sense to make elevator=noop the default
 for virtio block devices? Or would you recommend to
 set this on the kvm server instead?


 I think leaving the defaults is best.  The elevator on the guest serves to
 schedule I/O among processes in the guest, and the elevator on the host
 partitions I/O among the guests.

It depends on the workload.  Khoa has seen cases where CFQ does not
scale with multi-threaded workloads and deadline is preferred.  But
it's not one-size-fits-all, it depends on your workload and requires
benchmarking.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm: ppc: Fix breakage of kvm_arch_pre_run/process_irqchip_events

2011-03-10 Thread Stefan Hajnoczi
On Fri, Mar 11, 2011 at 5:55 AM, Alexander Graf ag...@suse.de wrote:

 On 17.02.2011, at 22:01, Jan Kiszka wrote:

 On 2011-02-07 12:19, Jan Kiszka wrote:
 We do not check them, and the only arch with non-empty implementations
 always returns 0 (this is also true for qemu-kvm).

 Signed-off-by: Jan Kiszka jan.kis...@siemens.com
 CC: Alexander Graf ag...@suse.de
 ---
 kvm.h              |    5 ++---
 target-i386/kvm.c  |    8 ++--
 target-ppc/kvm.c   |    6 ++
 target-s390x/kvm.c |    6 ++
 4 files changed, 8 insertions(+), 17 deletions(-)


 ...

 diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
 index 93ecc57..bd4012a 100644
 --- a/target-ppc/kvm.c
 +++ b/target-ppc/kvm.c
 @@ -256,14 +256,12 @@ int kvm_arch_pre_run(CPUState *env, struct kvm_run 
 *run)
     return 0;
 }

 -int kvm_arch_post_run(CPUState *env, struct kvm_run *run)
 +void kvm_arch_post_run(CPUState *env, struct kvm_run *run)
 {
 -    return 0;
 }

 -int kvm_arch_process_irqchip_events(CPUState *env)
 +void kvm_arch_process_irqchip_events(CPUState *env)
 {
 -    return 0;
 }

 Oops. Do we already have a built-bot for KVM-enabled PPC (and s390)
 targets somewhere?

 Just before leaving for vacation I prepared a machine for each and gave 
 stefan access to them. Looks like they're not officially running though - 
 will try to look at this asap.

They are in the process of being added to the buildbot by Daniel
Gollub.  However, the ppc box is unable to build qemu.git because it
hits ENOMEM while compiling.  I doubled swap size but that didn't fix
the issue so I need to investigate more.  At least s390 should be good
to go soon and I will send an update when it is up and running.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Virtual SCSI disks hangs on heavy IO

2011-03-14 Thread Stefan Hajnoczi
On Mon, Mar 14, 2011 at 6:05 PM, Guido Winkelmann
guido-k...@thisisnotatest.de wrote:
 Does anybody have an idea what might cause this or what might be done about 
 it?

The lsi_scsi emulation code is incomplete.  It does not handle some
situations like the ORDERED commands or message 0x0c.

There is a patch to address the message 0xc issue, it has not been
applied to qemu.git or qemu-kvm.git yet:
http://patchwork.ozlabs.org/patch/63926/

Basically there is no one actively maintaining or reviewing patches
for the lsi53c895a SCSI controller.

virtio-blk works very will with Linux guests.  Is there a reason you
need to use SCSI emulation instead of virtio-blk?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Virtual SCSI disks hangs on heavy IO

2011-03-15 Thread Stefan Hajnoczi
On Mon, Mar 14, 2011 at 10:57 PM, Guido Winkelmann
guido-k...@thisisnotatest.de wrote:
 On Monday 14 March 2011 20:32:23 Stefan Hajnoczi wrote:
 On Mon, Mar 14, 2011 at 6:05 PM, Guido Winkelmann

 guido-k...@thisisnotatest.de wrote:
  Does anybody have an idea what might cause this or what might be done
  about it?

 The lsi_scsi emulation code is incomplete.  It does not handle some
 situations like the ORDERED commands or message 0x0c.

 There is a patch to address the message 0xc issue, it has not been
 applied to qemu.git or qemu-kvm.git yet:
 http://patchwork.ozlabs.org/patch/63926/

 Basically there is no one actively maintaining or reviewing patches
 for the lsi53c895a SCSI controller.

 Does that mean that using the SCSI transport for virtual disks is officially
 unsupported or deprecated or that it should be?

The LSI SCSI emulation in particular has not seen much attention.  As
for the wider SCSI emulation there has been work over the past few
months so it's alive and being used.

 Are things better with the IDE driver?

IDE is commonly used for compatibility with guests that do not have
virtio-blk drivers.  It should work fine although performance is poor
due to the IDE interface.

 virtio-blk works very will with Linux guests.  Is there a reason you
 need to use SCSI emulation instead of virtio-blk?

 I can probably use virtio-blk most of the time, I was just hoping to be able
 to virtualize a wider array of operating systems, like the *BSDs,
 (Open)Solaris, Windows, or even just some linux distributions whose installers
 don't anticipate KVM and thus don't support virtio-anything.

Windows virtio-blk drivers are available and should be used:
http://www.linux-kvm.org/page/WindowsGuestDrivers/Download_Drivers

BSD and Solaris don't ship with virtio-blk AFAIK.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Virtual SCSI disks hangs on heavy IO

2011-03-15 Thread Stefan Hajnoczi
On Tue, Mar 15, 2011 at 7:47 AM, Alexander Graf ag...@suse.de wrote:

 On 15.03.2011, at 08:09, Stefan Hajnoczi wrote:

 On Mon, Mar 14, 2011 at 10:57 PM, Guido Winkelmann
 guido-k...@thisisnotatest.de wrote:
 On Monday 14 March 2011 20:32:23 Stefan Hajnoczi wrote:
 On Mon, Mar 14, 2011 at 6:05 PM, Guido Winkelmann

 guido-k...@thisisnotatest.de wrote:
 Does anybody have an idea what might cause this or what might be done
 about it?

 The lsi_scsi emulation code is incomplete.  It does not handle some
 situations like the ORDERED commands or message 0x0c.

 There is a patch to address the message 0xc issue, it has not been
 applied to qemu.git or qemu-kvm.git yet:
 http://patchwork.ozlabs.org/patch/63926/

 Basically there is no one actively maintaining or reviewing patches
 for the lsi53c895a SCSI controller.

 Does that mean that using the SCSI transport for virtual disks is officially
 unsupported or deprecated or that it should be?

 The LSI SCSI emulation in particular has not seen much attention.  As
 for the wider SCSI emulation there has been work over the past few
 months so it's alive and being used.

 Are things better with the IDE driver?

 IDE is commonly used for compatibility with guests that do not have
 virtio-blk drivers.  It should work fine although performance is poor
 due to the IDE interface.

 virtio-blk works very will with Linux guests.  Is there a reason you
 need to use SCSI emulation instead of virtio-blk?

 I can probably use virtio-blk most of the time, I was just hoping to be able
 to virtualize a wider array of operating systems, like the *BSDs,
 (Open)Solaris, Windows, or even just some linux distributions whose 
 installers
 don't anticipate KVM and thus don't support virtio-anything.

 Windows virtio-blk drivers are available and should be used:
 http://www.linux-kvm.org/page/WindowsGuestDrivers/Download_Drivers

 BSD and Solaris don't ship with virtio-blk AFAIK.

 This is pretty much the gap that AHCI is trying to fill. It's a 
 well-supported HBA that pretty much every OS supports, but is still simple 
 enough to implement. Unfortunately, 0.14 ships without BIOS support for it, 
 so you can't boot off an AHCI disk yet. But as of 0.15, AHCI is pretty much 
 the adapter of choice for your use case.

 Please keep in mind that I didn't get FreeBSD rolling with AHCI emulation 
 yet. OpenBSD works just fine.

I think one missing AHCI feature was legacy PATA mode?  Perhaps that
is a good GSoC project if you're willing to mentor it, Alex.  I'm
thinking that with complete AHCI and legacy mode it would be a good
choice as the default non-virtio-blk disk interface.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Virtual SCSI disks hangs on heavy IO

2011-03-15 Thread Stefan Hajnoczi
On Tue, Mar 15, 2011 at 9:16 AM, Alexander Graf ag...@suse.de wrote:

 On 15.03.2011, at 10:03, Stefan Hajnoczi wrote:

 On Tue, Mar 15, 2011 at 7:47 AM, Alexander Graf ag...@suse.de wrote:

 On 15.03.2011, at 08:09, Stefan Hajnoczi wrote:

 On Mon, Mar 14, 2011 at 10:57 PM, Guido Winkelmann
 guido-k...@thisisnotatest.de wrote:
 On Monday 14 March 2011 20:32:23 Stefan Hajnoczi wrote:
 On Mon, Mar 14, 2011 at 6:05 PM, Guido Winkelmann

 guido-k...@thisisnotatest.de wrote:
 Does anybody have an idea what might cause this or what might be done
 about it?

 The lsi_scsi emulation code is incomplete.  It does not handle some
 situations like the ORDERED commands or message 0x0c.

 There is a patch to address the message 0xc issue, it has not been
 applied to qemu.git or qemu-kvm.git yet:
 http://patchwork.ozlabs.org/patch/63926/

 Basically there is no one actively maintaining or reviewing patches
 for the lsi53c895a SCSI controller.

 Does that mean that using the SCSI transport for virtual disks is 
 officially
 unsupported or deprecated or that it should be?

 The LSI SCSI emulation in particular has not seen much attention.  As
 for the wider SCSI emulation there has been work over the past few
 months so it's alive and being used.

 Are things better with the IDE driver?

 IDE is commonly used for compatibility with guests that do not have
 virtio-blk drivers.  It should work fine although performance is poor
 due to the IDE interface.

 virtio-blk works very will with Linux guests.  Is there a reason you
 need to use SCSI emulation instead of virtio-blk?

 I can probably use virtio-blk most of the time, I was just hoping to be 
 able
 to virtualize a wider array of operating systems, like the *BSDs,
 (Open)Solaris, Windows, or even just some linux distributions whose 
 installers
 don't anticipate KVM and thus don't support virtio-anything.

 Windows virtio-blk drivers are available and should be used:
 http://www.linux-kvm.org/page/WindowsGuestDrivers/Download_Drivers

 BSD and Solaris don't ship with virtio-blk AFAIK.

 This is pretty much the gap that AHCI is trying to fill. It's a 
 well-supported HBA that pretty much every OS supports, but is still simple 
 enough to implement. Unfortunately, 0.14 ships without BIOS support for it, 
 so you can't boot off an AHCI disk yet. But as of 0.15, AHCI is pretty much 
 the adapter of choice for your use case.

 Please keep in mind that I didn't get FreeBSD rolling with AHCI emulation 
 yet. OpenBSD works just fine.

 I think one missing AHCI feature was legacy PATA mode?  Perhaps that
 is a good GSoC project if you're willing to mentor it, Alex.  I'm
 thinking that with complete AHCI and legacy mode it would be a good
 choice as the default non-virtio-blk disk interface.

 Or two be more precise: There are two different dimensions

 SATA / PATA
 IDE / AHCI

 The first is the link model - the type of connection the disk/cd-rom is 
 connected to the hba with. The second is the OS interface.

 AHCI can handle SATA and PATA devices in AHCI mode. IIUC both link models 
 also work in IDE (legacy) mode.
 The ICH-HBA can be BIOS configured to either operate in AHCI mode or in 
 legacy mode, but not both at the same time. You can split between channels 
 though. So you can have channels 1,2 operate through legacy while channels 
 3,4 go through AHCI. The same disk still can only be accessed either through 
 IDE _or_ AHCI.

 Since we already have properly working PIIX3 IDE emulation code, I don't see 
 the point in implementing ICH-7 AHCI IDE legacy compatibility mode.

 There are SATA controllers out there that apparently can expose the same disk 
 through the legacy IDE interface and a faster SATA interface. I'm not sure 
 any of those is AHCI compatible - the spec doesn't forbid you to do this.

Okay, I was thinking that having just the AHCI device which is guest
(BIOS?) configurable to work with legacy guests is nicer than having
to switch QEMU command-line options on the host.  But then we don't
have non-volatile storage for the BIOS AFAIK, so it currently doesn't
make much difference which AHCI supports IDE emulation or whether you
explicitly use the piix IDE emulation.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: KVM lock contention on 48 core AMD machine

2011-03-18 Thread Stefan Hajnoczi
On Fri, Mar 18, 2011 at 12:02 PM, Ben Nagy b...@iagu.net wrote:
 KVM commandline (using libvirt):
 LC_ALL=C PATH=/usr/local/sbin:/usr/local/bin:/usr/bin:/usr/sbin:/sbin:/bin
 QEMU_AUDIO_DRV=none /usr/local/bin/kvm-snapshot -S -M pc-0.14
 -enable-kvm -m 1024 -smp 1,sockets=1,cores=1,threads=1 -name fb-0
 -uuid de59229b-eb06-9ecc-758e-d20bc5ddc291 -nodefconfig -nodefaults
 -chardev 
 socket,id=charmonitor,path=/var/lib/libvirt/qemu/fb-0.monitor,server,nowait
 -mon chardev=charmonitor,id=monitor,mode=readline -rtc base=localtime
 -no-acpi -boot cd -drive
 file=/mnt/big/bigfiles/kvm_disks/eax/fb-0.ovl,if=none,id=drive-ide0-0-0,format=qcow2
 -device ide-drive,bus=ide.0,unit=0,drive=drive-ide0-0-0,id=ide0-0-0
 -drive if=none,media=cdrom,id=drive-ide0-0-1,readonly=on,format=raw
 -device ide-drive,bus=ide.0,unit=1,drive=drive-ide0-0-1,id=ide0-0-1
 -netdev tap,fd=17,id=hostnet0 -device
 virtio-net-pci,netdev=hostnet0,id=net0,mac=52:54:00:d9:09:ef,bus=pci.0,addr=0x3
 -usb -device usb-tablet,id=input0 -vnc 127.0.0.1:0 -k en-us -vga
 cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x4

Please try without -usb -device usb-tablet,id=input0.  That is known
to cause increased CPU utilization.  I notice that idr_lock is either
Infiniband or POSIX timers related:
drivers/infiniband/core/sa_query.c
kernel/posix-timers.c

-usb sets up a 1000 Hz timer for each VM.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Virtual SCSI disks hangs on heavy IO

2011-03-18 Thread Stefan Hajnoczi
On Fri, Mar 18, 2011 at 4:06 PM, Guido Winkelmann
guido-k...@thisisnotatest.de wrote:
 Am Wednesday 16 March 2011 schrieb Stefan Hajnoczi:
 On Tue, Mar 15, 2011 at 1:20 PM, Guido Winkelmann

 guido-k...@thisisnotatest.de wrote:
  Am Tuesday 15 March 2011 schrieben Sie:
  On Mon, Mar 14, 2011 at 10:57 PM, Guido Winkelmann
 
  guido-k...@thisisnotatest.de wrote:
   On Monday 14 March 2011 20:32:23 Stefan Hajnoczi wrote:
   On Mon, Mar 14, 2011 at 6:05 PM, Guido Winkelmann
  
   guido-k...@thisisnotatest.de wrote:
Does anybody have an idea what might cause this or what might be
done about it?
  
   The lsi_scsi emulation code is incomplete.  It does not handle some
   situations like the ORDERED commands or message 0x0c.
  
   There is a patch to address the message 0xc issue, it has not been
   applied to qemu.git or qemu-kvm.git yet:
   http://patchwork.ozlabs.org/patch/63926/
  
   Basically there is no one actively maintaining or reviewing patches
   for the lsi53c895a SCSI controller.
  
   Does that mean that using the SCSI transport for virtual disks is
   officially unsupported or deprecated or that it should be?
 
  The LSI SCSI emulation in particular has not seen much attention.  As
  for the wider SCSI emulation there has been work over the past few
  months so it's alive and being used.
 
  Well, I cannot find any other HBAs than LSI when I run qemu -device ? -
  or at least nothing I would recognize as a SCSI HBA. As far as I can
  see, that pretty much means I cannot use SCSI disks in KVM at all,
  unless I'm prepared to live with the problems described earlier...

 The LSI controller is the only available PCI SCSI HBA.  Are you able
 to try the patch I linked?
 http://patchwork.ozlabs.org/patch/63926/

 I haven't tried the patch yet. At work, it was decided that we are not going 
 to
 use a manually patch version of qemu-kvm unless absolutely necessary, and at
 home, I'm unlikely to ever want to virtualize an OS without virtio-drivers.

 I can still try the patch on my home machine, if you want me to.

Don't worry about it if you're going virtio-blk already.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: qemu-kvm crash with

2011-03-25 Thread Stefan Hajnoczi
On Thu, Mar 24, 2011 at 1:38 PM, Conor Murphy
conor_murphy_v...@hotmail.com wrote:
 #4  _int_free (av=value optimized out, p=0x7fa24c0009f0, have_lock=0) at
 malloc.c:4795
 #5  0x004a18fe in qemu_vfree (ptr=0x7fa24c000a00) at oslib-posix.c:76
 #6  0x0045af3d in handle_aiocb_rw (aiocb=0x7fa2dc034cd0) at
 posix-aio-compat.c:301

I don't see a way for a double-free to occur so I think something has
overwritten the memory preceeding the allocated buffer.

In gdb you could inspect the aiocb structure to look at its aio_iov[],
aio_niov, and aio_nbytes fields.  They might be invalid or corrupted
somehow.

You could also dump out the memory before 0x7fa24c000a00, specifically
0x7fa24c0009f0, to see if you notice any pattern or printable
characters that give a clue as to what has corrupted the memory here.

Are you running qemu-kvm.git/master?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] rbd: use the higher level librbd instead of just librados

2011-03-28 Thread Stefan Hajnoczi
On Thu, Mar 24, 2011 at 03:51:36PM -0700, Josh Durgin wrote:
You have sent a malformed patch.  Please send patches that follow the
guidelines at http://wiki.qemu.org/Contribute/SubmitAPatch and test that
your mail client is not line wrapping or mangling whitespace.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio-blk.c handling of i/o which is not a 512 multiple

2011-03-30 Thread Stefan Hajnoczi
On Wed, Mar 30, 2011 at 9:15 AM, Conor Murphy
conor_murphy_v...@hotmail.com wrote:
 I'm trying to write a virtio-blk driver for Solaris. I've gotten it to the 
 point
 where Solaris can see the device and create a ZFS file system on it.

 However when I try and create a UFS filesystem on the device, the VM crashed
 with the error
 *** glibc detected *** /usr/bin/qemu-kvm: double free or corruption (!prev):
 0x7f2d38000a00 ***

This is a bug in QEMU.  A guest must not be able to trigger a crash.

 I can reproduce the problem with a simple dd, i.e.
 dd if=/dev/zero of=/dev/rdsk/c2d10p0 bs=5000 count=1

I think this a raw character device, which is why you're even able to
perform non-blocksize accesses?  Have you looked at how other drivers
(like the Xen pv blkfront) handle this?

 My driver will create a virtio-blk request with two elements in the sg list, 
 one
 for the first 4096 byes and the other for the remaining 904.

 From stepping through with gdb, virtio_blk_handle_write will sets n_sectors 
 to 9
 (5000 / 512). Later on the code, n_sectors is used the calculate the size of 
 the
 buffer required but 9 * 512 is too small and so when the request is process it
 ends up writing past the end of the buffer and I guest this triggers the glibc
 error.

We need to validate that (qiov-size % BDRV_SECTOR_SIZE) == 0 and
reject invalid requests.

 Is there a requirement for virtio-blk guest drivers that all i/o requests are
 sized in multiples of 512 bytes?

There is no strict requirement according to the virtio specification,
but maybe there should be:

http://ozlabs.org/~rusty/virtio-spec/virtio-spec-0.8.9.pdf

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] KVM: Automatic user feedback

2011-04-01 Thread Stefan Hajnoczi
On Fri, Apr 1, 2011 at 9:33 AM, Alexander Graf ag...@suse.de wrote:
 We're constantly developing and improving KVM, implementing new awesome
 features or simply fixing bugs in the existing code.

 But do people actually use that new code? Are we maybe writing it all in
 vain? Wouldn't it be nice to have some feeling for the number of users
 actually using our code?

 This patch enables us to stress test our automated test suite and at
 the same time figure out if we actually have users. When using a new
 kernel with this patch applied, the user will automatically send feedback,
 telling us that there are people out there, trying recent versions!

 Signed-off-by: Alexander Graf ag...@suse.de
 ---
  virt/kvm/kvm_main.c |    2 ++
  1 files changed, 2 insertions(+), 0 deletions(-)

/me migrates to linux-kvm
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.32.x guest dies when trying to run tcpdump

2011-04-02 Thread Stefan Hajnoczi
On Sat, Apr 2, 2011 at 4:23 PM, Nikola Ciprich extmaill...@linuxbox.cz wrote:
 I'm using virtio network channel, and on one of the guests (the one with 
 aborted ext4) I use it also for one of virtual disks.
 One more interesting thing, I can't reproduce this immediately after guest 
 boot, but for example second day after boot, I can reproduce this.
 perhaps this can suggest something?
 Could somebody please help me to find and possibly fix this bug?

Softlockups are a symptom that a guest vcpu hasn't been able to
execute.  Unfortunately I don't see anything that points to a specific
bug in the backtraces.

 If needed, I can provide further debugging information, bisect etc...

It looks like your guests are SMP.  How many vcpus are you running?
How many physical cpus does /proc/cpuinfo list on the host?

Is the host overloaded when this occurs?

Are there any clues in host dmesg?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call minutes for Apr 5

2011-04-05 Thread Stefan Hajnoczi
On Tue, Apr 5, 2011 at 4:07 PM, Chris Wright chr...@redhat.com wrote:
 kvm-autotest
 - roadmap...refactor to centralize testing (handle the xen-autotest split off)
 - internally at RH, lmr and cleber maintain autotest server to test
  branches (testing qemu.git daily)
  - have good automation for installs and testing
 - seems more QA focused than developers
  - plenty of benefit for developers, so lack of developer use partly
    cultural/visibility...
  - kvm-autotest team always looking for feedback to improve for
    developer use case
 - kvm-autotest day to have folks use it, write test, give feedback?
  - startup cost is/was steep, the day might be too much handholding
  - install-fest? (to get it installed and up and running)
 - buildbot or autotest for testing patches to verify building and working
 - one goal is to reduce mailing list load (patch resubmission because
  they haven't handled basic cases that buildbot or autotest would have
  caught)
 - fedora-virt test day coming up on April 14th.  lucas will be on hand and
  we can piggy back on that to include kvm-autotest install and virt testing
 - kvm autotest run before qemu pull request and post merge to track
  regressions, more frequent testing helps developers see breakage
  quickly
  - qemu.git daily testing already, only the sanity test subset
    - run more comprehensive stable set of tests on weekends
 - one issue is the large number of known failures, need to make these
  easier to identify (and fix the failures one way or another)
 - create database and verify (regressions) against that
  - red/yellow/green (yellow shows area was already broken)
 - autotest can be run against server, not just on laptop

Features that I think are important for a qemu.git kvm-autotest:
* Public results display (sounds like you're working on this)
* Public notifications of breakage, qemu.git/master failures to
qemu-devel mailing list.
* A one-time contributor can get their code tested.  No requirement to
set up a server because contributors may not have the resources.

Perhaps kvm-autotest is a good platform for the automated testing of
ARM TCG.  Paul is CCed, I recently saw the Jenkins qemu build and boot
tests he has set up.  Lucas, do you have ideas on how these efforts
can work together to bring testing to upstream QEMU?
http://validation.linaro.org/jenkins/job/qemu-boot-images/

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/2] rbd: allow configuration of rados from the rbd filename

2011-04-07 Thread Stefan Hajnoczi
On Thu, Apr 07, 2011 at 10:14:03AM +0900, Yoshiaki Tamura wrote:
 2011/3/29 Josh Durgin josh.dur...@dreamhost.com:
  The new format is 
  rbd:pool/image[@snapshot][:option1=value1[:option2=value2...]]
  Each option is used to configure rados, and may be any Ceph option, or 
  conf.
  The conf option specifies a Ceph configuration file to read.
 
  This allows rbd volumes from more than one Ceph cluster to be used by
  specifying different monitor addresses, as well as having different
  logging levels or locations for different volumes.
 
  Signed-off-by: Josh Durgin josh.dur...@dreamhost.com
  ---
   block/rbd.c |  119 
  ++
   1 files changed, 102 insertions(+), 17 deletions(-)
 
  diff --git a/block/rbd.c b/block/rbd.c
  index cb76dd3..bc3323d 100644
  --- a/block/rbd.c
  +++ b/block/rbd.c
  @@ -22,13 +22,17 @@
   /*
   * When specifying the image filename use:
   *
  - * rbd:poolname/devicename
  + * 
  rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]]
 
 I'm not sure IIUC, but currently this @snapshotname seems to be
 meaningless; it doesn't allow you to boot from a snapshot because it's
 read only.  Am I misunderstanding or tested incorrectly?

Read-only block devices are supported by QEMU and can be useful.

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call minutes for Apr 5

2011-04-07 Thread Stefan Hajnoczi
On Tue, Apr 5, 2011 at 6:37 PM, Lucas Meneghel Rodrigues l...@redhat.com 
wrote:

Thanks for your detailed response!

 On Tue, 2011-04-05 at 16:29 +0100, Stefan Hajnoczi wrote:
 * Public notifications of breakage, qemu.git/master failures to
 qemu-devel mailing list.

 ^ The challenge is to get enough data to determine what is a new
 breakage from a known issue, mainly. More related to have historical
 data from test results than anything else, IMO.

I agree.  Does kvm-autotest currently archive test results?

 * A one-time contributor can get their code tested.  No requirement to
 set up a server because contributors may not have the resources.

 Coming back to the point that many colleagues made: We need a sort of
 'make test' on the qemu trees that would fetch autotest and could setup
 basic tests that people could run, maybe suggest test sets...

 The problem I see is, getting guests up and running using configs that
 actually matter is not trivial (there are things such as ensuring that
 all auxiliary utilities are installed in a distro agnostic fashion,
 having bridges and DHCP server setup on possibly a disconnected work
 laptop, and stuff).

 So, having a 'no brains involved at all' setup is quite a challenge,
 suggestions welcome. Also, downloading isos, waiting for guests to
 install and run thorough tests won't be fast. So J. Random Developer
 might not bother to run tests even if we can provide a fool proof,
 perfectly automated setup, because it'd take a long time at first to get
 the tests run. This is also a challenge.

I'm actually starting to think that there is no one-size-fits-all solution.

Developers need make check-type unit tests for various QEMU
subsystems.  kvm-autotest could also run these unit tests as part of
its execution.

Then there are end-to-end acceptance tests.  They simply require
storage, network, and time resources and there's no way around that.
These tests are more suited to centralized testing infrastructure that
periodically tests qemu.git.

On the community call I was trying to see if there is a lightweight
version of kvm-autotest that could be merged into qemu.git.  But now I
think that this isn't realistic and it would be better to grow unit
tests in qemu.git while covering it with kvm-autotest for acceptance
testing.

 Perhaps kvm-autotest is a good platform for the automated testing of
 ARM TCG.  Paul is CCed, I recently saw the Jenkins qemu build and boot
 tests he has set up.  Lucas, do you have ideas on how these efforts
 can work together to bring testing to upstream QEMU?
 http://validation.linaro.org/jenkins/job/qemu-boot-images/

 I heard about jenkins before and it is indeed a nice project. What they
 do here, from what I could assess browsing at the webpage you provided
 is:

 1) Build qemu.git every time there are commits
 2) Boot pre-made 'pristine' images, one is a lenny arm image and the
 other is a linaro arm image.

 It is possible to do the same with kvm autotest, just a matter of not
 performing guest install tests and executing only the boot tests with
 pre-made images. What jenkins does here is a even quicker and shorter
 version of our sanity jobs.

 About how we can work together, I thought about some possibilities:

 1) Modify the jenkins test step to execute a kvm autotest job after the
 build, with the stripped down test set. We might gain some extra debug
 info, that the current test step does not seem to provide
 2) Do the normal test step and if that succeeds, trigger a kvm autotest
 job that does more comprehensive testing, such as migration, time drift,
 block layer, etc

 The funny thing is that KVM autotest has infrastructure to do the same
 as jenkins does, but jenkins is highly streamlined for the buildbot use
 case (continuous build and integration), and I see that as a very nice
 advantage. So I'd rather keep use jenkins and have kvm autotest plugged
 into it conveniently.

That sounds good.  I think the benefit of working together is that
different entities (Linaro, Red Hat, etc) can contribute QEMU tests
into a single place.  That testing can then cover to both upstream and
downstream to prevent breakage.

So kvm-autotest can run in single job mode and kicked off from jenkins
or buildbot?

It sounds like kvm-autotest has or needs its own cron, result
archiving, etc infrastructure.  Does it make sense to use a harness
like jenkins or buildbot instead and focus kvm-autotest purely as a
testing framework?

Stefan
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/2] rbd: use the higher level librbd instead of just librados

2011-04-08 Thread Stefan Hajnoczi
On Mon, Mar 28, 2011 at 04:15:57PM -0700, Josh Durgin wrote:
 librbd stacks on top of librados to provide access
 to rbd images.
 
 Using librbd simplifies the qemu code, and allows
 qemu to use new versions of the rbd format
 with few (if any) changes.
 
 Signed-off-by: Josh Durgin josh.dur...@dreamhost.com
 Signed-off-by: Yehuda Sadeh yeh...@hq.newdream.net
 ---
  block/rbd.c   |  785 
 +++--
  block/rbd_types.h |   71 -
  configure |   33 +--
  3 files changed, 221 insertions(+), 668 deletions(-)
  delete mode 100644 block/rbd_types.h

Hi Josh,
I have applied your patches onto qemu.git/master and am running
ceph.git/master.

Unfortunately qemu-iotests fails for me.


Test 016 seems to hang in qemu-io -g -c write -P 66 128M 512
rbd:rbd/t.raw.  I can reproduce this consistently.  Here is the
backtrace of the hung process (not consuming CPU, probably deadlocked):

Thread 9 (Thread 0x7f9ded6d6700 (LWP 26049)):
#0  0x7f9def41d16c in pthread_cond_wait@@GLIBC_2.3.2 () from 
/lib/libpthread.so.0
#1  0x7f9dee676d9a in Wait (this=0x2723950) at ./common/Cond.h:46
#2  SimpleMessenger::dispatch_entry (this=0x2723950) at 
msg/SimpleMessenger.cc:362
#3  0x7f9dee66180c in SimpleMessenger::DispatchThread::entry (this=value 
optimized out) at msg/SimpleMessenger.h:533
#4  0x7f9def4188ba in start_thread () from /lib/libpthread.so.0
#5  0x7f9dee14d02d in clone () from /lib/libc.so.6
#6  0x in ?? ()

Thread 8 (Thread 0x7f9deced5700 (LWP 26050)):
#0  0x7f9def41d16c in pthread_cond_wait@@GLIBC_2.3.2 () from 
/lib/libpthread.so.0
#1  0x7f9dee674fab in Wait (this=0x2723950) at ./common/Cond.h:46
#2  SimpleMessenger::reaper_entry (this=0x2723950) at 
msg/SimpleMessenger.cc:2251
#3  0x7f9dee6617ac in SimpleMessenger::ReaperThread::entry (this=0x2723d80) 
at msg/SimpleMessenger.h:485
#4  0x7f9def4188ba in start_thread () from /lib/libpthread.so.0
#5  0x7f9dee14d02d in clone () from /lib/libc.so.6
#6  0x in ?? ()

Thread 7 (Thread 0x7f9dec6d4700 (LWP 26051)):
#0  0x7f9def41d4d9 in pthread_cond_timedwait@@GLIBC_2.3.2 () from 
/lib/libpthread.so.0
#1  0x7f9dee72187a in WaitUntil (this=0x2722c00) at common/Cond.h:60
#2  SafeTimer::timer_thread (this=0x2722c00) at common/Timer.cc:110
#3  0x7f9dee722d7d in SafeTimerThread::entry (this=value optimized out) 
at common/Timer.cc:38
#4  0x7f9def4188ba in start_thread () from /lib/libpthread.so.0
#5  0x7f9dee14d02d in clone () from /lib/libc.so.6
#6  0x in ?? ()

Thread 6 (Thread 0x7f9df07ea700 (LWP 26052)):
#0  0x7f9def41d16c in pthread_cond_wait@@GLIBC_2.3.2 () from 
/lib/libpthread.so.0
#1  0x7f9dee67cae1 in Wait (this=0x2729890) at ./common/Cond.h:46
#2  SimpleMessenger::Pipe::writer (this=0x2729890) at 
msg/SimpleMessenger.cc:1746
#3  0x7f9dee66187d in SimpleMessenger::Pipe::Writer::entry (this=value 
optimized out) at msg/SimpleMessenger.h:204
#4  0x7f9def4188ba in start_thread () from /lib/libpthread.so.0
#5  0x7f9dee14d02d in clone () from /lib/libc.so.6
#6  0x in ?? ()

Thread 5 (Thread 0x7f9debed3700 (LWP 26055)):
#0  0x7f9dee142113 in poll () from /lib/libc.so.6
#1  0x7f9dee66d599 in tcp_read_wait (sd=value optimized out, 
timeout=value optimized out) at msg/tcp.cc:48
#2  0x7f9dee66e89b in tcp_read (sd=3, buf=value optimized out, len=1, 
timeout=90) at msg/tcp.cc:25
#3  0x7f9dee67ffd2 in SimpleMessenger::Pipe::reader (this=0x2729890) at 
msg/SimpleMessenger.cc:1539
#4  0x7f9dee66185d in SimpleMessenger::Pipe::Reader::entry (this=value 
optimized out) at msg/SimpleMessenger.h:196
#5  0x7f9def4188ba in start_thread () from /lib/libpthread.so.0
#6  0x7f9dee14d02d in clone () from /lib/libc.so.6
#7  0x in ?? ()

Thread 4 (Thread 0x7f9debdd2700 (LWP 26056)):
#0  0x7f9def41d4d9 in pthread_cond_timedwait@@GLIBC_2.3.2 () from 
/lib/libpthread.so.0
#1  0x7f9dee72187a in WaitUntil (this=0x2722e58) at common/Cond.h:60
#2  SafeTimer::timer_thread (this=0x2722e58) at common/Timer.cc:110
#3  0x7f9dee722d7d in SafeTimerThread::entry (this=value optimized out) 
at common/Timer.cc:38
#4  0x7f9def4188ba in start_thread () from /lib/libpthread.so.0
#5  0x7f9dee14d02d in clone () from /lib/libc.so.6
#6  0x in ?? ()

Thread 3 (Thread 0x7f9deb2ce700 (LWP 26306)):
#0  0x7f9def41d16c in pthread_cond_wait@@GLIBC_2.3.2 () from 
/lib/libpthread.so.0
#1  0x7f9dee67cae1 in Wait (this=0x272f090) at ./common/Cond.h:46
#2  SimpleMessenger::Pipe::writer (this=0x272f090) at 
msg/SimpleMessenger.cc:1746
#3  0x7f9dee66187d in SimpleMessenger::Pipe::Writer::entry (this=value 
optimized out) at msg/SimpleMessenger.h:204
#4  0x7f9def4188ba in start_thread () from /lib/libpthread.so.0
#5  0x7f9dee14d02d in clone () from /lib/libc.so.6
#6  0x in ?? ()

Thread 2 (Thread 0x7f9deb3cf700 (LWP 26309)):

  1   2   3   4   5   6   7   8   9   >