[Qemu-devel] Re: [PATCH] ceph/rbd block driver for qemu-kvm (v3)

2010-06-11 Thread Simone Gotti
Hi Christian,

thanks for you patch. I tried it a little and it worked quite well but
during some live migration tests I noticed a problem.


The problem is related to live migration with high I/O using the AIO
calls (I triggered it with a simple dd).

If you launch a live migration and the guest is stopped and started on
the new qemu process while some AIO was in flight the guest on the new
qemu will wait undefinitely for data this will never come. With ata
emulation an ata reset is sent after some seconds but with virtio this
won't happen.

I'm not a qemu expert but from what I understand qemu in
savevm.c:do_savevm calls qemu_aio_flush to wait that all the asyncronous
aio returned (the callback si called). But the rbd block driver doesn't
use the qemu aio model but the rados one so that function will never
know of the rados aio.

So a solution will be to glue the block driver with the qemu aio model.
I tried to do this to test if this will work in the attached patch. I
only tested with one rbd block device but the live migration tests
worked (in the patch I removed all the debug prints I adedd to see if
all AIO requets really returned.

This is an RFC just to know what you think about this possible solution.
As qemu's aio model is event based and it needs a file descriptor for
event communication i used eventfd to do this.
Let me know if you need a detailed description of the patch!


I've also got a question: as librados is multithreaded the callbacks are
called in another thread. Is there the need to protect some critical
sections with a lock (for example in rbd_aio_rw_vector and in
rbd_finish_aiocb)?


Thanks!

Bye!


From: Simone Gotti simone.go...@gmail.com
Date: Fri, 11 Jun 2010 21:19:39 +0200
Subject: [PATCH] block/rbd: Added glue to qemu aio model to fix live
migration with outstanding aio

Signed-off-by: Simone Gotti simone.go...@gmail.com


---
 block/rbd.c |   63
+-
 1 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 4d22069..83b7898 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -25,6 +25,8 @@
 
 #include signal.h
 
+#include sys/eventfd.h
+
 /*
  * When specifying the image filename use:
  *
@@ -47,6 +49,15 @@
 
 #define OBJ_MAX_SIZE (1UL  OBJ_DEFAULT_OBJ_ORDER)
 
+typedef struct BDRVRBDState {
+int efd;
+rados_pool_t pool;
+char name[RBD_MAX_OBJ_NAME_SIZE];
+uint64_t size;
+uint64_t objsize;
+int qemu_aio_count;
+} BDRVRBDState;
+
 typedef struct RBDAIOCB {
 BlockDriverAIOCB common;
 QEMUBH *bh;
@@ -57,6 +68,7 @@ typedef struct RBDAIOCB {
 int64_t sector_num;
 int aiocnt;
 int error;
+BDRVRBDState *s;
 } RBDAIOCB;
 
 typedef struct RADOSCB {
@@ -67,12 +79,6 @@ typedef struct RADOSCB {
 char *buf;
 } RADOSCB;
 
-typedef struct BDRVRBDState {
-rados_pool_t pool;
-char name[RBD_MAX_OBJ_NAME_SIZE];
-uint64_t size;
-uint64_t objsize;
-} BDRVRBDState;
 
 typedef struct rbd_obj_header_ondisk RbdHeader1;
 
@@ -255,6 +261,31 @@ done:
 return ret;
 }
 
+static void rbd_aio_completion_cb(void *opaque)
+{
+BDRVRBDState *s = opaque;
+
+uint64_t val;
+ssize_t ret;
+
+do {
+if ((ret = read(s-efd, val, sizeof(val)))  0) {
+s-qemu_aio_count -= val;
+   }
+} while (ret == -1  errno == EINTR);
+
+return;
+}
+
+static int rbd_aio_flush_cb(void *opaque)
+{
+BDRVRBDState *s = opaque;
+
+return (s-qemu_aio_count  0) ? 1 : 0;
+}
+
+
+
 static int rbd_open(BlockDriverState *bs, const char *filename, int flags)
 {
 BDRVRBDState *s = bs-opaque;
@@ -303,6 +334,15 @@ static int rbd_open(BlockDriverState *bs, const
char *filename, int flags)
 s-size = header-image_size;
 s-objsize = 1  header-options.order;
 
+s-efd = eventfd(0, 0);
+if (s-efd == -1) {
+error_report(error opening eventfd);
+goto failed;
+}
+fcntl(s-efd, F_SETFL, O_NONBLOCK);
+qemu_aio_set_fd_handler(s-efd, rbd_aio_completion_cb, NULL,
+rbd_aio_flush_cb, NULL, s);
+
 return 0;
 
 failed:
@@ -393,6 +433,7 @@ static AIOPool rbd_aio_pool = {
 };
 
 /* This is the callback function for rados_aio_read and _write */
+
 static void rbd_finish_aiocb(rados_completion_t c, RADOSCB *rcb)
 {
 RBDAIOCB *acb = rcb-acb;
@@ -427,6 +468,8 @@ static void rbd_finish_aiocb(rados_completion_t c,
RADOSCB *rcb)
 acb-ret += r;
 }
 }
+uint64_t buf = 1;
+write(acb-s-efd, buf, sizeof(buf));
 qemu_free(rcb);
 i = 0;
 if (!acb-aiocnt  acb-bh) {
@@ -435,6 +478,7 @@ static void rbd_finish_aiocb(rados_completion_t c,
RADOSCB *rcb)
 }
 
 /* Callback when all queued rados_aio requests are complete */
+
 static void rbd_aio_bh_cb(void *opaque)
 {
 RBDAIOCB *acb = opaque;
@@ -446,6 +490,10 @@ static void rbd_aio_bh_cb(void *opaque)
 acb-common.cb(acb-common.opaque, (acb-ret  0 ? 0 : acb-ret));
 qemu_bh_delete(acb-bh);
 acb-bh = NULL

[Qemu-devel] VersatetilePB: request for comments on PL031 patch

2007-06-15 Thread Simone

Hello,

My name is Simone Basso, and I study Computer Science at Politecnico di Torino,
Italy. In the three past weeks I've been working to a project that involves
Qemu, as a part of an exam, together with another student, Gaspare Scherma.


The requested task was to add to Qemu 0.9.0 support for AMBA ARM PL031 RTC chip,
to avoid to have the emulated system (it was debian 4.0) always starting in
1970-01-01, which may be _very_ annoying :-).

The patch that we attach provides the following features:

  - The chip is recognized by the kernel;

  - It is possible to read the current time from the RTC;

  - It is possible to request an interrupt in the next 24 hours;

  - It is possible to acknowledge such interrupt, to lower the IRQ signal.


To implement such patch we've used the following knowledge sources:

  - The implementation of rtc-pl031 driver in the linux kernel;

  - The emulated device hw/pl011.c in qemu sources;

  - Documentation in the linux kernel.

Needless to say, any error in the patch was certainly introduced by us,
and does not depends on them :-).


The reason why we used linux kernel driver as a source of documentation were 
two:

  - We were not able to find complete documentation of the chip[1];

  - Another part of the exam task was to study Linux's RTC framework, so
we were yet a bit confident with the driver's code.

Having used the Linux kernel as the main source of information, we've also
taken some code from the kernel itself, to implement some bits. Anyway, it
should not be a problem, since the new file hw/pl031.c, added by the patch,
is in the GPL.


I need also to point that, to test linux kernel with our patch, you also need to
patch the kernel itself: infact we've find out a problem in linux's rtc-pl031
interrupt handler. For this reason, a _very_ little patch for linux kernel is
also attached[2].


I write this email to make you, qemu developers, aware of our effert, in the
hope that the patch could be a base for PL031 support with Qemu. If I had missed
some information useful to revise the patch, please tell me.

Regards,
-Simone Basso

--

Footnotes
---

 [1] We've just found www.arm.com/pdfs/DDI0287B_arm926ejs_dev_chip_trm.pdf 
which

 says something about PL031 on page 222.

 [2] The patch is for 2.6.21.3, however I known for sure that it applies 
without

 any problem to 2.6.18 (I guess that it won't create any problem for any
 kernel = 2.6.18, but I can't say anything about less than .18).

 We've yet send via email the patch to the proper driver mantainer, who
 said that it seemed ok and that he'll forward it upstream.

 This happened a week ago. However, at the moment now, no stable kernel 
from

 kernel.org has the patch, therefore it is necessary to apply it.







--- linux-2.6.21.3/drivers/rtc/rtc-pl031.c.orig	2007-05-24 23:22:47.0 +0200
+++ linux-2.6.21.3/drivers/rtc/rtc-pl031.c	2007-06-06 22:07:53.0 +0200
@@ -49,9 +49,14 @@
 
 static irqreturn_t pl031_interrupt(int irq, void *dev_id)
 {
-	struct rtc_device *rtc = dev_id;
+	struct pl031_local *ldata = dev_id;
 
-	rtc_update_irq(rtc-class_dev, 1, RTC_AF);
+	rtc_update_irq(ldata-rtc-class_dev, 1, RTC_AF);
+
+	/* www.arm.com/pdfs/DDI0287B_arm926ejs_dev_chip_trm.pdf at page
+	   222 tells that The interrupt is cleared by writing any data
+   value to the interrupt clear register RTCICR. */
+	__raw_writel(1, ldata-base + RTC_ICR);
 
 	return IRQ_HANDLED;
 }
@@ -173,8 +178,11 @@
 		goto out_no_remap;
 	}
 
-	if (request_irq(adev-irq[0], pl031_interrupt, IRQF_DISABLED,
-			rtc-pl031, ldata-rtc)) {
+	/* Pass ldata to the interrupt handler, so we're able to write
+	   the register that clears the interrupt: we need ldata-base
+	   for that. */
+ 	if (request_irq(adev-irq[0], pl031_interrupt, IRQF_DISABLED,
+			rtc-pl031, ldata)) {
 		ret = -EIO;
 		goto out_no_irq;
 	}
















--- qemu-0.9.0/hw/versatilepb.c.orig	2007-06-06 21:45:40.0 +0200
+++ qemu-0.9.0/hw/versatilepb.c	2007-06-06 21:45:40.0 +0200
@@ -214,6 +214,9 @@
that includes hardware cursor support from the PL111.  */
 pl110_init(ds, 0x1012, pic, 16, 1);
 
+/* Add PL031 Real Time Clock. */
+pl031_init(0x101e8000,pic,10);
+
 /* Memory map for Versatile/PB:  */
 /* 0x1000 System registers.  */
 /* 0x10001000 PCI controller config registers.  */
--- qemu-0.9.0/vl.h.orig	2007-06-06 21:45:40.0 +0200
+++ qemu-0.9.0/vl.h	2007-06-06 21:45:40.0 +0200
@@ -1307,6 +1307,9 @@
 /* smc91c111.c */
 void smc91c111_init(NICInfo *, uint32_t, void *, int);
 
+/* pl031.c */
+void pl031_init(uint32_t base, void * pic, int irq);
+
 /* pl110.c */
 void *pl110_init(DisplayState *ds, uint32_t base, void *pic, int irq, int);
 
--- qemu-0.9.0/Makefile.target.orig	2007-06-06 21:45:40.0 +0200
+++ qemu-0.9.0/Makefile.target	2007-06-06 21:45:40.0 +0200