Attached is an updated version of my rados kvm driver. I did the following
changes:

- modifications for qemu "configure" - you must use "--enable-rbd" to 
  compile the rbd driver
- new naming scheme for the objects stored
- object size can be specified when creating an image with
  "-o cluster_size=<size in bytes>" (default is 4MB)
- new method for "qemu-img info"
- extended header format, to be prepared for encryption, compression and
  snapshots

Everything seems to work fine now and I think that the driver is in a usable
state. During the next two weeks I'll do some testing. I also hope that
'll be able to test live migration of a virtual machine.

If there are no objections on this list, I would like to send the patch to
the qemu-devel list.

Snapshots and AIO might be the next things I'll try to implement. Snapshots
will probably be taken on a per pool basis, since this will make things
a lot simpler. If you have multiple disk images in one pool, every image
will be snapshot, but there won't be a reference to the snapshot in the
metadata of the image. To avoid extra overhead you will have to create a
new pool for every image. - Ideas for a better solution are welcome.

Christian

-- 
Christian Brunner
--- qemu-kvm-0.12.3.orig/block/rbd.c    1970-01-01 01:00:00.000000000 +0100
+++ qemu-kvm-0.12.3/block/rbd.c 2010-03-14 11:37:30.490285574 +0100
@@ -0,0 +1,346 @@
+/*
+ * QEMU Block driver for RADOS (Ceph)
+ *
+ * Copyright (C) 2010 Christian Brunner <c...@muc.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include <qemu-common.h>
+
+//#include "rbd.h"
+#include "module.h"
+#include "block_int.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <rados/librados.h>
+
+#include <signal.h>
+
+/*
+ * When specifying the image filename use:
+ *
+ * rbd:poolname/devicename
+ *
+ * poolname must be the name of an existing rados pool
+ *
+ * devicename is the basename for all objects used to
+ * emulate the raw device.
+ *
+ * Metadata information (image size, ...) is stored in an 
+ * object with the name "devicename.metadata".
+ *
+ * The raw device is split into 4MB sized objects by default.
+ * The sequencenumber is encoded in a 12 byte long hex-string,
+ * and is attached to the devicename, separated by a dot.
+ * e.g. "devicename.1234567890ab"
+ *
+ */
+
+#define OBJ_MAX_SIZE (1UL<<22) // 22 Bit = 4MB (as default)
+#define MAX_NAME 128           // Maximum size of the poolname plus objectname
+#define MAX_SNAPS 4096         // Maximum number of snapshots (unsupported at
+                               // the moment)
+
+typedef struct RBDRVRBDState {
+       rados_pool_t pool;
+       char name[MAX_NAME];
+       int name_len;
+       uint64_t size;
+       uint64_t objsize;
+} RBDRVRBDState;
+       
+#define RBD_TEXT "<<< Rados Block Device Image >>>\n"
+#define RBD_SIGNATURE "RBD"
+#define RBD_VERSION1 "001.000"
+
+#define COMP_NONE 0
+#define CRYPT_NONE 0
+
+typedef struct {
+       char text[64];
+       char signature[4];
+       char version[8];
+       uint64_t imagesize;
+       uint64_t objsize;
+       uint8_t crypt_type;
+       uint8_t comp_type;                      // unsupported at the moment
+       uint16_t snap_count;                    // unsupported at the moment
+       uint32_t snap_id[MAX_SNAPS];    // unsupported at the moment
+} RbdHeader1;
+
+static int rbd_parsename(const char *filename, char *pool, char *name) {
+       const char *rbdname;
+       char *p, *n;
+       int l;
+
+       if(!strstart(filename, "rbd:", &rbdname)) {
+               return -EINVAL;
+       }
+               
+       pstrcpy(pool, MAX_NAME, rbdname);
+       p = strchr(pool, '/');
+       if (p == NULL) {
+               return -EINVAL;
+       }
+
+       *p = '\0';
+       n = ++p;
+
+       l = strlen(n);
+
+       if (l > MAX_NAME-14) {
+               fprintf(stderr, "object name to long\n");
+               return -EINVAL;
+       } else if (l <= 0) {
+               fprintf(stderr, "object name to short\n");
+               return -EINVAL;
+       }
+
+       strcpy(name, n);
+
+       return l;
+}
+
+static int rbd_create(const char *filename, QEMUOptionParameter *options) {
+       int64_t bytes = 0;
+       int64_t objsize = OBJ_MAX_SIZE;
+       char pool[MAX_NAME];
+       char n[MAX_NAME];
+       char name[MAX_NAME];
+       RbdHeader1 header;
+       rados_pool_t p;
+       int name_len;
+
+       if ((name_len = rbd_parsename(filename, pool, name)) < 0) {
+               return -EINVAL;
+       }
+       snprintf(n, MAX_NAME, "%s.metadata", name);
+
+       /* Read out options */
+       while (options && options->name) {
+               if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+                       bytes = options->value.n;
+               } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) {
+                       if (options->value.n) {
+                               objsize = options->value.n;
+                       }
+               }
+               options++;
+       }
+
+       memset(&header, 0, sizeof(header));
+       pstrcpy(header.text, sizeof(header.text), RBD_TEXT);
+       pstrcpy(header.signature, sizeof(header.signature), RBD_SIGNATURE);
+       pstrcpy(header.version, sizeof(header.version), RBD_VERSION1);
+       header.imagesize = bytes;
+       cpu_to_le64s(&header.imagesize);
+       header.objsize = objsize;
+       cpu_to_le64s(&header.objsize);
+       header.crypt_type = CRYPT_NONE;
+       header.comp_type = COMP_NONE;
+       header.snap_count = 0;
+       cpu_to_le16s(&header.snap_count);
+
+        if (rados_initialize(0, NULL) < 0) {
+                fprintf(stderr, "error initializing\n");
+               return -EIO;
+        }
+
+       if (rados_open_pool(pool, &p)) {
+                fprintf(stderr, "error opening pool %s\n", pool);
+               return -EIO;
+       }
+
+       if (rados_write(p, n, 0, (const char *) &header, sizeof(header)) < 0) {
+               return -errno;
+       }
+
+       rados_close_pool(p);
+       rados_deinitialize();
+
+       return 0;
+}
+
+static int rbd_open(BlockDriverState *bs, const char *filename, int flags) {
+       RBDRVRBDState *s = bs->opaque;
+       char pool[MAX_NAME];
+       char n[MAX_NAME];
+       char hbuf[4096];
+
+       if ((s->name_len = rbd_parsename(filename, pool, s->name)) < 0) {
+               return -EINVAL;
+       }
+       snprintf(n, MAX_NAME, "%s.metadata", s->name);
+       
+        if (rados_initialize(0, NULL) < 0) {
+                fprintf(stderr, "error initializing\n");
+               return -EIO;
+        }
+
+       if (rados_open_pool(pool, &s->pool)) {
+                fprintf(stderr, "error opening pool %s\n", pool);
+               return -EIO;
+       }
+
+       if (rados_read(s->pool, n, 0, hbuf, 4096) < 0) {
+                fprintf(stderr, "error reading header from %s\n", s->name);
+               return -EIO;
+       }
+       if (!strncmp(hbuf+64, RBD_SIGNATURE, 4)) {
+               if(!strncmp(hbuf+68, RBD_VERSION1, 8)) {
+                       RbdHeader1 *header;
+
+                       header = (RbdHeader1 *) hbuf;
+                       le64_to_cpus(&header->imagesize);
+                       s->size = header->imagesize;
+                       le64_to_cpus(&header->objsize);
+                       s->objsize = header->objsize;
+               } else {
+                       fprintf(stderr, "Unknown image version %s\n", hbuf+68);
+                       return -EIO;
+               }
+       } else {
+                fprintf(stderr, "Invalid header signature %s\n", hbuf+64);
+               return -EIO;
+       }
+
+       return 0;
+}
+
+static void rbd_close(BlockDriverState *bs) {
+       RBDRVRBDState *s = bs->opaque;
+
+       rados_close_pool(s->pool);
+       rados_deinitialize();
+}
+
+static int rbd_write(BlockDriverState *bs, int64_t sector_num, 
+               const uint8_t *buf, int nb_sectors) {
+       RBDRVRBDState *s = bs->opaque;
+       char n[MAX_NAME];
+
+       int64_t segnr, segoffs, segsize;
+       int64_t off, size;
+
+       off = sector_num * 512;
+       size = nb_sectors * 512;
+       segnr = (int64_t) (off / s->objsize);
+       segoffs = (int64_t) (off % s->objsize);
+       segsize  = (int64_t) (s->objsize - segoffs);
+
+       while (size > 0) {
+               if (size < segsize) {
+                       segsize = size;
+               }
+
+               snprintf(n, MAX_NAME, "%s.%012llx", s->name, (long long 
unsigned int) segnr);
+
+               if (rados_write(s->pool, n, segoffs, (const char *) buf, 
segsize) < 0) {
+                       return -errno;
+               }
+
+               buf += segsize;
+               size -= segsize;
+               segoffs = 0;
+               segsize = s->objsize;
+               segnr++;
+       }
+
+       return(0);
+}
+
+static int rbd_read(BlockDriverState *bs, int64_t sector_num, 
+               uint8_t *buf, int nb_sectors) {
+       RBDRVRBDState *s = bs->opaque;
+       char n[MAX_NAME];
+
+       int64_t segnr, segoffs, segsize, r;
+       int64_t off, size;
+
+       off = sector_num * 512;
+       size = nb_sectors * 512;
+       segnr = (int64_t) (off / s->objsize);
+       segoffs = (int64_t) (off % s->objsize);
+       segsize  = (int64_t) (s->objsize - segoffs);
+
+       while (size > 0) {
+               if (size < segsize) {
+                       segsize = size;
+               }
+
+               snprintf(n, MAX_NAME, "%s.%012llx", s->name, (long long 
unsigned int) segnr);
+
+               r = rados_read(s->pool, n, segoffs, (char *) buf, segsize);
+               if (r < 0) {
+                       memset(buf, 0, segsize);
+               } else if (r < segsize) {
+                       memset(buf+r, 0, segsize-r);
+               }
+
+               buf += segsize;
+               size -= segsize;
+               segoffs = 0;
+               segsize = s->objsize;
+               segnr++;
+       }
+
+       return(0);
+}
+
+static int rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    RBDRVRBDState *s = bs->opaque;
+    bdi->cluster_size = s->objsize;
+    return 0;
+}
+
+static int64_t rbd_getlength(BlockDriverState *bs) {
+       RBDRVRBDState *s = bs->opaque;
+
+       return s->size;
+}
+
+
+static QEMUOptionParameter rbd_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    {
+        .name = BLOCK_OPT_CLUSTER_SIZE,
+        .type = OPT_SIZE,
+        .help = "RBD object size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_rbd = {
+       .format_name    = "rbd",
+       .instance_size  = sizeof(RBDRVRBDState),
+       .bdrv_open      = rbd_open,
+       .bdrv_read      = rbd_read,
+       .bdrv_write     = rbd_write,
+       .bdrv_close     = rbd_close,
+       .bdrv_create    = rbd_create,
+       .bdrv_get_info  = rbd_getinfo,
+       .create_options = rbd_create_options,
+       .bdrv_getlength = rbd_getlength,
+       .protocol_name  = "rbd",
+};
+
+static void bdrv_rbd_init(void) {
+       bdrv_register(&bdrv_rbd);
+}
+
+block_init(bdrv_rbd_init);
+
+
--- qemu-kvm-0.12.3.orig/Makefile       2010-02-26 03:34:00.000000000 +0100
+++ qemu-kvm-0.12.3/Makefile    2010-03-11 21:25:07.400595864 +0100
@@ -28,6 +28,7 @@
 VPATH=$(SRC_PATH):$(SRC_PATH)/hw
 
 LIBS+=-lz $(LIBS_TOOLS)
+LIBS+=-lrados
 
 ifdef BUILD_DOCS
 DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8
@@ -114,6 +115,7 @@
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
+block-nested-$(CONFIG_RBD) += rbd.o
 
 block-obj-y +=  $(addprefix block/, $(block-nested-y))
 
--- qemu-kvm-0.12.3.orig/configure      2010-02-26 03:34:00.000000000 +0100
+++ qemu-kvm-0.12.3/configure   2010-03-11 21:24:01.073595962 +0100
@@ -272,6 +272,7 @@
 check_utests="no"
 user_pie="no"
 zero_malloc=""
+rbd="no"
 
 # OS specific
 if check_define __linux__ ; then
@@ -635,6 +636,8 @@
   ;;
   --disable-cpu-emulation) cpu_emulation="no"
   ;;
+  --enable-rbd) rbd="yes"
+  ;;
   *) echo "ERROR: unknown option $opt"; show_help="yes"
   ;;
   esac
@@ -793,6 +796,7 @@
 echo "  --kerneldir=PATH         look for kernel includes in PATH"
 echo "  --with-kvm-trace         enable building the KVM module with the kvm 
trace option"
 echo "  --disable-cpu-emulation  disables use of qemu cpu emulation code"
+echo "  --enable-rbd            enable building the rados block device (rbd)"
 echo ""
 echo "NOTE: The object files are built at the place where configure is 
launched"
 exit 1
@@ -1556,6 +1560,25 @@
 fi
 
 ##########################################
+# rbd probe
+if test "$rbd" != "no" ; then
+  cat > $TMPC <<EOF
+#include <stdio.h>
+#include <rados/librados.h>
+int main(void) { rados_initialize(0, NULL); return 0; }
+EOF
+  if compile_prog "" "-lrados -lcrypto" ; then
+    rbd=yes
+    LIBS="$LIBS -lrados -lcrypto"
+  else
+    if test "$rbd" = "yes" ; then
+      feature_not_found "rados block device"
+    fi
+    rbd=no
+  fi
+fi
+
+##########################################
 # linux-aio probe
 
 if test "$linux_aio" != "no" ; then
@@ -2058,6 +2081,7 @@
 echo "preadv support    $preadv"
 echo "fdatasync         $fdatasync"
 echo "uuid support      $uuid"
+echo "rbd support       $rbd"
 
 if test $sdl_too_old = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -2277,6 +2301,9 @@
 else
   echo "CONFIG_NO_CPU_EMULATION=y" >> $config_host_mak
 fi
+if test "$rbd" = "yes" ; then
+  echo "CONFIG_RBD=y" >> $config_host_mak
+fi
 
 # XXX: suppress that
 if [ "$bsd" = "yes" ] ; then
------------------------------------------------------------------------------
Download Intel&#174; Parallel Studio Eval
Try the new software tools for yourself. Speed compiling, find bugs
proactively, and fine-tune applications for parallel performance.
See why Intel Parallel Studio got high marks during beta.
http://p.sf.net/sfu/intel-sw-dev
_______________________________________________
Ceph-devel mailing list
Ceph-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/ceph-devel

Reply via email to