date:20120709

[Qemu-devel] [PATCH v16 9/9] Add XBZRLE statistics

2012-07-09 Thread Orit Wasserman

Signed-off-by: Benoit Hudzia benoit.hud...@sap.com
Signed-off-by: Petter Svard pett...@cs.umu.se
Signed-off-by: Aidan Shribman aidan.shrib...@sap.com
Signed-off-by: Orit Wasserman owass...@redhat.com
---
 arch_init.c  |   66 ++
 hmp.c|   13 ++
 migration.c  |   49 
 migration.h  |9 +++
 qapi-schema.json |   37 ++
 qmp-commands.hx  |   35 +++-
 6 files changed, 203 insertions(+), 6 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index c9c3fe0..53ae2b2 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -202,6 +202,64 @@ int64_t xbzrle_cache_resize(int64_t new_size)
 return pow2floor(new_size);
 }
 
+/* accounting for migration statistics */
+typedef struct AccountingInfo {
+uint64_t dup_pages;
+uint64_t norm_pages;
+uint64_t xbzrle_bytes;
+uint64_t xbzrle_pages;
+uint64_t xbzrle_cache_miss;
+uint64_t iterations;
+uint64_t xbzrle_overflows;
+} AccountingInfo;
+
+static AccountingInfo acct_info;
+
+static void acct_clear(void)
+{
+memset(acct_info, 0, sizeof(acct_info));
+}
+
+uint64_t dup_mig_bytes_transferred(void)
+{
+return acct_info.dup_pages * TARGET_PAGE_SIZE;
+}
+
+uint64_t dup_mig_pages_transferred(void)
+{
+return acct_info.dup_pages;
+}
+
+uint64_t norm_mig_bytes_transferred(void)
+{
+return acct_info.norm_pages * TARGET_PAGE_SIZE;
+}
+
+uint64_t norm_mig_pages_transferred(void)
+{
+return acct_info.norm_pages;
+}
+
+uint64_t xbzrle_mig_bytes_transferred(void)
+{
+return acct_info.xbzrle_bytes;
+}
+
+uint64_t xbzrle_mig_pages_transferred(void)
+{
+return acct_info.xbzrle_pages;
+}
+
+uint64_t xbzrle_mig_pages_cache_miss(void)
+{
+return acct_info.xbzrle_cache_miss;
+}
+
+uint64_t xbzrle_mig_pages_overflow(void)
+{
+return acct_info.xbzrle_overflows;
+}
+
 static void save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 int cont, int flag)
 {
@@ -236,6 +294,7 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t 
*current_data,
 cache_insert(XBZRLE.cache, current_addr,
  g_memdup(current_data, TARGET_PAGE_SIZE));
 }
+acct_info.xbzrle_cache_miss++;
 return -1;
 }
 
@@ -250,6 +309,7 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t 
*current_data,
 return 0;
 } else if (encoded_len == -1) {
 DPRINTF(Overflow\n);
+acct_info.xbzrle_overflows++;
 /* update data in the cache */
 memcpy(prev_cached_page, current_data, TARGET_PAGE_SIZE);
 return -1;
@@ -269,7 +329,9 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t 
*current_data,
 qemu_put_byte(f, hdr.xh_flags);
 qemu_put_be16(f, hdr.xh_len);
 qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
+acct_info.xbzrle_pages++;
 bytes_sent = encoded_len + sizeof(hdr);
+acct_info.xbzrle_bytes += bytes_sent;
 
 return bytes_sent;
 }
@@ -301,6 +363,7 @@ static int ram_save_block(QEMUFile *f, int stage)
 p = memory_region_get_ram_ptr(mr) + offset;
 
 if (is_dup_page(p)) {
+acct_info.dup_pages++;
 save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_COMPRESS);
 qemu_put_byte(f, *p);
 bytes_sent = 1;
@@ -323,6 +386,7 @@ static int ram_save_block(QEMUFile *f, int stage)
 save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE);
 qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 bytes_sent = TARGET_PAGE_SIZE;
+acct_info.norm_pages++;
 }
 
 /* if page is unmodified, continue to the next */
@@ -449,6 +513,7 @@ int ram_save_live(QEMUFile *f, int stage, void *opaque)
 return -1;
 }
 XBZRLE.encoded_buf = g_malloc0(TARGET_PAGE_SIZE);
+acct_clear();
 }
 
 /* Make sure all dirty bits are set */
@@ -484,6 +549,7 @@ int ram_save_live(QEMUFile *f, int stage, void *opaque)
bytes_sent -1 represent no more blocks*/
 if (bytes_sent  0) {
 bytes_transferred += bytes_sent;
+acct_info.iterations++;
 } else if (bytes_sent == -1) { /* no more blocks */
 break;
 }
diff --git a/hmp.c b/hmp.c
index 5833e1c..5c56f4a 100644
--- a/hmp.c
+++ b/hmp.c
@@ -168,6 +168,19 @@ void hmp_info_migrate(Monitor *mon)
info-disk-total  10);
 }
 
+if (info-has_cache) {
+monitor_printf(mon, cache size: % PRIu64  bytes\n,
+   info-cache-cache_size);
+monitor_printf(mon, xbzrle transferred: % PRIu64  kbytes\n,
+   info-cache-xbzrle_bytes  10);
+monitor_printf(mon, xbzrle pages: % PRIu64  pages\n,
+   info-cache-xbzrle_pages);
+monitor_printf(mon, xbzrle cache miss: % PRIu64 \n,

[Qemu-devel] [PATCH v16 6/9] Add xbzrle_encode_buffer and xbzrle_decode_buffer functions

2012-07-09 Thread Orit Wasserman

For performance we are encoding long word at a time.
For nzrun we use long-word-at-a-time NULL-detection tricks from strcmp():
using ((lword - 0x0101010101010101)  (~lword)  0x8080808080808080) test
to find out if any byte in the long word is zero.

Signed-off-by: Benoit Hudzia benoit.hud...@sap.com
Signed-off-by: Petter Svard pett...@cs.umu.se
Signed-off-by: Aidan Shribman aidan.shrib...@sap.com
Signed-off-by: Orit Wasserman owass...@redhat.com
Signed-off-by: Eric Blake ebl...@redhat.com
---
 migration.h |4 ++
 savevm.c|  159 +++
 2 files changed, 163 insertions(+), 0 deletions(-)

diff --git a/migration.h b/migration.h
index acc0b94..c46af82 100644
--- a/migration.h
+++ b/migration.h
@@ -100,4 +100,8 @@ void migrate_add_blocker(Error *reason);
  */
 void migrate_del_blocker(Error *reason);
 
+int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
+ uint8_t *dst, int dlen);
+int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
+
 #endif
diff --git a/savevm.c b/savevm.c
index a15c163..26d 100644
--- a/savevm.c
+++ b/savevm.c
@@ -2385,3 +2385,162 @@ void vmstate_register_ram_global(MemoryRegion *mr)
 {
 vmstate_register_ram(mr, NULL);
 }
+
+/*
+  page = zrun nzrun
+   | zrun nzrun page
+
+  zrun = length
+
+  nzrun = length byte...
+
+  length = uleb128 encoded integer
+ */
+int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
+ uint8_t *dst, int dlen)
+{
+uint32_t zrun_len = 0, nzrun_len = 0;
+int d = 0, i = 0;
+long res, xor;
+uint8_t *nzrun_start = NULL;
+
+g_assert(!(((uintptr_t)old_buf | (uintptr_t)new_buf | slen) %
+   sizeof(long)));
+
+while (i  slen) {
+/* overflow */
+if (d + 2  dlen) {
+return -1;
+}
+
+/* not aligned to sizeof(long) */
+res = (slen - i) % sizeof(long);
+while (res  old_buf[i] == new_buf[i]) {
+zrun_len++;
+i++;
+res--;
+}
+
+/* word at a time for speed */
+if (!res) {
+while (i  slen 
+   (*(long *)(old_buf + i)) == (*(long *)(new_buf + i))) {
+i += sizeof(long);
+zrun_len += sizeof(long);
+}
+
+/* go over the rest */
+while (i  slen  old_buf[i] == new_buf[i]) {
+zrun_len++;
+i++;
+}
+}
+
+/* buffer unchanged */
+if (zrun_len == slen) {
+return 0;
+}
+
+/* skip last zero run */
+if (i == slen) {
+return d;
+}
+
+d += uleb128_encode_small(dst + d, zrun_len);
+
+zrun_len = 0;
+nzrun_start = new_buf + i;
+
+/* overflow */
+if (d + 2  dlen) {
+return -1;
+}
+/* not aligned to sizeof(long) */
+res = (slen - i) % sizeof(long);
+while (res  old_buf[i] != new_buf[i]) {
+i++;
+nzrun_len++;
+res--;
+}
+
+/* word at a time for speed, use of 32-bit long okay */
+if (!res) {
+/* truncation to 32-bit long okay */
+long mask = 0x0101010101010101ULL;
+while (i  slen) {
+xor = *(long *)(old_buf + i) ^ *(long *)(new_buf + i);
+if ((xor - mask)  ~xor  (mask  7)) {
+/* found the end of an nzrun within the current long */
+while (old_buf[i] != new_buf[i]) {
+nzrun_len++;
+i++;
+}
+break;
+} else {
+i += sizeof(long);
+nzrun_len += sizeof(long);
+}
+}
+}
+
+d += uleb128_encode_small(dst + d, nzrun_len);
+/* overflow */
+if (d + nzrun_len  dlen) {
+return -1;
+}
+memcpy(dst + d, nzrun_start, nzrun_len);
+d += nzrun_len;
+nzrun_len = 0;
+}
+
+return d;
+}
+
+int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
+{
+int i = 0, d = 0;
+int ret;
+uint32_t count = 0;
+
+while (i  slen) {
+
+/* zrun */
+if ((slen - i)  2) {
+return -1;
+}
+
+ret = uleb128_decode_small(src + i, count);
+if (ret  0 || (i  !count)) {
+return -1;
+}
+i += ret;
+d += count;
+
+/* overflow */
+if (d  dlen) {
+return -1;
+}
+
+/* nzrun */
+if ((slen - i)  2) {
+return -1;
+}
+
+ret = uleb128_decode_small(src + i, count);
+if (ret  0 || !count) {
+return -1;
+}
+i += ret;
+
+/* overflow */
+if (d + count  dlen || i + count  slen) {
+

[Qemu-devel] [PATCH] make: Remove 'build-all' rule

2012-07-09 Thread Stefan Weil

It is not needed, because the 'all' rule does the same.

Signed-off-by: Stefan Weil s...@weilnetz.de
---
 Makefile |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index bad0e31..76dae56 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ BUILD_DIR=$(CURDIR)
 # All following code might depend on configuration variables
 ifneq ($(wildcard config-host.mak),)
 # Put the all: rule here so that config-host.mak can contain dependencies.
-all: build-all
+all:
 include config-host.mak
 include $(SRC_PATH)/rules.mak
 config-host.mak: $(SRC_PATH)/configure
@@ -31,7 +31,7 @@ Makefile: ;
 configure: ;
 
 .PHONY: all clean cscope distclean dvi html info install install-doc \
-   pdf recurse-all speed tar tarbin test build-all
+   pdf recurse-all speed tar tarbin test
 
 $(call set-vpath, $(SRC_PATH))
 
@@ -82,7 +82,7 @@ defconfig:
 
 -include config-all-devices.mak
 
-build-all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all
+all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all
 
 config-host.h: config-host.h-timestamp
 config-host.h-timestamp: config-host.mak
-- 
1.7.10

Re: [Qemu-devel] [PATCH v4 0/7] file descriptor passing using pass-fd

2012-07-09 Thread Anthony Liguori


On 06/26/2012 04:10 AM, Daniel P. Berrange wrote:

On Fri, Jun 22, 2012 at 02:36:07PM -0400, Corey Bryant wrote:

libvirt's sVirt security driver provides SELinux MAC isolation for
Qemu guest processes and their corresponding image files.  In other
words, sVirt uses SELinux to prevent a QEMU process from opening
files that do not belong to it.

sVirt provides this support by labeling guests and resources with
security labels that are stored in file system extended attributes.
Some file systems, such as NFS, do not support the extended
attribute security namespace, and therefore cannot support sVirt
isolation.

A solution to this problem is to provide fd passing support, where
libvirt opens files and passes file descriptors to QEMU.  This,
along with SELinux policy to prevent QEMU from opening files, can
provide image file isolation for NFS files stored on the same NFS
mount.

This patch series adds the pass-fd QMP monitor command, which allows
an fd to be passed via SCM_RIGHTS, and returns the received file
descriptor.  Support is also added to the block layer to allow QEMU
to dup the fd when the filename is of the /dev/fd/X format.  This
is useful if MAC policy prevents QEMU from opening specific types
of files.


I was thinking about some of the sources complexity when using
FD passing from libvirt and wanted to raise one idea for discussion
before we continue.

With this proposed series, we have usage akin to:

   1. pass_fd FDSET={M} -  returns a string /dev/fd/N showing QEMU's
  view of the FD
   2. drive_add file=/dev/fd/N
   3. if failure:
close_fd /dev/fd/N

My problem is that none of this FD passing is transactional.


My original patch series did not suffer from this problem.

QEMU owned the file descriptor once it received it from libvirt.

I don't think the cited problem (QEMU failing an operation if libvirt was down) 
is really an actual problem since it would be libvirt that would be issuing the 
command in the first place (so the command would just fail which libvirt would 
have to assume anyway if it crashed).


I really dislike where this thread has headed with /dev/fdset.  This has become 
extremely complex and cumbersome.


Perhaps we should reconsider using an RPC for QEMU to request an fd as this 
solves all the cited problems in a much simpler fashion.


Regards,

Anthony Liguori

Re: [Qemu-devel] [PATCH v3] sheepdog: do not blindly memset all read buffers

2012-07-09 Thread MORITA Kazutaka

At Mon, 9 Jul 2012 16:34:13 +0200,
Christoph Hellwig wrote:
 
 Only buffers that map to unallocated blocks need to be zeroed.
 
 Signed-off-by: Christoph Hellwig h...@lst.de
 
 ---
  block/sheepdog.c |   37 ++---
  1 file changed, 18 insertions(+), 19 deletions(-)

Acked-by: MORITA Kazutaka morita.kazut...@lab.ntt.co.jp

[Qemu-devel] [PATCH v16 8/9] Add set_cachesize command

2012-07-09 Thread Orit Wasserman

Change XBZRLE cache size in bytes (the size should be a power of 2).
If XBZRLE cache size is too small there will be many cache miss.

Signed-off-by: Benoit Hudzia benoit.hud...@sap.com
Signed-off-by: Petter Svard pett...@cs.umu.se
Signed-off-by: Aidan Shribman aidan.shrib...@sap.com
Signed-off-by: Orit Wasserman owass...@redhat.com
---
 arch_init.c  |   10 ++
 hmp-commands.hx  |   20 
 hmp.c|   13 +
 hmp.h|1 +
 migration.c  |   14 ++
 migration.h  |2 ++
 qapi-schema.json |   16 
 qmp-commands.hx  |   23 +++
 8 files changed, 99 insertions(+), 0 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 15f0790..c9c3fe0 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -192,6 +192,16 @@ static struct {
 .cache = NULL,
 };
 
+
+int64_t xbzrle_cache_resize(int64_t new_size)
+{
+if (XBZRLE.cache != NULL) {
+return cache_resize(XBZRLE.cache, new_size / TARGET_PAGE_SIZE) *
+TARGET_PAGE_SIZE;
+}
+return pow2floor(new_size);
+}
+
 static void save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 int cont, int flag)
 {
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 9245bef..7ff1d77 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -829,6 +829,26 @@ STEXI
 @item migrate_cancel
 @findex migrate_cancel
 Cancel the current VM migration.
+
+ETEXI
+
+{
+.name   = migrate_set_cachesize,
+.args_type  = value:o,
+.params = value,
+.help   = set cache size (in bytes) for XBZRLE migrations,
+  the cache size will be rounded down to the nearest 
+  power of 2.\n
+  The cache size effects the number of cache misses.
+  In case of a high cache miss ratio you need to increase
+   the cache size,
+.mhandler.cmd = hmp_migrate_set_cachesize,
+},
+
+STEXI
+@item migrate_set_cachesize @var{value}
+@findex migrate_set_cache
+Set cache size to @var{value} (in bytes) for xbzrle migrations.
 ETEXI
 
 {
diff --git a/hmp.c b/hmp.c
index b0440e6..5833e1c 100644
--- a/hmp.c
+++ b/hmp.c
@@ -758,6 +758,19 @@ void hmp_migrate_set_downtime(Monitor *mon, const QDict 
*qdict)
 qmp_migrate_set_downtime(value, NULL);
 }
 
+void hmp_migrate_set_cachesize(Monitor *mon, const QDict *qdict)
+{
+int64_t value = qdict_get_int(qdict, value);
+Error *err = NULL;
+
+qmp_migrate_set_cachesize(value, err);
+if (err) {
+monitor_printf(mon, %s\n, error_get_pretty(err));
+error_free(err);
+return;
+}
+}
+
 void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict)
 {
 int64_t value = qdict_get_int(qdict, value);
diff --git a/hmp.h b/hmp.h
index 09ba198..7c5117d 100644
--- a/hmp.h
+++ b/hmp.h
@@ -53,6 +53,7 @@ void hmp_migrate_cancel(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict);
+void hmp_migrate_set_cachesize(Monitor *mon, const QDict *qdict);
 void hmp_set_password(Monitor *mon, const QDict *qdict);
 void hmp_expire_password(Monitor *mon, const QDict *qdict);
 void hmp_eject(Monitor *mon, const QDict *qdict);
diff --git a/migration.c b/migration.c
index 1a264a9..b3cdf8c 100644
--- a/migration.c
+++ b/migration.c
@@ -533,6 +533,20 @@ void qmp_migrate_cancel(Error **errp)
 migrate_fd_cancel(migrate_get_current());
 }
 
+void qmp_migrate_set_cachesize(int64_t value, Error **errp)
+{
+MigrationState *s = migrate_get_current();
+
+/* Check for truncation */
+if (value != (size_t)value) {
+error_set(errp, QERR_INVALID_PARAMETER_VALUE, cache size,
+  exceeding address space);
+return;
+}
+
+s-xbzrle_cache_size = xbzrle_cache_resize(value);
+}
+
 void qmp_migrate_set_speed(int64_t value, Error **errp)
 {
 MigrationState *s;
diff --git a/migration.h b/migration.h
index 9b61e70..a73a34a 100644
--- a/migration.h
+++ b/migration.h
@@ -108,4 +108,6 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t 
*dst, int dlen);
 int migrate_use_xbzrle(void);
 int64_t migrate_xbzrle_cache_size(void);
 
+int64_t xbzrle_cache_resize(int64_t new_size);
+
 #endif
diff --git a/qapi-schema.json b/qapi-schema.json
index a8408fd..c13577d 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1390,6 +1390,22 @@
 { 'command': 'migrate_set_speed', 'data': {'value': 'int'} }
 
 ##
+# @migrate_set_cachesize
+#
+# Set XBZRLE cache size
+#
+# @value: cache size in bytes
+#
+# The size will be rounded down to the nearest power of 2.
+# The cache size can be modified before and during ongoing migration
+#
+# Returns: nothing on success
+#
+# Since: 1.2
+##
+{ 'command': 'migrate_set_cachesize', 'data': {'value': 'int'} }
+
+##
 #

Re: [Qemu-devel] [PATCH v4 0/7] file descriptor passing using pass-fd

2012-07-09 Thread Luiz Capitulino

On Mon, 09 Jul 2012 13:40:34 -0500
Anthony Liguori aligu...@us.ibm.com wrote:

 On 06/26/2012 04:10 AM, Daniel P. Berrange wrote:
  On Fri, Jun 22, 2012 at 02:36:07PM -0400, Corey Bryant wrote:
  libvirt's sVirt security driver provides SELinux MAC isolation for
  Qemu guest processes and their corresponding image files.  In other
  words, sVirt uses SELinux to prevent a QEMU process from opening
  files that do not belong to it.
 
  sVirt provides this support by labeling guests and resources with
  security labels that are stored in file system extended attributes.
  Some file systems, such as NFS, do not support the extended
  attribute security namespace, and therefore cannot support sVirt
  isolation.
 
  A solution to this problem is to provide fd passing support, where
  libvirt opens files and passes file descriptors to QEMU.  This,
  along with SELinux policy to prevent QEMU from opening files, can
  provide image file isolation for NFS files stored on the same NFS
  mount.
 
  This patch series adds the pass-fd QMP monitor command, which allows
  an fd to be passed via SCM_RIGHTS, and returns the received file
  descriptor.  Support is also added to the block layer to allow QEMU
  to dup the fd when the filename is of the /dev/fd/X format.  This
  is useful if MAC policy prevents QEMU from opening specific types
  of files.
 
  I was thinking about some of the sources complexity when using
  FD passing from libvirt and wanted to raise one idea for discussion
  before we continue.
 
  With this proposed series, we have usage akin to:
 
 1. pass_fd FDSET={M} -  returns a string /dev/fd/N showing QEMU's
view of the FD
 2. drive_add file=/dev/fd/N
 3. if failure:
  close_fd /dev/fd/N
 
  My problem is that none of this FD passing is transactional.
 
 My original patch series did not suffer from this problem.
 
 QEMU owned the file descriptor once it received it from libvirt.
 
 I don't think the cited problem (QEMU failing an operation if libvirt was 
 down) 
 is really an actual problem since it would be libvirt that would be issuing 
 the 
 command in the first place (so the command would just fail which libvirt 
 would 
 have to assume anyway if it crashed).
 
 I really dislike where this thread has headed with /dev/fdset.  This has 
 become 
 extremely complex and cumbersome.

I agree, maybe it's time to start over and discuss the original problem again.

 
 Perhaps we should reconsider using an RPC for QEMU to request an fd as this 
 solves all the cited problems in a much simpler fashion.
 
 Regards,
 
 Anthony Liguori

[Qemu-devel] [PATCH v16 7/9] Add XBZRLE to ram_save_block and ram_save_live

2012-07-09 Thread Orit Wasserman

In the outgoing migration check to see if the page is cached and
changed than send compressed page by using save_xbrle_page function.
In the incoming migration check to see if RAM_SAVE_FLAG_XBZRLE is set
and decompress the page (by using load_xbrle function).

Signed-off-by: Benoit Hudzia benoit.hud...@sap.com
Signed-off-by: Petter Svard pett...@cs.umu.se
Signed-off-by: Aidan Shribman aidan.shrib...@sap.com
Signed-off-by: Orit Wasserman owass...@redhat.com
---
 arch_init.c |  187 +--
 migration.c |   24 
 migration.h |4 +
 3 files changed, 210 insertions(+), 5 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index e36899e..15f0790 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -43,6 +43,7 @@
 #include hw/smbios.h
 #include exec-memory.h
 #include hw/pcspk.h
+#include qemu/page_cache.h
 
 #ifdef DEBUG_ARCH_INIT
 #define DPRINTF(fmt, ...) \
@@ -102,6 +103,7 @@ const uint32_t arch_type = QEMU_ARCH;
 #define RAM_SAVE_FLAG_PAGE 0x08
 #define RAM_SAVE_FLAG_EOS  0x10
 #define RAM_SAVE_FLAG_CONTINUE 0x20
+#define RAM_SAVE_FLAG_XBZRLE   0x40
 
 #ifdef __ALTIVEC__
 #include altivec.h
@@ -169,6 +171,27 @@ static int is_dup_page(uint8_t *page)
 return 1;
 }
 
+/* XBZRLE (Xor Based Zero Length Encoding */
+typedef struct XBZRLEHeader {
+uint16_t xh_len;
+uint8_t xh_flags;
+} XBZRLEHeader;
+
+/* struct contains XBZRLE cache and a static page
+   used by the compression */
+static struct {
+/* buffer used for XBZRLE encoding */
+uint8_t *encoded_buf;
+/* buffer used for XBZRLE decoding */
+uint8_t *decoded_buf;
+/* Cache for XBZRLE */
+PageCache *cache;
+} XBZRLE = {
+.encoded_buf = NULL,
+.decoded_buf = NULL,
+.cache = NULL,
+};
+
 static void save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 int cont, int flag)
 {
@@ -181,15 +204,76 @@ static void save_block_hdr(QEMUFile *f, RAMBlock *block, 
ram_addr_t offset,
 
 }
 
+#define ENCODING_FLAG_XBZRLE 0x1
+
+static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data,
+ram_addr_t current_addr, RAMBlock *block,
+ram_addr_t offset, int cont, int stage)
+{
+int encoded_len = 0, bytes_sent = -1, ret = -1;
+XBZRLEHeader hdr = {
+.xh_len = 0,
+.xh_flags = 0,
+};
+uint8_t *prev_cached_page;
+
+/* Stage 1 cache the page and exit.
+   Stage 2 check to see if page is cached, if not cache the page.
+   Stage 3 check if the page is cached and if not exit.
+*/
+if (stage == 1 || !cache_is_cached(XBZRLE.cache, current_addr)) {
+if (stage != 3) {
+cache_insert(XBZRLE.cache, current_addr,
+ g_memdup(current_data, TARGET_PAGE_SIZE));
+}
+return -1;
+}
+
+prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
+
+/* XBZRLE encoding (if there is no overflow) */
+encoded_len = xbzrle_encode_buffer(prev_cached_page, current_data,
+   TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
+   TARGET_PAGE_SIZE);
+if (encoded_len == 0) {
+DPRINTF(Skipping unmodified page\n);
+return 0;
+} else if (encoded_len == -1) {
+DPRINTF(Overflow\n);
+/* update data in the cache */
+memcpy(prev_cached_page, current_data, TARGET_PAGE_SIZE);
+return -1;
+}
+
+/* we need to update the data in the cache, in order to get the same data
+   we cached we decode the encoded page on the cached data */
+ret = xbzrle_decode_buffer(XBZRLE.encoded_buf, encoded_len,
+   prev_cached_page, TARGET_PAGE_SIZE);
+g_assert(ret != -1);
+
+hdr.xh_len = encoded_len;
+hdr.xh_flags |= ENCODING_FLAG_XBZRLE;
+
+/* Send XBZRLE based compressed page */
+save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_XBZRLE);
+qemu_put_byte(f, hdr.xh_flags);
+qemu_put_be16(f, hdr.xh_len);
+qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
+bytes_sent = encoded_len + sizeof(hdr);
+
+return bytes_sent;
+}
+
 static RAMBlock *last_block;
 static ram_addr_t last_offset;
 
-static int ram_save_block(QEMUFile *f)
+static int ram_save_block(QEMUFile *f, int stage)
 {
 RAMBlock *block = last_block;
 ram_addr_t offset = last_offset;
 int bytes_sent = -1;
 MemoryRegion *mr;
+ram_addr_t current_addr;
 
 if (!block)
 block = QLIST_FIRST(ram_list.blocks);
@@ -210,13 +294,31 @@ static int ram_save_block(QEMUFile *f)
 save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_COMPRESS);
 qemu_put_byte(f, *p);
 bytes_sent = 1;
-} else {
+} else if (migrate_use_xbzrle()  stage != 3) {
+current_addr = block-offset + offset;
+/* In stage 1 we only cache the pages before sending them
+   from

[Qemu-devel] [PATCH v16 2/9] Add XBZRLE documentation

2012-07-09 Thread Orit Wasserman

Signed-off-by: Orit Wasserman owass...@redhat.com
---
 docs/xbzrle.txt |  136 +++
 1 files changed, 136 insertions(+), 0 deletions(-)
 create mode 100644 docs/xbzrle.txt

diff --git a/docs/xbzrle.txt b/docs/xbzrle.txt
new file mode 100644
index 000..f70e851
--- /dev/null
+++ b/docs/xbzrle.txt
@@ -0,0 +1,136 @@
+XBZRLE (Xor Based Zero Run Length Encoding)
+===
+
+Using XBZRLE (Xor Based Zero Run Length Encoding) allows for the reduction
+of VM downtime and the total live-migration time of Virtual machines.
+It is particularly useful for virtual machines running memory write intensive
+workloads that are typical of large enterprise applications such as SAP ERP
+Systems, and generally speaking for any application that uses a sparse memory
+update pattern.
+
+Instead of sending the changed guest memory page this solution will send a
+compressed version of the updates, thus reducing the amount of data sent during
+live migration.
+In order to be able to calculate the update, the previous memory pages need to
+be stored on the source. Those pages are stored in a dedicated cache
+(hash table) and are
+accessed by their address.
+The larger the cache size the better the chances are that the page has already
+been stored in the cache.
+A small cache size will result in high cache miss rate.
+Cache size can be changed before and during migration.
+
+Format
+===
+
+The compression format performs a XOR between the previous and current content
+of the page, where zero represents an unchanged value.
+The page data delta is represented by zero and non zero runs.
+A zero run is represented by its length (in bytes).
+A non zero run is represented by its length (in bytes) and the new data.
+The run length is encoded using ULEB128 (http://en.wikipedia.org/wiki/LEB128)
+
+There can be more than one valid encoding, the sender may send a longer 
encoding
+for the benefit of reducing computation cost.
+
+page = zrun nzrun
+   | zrun nzrun page
+
+zrun = length
+
+nzrun = length byte...
+
+length = uleb128 encoded integer
+
+On the sender side XBZRLE is used as a compact delta encoding of page updates,
+retrieving the old page content from the cache (default size of 512 MB). The
+receiving side uses the existing page's content and XBZRLE to decode the new
+page's content.
+
+This work was originally based on research results published
+VEE 2011: Evaluation of Delta Compression Techniques for Efficient Live
+Migration of Large Virtual Machines by Benoit, Svard, Tordsson and Elmroth.
+Additionally the delta encoder XBRLE was improved further using the XBZRLE
+instead.
+
+XBZRLE has a sustained bandwidth of 2-2.5 GB/s for typical workloads making it
+ideal for in-line, real-time encoding such as is needed for live-migration.
+
+Example
+old buffer:
+1001 zeros
+05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 68 00 00 6b 00 6d
+3074 zeros
+
+new buffer:
+1001 zeros
+01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 68 00 00 67 00 69
+3074 zeros
+
+encoded buffer:
+
+encoded length 24
+e9 07 0f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 03 01 67 01 01 69
+
+Migration Capabilities
+==
+In order to use XBZRLE the destination QEMU version should be able to
+decode the new format.
+Adding a new migration capabilities command that will allow external management
+to query for it support.
+A typical use for the destination
+{qemu} info migrate_capabilities
+{qemu} xbzrle, ...
+
+In order to enable capabilities for future live migration,
+a new command migrate_set_parameter is introduced:
+{qemu} migrate_set_parameter xbzrle
+
+Usage
+==
+
+1. Activate xbzrle
+2. Set the XBZRLE cache size - the cache size is in MBytes and should be a
+power of 2. The cache default value is 64MBytes.
+3. start outgoing migration
+
+A typical usage scenario:
+On the incoming QEMU:
+{qemu} migrate_set_parameter xbzrle on
+On the outgoing QEMU:
+{qemu} migrate_set_parameter xbzrle on
+{qemu} migrate_set_cachesize 256m
+{qemu} migrate -d tcp:destination.host:
+{qemu} info migrate
+...
+cache size: 67108864 bytes
+transferred ram-duplicate: A kbytes
+transferred ram-normal: B kbytes
+transferred ram-xbrle: C kbytes
+overflow ram-xbrle: D pages
+cache-miss ram-xbrle: E pages
+
+cache-miss: the number of cache misses to date - high cache-miss rate
+indicates that the cache size is set too low.
+overflow: the number of overflows in the decoding which where the delta could
+not be compressed. This can happen if the changes in the pages are too large
+or there are many short changes; for example, changing every second byte (half 
a
+page).
+
+Testing: Testing indicated that live migration with XBZRLE was completed in 110
+seconds, whereas without it would not be able to complete.
+
+A simple synthetic memory r/w load generator:
+..include stdlib.h
+..include stdio.h
+..int

[Qemu-devel] First shot at adding IPMI to qemu

2012-07-09 Thread minyard

I had asked about getting an IPMI device into qemu and received some
interest, and it's useful to me, so I've done some work to add it.
The following patch set has a set of patches to add an IPMI KCS
device, and IPMI BT device, a built-in BMC (IPMI management controller),
and a way to attach an external BMC through a chardev.

There was some discussion on whether to make the BMC internal or
external, but I went ahead and added both.  The internal one is
fairly basic and not extensible, at least without adding code.
I've modified the OpenIPMI library simulator to work with the
external interface to allow it to receive connections from the
qemu external simulator with a fairly basic protocol.

I've also added the ability for the OpenIPMI library to manage
a VM to power it on, power it off, reset it, and handle an IPMI
watchdog timer.  So it looks quite like a real system.  Instructions
for using it are in the OpenIPMI release candidate I uploaded to
https://sourceforge.net/projects/openipmi

Since IPMI can advertise its presence via SMBIOS, I added a
way for a driver to add an SMBIOS entry.  I also added a way
to query a free interrupt from the ISA bus, since the interrupt
is in the SMBIOS entry and nobody really cares which one is used.

[Qemu-devel] [PATCH 1/9] smbios: Add a function to directly add an entry

2012-07-09 Thread minyard

From: Corey Minyard cminy...@mvista.com

There was no way to directly add a table entry to the SMBIOS table,
even though the BIOS supports this.  So add a function to do this.
This is in preparation for the IPMI handler adding it's SMBIOS table
entry.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 hw/smbios.c |   27 +++
 hw/smbios.h |   15 ---
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/hw/smbios.c b/hw/smbios.c
index c57237d..98c7f99 100644
--- a/hw/smbios.c
+++ b/hw/smbios.c
@@ -178,6 +178,33 @@ static void smbios_build_type_1_fields(const char *t)
  strlen(buf) + 1, buf);
 }
 
+int smbios_table_entry_add(struct smbios_structure_header *entry)
+{
+struct smbios_table *table;
+struct smbios_structure_header *header;
+unsigned int size = entry-length;
+
+if (!smbios_entries) {
+smbios_entries_len = sizeof(uint16_t);
+smbios_entries = g_malloc0(smbios_entries_len);
+}
+smbios_entries = g_realloc(smbios_entries, smbios_entries_len +
+  sizeof(*table) + size);
+table = (struct smbios_table *)(smbios_entries + smbios_entries_len);
+table-header.type = SMBIOS_TABLE_ENTRY;
+table-header.length = cpu_to_le16(sizeof(*table) + size);
+
+header = (struct smbios_structure_header *)(table-data);
+memcpy(header, entry, size);
+
+smbios_check_collision(header-type, SMBIOS_TABLE_ENTRY);
+
+smbios_entries_len += sizeof(*table) + size;
+(*(uint16_t *)smbios_entries) =
+   cpu_to_le16(le16_to_cpu(*(uint16_t *)smbios_entries) + 1);
+return 0;
+}
+
 int smbios_entry_add(const char *t)
 {
 char buf[1024];
diff --git a/hw/smbios.h b/hw/smbios.h
index 94e3641..6431a15 100644
--- a/hw/smbios.h
+++ b/hw/smbios.h
@@ -13,21 +13,22 @@
  *
  */
 
+/* This goes at the beginning of every SMBIOS structure. */
+struct smbios_structure_header {
+uint8_t type;
+uint8_t length;
+uint16_t handle;
+} QEMU_PACKED;
+
 int smbios_entry_add(const char *t);
 void smbios_add_field(int type, int offset, int len, void *data);
 uint8_t *smbios_get_table(size_t *length);
+int smbios_table_entry_add(struct smbios_structure_header *entry);
 
 /*
  * SMBIOS spec defined tables
  */
 
-/* This goes at the beginning of every SMBIOS structure. */
-struct smbios_structure_header {
-uint8_t type;
-uint8_t length;
-uint16_t handle;
-} QEMU_PACKED;
-
 /* SMBIOS type 0 - BIOS Information */
 struct smbios_type_0 {
 struct smbios_structure_header header;
-- 
1.7.4.1

[Qemu-devel] [PATCH 7/9] IPMI: Add a BT low-level interface

2012-07-09 Thread minyard

From: Corey Minyard cminy...@mvista.com

This provides the simulation of the BT hardware interface for
IPMI.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/ipmi_bt.c   |  265 
 hw/ipmi_bt.h   |   99 +
 5 files changed, 367 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi_bt.c
 create mode 100644 hw/ipmi_bt.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index b549389..f8f8e6d 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -10,6 +10,7 @@ CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
+CONFIG_IPMI_BT=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index af7d2a9..8c1177d 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -10,6 +10,7 @@ CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
+CONFIG_IPMI_BT=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 99e5d1e..e1d30cc 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -23,6 +23,7 @@ hw-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
 hw-obj-$(CONFIG_IPMI) += ipmi.o
 hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
 hw-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs.o
+hw-obj-$(CONFIG_IPMI_BT) += ipmi_bt.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/ipmi_bt.c b/hw/ipmi_bt.c
new file mode 100644
index 000..39f099e
--- /dev/null
+++ b/hw/ipmi_bt.c
@@ -0,0 +1,265 @@
+/*
+ * QEMU IPMI BT emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include hw.h
+
+#include ipmi.h
+#include ipmi_bt.h
+
+#define IPMI_CMD_GET_BT_INTF_CAP   0x36
+
+static void ipmi_bt_handle_event(IPMIState *s)
+{
+IPMIBtState *bt = s-typeinfo;
+
+ipmi_lock();
+if (s-inlen  4)
+   goto out;
+/* Note that overruns are handled by handle_command */
+if (s-inmsg[0] != (s-inlen - 1)) {
+   /* Length mismatch, just ignore. */
+   IPMI_BT_SET_BBUSY(bt-control_reg, 1);
+   s-inlen = 0;
+   goto out;
+}
+if ((s-inmsg[1] == (IPMI_NETFN_APP  2)) 
+   (s-inmsg[3] == IPMI_CMD_GET_BT_INTF_CAP)) {
+   /* We handle this one ourselves. */
+   s-outmsg[0] = 9;
+   s-outmsg[1] = s-inmsg[1] | 0x04;
+   s-outmsg[2] = s-inmsg[2];
+   s-outmsg[3] = s-inmsg[3];
+   s-outmsg[4] = 0;
+   s-outmsg[5] = 1; /* Only support 1 outstanding request. */
+   if (sizeof(s-inmsg)  0xff) /* Input buffer size */
+   s-outmsg[6] = 0xff;
+   else
+   s-outmsg[6] = (unsigned char ) sizeof(s-inmsg);
+   if (sizeof(s-outmsg)  0xff) /* Output buffer size */
+   s-outmsg[7] = 0xff;
+   else
+   s-outmsg[7] = (unsigned char) sizeof(s-outmsg);
+   s-outmsg[8] = 10; /* Max request to response time */
+   s-outmsg[9] = 0; /* Don't recommend retries */
+   s-outlen = 10;
+   IPMI_BT_SET_BBUSY(bt-control_reg, 0);
+   IPMI_BT_SET_B2H_ATN(bt-control_reg, 1);
+   if (s-use_irq  s-irqs_enabled 
+   !IPMI_BT_GET_B2H_IRQ(bt-mask_reg) 
+   IPMI_BT_GET_B2H_IRQ_EN(bt-mask_reg)) {
+   IPMI_BT_SET_B2H_IRQ(bt-mask_reg, 1);
+   qemu_irq_raise(s-irq);
+   }
+   goto out;
+}
+bt-waiting_seq = s-inmsg[2];
+s-inmsg[2] = s-inmsg[1];
+s-handle_command(s, s-inmsg + 2, s-inlen - 2, sizeof(s-inmsg),
+ bt-waiting_rsp);
+ out:
+ipmi_unlock();
+}
+
+static void ipmi_bt_handle_rsp(IPMIState *s, uint8_t msg_id,
+

[Qemu-devel] [PATCH 4/9] Add a base IPMI interface

2012-07-09 Thread minyard

From: Corey Minyard cminy...@mvista.com

Add the basic IPMI types and infrastructure to QEMU.  Low-level
interfaces and simulation interfaces will register with this; it's
kind of the go-between to tie them together.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |2 +
 hw/ipmi.c  |  147 +++
 hw/ipmi.h  |  192 
 qemu-doc.texi  |2 +
 qemu-options.hx|   35 +++
 sysemu.h   |8 ++
 vl.c   |   46 +
 9 files changed, 434 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi.c
 create mode 100644 hw/ipmi.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 2c78175..eb17afc 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -7,6 +7,7 @@ CONFIG_VGA_ISA=y
 CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
+CONFIG_IPMI=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 233a856..e4e3e4f 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -7,6 +7,7 @@ CONFIG_VGA_ISA=y
 CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
+CONFIG_IPMI=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 30c0b78..0d55997 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -20,6 +20,8 @@ hw-obj-$(CONFIG_M48T59) += m48t59.o
 hw-obj-$(CONFIG_ESCC) += escc.o
 hw-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
 
+hw-obj-$(CONFIG_IPMI) += ipmi.o
+
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
 hw-obj-$(CONFIG_I8254) += i8254_common.o i8254.o
diff --git a/hw/ipmi.c b/hw/ipmi.c
new file mode 100644
index 000..86a097b
--- /dev/null
+++ b/hw/ipmi.c
@@ -0,0 +1,147 @@
+/*
+ * QEMU IPMI emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include hw.h
+#include ipmi.h
+#include sysemu.h
+#include qmp-commands.h
+
+static void (*ipmi_init_handlers[4])(IPMIState *s);
+
+void register_ipmi_type(unsigned int type, void (*init)(IPMIState *s))
+{
+ipmi_init_handlers[type] = init;
+}
+
+static void (*ipmi_sim_init_handlers[2])(IPMIState *s);
+
+void register_ipmi_sim(unsigned int iftype, void (*init)(IPMIState *s))
+{
+ipmi_sim_init_handlers[iftype] = init;
+}
+
+
+#ifdef DO_IPMI_THREAD
+static void *ipmi_thread(void *opaque)
+{
+IPMIState *s = opaque;
+int64_t wait_until;
+
+ipmi_lock();
+for (;;) {
+   qemu_cond_wait(s-waker, s-lock);
+   wait_until = 0;
+   while (s-do_wake) {
+   s-do_wake = 0;
+   s-handle_if_event(s);
+   }
+}
+ipmi_unlock();
+return NULL;
+}
+#endif
+
+static int ipmi_do_hw_op(IPMIState *s, enum ipmi_op op, int checkonly)
+{
+switch(op) {
+case IPMI_RESET_CHASSIS:
+   if (checkonly)
+   return 0;
+   qemu_system_reset_request();
+   return 0;
+
+case IPMI_POWEROFF_CHASSIS:
+   if (checkonly)
+   return 0;
+   qemu_system_powerdown_request();
+   return 0;
+
+case IPMI_SEND_NMI:
+   if (checkonly)
+   return 0;
+   qemu_mutex_lock_iothread();
+   qmp_inject_nmi(NULL);
+   qemu_mutex_unlock_iothread();
+   return 0;
+
+case IPMI_POWERCYCLE_CHASSIS:
+case IPMI_PULSE_DIAG_IRQ:
+case IPMI_SHUTDOWN_VIA_ACPI_OVERTEMP:
+case IPMI_POWERON_CHASSIS:
+default:
+   return IPMI_CC_COMMAND_NOT_SUPPORTED;
+}
+}
+
+static void ipmi_set_irq_enable(IPMIState *s, int val)
+{
+s-irqs_enabled = val;
+}
+
+static

[Qemu-devel] [PATCH 3/9] isa: Add a way to query for a free interrupt

2012-07-09 Thread minyard

From: Corey Minyard cminy...@mvista.com

This lets devices that don't care about their interrupt number, like
IPMI, just grab any unused interrupt.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 hw/isa-bus.c |   13 +
 hw/isa.h |2 ++
 2 files changed, 15 insertions(+), 0 deletions(-)

diff --git a/hw/isa-bus.c b/hw/isa-bus.c
index 5a43f03..f561f21 100644
--- a/hw/isa-bus.c
+++ b/hw/isa-bus.c
@@ -71,6 +71,7 @@ qemu_irq isa_get_irq(ISADevice *dev, int isairq)
 if (isairq  0 || isairq  15) {
 hw_error(isa irq %d invalid, isairq);
 }
+isabus-irq_inuse[isairq] = 1;
 return isabus-irqs[isairq];
 }
 
@@ -82,6 +83,18 @@ void isa_init_irq(ISADevice *dev, qemu_irq *p, int isairq)
 dev-nirqs++;
 }
 
+int isa_find_free_irq(ISABus *bus)
+{
+unsigned int i;
+
+/* 0 and 1 are called for, 2 is the chain interrupt */
+for (i = 3; i  ISA_NUM_IRQS; i++) {
+   if (!bus-irq_inuse[i])
+   return i;
+}
+return 0;
+}
+
 static inline void isa_init_ioport(ISADevice *dev, uint16_t ioport)
 {
 if (dev  (dev-ioport_id == 0 || ioport  dev-ioport_id)) {
diff --git a/hw/isa.h b/hw/isa.h
index f7bc4b5..9447296 100644
--- a/hw/isa.h
+++ b/hw/isa.h
@@ -28,6 +28,7 @@ struct ISABus {
 BusState qbus;
 MemoryRegion *address_space_io;
 qemu_irq *irqs;
+int irq_inuse[ISA_NUM_IRQS];
 };
 
 struct ISADevice {
@@ -41,6 +42,7 @@ ISABus *isa_bus_new(DeviceState *dev, MemoryRegion 
*address_space_io);
 void isa_bus_irqs(ISABus *bus, qemu_irq *irqs);
 qemu_irq isa_get_irq(ISADevice *dev, int isairq);
 void isa_init_irq(ISADevice *dev, qemu_irq *p, int isairq);
+int isa_find_free_irq(ISABus *bus);
 MemoryRegion *isa_address_space(ISADevice *dev);
 ISADevice *isa_create(ISABus *bus, const char *name);
 ISADevice *isa_try_create(ISABus *bus, const char *name);
-- 
1.7.4.1

[Qemu-devel] [PATCH 5/9] IPMI: Add a PC ISA type structure

2012-07-09 Thread minyard

From: Corey Minyard cminy...@mvista.com

This provides the base infrastructure to tie IPMI low-level
interfaces into a PC ISA bus.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/isa_ipmi.c  |  138 
 hw/pc.c|   12 +++
 hw/pc.h|   18 +
 hw/smbios.h|   12 +++
 7 files changed, 183 insertions(+), 0 deletions(-)
 create mode 100644 hw/isa_ipmi.c

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index eb17afc..c0aff0d 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -8,6 +8,7 @@ CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
+CONFIG_ISA_IPMI=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index e4e3e4f..615e4f2 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -8,6 +8,7 @@ CONFIG_VGA_CIRRUS=y
 CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
+CONFIG_ISA_IPMI=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 0d55997..8f27ffe 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -21,6 +21,7 @@ hw-obj-$(CONFIG_ESCC) += escc.o
 hw-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
 
 hw-obj-$(CONFIG_IPMI) += ipmi.o
+hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/isa_ipmi.c b/hw/isa_ipmi.c
new file mode 100644
index 000..cad78b0
--- /dev/null
+++ b/hw/isa_ipmi.c
@@ -0,0 +1,138 @@
+/*
+ * QEMU ISA IPMI KCS emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include hw.h
+#include isa.h
+#include pc.h
+#include qemu-timer.h
+#include sysemu.h
+#include smbios.h
+#include ipmi.h
+
+
+typedef struct ISAIPMIState {
+ISADevice dev;
+uint32_t type;
+uint32_t iobase;
+uint32_t isairq;
+uint8_t slave_addr;
+IPMIState state;
+} ISAIPMIState;
+
+static int ipmi_isa_initfn(ISADevice *dev)
+{
+ISAIPMIState *isa = DO_UPCAST(ISAIPMIState, dev, dev);
+struct smbios_type_38 smb38;
+
+if (isa-iobase == -1) {
+   /* If no I/O base is specified, set the defaults */
+   switch (isa-type) {
+   case IPMI_KCS:
+   isa-iobase = 0xca2;
+   break;
+   case IPMI_BT:
+   isa-iobase = 0xe4;
+   break;
+   case IPMI_SMIC:
+   isa-iobase = 0xca9;
+   break;
+   default:
+   fprintf(stderr, Unknown IPMI type: %d\n, isa-type);
+   abort();
+   }
+}
+
+isa-state.slave_addr = isa-slave_addr;
+
+qdev_set_legacy_instance_id(dev-qdev, isa-iobase, 3);
+
+ipmi_init(isa-type, isa-state);
+
+if (isa-isairq  0) {
+   isa_init_irq(dev, isa-state.irq, isa-isairq);
+   isa-state.use_irq = 1;
+}
+
+isa_register_ioport(dev, isa-state.io, isa-iobase);
+
+smb38.header.type = 38;
+smb38.header.length = sizeof(smb38);
+smb38.header.handle = 0x3000;
+smb38.interface_type = isa-state.smbios_type;
+smb38.ipmi_version = 0x20;
+smb38.i2c_slave_addr = isa-state.slave_addr;
+smb38.nv_storage_dev_addr = 0;
+
+/* or 1 to set it to I/O space */
+smb38.base_addr = isa-iobase | 1;
+
+ /* 1-byte boundaries, addr bit0=0, level triggered irq */
+smb38.base_addr_mod_and_irq_info = 1;
+smb38.interrupt_number = isa-isairq;
+smbios_table_entry_add((struct smbios_structure_header *) smb38);
+
+return 0;
+}
+
+static Property ipmi_isa_properties[] = {
+DEFINE_PROP_HEX32(type, ISAIPMIState, type,  IPMI_KCS),
+

[Qemu-devel] [PATCH 6/9] IPMI: Add a KCS low-level interface

2012-07-09 Thread minyard

From: Corey Minyard cminy...@mvista.com

This provides the simulation of the KCS hardware interface.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/ipmi_kcs.c  |  259 
 hw/ipmi_kcs.h  |   82 +++
 5 files changed, 344 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi_kcs.c
 create mode 100644 hw/ipmi_kcs.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index c0aff0d..b549389 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -9,6 +9,7 @@ CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
+CONFIG_IPMI_KCS=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 615e4f2..af7d2a9 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -9,6 +9,7 @@ CONFIG_VMWARE_VGA=y
 CONFIG_VMMOUSE=y
 CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
+CONFIG_IPMI_KCS=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 8f27ffe..99e5d1e 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -22,6 +22,7 @@ hw-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
 
 hw-obj-$(CONFIG_IPMI) += ipmi.o
 hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
+hw-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/ipmi_kcs.c b/hw/ipmi_kcs.c
new file mode 100644
index 000..61188c9
--- /dev/null
+++ b/hw/ipmi_kcs.c
@@ -0,0 +1,259 @@
+/*
+ * QEMU IPMI KCS emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include hw.h
+
+#include ipmi.h
+#include ipmi_kcs.h
+
+#define SET_OBF() \
+do {   \
+   IPMI_KCS_SET_OBF(kcs-status_reg, 1);   \
+   if (s-use_irq  s-irqs_enabled  !s-obf_irq_set) { \
+   s-obf_irq_set = 1; \
+   if (!s-atn_irq_set)\
+   qemu_irq_raise(s-irq); \
+   }   \
+} while (0)
+
+static void ipmi_kcs_handle_event(IPMIState *s)
+{
+IPMIKcsState *kcs = s-typeinfo;
+if (kcs-cmd_reg == IPMI_KCS_ABORT_STATUS_CMD) {
+   if (IPMI_KCS_GET_STATE(kcs-status_reg) != IPMI_KCS_ERROR_STATE) {
+   kcs-waiting_rsp++; /* Invalidate the message */
+   s-outmsg[0] = IPMI_KCS_STATUS_ABORTED_ERR;
+   s-outlen = 1;
+   s-outpos = 0;
+   IPMI_KCS_SET_STATE(kcs-status_reg, IPMI_KCS_ERROR_STATE);
+   SET_OBF();
+   }
+   goto out;
+}
+
+switch (IPMI_KCS_GET_STATE(kcs-status_reg)) {
+case IPMI_KCS_IDLE_STATE:
+   if (kcs-cmd_reg == IPMI_KCS_WRITE_START_CMD) {
+   IPMI_KCS_SET_STATE(kcs-status_reg, IPMI_KCS_WRITE_STATE);
+   kcs-cmd_reg = -1;
+   s-write_end = 0;
+   s-inlen = 0;
+   SET_OBF();
+   }
+   break;
+
+case IPMI_KCS_READ_STATE:
+handle_read:
+   if (s-outpos = s-outlen) {
+   IPMI_KCS_SET_STATE(kcs-status_reg, IPMI_KCS_IDLE_STATE);
+   SET_OBF();
+   } else if (kcs-data_in_reg == IPMI_KCS_READ_CMD) {
+   kcs-data_out_reg = s-outmsg[s-outpos];
+   s-outpos++;
+   SET_OBF();
+   } else {
+   s-outmsg[0] = IPMI_KCS_STATUS_BAD_CC_ERR;
+   s-outlen = 1;
+   s-outpos = 0;
+   IPMI_KCS_SET_STATE(kcs-status_reg, IPMI_KCS_ERROR_STATE);
+   SET_OBF();
+   goto out;

[Qemu-devel] [PATCH 2/9] pc: move SMBIOS setup to after device init

2012-07-09 Thread minyard

From: Corey Minyard cminy...@mvista.com

Setting up the firmware interface for the SMBIOS table needs to
be done later in the process, after device initialization, so that
devices can add entries to the table.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 hw/pc.c  |   22 +-
 hw/pc.h  |9 +
 hw/pc_piix.c |   12 
 3 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/hw/pc.c b/hw/pc.c
index fb04c8b..c0acb6a 100644
--- a/hw/pc.c
+++ b/hw/pc.c
@@ -971,20 +971,12 @@ void pc_cpus_init(const char *cpu_model)
 }
 
 void pc_memory_init(MemoryRegion *system_memory,
-const char *kernel_filename,
-const char *kernel_cmdline,
-const char *initrd_filename,
 ram_addr_t below_4g_mem_size,
 ram_addr_t above_4g_mem_size,
-MemoryRegion *rom_memory,
 MemoryRegion **ram_memory)
 {
-int linux_boot, i;
-MemoryRegion *ram, *option_rom_mr;
+MemoryRegion *ram;
 MemoryRegion *ram_below_4g, *ram_above_4g;
-void *fw_cfg;
-
-linux_boot = (kernel_filename != NULL);
 
 /* Allocate RAM.  We allocate it as a single memory region and use
  * aliases to address portions of it, mostly for backwards compatibility
@@ -1006,7 +998,17 @@ void pc_memory_init(MemoryRegion *system_memory,
 memory_region_add_subregion(system_memory, 0x1ULL,
 ram_above_4g);
 }
+}
 
+void pc_bios_init(const char *kernel_filename,
+ const char *kernel_cmdline,
+ const char *initrd_filename,
+ ram_addr_t below_4g_mem_size,
+ MemoryRegion *rom_memory)
+{
+MemoryRegion *option_rom_mr;
+void *fw_cfg;
+int linux_boot, i;
 
 /* Initialize PC system firmware */
 pc_system_firmware_init(rom_memory);
@@ -1019,6 +1021,8 @@ void pc_memory_init(MemoryRegion *system_memory,
 option_rom_mr,
 1);
 
+linux_boot = (kernel_filename != NULL);
+
 fw_cfg = bochs_bios_init();
 rom_set_fw(fw_cfg);
 
diff --git a/hw/pc.h b/hw/pc.h
index 8ccf202..33ab689 100644
--- a/hw/pc.h
+++ b/hw/pc.h
@@ -107,13 +107,14 @@ void pc_acpi_smi_interrupt(void *opaque, int irq, int 
level);
 
 void pc_cpus_init(const char *cpu_model);
 void pc_memory_init(MemoryRegion *system_memory,
-const char *kernel_filename,
-const char *kernel_cmdline,
-const char *initrd_filename,
 ram_addr_t below_4g_mem_size,
 ram_addr_t above_4g_mem_size,
-MemoryRegion *rom_memory,
 MemoryRegion **ram_memory);
+void pc_bios_init(const char *kernel_filename,
+ const char *kernel_cmdline,
+ const char *initrd_filename,
+ ram_addr_t below_4g_mem_size,
+ MemoryRegion *rom_memory);
 qemu_irq *pc_allocate_cpu_irq(void);
 DeviceState *pc_vga_init(ISABus *isa_bus, PCIBus *pci_bus);
 void pc_basic_device_init(ISABus *isa_bus, qemu_irq *gsi,
diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index fb86f27..21b4f59 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -176,10 +176,8 @@ static void pc_init1(MemoryRegion *system_memory,
 
 /* allocate ram and load rom/bios */
 if (!xen_enabled()) {
-pc_memory_init(system_memory,
-   kernel_filename, kernel_cmdline, initrd_filename,
-   below_4g_mem_size, above_4g_mem_size,
-   pci_enabled ? rom_memory : system_memory, ram_memory);
+pc_memory_init(system_memory, below_4g_mem_size, above_4g_mem_size,
+  ram_memory);
 }
 
 gsi_state = g_malloc0(sizeof(*gsi_state));
@@ -287,6 +285,12 @@ static void pc_init1(MemoryRegion *system_memory,
 if (pci_enabled) {
 pc_pci_device_init(pci_bus);
 }
+
+if (!xen_enabled()) {
+   pc_bios_init(kernel_filename, kernel_cmdline, initrd_filename,
+below_4g_mem_size,
+pci_enabled ? rom_memory : system_memory);
+}
 }
 
 static void pc_init_pci(ram_addr_t ram_size,
-- 
1.7.4.1

[Qemu-devel] [PATCH 8/9] IPMI: Add a local BMC simulation

2012-07-09 Thread minyard

From: Corey Minyard cminy...@mvista.com

This provides a minimal local BMC, basically enough to comply with the
spec and provide a complete watchdog timer (including a sensor, SDR,
and event).

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/ipmi_sim.c  | 1273 
 hw/ipmi_sim.h  |  270 
 5 files changed, 1546 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi_sim.c
 create mode 100644 hw/ipmi_sim.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index f8f8e6d..8c99d5d 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -11,6 +11,7 @@ CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
 CONFIG_IPMI_BT=y
+CONFIG_IPMI_LOCAL=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 8c1177d..4d01883 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -11,6 +11,7 @@ CONFIG_IPMI=y
 CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
 CONFIG_IPMI_BT=y
+CONFIG_IPMI_LOCAL=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index e1d30cc..193227d 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -24,6 +24,7 @@ hw-obj-$(CONFIG_IPMI) += ipmi.o
 hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
 hw-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs.o
 hw-obj-$(CONFIG_IPMI_BT) += ipmi_bt.o
+hw-obj-$(CONFIG_IPMI_LOCAL) += ipmi_sim.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/ipmi_sim.c b/hw/ipmi_sim.c
new file mode 100644
index 000..6813a86
--- /dev/null
+++ b/hw/ipmi_sim.c
@@ -0,0 +1,1273 @@
+/*
+ * IPMI BMC emulation
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include stdio.h
+#include string.h
+#include ipmi_sim.h
+
+static void ipmi_sim_handle_timeout(IPMIState *s);
+
+static void ipmi_gettime(struct ipmi_time *time)
+{
+int64_t stime;
+
+stime = get_clock_realtime();
+time-tv_sec = stime / 10LL;
+time-tv_nsec = stime % 10LL;
+}
+
+static int64_t ipmi_getmonotime(void)
+{
+return qemu_get_clock_ns(vm_clock);
+}
+
+static void ipmi_timeout(void *opaque)
+{
+IPMIState *s = opaque;
+
+ipmi_sim_handle_timeout(s);
+}
+
+static void set_timestamp(IPMISimState *ss, uint8_t *ts)
+{
+unsigned int val;
+struct ipmi_time now;
+
+ipmi_gettime(now);
+val = now.tv_sec + ss-sel.time_offset;
+ts[0] = val  0xff;
+ts[1] = (val  8)  0xff;
+ts[2] = (val  16)  0xff;
+ts[3] = (val  24)  0xff;
+}
+
+static void sdr_inc_reservation(IPMISdr *sdr)
+{
+sdr-reservation++;
+if (sdr-reservation == 0)
+   sdr-reservation = 1;
+}
+
+static int sdr_add_entry(IPMISimState *ss, const uint8_t *entry,
+unsigned int len, uint16_t *recid)
+{
+if ((len  5) || (len  255))
+   return 1;
+
+if (entry[ss-sdr.next_free + 4] != len)
+   return 1;
+
+if (ss-sdr.next_free + len  MAX_SDR_SIZE) {
+   ss-sdr.overflow = 1;
+   return 1;
+}
+
+memcpy(ss-sdr.sdr + ss-sdr.next_free, entry, len);
+ss-sdr.sdr[ss-sdr.next_free] = ss-sdr.next_rec_id  0xff;
+ss-sdr.sdr[ss-sdr.next_free+1] = (ss-sdr.next_rec_id  8)  0xff;
+ss-sdr.sdr[ss-sdr.next_free+2] = 0x51; /* Conform to IPMI 1.5 spec */
+
+if (recid)
+   *recid = ss-sdr.next_rec_id;
+ss-sdr.next_rec_id++;
+set_timestamp(ss, ss-sdr.last_addition);
+ss-sdr.next_free += len;
+sdr_inc_reservation(ss-sdr);
+return 0;
+}
+
+static int sdr_find_entry(IPMISdr *sdr, uint16_t recid,
+ unsigned int *retpos, uint16_t *nextrec)
+{

[Qemu-devel] [PATCH 9/9] IPMI: Add an external connection simulation interface

2012-07-09 Thread minyard

From: Corey Minyard cminy...@mvista.com

This adds an interface for IPMI that connects to a remote
BMC over a chardev (generally a TCP socket).  The OpenIPMI
lanserv simulator describes this interface, see that for
interface details.

Signed-off-by: Corey Minyard cminy...@mvista.com
---
 default-configs/i386-softmmu.mak   |1 +
 default-configs/x86_64-softmmu.mak |1 +
 hw/Makefile.objs   |1 +
 hw/ipmi_extern.c   |  421 
 hw/ipmi_extern.h   |   75 +++
 5 files changed, 499 insertions(+), 0 deletions(-)
 create mode 100644 hw/ipmi_extern.c
 create mode 100644 hw/ipmi_extern.h

diff --git a/default-configs/i386-softmmu.mak b/default-configs/i386-softmmu.mak
index 8c99d5d..325f92e 100644
--- a/default-configs/i386-softmmu.mak
+++ b/default-configs/i386-softmmu.mak
@@ -12,6 +12,7 @@ CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
 CONFIG_IPMI_BT=y
 CONFIG_IPMI_LOCAL=y
+CONFIG_IPMI_EXTERN=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/default-configs/x86_64-softmmu.mak 
b/default-configs/x86_64-softmmu.mak
index 4d01883..2ac9177 100644
--- a/default-configs/x86_64-softmmu.mak
+++ b/default-configs/x86_64-softmmu.mak
@@ -12,6 +12,7 @@ CONFIG_ISA_IPMI=y
 CONFIG_IPMI_KCS=y
 CONFIG_IPMI_BT=y
 CONFIG_IPMI_LOCAL=y
+CONFIG_IPMI_EXTERN=y
 CONFIG_SERIAL=y
 CONFIG_PARALLEL=y
 CONFIG_I8254=y
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 193227d..06757b0 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -25,6 +25,7 @@ hw-obj-$(CONFIG_ISA_IPMI) += isa_ipmi.o
 hw-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs.o
 hw-obj-$(CONFIG_IPMI_BT) += ipmi_bt.o
 hw-obj-$(CONFIG_IPMI_LOCAL) += ipmi_sim.o
+hw-obj-$(CONFIG_IPMI_EXTERN) += ipmi_extern.o
 
 hw-obj-$(CONFIG_SERIAL) += serial.o
 hw-obj-$(CONFIG_PARALLEL) += parallel.o
diff --git a/hw/ipmi_extern.c b/hw/ipmi_extern.c
new file mode 100644
index 000..bbb8469
--- /dev/null
+++ b/hw/ipmi_extern.c
@@ -0,0 +1,421 @@
+/*
+ * IPMI BMC external connection
+ *
+ * Copyright (c) 2012 Corey Minyard, MontaVista Software, LLC
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the Software), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * This is designed to connect with OpenIPMI's lanserv serial interface
+ * using the VM connection type.  See that for details.
+ */
+
+#include ipmi_extern.h
+
+static int can_receive(void *opaque);
+static void receive(void *opaque, const uint8_t *buf, int size);
+static void chr_event(void *opaque, int event);
+
+static unsigned char
+ipmb_checksum(const unsigned char *data, int size, unsigned char start)
+{
+   unsigned char csum = start;
+
+   for (; size  0; size--, data++)
+   csum += *data;
+
+   return csum;
+}
+
+static void continue_send(IPMIState *s, IPMIExternState *es)
+{
+if (es-outlen == 0)
+   goto check_reset;
+
+ send:
+es-outpos += qemu_chr_fe_write(es-chr, es-outbuf + es-outpos,
+   es-outlen - es-outpos);
+if (es-outpos  es-outlen) {
+   /* Not fully transmitted, try again in a 10ms */
+   qemu_mod_timer(es-extern_timer,
+  qemu_get_clock_ns(vm_clock) + 1000);
+} else {
+   /* Sent */
+   es-outlen = 0;
+   es-outpos = 0;
+   if (!es-sending_cmd)
+   es-waiting_rsp = 1;
+   else
+   es-sending_cmd = 0;
+
+check_reset:
+   if (es-send_reset) {
+   /* Send the reset */
+   es-outbuf[0] = VM_CMD_RESET;
+   es-outbuf[1] = VM_CMD_CHAR;
+   es-outlen = 2;
+   es-outpos = 0;
+   es-send_reset = 0;
+   es-sending_cmd = 1;
+   goto send;
+   }
+
+   if (es-waiting_rsp)
+   /* Make sure we get a response within 4 seconds. */
+   qemu_mod_timer(es-extern_timer,
+  qemu_get_clock_ns(vm_clock) + 40ULL);
+}
+return;
+}
+
+static void extern_timeout(void *opaque)
+{
+IPMIState *s = opaque;

Re: [Qemu-devel] [libvirt] [RFC 0/5] block: File descriptor passing using -open-hook-fd

2012-07-09 Thread Anthony Liguori


On 05/17/2012 09:14 AM, Eric Blake wrote:

On 05/17/2012 07:42 AM, Stefan Hajnoczi wrote:



The -open-hook-fd approach allows QEMU to support file descriptor passing
without changing -drive.  It also supports snapshot_blkdev and other commands

By the way, How will it support them?


The problem with snapshot_blkdev is that closing a file and opening a
new file cannot be done by the QEMU process when an SELinux policy is in
place to prevent opening files.


snapshot_blkdev can take an fd:name instead of a /path/to/file for the
file to open, in which case libvirt can pass in the named fd _prior_ to
the snapshot_blkdev using the 'getfd' monitor command.



The -open-hook-fd approach works even when the QEMU process is not
allowed to open files since file descriptor passing over a UNIX domain
socket is used to open files on behalf of QEMU.


The -open-hook-fd approach would indeed allow snapshot_blokdev to ask
for the fd after the fact, but it's much more painful.  Consider a case
with a two-disk snapshot:

with the fd:name approach, the sequence is:

libvirt calls getfd:name1 over normal monitor
qemu responds
libvirt calls getfd:name2 over normal monitor
qemu responds
libvirt calls transaction around blockdev-snapshot-sync over normal
monitor, using fd:name1 and fd:name2
qemu responds

but with -open-hook-fd, the approach would be:

libvirt calls transaction
qemu calls open(file1) over hook
libvirt responds
qemu calls open(file2) over hook
libvirt responds
qemu responds to the original transaction

The 'transaction' operation is thus blocked by the time it takes to do
two intermediate opens over a second channel, which kind of defeats the
purpose of making the transaction take effect with minimal guest
downtime.


How are you defining guest down time?

It's important to note that code running in QEMU does not equate to guest 
visible down time unless QEMU does an explicit vm_stop() which is not happening 
here.


Instead, a VCPU may become blocked *if* it attempts to acquire qemu_mute while 
QEMU is holding it.


If your concern is qemu_mutex being held while waiting for libvirt, it would be 
fairly easy to implement a qemu_open_async() that dropped allowed dropping back 
to the main loop and then calling a callback when the open completes.


It would be pretty trivial to convert qmp_transaction to use such a command.

But this is all speculative.  There's no reason to believe that an RPC would 
have a noticable guest visible latency unless you assume there's lot contention. 
 I would strongly suspect that the bdrv_flush() is going to be a much greater 
source of lock contention than the RPC would be.  An RPC is only bound by 
scheduler latency whereas synchronous disk I/O is bound spinning a platter.



And libvirt code becomes a lot trickier to deal with the fact
that two channels are in use, and that the channel that issued the
'transaction' command must block while the other channel for handling
hooks must be responsive.


All libvirt needs to do is listen on a socket and delegate access according to a 
white list.  Whatever is providing fd's needs to have no knowledge of anythign 
other than what the guest is allowed to access which shouldn't depend on an 
executing command.


Regards,

Anthony Liguori


I'm really disliking the hook-fd approach, when a better solution is to
make use of 'getfd' in advance of any operation that will need to open
new fds.

Re: [Qemu-devel] [libvirt] [RFC 0/5] block: File descriptor passing using -open-hook-fd

2012-07-09 Thread Eric Blake

On 07/09/2012 02:00 PM, Anthony Liguori wrote:

 with the fd:name approach, the sequence is:

 libvirt calls getfd:name1 over normal monitor
 qemu responds
 libvirt calls getfd:name2 over normal monitor
 qemu responds
 libvirt calls transaction around blockdev-snapshot-sync over normal
 monitor, using fd:name1 and fd:name2
 qemu responds

This general layout is true whether we rewrite all commands to
understand fd:nnn (proposal 1) or whether we add new magic parsing
(/dev/fd/nnn of proposal 3, or even /dev/fdset/nnn of proposal 5), all
as called out in these messages:

https://lists.gnu.org/archive/html/qemu-devel/2012-07/msg00227.html
https://lists.gnu.org/archive/html/qemu-devel/2012-07/msg01098.html


 but with -open-hook-fd, the approach would be:

 libvirt calls transaction
 qemu calls open(file1) over hook
 libvirt responds
 qemu calls open(file2) over hook
 libvirt responds
 qemu responds to the original transaction

whereas this approach is quite different in semantics, but may indeed be
easier for qemu to implement, at the expense of some more complexity on
the part of libvirt.

At the high level, I think both approaches have one thing in common: by
refactoring all qemu code to go through qemu_open(), we can then
implement our desired complexity (whether fd:nn, /dev/fd/nnn,
/dev/fdset/nnn, or some other magic name parsing; or whether it is an
rpc call over a second socket in parallel to the monitor socket) in just
one location.  Likewise, both approaches have to deal with libvirtd
restarts (magic name parsing by changing an 'inuse' flag when the
monitor detects EOF; rpc passing by failing a qemu_open() when the rpc
socket detects EOF).


 The 'transaction' operation is thus blocked by the time it takes to do
 two intermediate opens over a second channel, which kind of defeats the
 purpose of making the transaction take effect with minimal guest
 downtime.
 
 How are you defining guest down time?
 
 It's important to note that code running in QEMU does not equate to
 guest visible down time unless QEMU does an explicit vm_stop() which is
 not happening here.
 
 Instead, a VCPU may become blocked *if* it attempts to acquire qemu_mute
 while QEMU is holding it.
 
 If your concern is qemu_mutex being held while waiting for libvirt, it
 would be fairly easy to implement a qemu_open_async() that dropped
 allowed dropping back to the main loop and then calling a callback when
 the open completes.
 
 It would be pretty trivial to convert qmp_transaction to use such a
 command.

In other words, remembering that transactions are divided into phases:

phase 1 - prepare: obtain all needed fds (whether by pre-opening them
via 'pass-fd' or other new 'getfd' relative, or whether by RPC calls);
no guest downtime, and with cleanup that avoids any leaks on any failures
phase 2 - commit: flush all devices and actually make the changes in
qemu state to use the fds obtained in phase 1

and where the guest downtime (if any) is more likely due to flushing
changes in phase 2

 
 But this is all speculative.  There's no reason to believe that an RPC
 would have a noticable guest visible latency unless you assume there's
 lot contention.  I would strongly suspect that the bdrv_flush() is going
 to be a much greater source of lock contention than the RPC would be. 
 An RPC is only bound by scheduler latency whereas synchronous disk I/O
 is bound spinning a platter.
 
 And libvirt code becomes a lot trickier to deal with the fact
 that two channels are in use, and that the channel that issued the
 'transaction' command must block while the other channel for handling
 hooks must be responsive.
 
 All libvirt needs to do is listen on a socket and delegate access
 according to a white list.  Whatever is providing fd's needs to have no
 knowledge of anythign other than what the guest is allowed to access
 which shouldn't depend on an executing command.

That's not quite accurate.  What the guest is allowed to access should
indeed change depending on the executing command.  That is, if I start a
guest with:

base - delta

then I only want to permet O_RDONLY access to base but O_RDWR access to
delta.  If I then call 'blockdev-snapshot-sync', I want to change to the
situation:

base - delta - snap

and give O_RDWR permissions to snap; it would also be nice if qemu
attempts to reopen delta with O_RDONLY permissions (although from a
trust perspective, libvirt must assume that delta is still O_RDWR
because qemu may have been compromised and lie about the tightening of
permissions); at any rate, depending on SELinux capabilities of the
file, libvirt may be able to enforce no further writes to 'delta' by
toggling a SELinux label (obviously, this should only be done after
'blockdev-snapshot-sync' completes).

On the other hand, the user could decide to do a 'block-commit', to
squash things into:

base

where base is now O_RDWR.  But libvirt doesn't want to grant
write-access to 'base' up-front, so the whitelist allowing O_RDWR access
to

Re: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only

2012-07-09 Thread Catalin Patulea

On Mon, Jul 2, 2012 at 11:25 AM, Catalin Patulea catal...@google.com wrote:
 FPREM1 now passes the TestFloat floatx80_rem suite (and FPREM is implemented 
 very
 similarly).

 The code (the bulk of which is remainder_kernel and do_fprem) is derived from
 Bochs SVN revision 11224 dated 2012-06-21 10:33:37 -0700, with conversions to
 Qemu type aliases, C features only, etc. as needed.

 Signed-off-by: Catalin Patulea catal...@google.com
 ---
  fpu/softfloat.c |  195 
 +++
  fpu/softfloat.h |4 +
  target-i386/op_helper.c |  166 
  3 files changed, 266 insertions(+), 99 deletions(-)

 diff --git a/fpu/softfloat.c b/fpu/softfloat.c
 index b29256a..bd1879d 100644
 [...]
Ping - how do people feel about the latest patch?

[Qemu-devel] KVM call agenda for Tuesday, July 10th

2012-07-09 Thread Juan Quintela


Hi

Please send in any agenda items you are interested in covering.

Later, Juan.

Re: [Qemu-devel] [libvirt] [RFC 0/5] block: File descriptor passing using -open-hook-fd

2012-07-09 Thread Anthony Liguori


On 07/09/2012 03:29 PM, Eric Blake wrote:

On 07/09/2012 02:00 PM, Anthony Liguori wrote:


with the fd:name approach, the sequence is:

libvirt calls getfd:name1 over normal monitor
qemu responds
libvirt calls getfd:name2 over normal monitor
qemu responds
libvirt calls transaction around blockdev-snapshot-sync over normal
monitor, using fd:name1 and fd:name2
qemu responds


This general layout is true whether we rewrite all commands to
understand fd:nnn (proposal 1) or whether we add new magic parsing
(/dev/fd/nnn of proposal 3, or even /dev/fdset/nnn of proposal 5), all
as called out in these messages:

https://lists.gnu.org/archive/html/qemu-devel/2012-07/msg00227.html
https://lists.gnu.org/archive/html/qemu-devel/2012-07/msg01098.html



but with -open-hook-fd, the approach would be:

libvirt calls transaction
qemu calls open(file1) over hook
libvirt responds
qemu calls open(file2) over hook
libvirt responds
qemu responds to the original transaction


whereas this approach is quite different in semantics, but may indeed be
easier for qemu to implement, at the expense of some more complexity on
the part of libvirt.

At the high level, I think both approaches have one thing in common: by
refactoring all qemu code to go through qemu_open(), we can then
implement our desired complexity (whether fd:nn, /dev/fd/nnn,
/dev/fdset/nnn, or some other magic name parsing; or whether it is an
rpc call over a second socket in parallel to the monitor socket) in just
one location.  Likewise, both approaches have to deal with libvirtd
restarts (magic name parsing by changing an 'inuse' flag when the
monitor detects EOF; rpc passing by failing a qemu_open() when the rpc
socket detects EOF).


Ack.





The 'transaction' operation is thus blocked by the time it takes to do
two intermediate opens over a second channel, which kind of defeats the
purpose of making the transaction take effect with minimal guest
downtime.


How are you defining guest down time?

It's important to note that code running in QEMU does not equate to
guest visible down time unless QEMU does an explicit vm_stop() which is
not happening here.

Instead, a VCPU may become blocked *if* it attempts to acquire qemu_mute
while QEMU is holding it.

If your concern is qemu_mutex being held while waiting for libvirt, it
would be fairly easy to implement a qemu_open_async() that dropped
allowed dropping back to the main loop and then calling a callback when
the open completes.

It would be pretty trivial to convert qmp_transaction to use such a
command.


In other words, remembering that transactions are divided into phases:

phase 1 - prepare: obtain all needed fds (whether by pre-opening them
via 'pass-fd' or other new 'getfd' relative, or whether by RPC calls);
no guest downtime, and with cleanup that avoids any leaks on any failures
phase 2 - commit: flush all devices and actually make the changes in
qemu state to use the fds obtained in phase 1

and where the guest downtime (if any) is more likely due to flushing
changes in phase 2


Not quite.  A synchronous flush can cause lock contention.  We need to separate 
out the problem of lock contention from guest down time.


Also, there's no obvious need to move the flushes before opens.  The main issue 
is that we use qemu_mutex to effectively create a write queue.


You can imagine a simple write queueing mechanism that would obviate the need 
need for this such that we could flush, queue upcoming writes, and drop 
qemu_mutex to sleep waiting for libvirt to send us our fds.



But this is all speculative.  There's no reason to believe that an RPC
would have a noticable guest visible latency unless you assume there's
lot contention.  I would strongly suspect that the bdrv_flush() is going
to be a much greater source of lock contention than the RPC would be.
An RPC is only bound by scheduler latency whereas synchronous disk I/O
is bound spinning a platter.


And libvirt code becomes a lot trickier to deal with the fact
that two channels are in use, and that the channel that issued the
'transaction' command must block while the other channel for handling
hooks must be responsive.


All libvirt needs to do is listen on a socket and delegate access
according to a white list.  Whatever is providing fd's needs to have no
knowledge of anythign other than what the guest is allowed to access
which shouldn't depend on an executing command.


That's not quite accurate.  What the guest is allowed to access should
indeed change depending on the executing command.  That is, if I start a
guest with:


I should have spoke more clearly.  libvirt may change the white list for various 
reasons dynamically.  But there shouldn't be a direct dependency on whatever is 
serving up fd's and whatever is changing the white list.


Basically, you just need a shared hash table for each guest.  It should be quite 
simple.



Maybe the only reason that I'm still leaning towards a 'pass-fd'
solution instead of a hook fd solution is that

Re: [Qemu-devel] [PATCH v16 8/9] Add set_cachesize command

2012-07-09 Thread Eric Blake

On 07/09/2012 12:22 PM, Orit Wasserman wrote:
 Change XBZRLE cache size in bytes (the size should be a power of 2).
 If XBZRLE cache size is too small there will be many cache miss.
 
 Signed-off-by: Benoit Hudzia benoit.hud...@sap.com
 Signed-off-by: Petter Svard pett...@cs.umu.se
 Signed-off-by: Aidan Shribman aidan.shrib...@sap.com
 Signed-off-by: Orit Wasserman owass...@redhat.com

 +++ b/qapi-schema.json
 @@ -1390,6 +1390,22 @@
  { 'command': 'migrate_set_speed', 'data': {'value': 'int'} }

We are copying after bad practice, but...

  
  ##
 +# @migrate_set_cachesize

...new QMP commands should prefer '-' over '_'.  While the HMP version
is fine with migrate_set_cachesize, the QMP command should be
migrate-set-cachesize (or even 'migrate-set-cache-size').


 +
 +Set cache size to be used by XBZRLE migration, the cache size will be round 
 down

s/round/rounded/

-- 
Eric Blake   ebl...@redhat.com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-devel] [PATCH] target-i386: implement FPREM and FPREM1 using softfloat only

2012-07-09 Thread Peter Maydell

On 2 July 2012 16:25, Catalin Patulea catal...@google.com wrote:
 FPREM1 now passes the TestFloat floatx80_rem suite (and FPREM is implemented 
 very
 similarly).

 The code (the bulk of which is remainder_kernel and do_fprem) is derived from
 Bochs SVN revision 11224 dated 2012-06-21 10:33:37 -0700, with conversions to
 Qemu type aliases, C features only, etc. as needed.

QEMU is the official capitalization.


 Signed-off-by: Catalin Patulea catal...@google.com
 ---
  fpu/softfloat.c |  195 
 +++
  fpu/softfloat.h |4 +
  target-i386/op_helper.c |  166 
  3 files changed, 266 insertions(+), 99 deletions(-)

 diff --git a/fpu/softfloat.c b/fpu/softfloat.c
 index b29256a..bd1879d 100644
 --- a/fpu/softfloat.c
 +++ b/fpu/softfloat.c
 @@ -5234,6 +5234,16 @@ int floatx80_unordered_quiet( floatx80 a, floatx80 b 
 STATUS_PARAM )
  }

  
 /*
 +| Returns 1 if the extended double-precision floating-point value `a' is an
 +| unsupported value; otherwise returns 0.

Let's try for something a little less cryptic to save future readers having
to dig out the intel architecture manuals:
an unsupported value (ie the bit pattern does not represent a valid
IEEE number).

 +**/
 +int floatx80_is_unsupported(floatx80 a)
 +{
 +return extractFloatx80Exp(a) 
 +   !(extractFloatx80Frac(a)  LIT64(0x8000));
 +}

This doesn't match up with all the cases in the Intel Software Developer's
Manual table 8.3: it catches exponent non-zero but explicit integer bit
is zero (pseudo-NaNs, pseudo-infinities, and unnormals) but not the case
of exponent is zero but explicit integer bit is one (pseudo-denormals).

[For those following along at home, it is because floatx80 has an explicit
integer bit rather than the implicit bit used in IEEE single and double
that you can get 'unsupported' bit patterns, where the explicit bit is
a value different from what the implicit bit would be under IEEE rules.]

 +
 +/*
  | Returns the result of converting the quadruple-precision floating-point
  | value `a' to the 32-bit two's complement integer format.  The conversion
  | is performed according to the IEC/IEEE Standard for Binary Floating-Point
 @@ -6828,6 +6838,191 @@ floatx80 floatx80_scalbn( floatx80 a, int n 
 STATUS_PARAM )
aSign, aExp, aSig, 0 STATUS_VAR );
  }

 +/* executes single exponent reduction cycle */
 +static uint64_t remainder_kernel(uint64_t aSig0, uint64_t bSig, int expDiff,
 + uint64_t *zSig0, uint64_t *zSig1)
 +{
 +uint64_t term0, term1;
 +uint64_t aSig1 = 0;
 +
 +shortShift128Left(aSig1, aSig0, expDiff, aSig1, aSig0);
 +uint64_t q = estimateDiv128To64(aSig1, aSig0, bSig);

Declaring variables in the middle of code isn't QEMU coding style;
top of the function, please. [Other cases below; I haven't bothered
to call them all out.]

 +mul64To128(bSig, q, term0, term1);
 +sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
 +while ((int64)(*zSig1)  0) {

Cast to int64 is almost certainly wrong: this conditional will
give different results if int64 is exactly 64 bits vs if it is
more than 64 bits. You probably wanted int64_t.

 +--q;
 +add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
 +}
 +return q;
 +}
 +
 +static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, uint64_t *q,
 +int rounding_mode STATUS_PARAM)
 +{
 +int32 aExp, bExp, zExp, expDiff;
 +uint64_t aSig0, aSig1, bSig;
 +flag aSign;
 +*q = 0;
 +
 +/* handle unsupported extended double-precision floating encodings */
 +if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) {
 +float_raise(float_flag_invalid, status);

Use STATUS_VAR. It's stupid, but let's be consistently stupid until
we get round to globally fixing it.

 +*r = floatx80_default_nan;
 +return -1;
 +}
 +
 +aSig0 = extractFloatx80Frac(a);
 +aExp = extractFloatx80Exp(a);
 +aSign = extractFloatx80Sign(a);
 +bSig = extractFloatx80Frac(b);
 +bExp = extractFloatx80Exp(b);
 +
 +if (aExp == 0x7FFF) {
 +if ((uint64_t) (aSig01) || ((bExp == 0x7FFF) 
 +  (uint64_t) (bSig1))) {

aSig0 and bSig are already uint64_t, why the casts?

 +*r = propagateFloatx80NaN(a, b, status);
 +return -1;
 +}
 +float_raise(float_flag_invalid, status);
 +*r = floatx80_default_nan;
 +return -1;
 +}
 +if (bExp == 0x7FFF) {
 +if ((uint64_t) (bSig1)) {
 +*r = propagateFloatx80NaN(a, b, status);
 +return -1;
 +}
 +if (aExp == 0

[Qemu-devel] buildbot failure in qemu on default_i386_rhel61

2012-07-09 Thread qemu

The Buildbot has detected a new failure on builder default_i386_rhel61 while 
building qemu.
Full details are available at:
 http://buildbot.b1-systems.de/qemu/builders/default_i386_rhel61/builds/304

Buildbot URL: http://buildbot.b1-systems.de/qemu/

Buildslave for this Build: kraxel_rhel61_32bit

Build Reason: The Nightly scheduler named 'nightly_default' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed compile

sincerely,
 -The Buildbot

Re: [Qemu-devel] [PULL 00/14] SCSI updates for 2012-07-02

2012-07-09 Thread Alexander Graf


On 09.07.2012, at 18:48, Anthony Liguori wrote:

 On 07/02/2012 04:41 AM, Paolo Bonzini wrote:
 Anthony,
 
 The following changes since commit 71ea2e016131a9fcde6f1ffd3e0e34a64c21f593:
 
   bsd-user: fix build (2012-06-28 20:28:36 +)
 
 Pulled.  Thanks.

Megasas? :)


http://buildbot.b1-systems.de/qemu/builders/default_i386_rhel61/builds/304/steps/compile/logs/stdio

[Qemu-devel] [PATCH] megasas: disable due to build breakage

2012-07-09 Thread Anthony Liguori

The Buildbot has detected a new failure on builder default_i386_rhel61 while
building qemu.

Full details are available at:
 http://buildbot.b1-systems.de/qemu/builders/default_i386_rhel61/builds/304

The proper fix is non-trivial so let's disable the build by default until it's
fixed properly.

Signed-off-by: Anthony Liguori aligu...@us.ibm.com
---
 default-configs/pci.mak |1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/default-configs/pci.mak b/default-configs/pci.mak
index 4b49c00..9d3e1db 100644
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -10,7 +10,6 @@ CONFIG_EEPRO100_PCI=y
 CONFIG_PCNET_PCI=y
 CONFIG_PCNET_COMMON=y
 CONFIG_LSI_SCSI_PCI=y
-CONFIG_MEGASAS_SCSI_PCI=y
 CONFIG_RTL8139_PCI=y
 CONFIG_E1000_PCI=y
 CONFIG_IDE_CORE=y
-- 
1.7.5.4

Re: [Qemu-devel] [PULL 00/14] SCSI updates for 2012-07-02

2012-07-09 Thread Anthony Liguori


On 07/09/2012 06:09 PM, Alexander Graf wrote:


On 09.07.2012, at 18:48, Anthony Liguori wrote:


On 07/02/2012 04:41 AM, Paolo Bonzini wrote:

Anthony,

The following changes since commit 71ea2e016131a9fcde6f1ffd3e0e34a64c21f593:

   bsd-user: fix build (2012-06-28 20:28:36 +)


Pulled.  Thanks.


Megasas? :)


So this code is really broken:

info.host.type = MFI_INFO_HOST_PCIX;
info.device.type = MFI_INFO_DEV_SAS3G;
info.device.port_count = 2;
info.device.port_addr[0] = cpu_to_le64(megasas_gen_sas_addr((uint64_t)s));

This will make migration impossible not to mention the fact that casting a 
pointer to a uint64_t is really broken.


This code needs to be refactored to not do this.  It's quite pervasive though 
(there's a half a dozen instances like this).


I'm going to disable the build by default.  I don't want to see a rash fix like 
(uint64_t)(intptr_t).  This needs to be fixed by not making the pointer address 
guest visible.  It can then be re-enabled.  Should be easy enough to update your 
.mak config if you want to test between now and then.


Regards,

Anthony Liguori




http://buildbot.b1-systems.de/qemu/builders/default_i386_rhel61/builds/304/steps/compile/logs/stdio

[Qemu-devel] buildbot failure in qemu on default_mingw32

2012-07-09 Thread qemu

The Buildbot has detected a new failure on builder default_mingw32 while 
building qemu.
Full details are available at:
 http://buildbot.b1-systems.de/qemu/builders/default_mingw32/builds/312

Buildbot URL: http://buildbot.b1-systems.de/qemu/

Buildslave for this Build: kraxel_rhel61

Build Reason: The Nightly scheduler named 'nightly_default' triggered this build
Build Source Stamp: [branch master] HEAD
Blamelist: 

BUILD FAILED: failed compile

sincerely,
 -The Buildbot

[Qemu-devel] [Bug 1018530] Re: No write access in a 9p/virtfs shared folder

2012-07-09 Thread M. Mohan Kumar

No, commit  daf0b9aca9f67323266af1a92e8ea06f9d7bf408 added create
support proxy FS driver model. Local FS had support for creating files
much before.

Georg, is qemu running with root user privileges?

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1018530

Title:
  No write access in a 9p/virtfs shared folder

Status in QEMU:
  New
Status in “qemu-kvm” package in Ubuntu:
  Fix Released

Bug description:
  Ubuntu version:  Ubuntu 12.04 LTS
  Kernel: 3.2.0-25-generic
  Version of qemu-kvm: 1.0+noroms-0ubuntu13

  I have created an shared folder for an virtual machine which is
  managed by libvirt.

  filesystem type='mount' accessmode='passthrough'
  source dir='/storage/data'/
  target dir='data'/
  address type='pci' domain='0x' bus='0x00' slot='0x08' function='0x0'/
  /filesystem

  I mounted it in the virtual machine with this command:  mount -t 9p -o 
trans=virtio,version=9p2000.L data /data
  The filesystem permissions of all files an folders in the shared folder are 
set to 777. I expected that I have the full permissions also in the virtual 
machine.

  Regardless of the permissions on the filesystem I cannot write or create 
files and folders in the virtual machine. The original filesystem (/storage) is 
XFS.
  In another shared folder (similar config in libvirt) which is originally NTFS 
I have no problems.

  ProblemType: Bug
  DistroRelease: Ubuntu 12.04
  Package: qemu-kvm 1.0+noroms-0ubuntu13
  ProcVersionSignature: Ubuntu 3.2.0-25.40-generic 3.2.18
  Uname: Linux 3.2.0-25-generic x86_64
  ApportVersion: 2.0.1-0ubuntu8
  Architecture: amd64
  Date: Wed Jun 27 20:15:20 2012
  InstallationMedia: Ubuntu-Server 12.04 LTS Precise Pangolin - Beta amd64 
(20120409)
  MachineType: To be filled by O.E.M. To be filled by O.E.M.
  ProcEnviron:
   TERM=xterm
   LANG=de_DE.UTF-8
   SHELL=/bin/bash
  ProcKernelCmdLine: BOOT_IMAGE=/vmlinuz-3.2.0-25-generic 
root=/dev/mapper/system-root ro
  SourcePackage: qemu-kvm
  UpgradeStatus: No upgrade log present (probably fresh install)
  dmi.bios.date: 04/18/2012
  dmi.bios.vendor: American Megatrends Inc.
  dmi.bios.version: 1208
  dmi.board.asset.tag: To be filled by O.E.M.
  dmi.board.name: M5A99X EVO
  dmi.board.vendor: ASUSTeK COMPUTER INC.
  dmi.board.version: Rev 1.xx
  dmi.chassis.asset.tag: To Be Filled By O.E.M.
  dmi.chassis.type: 3
  dmi.chassis.vendor: To Be Filled By O.E.M.
  dmi.chassis.version: To Be Filled By O.E.M.
  dmi.modalias: 
dmi:bvnAmericanMegatrendsInc.:bvr1208:bd04/18/2012:svnTobefilledbyO.E.M.:pnTobefilledbyO.E.M.:pvrTobefilledbyO.E.M.:rvnASUSTeKCOMPUTERINC.:rnM5A99XEVO:rvrRev1.xx:cvnToBeFilledByO.E.M.:ct3:cvrToBeFilledByO.E.M.:
  dmi.product.name: To be filled by O.E.M.
  dmi.product.version: To be filled by O.E.M.
  dmi.sys.vendor: To be filled by O.E.M.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1018530/+subscriptions

Re: [Qemu-devel] [RFC] introduce a dynamic library to expose qemu block API

2012-07-09 Thread Wenchao Xia

于 2012-7-9 17:13, Paolo Bonzini 写道:
 Il 09/07/2012 10:54, Wenchao Xia ha scritto:
 Following is my implementing plan draft:
1 introduce libqblock.so in sub directory in qemu.
2 write a nbd client in libqblock, similar to qemu nbd client. Then
 use it to talk with nbd server, by default is qemu-nbd, to get access
 to images. In this way, libqblock.so could be friendly LGPL licensed.
 
 Did you actually assess the license situation of the block layer?
 block.c and large parts of block/* are under a BSD license, for example.
   If the library only has to support raw files, it might do so using
 synchronous I/O only.  This would remove a large body of GPL-licensed code.
 
  If the library was built as nbd-client communicating with nbd-server,
which then employ the BSO licensed code, could the library ignore the
server side's license problem? The reason using nbd-client approach are:
work around qemu block layer license issue and easy to implement, if
other tool found this labrary useful then considering about directly
employ the qemu block code.


3 still not got a good way to get additional info in (2)(3)(4),
 currently in my head is patch qemu-nbd to add an additional nbd command,
 image-info, in which returns related info.
 
 On the Linux kernel mailing list I would have no qualms labeling such
 command as crap.  However, since the social standards on qemu-devel
 are a bit higher, I'll ask instead: what information would the command
 provide beyond the size?
 
  The API need to report the image format it is using, such as
qcow2. And also API should report if a block at offset have been
allocated or it is a hole.

 Paolo
 


-- 
Best Regards

Wenchao Xia

Re: [Qemu-devel] [PATCH v16 8/9] Add set_cachesize command

2012-07-09 Thread Orit Wasserman

On 07/09/2012 11:59 PM, Eric Blake wrote:
 On 07/09/2012 12:22 PM, Orit Wasserman wrote:
 Change XBZRLE cache size in bytes (the size should be a power of 2).
 If XBZRLE cache size is too small there will be many cache miss.

 Signed-off-by: Benoit Hudzia benoit.hud...@sap.com
 Signed-off-by: Petter Svard pett...@cs.umu.se
 Signed-off-by: Aidan Shribman aidan.shrib...@sap.com
 Signed-off-by: Orit Wasserman owass...@redhat.com
 
 +++ b/qapi-schema.json
 @@ -1390,6 +1390,22 @@
  { 'command': 'migrate_set_speed', 'data': {'value': 'int'} }
 
 We are copying after bad practice, but...
 
  
  ##
 +# @migrate_set_cachesize
 
 ...new QMP commands should prefer '-' over '_'.  While the HMP version
 is fine with migrate_set_cachesize, the QMP command should be
 migrate-set-cachesize (or even 'migrate-set-cache-size').
 
I will change it
Thanks,
Orit
 
 +
 +Set cache size to be used by XBZRLE migration, the cache size will be round 
 down
 
 s/round/rounded/

Re: [Qemu-devel] [RFC] introduce a dynamic library to expose qemu block API

2012-07-09 Thread Wenchao Xia


于 2012-7-9 17:27, Daniel P. Berrange 写道:

On Mon, Jul 09, 2012 at 04:54:08PM +0800, Wenchao Xia wrote:

Hi, Paolo and folks,
   qemu have good capabilities to access different virtual disks, I want
to expose its block layer API to let 3rd party program linked in, such
as management stack or block tools, to access images data directly.

Following is the objects:
   (1) API to write/read block device at offset.
   (2) Determine the image type,qcow2/qed/raw
   (3) Determine which blocks are allocated.
   (4) Determine backing file.

Following is my implementing plan draft:
   1 introduce libqblock.so in sub directory in qemu.
   2 write a nbd client in libqblock, similar to qemu nbd client. Then
use it to talk with nbd server, by default is qemu-nbd, to get access
to images. In this way, libqblock.so could be friendly LGPL licensed.
   3 still not got a good way to get additional info in (2)(3)(4),
currently in my head is patch qemu-nbd to add an additional nbd command,
image-info, in which returns related info.

   What do you think about it?


For arbirary read/write access to disk images, I can see a little value
in having a standalone libnbd client API, that is able to just talk to
any NBD server. Arguably such a thing does not need to be part of the
QEMU source tree - eg see the recently written libiscsi.so client.

For getting the other metadata about the disk image you mention, another
possibility to is just make 'qemu-img info' return the data in a machine
parseable format, ie JSON  make a client API for extracting data from
this JSON document.


  Thank u for the idea. The .so is introduced to let program access the
image more directly, parsing string is not so fast and it depends on
another program's stdout output, I hope to get a faster way.


For a full-blown RPC API for doing arbitrary tasks related to block
devices, then many apps will tend towards libguestfs, since it provides
such a wide range of functionality for manipulating disk images.


  I used libguestfs to make my image too, but the target of the .so is
a bit different: it expose block data in a lower level, expose
everything qemu main code(except block code) can see. Potential purpose
is as first step to make qemu block layer independent, then qemu and
other tool would be just employers of the library. But now it is out of
plan because license issue, and the library acts as client now lowering
the performance.



Regards,
Daniel



--
Best Regards

Wenchao Xia

[Qemu-devel] [PATCH] megasas: Fix compilation for 32 bit hosts

2012-07-09 Thread Stefan Weil

Cc: Hannes Reinecke h...@suse.de
Signed-off-by: Stefan Weil s...@weilnetz.de
---
 default-configs/pci.mak |4 
 hw/megasas.c|   13 +++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/default-configs/pci.mak b/default-configs/pci.mak
index 9d3e1db..120b69d 100644
--- a/default-configs/pci.mak
+++ b/default-configs/pci.mak
@@ -5,6 +5,10 @@ CONFIG_USB_UHCI=y
 CONFIG_USB_OHCI=y
 CONFIG_USB_EHCI=y
 CONFIG_USB_XHCI=y
+
+# RAID adapter
+CONFIG_MEGASAS_SCSI_PCI=y
+
 CONFIG_NE2000_PCI=y
 CONFIG_EEPRO100_PCI=y
 CONFIG_PCNET_PCI=y
diff --git a/hw/megasas.c b/hw/megasas.c
index b48836f..26cf118 100644
--- a/hw/megasas.c
+++ b/hw/megasas.c
@@ -372,12 +372,13 @@ static uint64_t megasas_fw_time(void)
 return bcd_time;
 }
 
-static uint64_t megasas_gen_sas_addr(uint64_t id)
+static uint64_t megasas_gen_sas_addr(void *p)
 {
+uint64_t id = (uintptr_t)p;
 uint64_t addr;
 
 addr = 0x5001a4aULL  36;
-addr |= id  0xf;
+addr |= id  0xfULL;
 
 return addr;
 }
@@ -672,7 +673,7 @@ static int megasas_ctrl_get_info(MegasasState *s, 
MegasasCmd *cmd)
 info.host.type = MFI_INFO_HOST_PCIX;
 info.device.type = MFI_INFO_DEV_SAS3G;
 info.device.port_count = 2;
-info.device.port_addr[0] = cpu_to_le64(megasas_gen_sas_addr((uint64_t)s));
+info.device.port_addr[0] = cpu_to_le64(megasas_gen_sas_addr(s));
 
 memcpy(info.product_name, MegaRAID SAS 8708EM2, 20);
 snprintf(info.serial_number, 32, QEMU%08lx,
@@ -761,7 +762,7 @@ static int megasas_mfc_get_defaults(MegasasState *s, 
MegasasCmd *cmd)
 return MFI_STAT_INVALID_PARAMETER;
 }
 
-info.sas_addr = cpu_to_le64(megasas_gen_sas_addr((uint64_t)s));
+info.sas_addr = cpu_to_le64(megasas_gen_sas_addr(s));
 info.stripe_size = 3;
 info.flush_time = 4;
 info.background_rate = 30;
@@ -891,7 +892,7 @@ static int megasas_dcmd_pd_get_list(MegasasState *s, 
MegasasCmd *cmd)
 info.addr[num_pd_disks].scsi_dev_type = sdev-type;
 info.addr[num_pd_disks].connect_port_bitmap = 0x1;
 info.addr[num_pd_disks].sas_addr[0] =
-cpu_to_le64(megasas_gen_sas_addr((uint64_t)sdev));
+cpu_to_le64(megasas_gen_sas_addr(sdev));
 num_pd_disks++;
 offset += sizeof(struct mfi_pd_address);
 }
@@ -994,7 +995,7 @@ static int megasas_pd_get_info_submit(SCSIDevice *sdev, int 
lun,
 info-slot_number = (sdev-id  0xFF);
 info-path_info.count = 1;
 info-path_info.sas_addr[0] =
-cpu_to_le64(megasas_gen_sas_addr((uint64_t)sdev));
+cpu_to_le64(megasas_gen_sas_addr(sdev));
 info-connected_port_bitmap = 0x1;
 info-device_speed = 1;
 info-link_speed = 1;
-- 
1.7.0.4

Re: [Qemu-devel] [RFC] introduce a dynamic library to expose qemu block API

2012-07-09 Thread Wenchao Xia


于 2012-7-9 22:36, Christoph Hellwig 写道:

On Mon, Jul 09, 2012 at 04:54:08PM +0800, Wenchao Xia wrote:

Hi, Paolo and folks,
   qemu have good capabilities to access different virtual disks, I want
to expose its block layer API to let 3rd party program linked in, such
as management stack or block tools, to access images data directly.

Following is the objects:
   (1) API to write/read block device at offset.
   (2) Determine the image type,qcow2/qed/raw
   (3) Determine which blocks are allocated.
   (4) Determine backing file.


Sounds like you want a procedural interface for that.  At least for (1)
I have patches I'll submit soon to add qemu img read/write commands.

  Yes, the purpose is introduce API interface about block data, the
operation was supposed to happen frequently, a linked-in library may be
better than process output string parsing in performance.







--
Best Regards

Wenchao Xia

[Qemu-devel] [PATCH 0/2] RFC: powerpc-vfio: adding support

2012-07-09 Thread Alexey Kardashevskiy

The two patches in this set are supposed to add VFIO support for POWER.

The first one adds one more step in the initalizaion sequence which I am not
sure is correct.

The second patch adds actual VFIO support. It is not ready to submit but
ready to discuss. I would like to get rid of all #ifdef TARGET_PPC64 in patch #2
and I wonder if there is any plan to implement some generic EOI support code, 
etc.


Alexey Kardashevskiy (2):
  pseries pci: spapr_finalize_pci_setup introduced
  vfio-powerpc: added VFIO support

 hw/ppc/Makefile.objs |3 ++
 hw/spapr.c   |7 
 hw/spapr.h   |4 +++
 hw/spapr_iommu.c |   87 ++
 hw/spapr_pci.c   |   36 ++---
 hw/spapr_pci.h   |4 +++
 hw/vfio_pci.c|   76 +--
 hw/vfio_pci.h|2 ++
 8 files changed, 212 insertions(+), 7 deletions(-)

-- 
1.7.10

[Qemu-devel] [PATCH 2/2] vfio-powerpc: added VFIO support

2012-07-09 Thread Alexey Kardashevskiy

The patch enables VFIO on POWER.

It literally does the following:

1. POWERPC IOMMU support (the kernel counterpart is required)

2. Added #ifdef TARGET_PPC64 for EOI handlers initialisation.

3. Added vfio_get_container_fd() to VFIO in order to initialize 1).

4. Makefile fixed and is_vfio flag added into sPAPR PHB - required to
distinguish VFIO's DMA context from the emulated one.

WIth the pathes posted today a bit earlier, this patch fully supports
VFIO what includes MSIX as well,


Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/ppc/Makefile.objs |3 ++
 hw/spapr.h   |4 +++
 hw/spapr_iommu.c |   87 ++
 hw/spapr_pci.c   |   23 -
 hw/spapr_pci.h   |2 ++
 hw/vfio_pci.c|   76 +--
 hw/vfio_pci.h|2 ++
 7 files changed, 193 insertions(+), 4 deletions(-)

diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index f573a95..c46a049 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -25,4 +25,7 @@ obj-$(CONFIG_FDT) += ../device_tree.o
 # Xilinx PPC peripherals
 obj-y += xilinx_ethlite.o
 
+# VFIO PCI device assignment
+obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
+
 obj-y := $(addprefix ../,$(obj-y))
diff --git a/hw/spapr.h b/hw/spapr.h
index b37f337..9dca704 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -340,4 +340,8 @@ int spapr_dma_dt(void *fdt, int node_off, const char 
*propname,
 int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
   DMAContext *dma);
 
+void spapr_vfio_init_dma(int fd, uint32_t liobn,
+ uint64_t *dma32_window_start,
+ uint64_t *dma32_window_size);
+
 #endif /* !defined (__HW_SPAPR_H__) */
diff --git a/hw/spapr_iommu.c b/hw/spapr_iommu.c
index 50c288d..0a194e8 100644
--- a/hw/spapr_iommu.c
+++ b/hw/spapr_iommu.c
@@ -16,6 +16,8 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, see http://www.gnu.org/licenses/.
  */
+#include sys/ioctl.h
+
 #include hw.h
 #include kvm.h
 #include qdev.h
@@ -23,6 +25,7 @@
 #include dma.h
 
 #include hw/spapr.h
+#include hw/linux-vfio.h
 
 #include libfdt.h
 
@@ -183,6 +186,86 @@ static int put_tce_emu(target_ulong liobn, target_ulong 
ioba, target_ulong tce)
 return 0;
 }
 
+/*  API for POWERPC IOMMU  */
+
+#define POWERPC_IOMMU   2
+
+struct tce_iommu_info {
+__u32 argsz;
+__u32 dma32_window_start;
+__u32 dma32_window_size;
+};
+
+#define POWERPC_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct tce_iommu_dma_map {
+__u32 argsz;
+__u64 va;
+__u64 dmaaddr;
+};
+
+#define POWERPC_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
+#define POWERPC_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+typedef struct sPAPRVFIOTable {
+int fd;
+uint32_t liobn;
+QLIST_ENTRY(sPAPRVFIOTable) list;
+} sPAPRVFIOTable;
+
+QLIST_HEAD(vfio_tce_tables, sPAPRVFIOTable) vfio_tce_tables;
+
+void spapr_vfio_init_dma(int fd, uint32_t liobn,
+ uint64_t *dma32_window_start,
+ uint64_t *dma32_window_size)
+{
+sPAPRVFIOTable *t;
+struct tce_iommu_info info = { .argsz = sizeof(info) };
+
+if (ioctl(fd, POWERPC_IOMMU_GET_INFO, info)) {
+fprintf(stderr, POWERPC_IOMMU_GET_INFO failed %d\n, errno);
+return;
+}
+*dma32_window_start = info.dma32_window_start;
+*dma32_window_size = info.dma32_window_size;
+
+t = g_malloc0(sizeof(*t));
+t-fd = fd;
+t-liobn = liobn;
+
+QLIST_INSERT_HEAD(vfio_tce_tables, t, list);
+}
+
+static int put_tce_vfio(uint32_t liobn, target_ulong ioba, target_ulong tce)
+{
+sPAPRVFIOTable *t;
+struct tce_iommu_dma_map map = {
+.argsz = sizeof(map),
+.va = 0,
+.dmaaddr = ioba,
+};
+
+QLIST_FOREACH(t, vfio_tce_tables, list) {
+if (t-liobn != liobn) {
+continue;
+}
+if (tce) {
+map.va = (uintptr_t)qemu_get_ram_ptr(tce  ~SPAPR_TCE_PAGE_MASK);
+if (ioctl(t-fd, POWERPC_IOMMU_MAP_DMA, map)) {
+fprintf(stderr, TCE_MAP_DMA: %d\n, errno);
+return H_PARAMETER;
+}
+} else {
+if (ioctl(t-fd, POWERPC_IOMMU_UNMAP_DMA, map)) {
+fprintf(stderr, TCE_UNMAP_DMA: %d\n, errno);
+return H_PARAMETER;
+}
+}
+return H_SUCCESS;
+}
+return H_CONTINUE; /* positive non-zero value */
+}
+
 static target_ulong h_put_tce(CPUPPCState *env, sPAPREnvironment *spapr,
   target_ulong opcode, target_ulong *args)
 {
@@ -203,6 +286,10 @@ static target_ulong h_put_tce(CPUPPCState *env, 
sPAPREnvironment *spapr,
 if (0 = ret) {
 return ret ? H_PARAMETER : H_SUCCESS;
 }
+ret = put_tce_vfio(liobn, ioba, tce);
+if (0 = ret) {
+return ret ?

[Qemu-devel] [PATCH 1/2] pseries pci: spapr_finalize_pci_setup introduced

2012-07-09 Thread Alexey Kardashevskiy

Previously PCI bus setup was done in 3 steps:
1) create a PCI bus, configure DMA
2) create PCI devices on the bus
3) populate a PCI bus node in the Device Tree

As some bus parameters can be configured only when some or all
the devices got attached to the bus and initialized,
the spapr_finalize_pci_setup has been introduced.

As an example, such a handler can setup DMA window parameters taken from
an IOMMU file descriptor available from a VFIO PCI device.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 hw/spapr.c |7 +++
 hw/spapr_pci.c |   13 ++---
 hw/spapr_pci.h |2 ++
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/hw/spapr.c b/hw/spapr.c
index b83f83b..688a135 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -516,7 +516,14 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr,
 }
 
 QLIST_FOREACH(phb, spapr-phbs, list) {
+ret = spapr_finalize_pci_setup(phb);
+if (ret  0) {
+break;
+}
 ret = spapr_populate_pci_dt(phb, PHANDLE_XICP, fdt);
+if (ret  0) {
+break;
+}
 }
 
 if (ret  0) {
diff --git a/hw/spapr_pci.c b/hw/spapr_pci.c
index 014297b..5f89003 100644
--- a/hw/spapr_pci.c
+++ b/hw/spapr_pci.c
@@ -573,9 +573,6 @@ static int spapr_phb_init(SysBusDevice *s)
 phb-host_state.bus = bus;
 
 phb-dma_liobn = SPAPR_PCI_BASE_LIOBN | (pci_find_domain(bus)  16);
-phb-dma_window_start = 0;
-phb-dma_window_size = 0x4000;
-phb-dma = spapr_tce_new_dma_context(phb-dma_liobn, phb-dma_window_size);
 pci_setup_iommu(bus, spapr_pci_dma_context_fn, phb);
 
 QLIST_INSERT_HEAD(spapr-phbs, phb, list);
@@ -639,6 +636,16 @@ void spapr_create_phb(sPAPREnvironment *spapr,
 qdev_init_nofail(dev);
 }
 
+/* Finalize PCI setup, called when all devices are already created */
+int spapr_finalize_pci_setup(sPAPRPHBState *phb)
+{
+phb-dma_window_start = 0;
+phb-dma_window_size = 0x4000;
+phb-dma = spapr_tce_new_dma_context(phb-dma_liobn,
+ phb-dma_window_size);
+return 0;
+}
+
 /* Macros to operate with address in OF binding to PCI */
 #define b_x(x, p, l)(((x)  ((1(l))-1))  (p))
 #define b_n(x)  b_x((x), 31, 1) /* 0 if relocatable */
diff --git a/hw/spapr_pci.h b/hw/spapr_pci.h
index 145071c..3aae273 100644
--- a/hw/spapr_pci.h
+++ b/hw/spapr_pci.h
@@ -68,6 +68,8 @@ void spapr_create_phb(sPAPREnvironment *spapr,
   uint64_t mem_win_addr, uint64_t mem_win_size,
   uint64_t io_win_addr, uint64_t msi_win_addr);
 
+int spapr_finalize_pci_setup(sPAPRPHBState *phb);
+
 int spapr_populate_pci_dt(sPAPRPHBState *phb,
   uint32_t xics_phandle,
   void *fdt);
-- 
1.7.10

Re: [Qemu-devel] [PULL 00/14] SCSI updates for 2012-07-02

2012-07-09 Thread Hannes Reinecke

On 07/10/2012 01:19 AM, Anthony Liguori wrote:
 On 07/09/2012 06:09 PM, Alexander Graf wrote:

 On 09.07.2012, at 18:48, Anthony Liguori wrote:

 On 07/02/2012 04:41 AM, Paolo Bonzini wrote:
 Anthony,

 The following changes since commit
 71ea2e016131a9fcde6f1ffd3e0e34a64c21f593:

bsd-user: fix build (2012-06-28 20:28:36 +)

 Pulled.  Thanks.

 Megasas? :)
 
 So this code is really broken:
 
 info.host.type = MFI_INFO_HOST_PCIX;
 info.device.type = MFI_INFO_DEV_SAS3G;
 info.device.port_count = 2;
 info.device.port_addr[0] =
 cpu_to_le64(megasas_gen_sas_addr((uint64_t)s));
 
 This will make migration impossible not to mention the fact that
 casting a pointer to a uint64_t is really broken.
 
Hey, this is _NOT_ an address. It's a simple way of generating a
system-wide unique SAS address.

The whole thing is informational anyway, and can only be seen when
using the (proprietary) MegaCLI userspace command.

 This code needs to be refactored to not do this.  It's quite
 pervasive though (there's a half a dozen instances like this).
 

Okay, so here's the challenge: We need to generate a system-wide
unique SAS address, one per SCSI device and one per megasas instance.
A simple counter won't work, as we might have several qemu instances
running. Which would result in all of them having the same SAS
address for the host.

 I'm going to disable the build by default.  I don't want to see a
 rash fix like (uint64_t)(intptr_t).  This needs to be fixed by not
 making the pointer address guest visible.  It can then be
 re-enabled.  Should be easy enough to update your .mak config if you
 want to test between now and then.
 
As said, it's _not_ an address. The address it just use to seed the
SAS address.

But as you object, I see to use something else for seeding the SAS
address.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke   zSeries  Storage
h...@suse.de  +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

< 1 2 3

201 - 241 of 241 matches

Mail list logo