[Qemu-devel] [PATCH 10/13 v7] dump: add APIs to operate DataCache

2014-01-17 Thread qiaonuohan
DataCache is used to store data temporarily, then the data will be written to
vmcore. These functions will be called later when writing data of page to
vmcore.

Signed-off-by: Qiao Nuohan qiaonuo...@cn.fujitsu.com
Reviewed-by: Laszlo Ersek ler...@redhat.com
---
 dump.c|   47 +++
 include/sysemu/dump.h |9 +
 2 files changed, 56 insertions(+), 0 deletions(-)

diff --git a/dump.c b/dump.c
index 26a1756..fa183cb 100644
--- a/dump.c
+++ b/dump.c
@@ -1173,6 +1173,53 @@ out:
 return ret;
 }
 
+static void prepare_data_cache(DataCache *data_cache, DumpState *s,
+   off_t offset)
+{
+data_cache-fd = s-fd;
+data_cache-data_size = 0;
+data_cache-buf_size = BUFSIZE_DATA_CACHE;
+data_cache-buf = g_malloc0(BUFSIZE_DATA_CACHE);
+data_cache-offset = offset;
+}
+
+static int write_cache(DataCache *dc, const void *buf, size_t size,
+   bool flag_sync)
+{
+/*
+ * dc-buf_size should not be less than size, otherwise dc will never be
+ * enough
+ */
+assert(size = dc-buf_size);
+
+/*
+ * if flag_sync is set, synchronize data in dc-buf into vmcore.
+ * otherwise check if the space is enough for caching data in buf, if not,
+ * write the data in dc-buf to dc-fd and reset dc-buf
+ */
+if ((!flag_sync  dc-data_size + size  dc-buf_size) ||
+(flag_sync  dc-data_size  0)) {
+if (write_buffer(dc-fd, dc-offset, dc-buf, dc-data_size)  0) {
+return -1;
+}
+
+dc-offset += dc-data_size;
+dc-data_size = 0;
+}
+
+if (!flag_sync) {
+memcpy(dc-buf + dc-data_size, buf, size);
+dc-data_size += size;
+}
+
+return 0;
+}
+
+static void free_data_cache(DataCache *data_cache)
+{
+g_free(data_cache-buf);
+}
+
 static ram_addr_t get_start_block(DumpState *s)
 {
 GuestPhysBlock *block;
diff --git a/include/sysemu/dump.h b/include/sysemu/dump.h
index 6d4d0bc..92a95e4 100644
--- a/include/sysemu/dump.h
+++ b/include/sysemu/dump.h
@@ -41,6 +41,7 @@
 #define DISKDUMP_HEADER_BLOCKS  (1)
 #define BUFSIZE_BITMAP  (TARGET_PAGE_SIZE)
 #define PFN_BUFBITMAP   (CHAR_BIT * BUFSIZE_BITMAP)
+#define BUFSIZE_DATA_CACHE  (TARGET_PAGE_SIZE * 4)
 
 typedef struct ArchDumpInfo {
 int d_machine;  /* Architecture */
@@ -142,6 +143,14 @@ typedef struct QEMU_PACKED KdumpSubHeader64 {
 uint64_t max_mapnr_64;  /* header_version 6 and later */
 } KdumpSubHeader64;
 
+typedef struct DataCache {
+int fd; /* fd of the file where to write the cached data */
+uint8_t *buf;   /* buffer for cached data */
+size_t buf_size;/* size of the buf */
+size_t data_size;   /* size of cached data in buf */
+off_t offset;   /* offset of the file */
+} DataCache;
+
 struct GuestPhysBlockList; /* memory_mapping.h */
 int cpu_get_dump_info(ArchDumpInfo *info,
   const struct GuestPhysBlockList *guest_phys_blocks);
-- 
1.7.1




[Qemu-devel] [PATCH 11/13 v7] dump: add API to write dump pages

2014-01-17 Thread qiaonuohan
functions are used to write page to vmcore. vmcore is written page by page.
page desc is used to store the information of a page, including a page's size,
offset, compression format, etc.

Signed-off-by: Qiao Nuohan qiaonuo...@cn.fujitsu.com
---
 dump.c|  229 +
 include/sysemu/dump.h |7 ++
 2 files changed, 236 insertions(+), 0 deletions(-)

diff --git a/dump.c b/dump.c
index fa183cb..bb03ef7 100644
--- a/dump.c
+++ b/dump.c
@@ -25,6 +25,14 @@
 #include qapi/error.h
 #include qmp-commands.h
 
+#include zlib.h
+#ifdef CONFIG_LZO
+#include lzo/lzo1x.h
+#endif
+#ifdef CONFIG_SNAPPY
+#include snappy-c.h
+#endif
+
 static uint16_t cpu_convert_to_target16(uint16_t val, int endian)
 {
 if (endian == ELFDATA2LSB) {
@@ -1220,6 +1228,227 @@ static void free_data_cache(DataCache *data_cache)
 g_free(data_cache-buf);
 }
 
+static size_t get_len_buf_out(size_t page_size, uint32_t flag_compress)
+{
+size_t len_buf_out_zlib, len_buf_out_lzo, len_buf_out_snappy;
+size_t len_buf_out;
+
+/* init buf_out */
+len_buf_out_zlib = len_buf_out_lzo = len_buf_out_snappy = 0;
+
+/* buf size for zlib */
+len_buf_out_zlib = compressBound(page_size);
+
+/* buf size for lzo */
+#ifdef CONFIG_LZO
+if (flag_compress  DUMP_DH_COMPRESSED_LZO) {
+if (lzo_init() != LZO_E_OK) {
+/* return 0 to indicate lzo is unavailable */
+return 0;
+}
+}
+
+/*
+ * LZO will expand incompressible data by a little amount. please check the
+ * following URL to see the expansion calculation:
+ * http://www.oberhumer.com/opensource/lzo/lzofaq.php
+ */
+len_buf_out_lzo = page_size + page_size / 16 + 64 + 3;
+#endif
+
+#ifdef CONFIG_SNAPPY
+/* buf size for snappy */
+len_buf_out_snappy = snappy_max_compressed_length(page_size);
+#endif
+
+/* get the biggest that can store all kinds of compressed page */
+len_buf_out = MAX(len_buf_out_zlib,
+  MAX(len_buf_out_lzo, len_buf_out_snappy));
+
+return len_buf_out;
+}
+
+/*
+ * check if the page is all 0
+ */
+static inline bool is_zero_page(const uint8_t *buf, size_t page_size)
+{
+return buffer_is_zero(buf, page_size);
+}
+
+static int write_dump_pages(DumpState *s)
+{
+int ret = 0;
+DataCache page_desc, page_data;
+size_t len_buf_out, size_out;
+#ifdef CONFIG_LZO
+lzo_bytep wrkmem = NULL;
+#endif
+uint8_t *buf_out = NULL;
+off_t offset_desc, offset_data;
+PageDesc pd, pd_zero;
+uint8_t *buf;
+int endian = s-dump_info.d_endian;
+
+/* get offset of page_desc and page_data in dump file */
+offset_desc = s-offset_page;
+offset_data = offset_desc + sizeof(PageDesc) * s-num_dumpable;
+
+prepare_data_cache(page_desc, s, offset_desc);
+prepare_data_cache(page_data, s, offset_data);
+
+/* prepare buffer to store compressed data */
+len_buf_out = get_len_buf_out(s-page_size, s-flag_compress);
+if (len_buf_out == 0) {
+dump_error(s, dump: failed to get length of output buffer.\n);
+goto out;
+}
+
+#ifdef CONFIG_LZO
+wrkmem = g_malloc(LZO1X_1_MEM_COMPRESS);
+#endif
+
+buf_out = g_malloc(len_buf_out);
+
+/*
+ * init zero page's page_desc and page_data, because every zero page
+ * uses the same page_data
+ */
+pd_zero.size = cpu_convert_to_target32(s-page_size, endian);
+pd_zero.flags = cpu_convert_to_target32(0, endian);
+pd_zero.offset = cpu_convert_to_target64(offset_data, endian);
+pd_zero.page_flags = cpu_convert_to_target64(0, endian);
+buf = g_malloc0(s-page_size);
+ret = write_cache(page_data, buf, s-page_size, false);
+g_free(buf);
+if (ret  0) {
+dump_error(s, dump: failed to write page data(zero page).\n);
+goto out;
+}
+
+offset_data += s-page_size;
+
+/*
+ * dump memory to vmcore page by page. zero page will all be resided in the
+ * first page of page section
+ */
+while (get_next_page(NULL, buf, s)) {
+/* check zero page */
+if (is_zero_page(buf, s-page_size)) {
+ret = write_cache(page_desc, pd_zero, sizeof(PageDesc),
+  false);
+if (ret  0) {
+dump_error(s, dump: failed to write page desc.\n);
+goto out;
+}
+} else {
+/*
+ * not zero page, then:
+ * 1. compress the page
+ * 2. write the compressed page into the cache of page_data
+ * 3. get page desc of the compressed page and write it into the
+ *cache of page_desc
+ *
+ * only one compression format will be used here, for
+ * s-flag_compress is set. But when compression fails to work,
+ * we fall back to save in plaintext.
+ */
+ size_out = len_buf_out;
+ if ((s-flag_compress  

[Qemu-devel] [PATCH 00/13 v7] Make 'dump-guest-memory' dump in kdump-compressed format

2014-01-17 Thread qiaonuohan
Hi, all

The last version is here:
http://lists.nongnu.org/archive/html/qemu-devel/2014-01/msg00209.html

Command 'dump-guest-memory' was introduced to dump guest's memory. But the
vmcore's format is only elf32 or elf64. The message is here:
http://lists.gnu.org/archive/html/qemu-devel/2012-04/msg03379.html

Compared with migration, the missing of compression feature means regression
to 'dump-guest-memory'. So we post these patches to make 'dump-guest-memory' be
able to dump guest's in kdump-compressed format. Then vmcore can be much
smaller, and easily to be delivered.

The kdump-compressed format is *linux specific* *linux standard* crash dump
format used in kdump framework. The kdump-compressed format is readable only
with the crash utility, and it can be smaller than the ELF format because of
the compression support. To get more detailed information about
kdump-compressed format, please refer to the following URL:
http://sourceforge.net/projects/makedumpfile/

Note, similar to 'dump-guest-memory':
1. The guest should be x86 or x86_64. The other arch is not supported now.
2. If the OS is in the second kernel, gdb may not work well, and crash can
   work by specifying '--machdep phys_addr=xxx' in the command line. The
   reason is that the second kernel will update the page table, and we can
   not get the page table for the first kernel.
3. The cpu's state is stored in QEMU note.
4. The vmcore are able to be compressed with zlib, lzo or snappy. zlib is
   available by default, and option '--enable-lzo' or '--enable-snappy'
   should be specified with 'configure' to make lzo or snappy available.

Changelog:
Changes from v6 to v7:
1. support BE host
2. abandon non-flatten format to avoid using seek on vmcore
3. abandon using of very large array
4. use get_next_page to replace the iteration of guest's pages
5. abandon the support of HMP

Changes from v5 to v6:
1. add run-time check for compression format(lzo/snappy)
2. address Stefan's comments about reusing code and coding style
3. update the version of kdump-compressed format to 6th
4. resplit the patches
5. Add 'query-dump-guest-memory-capability' command

Changes from v4 to v5:
1. using flatten format to avoid using temporary files according to Stefan's
   comments
2. Address Andreas's comments about coding style

Changes from v3 to v4:
1. change to avoid conflict with Andreas's patches
2. rebase

Changes from v2 to v3:
1. Address Eric's comment

Changes from v1 to v2:
1. Address Eric  Daniel's comment: fix manner of string copy.
2. Address Eric's comment: replace reinventing new constants by using the
   ready-made ones accoring.
3. Address Andreas's comment: remove useless include.

qiaonuohan (13):
  dump: const-qualify the buf of WriteCoreDumpFunction
  dump: add argument to write_elfxx_notes
  dump: add API to write header of flatten format
  dump: add API to write vmcore
  dump: add API to write elf notes to buffer
  dump: add support for lzo/snappy
  dump: add members to DumpState and init some of them
  dump: add API to write dump header
  dump: add API to write dump_bitmap
  dump: add APIs to operate DataCache
  dump: add API to write dump pages
  dump: make kdump-compressed format available for 'dump-guest-memory'
  dump: add 'query-dump-guest-memory-capability' command

 configure |   54 +++
 dump.c|  972 -
 hmp.c |5 +-
 include/qom/cpu.h |3 +-
 include/sysemu/dump.h |  138 +++
 qapi-schema.json  |   38 ++-
 qmp-commands.hx   |   38 ++-
 7 files changed, 1228 insertions(+), 20 deletions(-)




Re: [Qemu-devel] [PULL 11/11] vl: Add a blank space between the variable and '='

2014-01-17 Thread Michael Tokarev
17.01.2014 10:04, Kewei Yu пишет:
 Thanks, I will send patch with prefix to indicate which file of the codebase 
 after this.

I already fixed it when applying, see 
http://git.corpit.ru/?p=qemu.git;h=2c02d1ad48ad44cf00522df7d8de9138689fac85

/mjt



Re: [Qemu-devel] [RFC PATCH v4 3/4] raw-posix: Add full image preallocation option

2014-01-17 Thread Stefan Hajnoczi
On Fri, Dec 27, 2013 at 11:05:53AM +0800, Hu Tao wrote:
 This patch adds a new option preallocation for raw format, and implements
 full preallocation.
 
 Signed-off-by: Hu Tao hu...@cn.fujitsu.com
 ---
  block/raw-posix.c | 46 ++
  1 file changed, 46 insertions(+)
 
 diff --git a/block/raw-posix.c b/block/raw-posix.c
 index 6f6b8c1..a722d27 100644
 --- a/block/raw-posix.c
 +++ b/block/raw-posix.c
 @@ -1160,17 +1160,52 @@ static int64_t 
 raw_get_allocated_file_size(BlockDriverState *bs)
  return (int64_t)st.st_blocks * 512;
  }
  
 +#ifdef __linux__
 +static int raw_preallocate2(int fd, int64_t offset, int64_t length)

Why is this function called raw_preallocate2()?  Please choose a
descriptive name.

 +{
 +int ret = -1;
 +
 +ret = fallocate(fd, 0, offset, length);
 +if (ret  0) {
 +ret = -errno;
 +}
 +
 +/* fallback to posix_fallocate() if fallocate() is not supported */
 +if (ret == -ENOSYS || ret == -EOPNOTSUPP) {
 +ret = posix_fallocate(fd, offset, length);

From the posix_fallocate(3) man page:
posix_fallocate() returns zero on success, or an error number on
failure.  Note that errno is not set.

You need to negate the error number:
if (ret  0) {
ret = -ret; /* posix_fallocate(3) returns a positive errno */
}

 +}
 +
 +return ret;
 +}
 +#else
 +static int raw_preallocate2(int fd, int64_t offset, int64_t length)
 +{
 +return posix_fallocate(fd, offset, length);

Same error number problem as above.

 @@ -1185,6 +1220,12 @@ static int raw_create(const char *filename, 
 QEMUOptionParameter *options,
  result = -errno;
  error_setg_errno(errp, -result, Could not resize file);
  }
 +if (prealloc == PREALLOC_MODE_FULL) {
 +result = raw_preallocate2(fd, 0, total_size);

What if ftruncate() failed?  We should not try to fallocate.  Please add
a proper error return path.

 +if (result != 0)
 +error_setg_errno(errp, -result,
 + Could not preallocate data for the new 
 file);

WARNING: braces {} are necessary for all arms of this statement

Please run scripts/checkpatch.pl before submitting patches, it checks
QEMU coding style.



Re: [Qemu-devel] [RFC PATCH v4 4/4] qcow2: Add full image preallocation option

2014-01-17 Thread Stefan Hajnoczi
On Fri, Dec 27, 2013 at 11:05:54AM +0800, Hu Tao wrote:

This approach seems okay but the calculation isn't quite right yet.

On Windows an error would be raised since we don't have preallocate=full
support.  That's okay.

 @@ -1477,16 +1478,53 @@ static int qcow2_create2(const char *filename, 
 int64_t total_size,
  Error *local_err = NULL;
  int ret;
  
 +if (prealloc == PREALLOC_MODE_FULL) {
 +int64_t meta_size = 0;
 +unsigned nrefe, nl2e;
 +BlockDriver *drv;
 +
 +drv = bdrv_find_protocol(filename, true);
 +if (drv == NULL) {
 +error_setg(errp, Could not find protocol for file '%s', 
 filename);
 +return -ENOENT;
 +}
 +
 +alloc_options = append_option_parameters(alloc_options,
 + drv-create_options);
 +alloc_options = append_option_parameters(alloc_options, options);
 +
 +/* header: 1 cluster */
 +meta_size += cluster_size;
 +/* total size of refblocks */
 +nrefe = (total_size / cluster_size);
 +nrefe = align_offset(nrefe, cluster_size / sizeof(uint16_t));
 +meta_size += nrefe * sizeof(uint16_t);

Every host cluster is reference-counted, including metadata (even
refcount blocks are recursively included).  This calculation is wrong
because it only considers data clusters.

 +/* total size of reftables */
 +meta_size += nrefe * sizeof(uint16_t) * sizeof(uint16_t) / 
 cluster_size;

I don't understand this calculation.  The refcount table consists of
contiguous clusters where each element is a 64-bit offset to a refcount
block.

 +/* total size of L2 tables */
 +nl2e = total_size / cluster_size;
 +nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t));
 +meta_size += nl2e * sizeof(uint64_t);
 +/* total size of L1 tables */
 +meta_size += nl2e * sizeof(uint64_t) * sizeof(uint64_t) / 
 cluster_size;

Another strange calculation.  The L1 table consists of contiguous
clusters where each element is a 64-bit offset to an L1 table.



Re: [Qemu-devel] [PATCH 00/13 v7] Make 'dump-guest-memory' dump in kdump-compressed format

2014-01-17 Thread Christian Borntraeger
On 17/01/14 08:46, qiaonuohan wrote:
 Hi, all
 
 The last version is here:
 http://lists.nongnu.org/archive/html/qemu-devel/2014-01/msg00209.html
 
 Command 'dump-guest-memory' was introduced to dump guest's memory. But the
 vmcore's format is only elf32 or elf64. The message is here:
 http://lists.gnu.org/archive/html/qemu-devel/2012-04/msg03379.html
 
 Compared with migration, the missing of compression feature means regression
 to 'dump-guest-memory'. So we post these patches to make 'dump-guest-memory' 
 be
 able to dump guest's in kdump-compressed format. Then vmcore can be much
 smaller, and easily to be delivered.

Nice, we certainly want that.
 
 The kdump-compressed format is *linux specific* *linux standard* crash dump
 format used in kdump framework. The kdump-compressed format is readable only
 with the crash utility, and it can be smaller than the ELF format because of
 the compression support. To get more detailed information about
 kdump-compressed format, please refer to the following URL:
 http://sourceforge.net/projects/makedumpfile/
 
 Note, similar to 'dump-guest-memory':
 1. The guest should be x86 or x86_64. The other arch is not supported now.

What is the reason for that? Looking over the patch set, there seem to
be no architecture folder involved and others like s390x also have the
dump-guest-memory support. 
[...]




Re: [Qemu-devel] [RFC PATCH v4 3/4] raw-posix: Add full image preallocation option

2014-01-17 Thread Stefan Hajnoczi
On Fri, Dec 27, 2013 at 11:05:53AM +0800, Hu Tao wrote:
 diff --git a/block/raw-posix.c b/block/raw-posix.c
 index 6f6b8c1..a722d27 100644
 --- a/block/raw-posix.c
 +++ b/block/raw-posix.c
 @@ -1160,17 +1160,52 @@ static int64_t 
 raw_get_allocated_file_size(BlockDriverState *bs)
  return (int64_t)st.st_blocks * 512;
  }
  
 +#ifdef __linux__
 +static int raw_preallocate2(int fd, int64_t offset, int64_t length)
 +{
 +int ret = -1;
 +
 +ret = fallocate(fd, 0, offset, length);
 +if (ret  0) {
 +ret = -errno;
 +}
 +
 +/* fallback to posix_fallocate() if fallocate() is not supported */
 +if (ret == -ENOSYS || ret == -EOPNOTSUPP) {
 +ret = posix_fallocate(fd, offset, length);
 +}
 +
 +return ret;
 +}

Why is this Linux-specific #ifdef necessary?  glibc will try to use the
fallocate(2) system call, if possible.

Stefan



Re: [Qemu-devel] [PATCH 00/13 v7] Make 'dump-guest-memory' dump in kdump-compressed format

2014-01-17 Thread Qiao Nuohan

On 01/17/2014 04:50 PM, Christian Borntraeger wrote:

What is the reason for that? Looking over the patch set, there seem to
be no architecture folder involved and others like s390x also have the
dump-guest-memory support.
[...]


I found dump-guest-memory have support s390/ppc. I do not restrict the guest
should only be x86/x86_64, actually, I get no knowledge about other arch, so 
what I can do now is just make x86/x86_64 works and be tested.


--
Regards
Qiao Nuohan



Re: [Qemu-devel] [RFC PATCH v4 0/4] qemu-img: add preallocation=full

2014-01-17 Thread Stefan Hajnoczi
On Fri, Dec 27, 2013 at 11:05:50AM +0800, Hu Tao wrote:
 This series implements full image preallocation to create a non-sparse image
 file at creation time, both for raw and qcow2 format. The purpose is to avoid
 performance deterioration of the guest cause by sparse image.
 
 v4:  - remove bdrv_preallocate and make preallocation a bdrv_create_file 
 option
  - prealloc_mode - PreallocMode and add it to QAPI
  - fix return value in raw_preallocate2
 
 Hu Tao (4):
   qapi: introduce PreallocMode
   raw,qcow2: don't convert file size to sector size
   raw-posix: Add full image preallocation option
   qcow2: Add full image preallocation option
 
  block/qcow2.c | 64 
 +--
  block/raw-posix.c | 50 +--
  qapi-schema.json  | 12 +++
  3 files changed, 113 insertions(+), 13 deletions(-)

I've thought more about the lack of progress information during
posix_fallocate(3).  Users creating 10s or 100s of GB images may have to
wait for a long time if posix_fallocate(3) is implemented in userspace
(effectively dd if=/dev/zero of=vm.img).

But I think adding preallocate=full is a good starting point.  Modern
file systems implement it efficiently so the wait time should be
negligible.  If creating images takes too long we could consider a
fancier interface later.

I left comments on specific patches.



Re: [Qemu-devel] [PATCH 0/9] QMP: Introduce incremental drive-backup with in-memory dirty bitmap

2014-01-17 Thread Stefan Hajnoczi
On Mon, Jan 13, 2014 at 06:39:39PM +0800, Fam Zheng wrote:
 This implements incremental backup.
 
 A few new QMP commands related to dirty bitmap are added:
 
 dirty-bitmap-add *
 dirty-bitmap-disable *
 dirty-bitmap-remove
 
 (*: also supported as transactions)
 
 As their name implies, they manipulate a block device's dirty bitmap. This
 doesn't interfere with dirty bitmap used for migration, backup, mirror, etc,
 which don't have a name and are invisible to user. Only named bitmaps (created
 by dirty-bitmap-add) can be disabled/removed by user.
 
 They are added to support user controlled write tracking, so as to determine
 the range of date for incremental backup.
 
 A new sync mode for drive-backup is introduced:
 
 drive-backup device=.. mode=.. sync=dirty-bitmap bitmap=bitmap0
 
 Which will scan dirty bitmap bitmap0 and only copy all dirty sectors to
 target.
 
 Now, let's see the usage with a simple example:
 
 # Start the guest
 vm = VM()
 vm.start()
 
 # Fake some guest writes with qemu-io, this is before creating dirty
 # bitmap, so it won't be copied
 vm.hmp('qemu-io ide0-hd0 write -P 0xa 512k 1M')
 
 # Create a dirty bitmap to track writes
 vm.qmp(dirty-bitmap-add, device=ide0-hd0, name=dirty-0)
 
 # Fake some more guest writes with qemu-io, this will be copied
 vm.hmp('qemu-io ide0-hd0 write -P 0xa 512M 1M')
 
 # Now disable the first dirty bitmap, do the backup according to it,
 # at meantime continue to track dirty with a new dirty bitmap
 vm.qmp(transaction, actions=[
 {
 'type': 'dirty-bitmap-disable', 'data': {
 'device': 'ide0-hd0',
 'name': 'dirty-0'
 }
 }, {
 'type': 'dirty-bitmap-add', 'data': {
 'device': 'ide0-hd0',
 'name': 'dirty-1'
 }
 }, {
 'type': 'drive-backup', 'data': {
 'device': 'ide0-hd0',
 'target': '/tmp/incre.qcow2',
 'bitmap': 'dirty-0',
 'sync': 'dirty-bitmap'
 }
 }
 ])
 
 # Once backup job started, the dirty bitmap can be removed (actually only
 # hidden from user since it is still referenced by block job
 vm.qmp(dirty-bitmap-remove, device=ide0-hd0, name=dirty-0)

I'm interested in the lifecycle of a dirty bitmap (but haven't reviewed
the patches yet).  In particular, what happens if a bitmap is added to
more than one drive?  Is there a more elegant way to handle the disable,
drive-backup, remove step (we only need to explicitly disable because we
still need the bitmap name for the drive-backup command)?  Also what
happens if we add the bitmap again after disabling?

No need to answer all these questions, but it suggests the interface
exposes a bit of complexity.  Maybe it's possible to make it simpler and
easier to use?

 P.S. Persistent dirty bitmap could be built on top of this series, but is not
 yet implemented, because the storage format and integrity measures are not
 quite clear for now. The discussion is still open and any questions, ideas, 
 use
 cases and concerns are all welcome!

It's desirable to keep dirty bitmaps in separate files.  That way they
can be used with any image format (raw, qcow2, etc).

For performance, the dirty bitmap is maintained in memory.  Keeping the
persistent dirty bitmap consistent would be very expensive since it
requires an fdatasync(2) before each write request.

I think it's reasonable to only write out the in-memory bitmap on
shutdown or live migration.  If the QEMU process crashes it is not safe
to trust the dirty bitmap; a full backup must be performed.

The persistent bitmap file must contain:
1. Bitmap granularity (e.g. 64 KB)
2. The actual bitmap (1 TB disk @ 64 KB granularity = 2 MB bitmap)
3. Flags including a clean bit

When QEMU activates the persistent dirty bitmap, it clears the clean
flag.  When QEMU deactivates and finishes writing out the dirty bitmap,
it sets the clean flag.

The clean flag is used to tell whether the persistent bitmap file is
safe to use again.

The file contains no information about the VM or disk image.  It's up to
the user or management tool to keep track of files.



Re: [Qemu-devel] [PATCH 1/6] qemu-fd-exchange: provide common methods for exchange fd

2014-01-17 Thread Daniel P. Berrange
On Wed, Jan 08, 2014 at 05:12:51PM +0800, Lei Li wrote:
 Signed-off-by: Lei Li li...@linux.vnet.ibm.com
 ---
  include/qemu/fd-exchange.h |   25 +++
  util/Makefile.objs |1 +
  util/qemu-fd-exchange.c|   97 
 
  3 files changed, 123 insertions(+), 0 deletions(-)
  create mode 100644 include/qemu/fd-exchange.h
  create mode 100644 util/qemu-fd-exchange.c
 
 diff --git a/include/qemu/fd-exchange.h b/include/qemu/fd-exchange.h
 new file mode 100644
 index 000..6929026
 --- /dev/null
 +++ b/include/qemu/fd-exchange.h
 @@ -0,0 +1,25 @@
 +/*
 + * Internel common methods for exchange of FD
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2.  See
 + * the COPYING file in the top-level directory.
 + *
 + */
 +
 +#ifndef FD_EXCHANGE_H
 +#define FD_EXCHANGE_H
 +
 +#include sys/socket.h
 +
 +union MsgControl {
 +struct cmsghdr cmsg;
 +char control[CMSG_SPACE(sizeof(int))];
 +};
 +
 +ssize_t qemu_send_with_fd(int sockfd, int passed_fd,
 +  const void *buf, size_t len);
 +
 +ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd,
 +  void *buf, size_t len);
 +
 +#endif
 diff --git a/util/Makefile.objs b/util/Makefile.objs
 index af3e5cb..2fb42bf 100644
 --- a/util/Makefile.objs
 +++ b/util/Makefile.objs
 @@ -13,3 +13,4 @@ util-obj-y += hexdump.o
  util-obj-y += crc32c.o
  util-obj-y += throttle.o
  util-obj-y += getauxval.o
 +util-obj-y += qemu-fd-exchange.o
 diff --git a/util/qemu-fd-exchange.c b/util/qemu-fd-exchange.c
 new file mode 100644
 index 000..70a3206
 --- /dev/null
 +++ b/util/qemu-fd-exchange.c
 @@ -0,0 +1,97 @@
 +/*
 + * Internel common methods for exchange of FD
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2.  See
 + * the COPYING file in the top-level directory.
 + *
 + */
 +
 +#include qemu/fd-exchange.h
 +#include qemu-common.h
 +
 +
 +ssize_t qemu_send_with_fd(int sockfd, int passed_fd,
 +  const void *buf, size_t len)
 +{
 +struct msghdr msg;
 +struct iovec iov;
 +struct cmsghdr *cmsg;
 +union MsgControl msg_control;
 +int retval;
 +
 +iov.iov_base = (int *)buf;
 +iov.iov_len = len;
 +
 +memset(msg, 0, sizeof(msg));
 +msg.msg_iov = iov;
 +msg.msg_iovlen = len;
 +msg.msg_control = msg_control;
 +msg.msg_controllen = sizeof(msg_control);
 +
 +if (passed_fd  0) {
 +*(int *)buf = passed_fd;

You are casting 'char *buf' to an 'int *' but many of the
callers only pass in a pointer to a 'char buf[1]'. So you
are overflowing the array and also likely causing alignment
violations on ARM platforms.

 +} else {
 +msg.msg_control = msg_control;
 +msg.msg_controllen = sizeof(msg_control);
 +
 +cmsg = msg_control.cmsg;
 +cmsg-cmsg_len = CMSG_LEN(sizeof(passed_fd));
 +cmsg-cmsg_level = SOL_SOCKET;
 +cmsg-cmsg_type = SCM_RIGHTS;
 +memcpy(CMSG_DATA(cmsg), passed_fd, sizeof(passed_fd));
 +
 +}
 +
 +do {
 +retval = sendmsg(sockfd, msg, 0);
 +} while (retval  0  errno == EINTR);
 +
 +return retval;
 +}
 +
 +ssize_t qemu_recv_with_fd(int sockfd, int *passed_fd,
 +  void *buf, size_t len)
 +{
 +struct iovec iov;
 +struct msghdr msg;
 +struct cmsghdr *cmsg;
 +union MsgControl msg_control;
 +int retval;
 +int data = *(int *)buf;
 +
 +iov.iov_base = buf;
 +iov.iov_len = len;
 +
 +memset(msg, 0, sizeof(msg));
 +msg.msg_iov = iov;
 +msg.msg_iovlen = 1;
 +msg.msg_control = msg_control;
 +msg.msg_controllen = sizeof(msg_control);
 +
 +do {
 +retval = recvmsg(sockfd, msg, 0);
 +} while (retval  0  errno == EINTR);
 +
 +if (retval = 0) {
 +return retval;
 +}
 +
 +if (data != *(int *)buf) {
 +*passed_fd = data;
 +return 0;
 +}

Again cast issues

 +
 +for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
 +if (cmsg-cmsg_len != CMSG_LEN(sizeof(int)) ||
 +cmsg-cmsg_level != SOL_SOCKET ||
 +cmsg-cmsg_type != SCM_RIGHTS) {
 +continue;
 +}
 +
 +memcpy(passed_fd, CMSG_DATA(cmsg), sizeof(*passed_fd));
 +return 0;
 +}
 +
 +*passed_fd = -ENFILE;
 +return retval;
 +}
 -- 

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|



Re: [Qemu-devel] [PATCH 2/9] qapi: Add optional field name to block dirty bitmap

2014-01-17 Thread Stefan Hajnoczi
On Mon, Jan 13, 2014 at 06:39:41PM +0800, Fam Zheng wrote:
 diff --git a/block.c b/block.c
 index 6ad0368..16ef61b 100644
 --- a/block.c
 +++ b/block.c
 @@ -52,6 +52,7 @@
  struct BdrvDirtyBitmap {
  HBitmap *bitmap;
  int refcnt;
 +char name[1024];

Is there a reason for a fixed-size buffer?  g_strdup() is nicer than
arbitrary limits, especially since the lifecycle of BdrvDirtyBitmap is
well-defined and you just need to add a g_free().

 +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
 +  int granularity,
 +  const char *name,
 +  Error **errp)
  {
  int64_t bitmap_size;
  BdrvDirtyBitmap *bitmap;
  
  assert((granularity  (granularity - 1)) == 0);
  
 +if (name  bdrv_find_dirty_bitmap(bs, name)) {
 +if (errp) {
 +error_setg(errp, Bitmap already exists: %s, name);
 +}

error_setg() already checks that errp != NULL, you can drop the if
statement.

 diff --git a/block/mirror.c b/block/mirror.c
 index 2932bab..cc0665b 100644
 --- a/block/mirror.c
 +++ b/block/mirror.c
 @@ -598,7 +598,7 @@ static void mirror_start_job(BlockDriverState *bs, 
 BlockDriverState *target,
  s-granularity = granularity;
  s-buf_size = MAX(buf_size, granularity);
  
 -s-dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity);
 +s-dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);

No error return means the coroutine will be started even if creating the
dirty bitmap fails.  I didn't check what happens but this definitely
makes the reader wonder if the error will be handled cleanly.

 diff --git a/qapi-schema.json b/qapi-schema.json
 index b9b051c..e91143a 100644
 --- a/qapi-schema.json
 +++ b/qapi-schema.json
 @@ -920,6 +920,8 @@
  #
  # Block dirty bitmap information.
  #
 +# @name: the name of dirty bitmap

Since 2.0



[Qemu-devel] device_add error handling regressed in v1.5.0

2014-01-17 Thread Markus Armbruster
Watch this:

$ upstream-qemu -nodefaults -S -display none -monitor stdio
QEMU 1.7.50 monitor - type 'help' for more information
(qemu) device_add e1000,netdev=xxx
Property 'e1000.netdev' can't find value 'xxx'
(qemu) info qtree
bus: main-system-bus
  type System
  dev: hpet, id 
gpio-in 2
gpio-out 1
timers = 3
msi = off
hpet-intcap = 4
irq 32
mmio fed0/0400
  dev: ioapic, id 
gpio-in 24
irq 0
mmio fec0/1000
  dev: i440FX-pcihost, id 
pci-hole64-size = 16777216.000T
short_root_bus = 0
irq 0
bus: pci.0
  type PCI
  dev: e1000, id 
mac = 00:00:00:00:00:00
vlan = null
netdev = null
bootindex = -1
autonegotiation = on
mitigation = on
addr = unset
romfile = null
rombar = 1
multifunction = off
command_serr_enable = on
Segmentation fault (core dumped)

Even though device_add failed, it still created a node in the qtree!

Same issue observed with scsi-hd.  Looks like a qdev problem, not a
device problem.

git-bisect blames this one:

e0a83fc2c1582dc8d4453849852ebe6c258b7c3a is the first bad commit
commit e0a83fc2c1582dc8d4453849852ebe6c258b7c3a
Author: Paolo Bonzini pbonz...@redhat.com
Date:   Tue Apr 2 15:50:00 2013 +0200

qom: do nothing on unparent of object without parent

Otherwise, device_unparent will fail to get a canonical path of
the object.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Message-id: 1364910600-3418-1-git-send-email-pbonz...@redhat.com
Signed-off-by: Anthony Liguori aligu...@us.ibm.com



Re: [Qemu-devel] device_add error handling regressed in v1.5.0

2014-01-17 Thread Paolo Bonzini
Il 17/01/2014 11:23, Markus Armbruster ha scritto:
 e0a83fc2c1582dc8d4453849852ebe6c258b7c3a is the first bad commit
 commit e0a83fc2c1582dc8d4453849852ebe6c258b7c3a
 Author: Paolo Bonzini pbonz...@redhat.com
 Date:   Tue Apr 2 15:50:00 2013 +0200
 
 qom: do nothing on unparent of object without parent
 
 Otherwise, device_unparent will fail to get a canonical path of
 the object.
 
 Signed-off-by: Paolo Bonzini pbonz...@redhat.com
 Message-id: 1364910600-3418-1-git-send-email-pbonz...@redhat.com
 Signed-off-by: Anthony Liguori aligu...@us.ibm.com
 

I think Amos had a fix for this.

Paolo



Re: [Qemu-devel] [PATCH 1/2] kvm: initialize qemu_host_page_size

2014-01-17 Thread Paolo Bonzini
Il 16/01/2014 07:21, Alexey Kardashevskiy ha scritto:
 There is a HOST_PAGE_ALIGN macro which makes sense for KVM accelerator
 but it uses qemu_host_page_size/qemu_host_page_mask which initialized
 for TCG only.
 
 This moves qemu_host_page_size/qemu_host_page_mask initialization from
 TCG's page_init() and adds a call for it from kvm_init().
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
  include/exec/exec-all.h |  1 +
  kvm-all.c   |  1 +
  translate-all.c | 14 --
  3 files changed, 10 insertions(+), 6 deletions(-)
 
 diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
 index ea90b64..3b03cbf 100644
 --- a/include/exec/exec-all.h
 +++ b/include/exec/exec-all.h
 @@ -81,6 +81,7 @@ void cpu_gen_init(void);
  int cpu_gen_code(CPUArchState *env, struct TranslationBlock *tb,
   int *gen_code_size_ptr);
  bool cpu_restore_state(CPUArchState *env, uintptr_t searched_pc);
 +void page_size_init(void);
  
  void QEMU_NORETURN cpu_resume_from_signal(CPUArchState *env1, void *puc);
  void QEMU_NORETURN cpu_io_recompile(CPUArchState *env, uintptr_t retaddr);
 diff --git a/kvm-all.c b/kvm-all.c
 index 0bfb060..edf2365 100644
 --- a/kvm-all.c
 +++ b/kvm-all.c
 @@ -1360,6 +1360,7 @@ int kvm_init(void)
   * page size for the system though.
   */
  assert(TARGET_PAGE_SIZE = getpagesize());
 +page_size_init();
  
  #ifdef KVM_CAP_SET_GUEST_DEBUG
  QTAILQ_INIT(s-kvm_sw_breakpoints);
 diff --git a/translate-all.c b/translate-all.c
 index 105c25a..543e1ff 100644
 --- a/translate-all.c
 +++ b/translate-all.c
 @@ -289,17 +289,15 @@ static inline void map_exec(void *addr, long size)
  }
  #endif
  
 -static void page_init(void)
 +void page_size_init(void)
  {
  /* NOTE: we can always suppose that qemu_host_page_size =
 TARGET_PAGE_SIZE */
  #ifdef _WIN32
 -{
 -SYSTEM_INFO system_info;
 +SYSTEM_INFO system_info;
  
 -GetSystemInfo(system_info);
 -qemu_real_host_page_size = system_info.dwPageSize;
 -}
 +GetSystemInfo(system_info);
 +qemu_real_host_page_size = system_info.dwPageSize;
  #else
  qemu_real_host_page_size = getpagesize();
  #endif
 @@ -310,7 +308,11 @@ static void page_init(void)
  qemu_host_page_size = TARGET_PAGE_SIZE;
  }
  qemu_host_page_mask = ~(qemu_host_page_size - 1);
 +}
  
 +static void page_init(void)
 +{
 +page_size_init();
  #if defined(CONFIG_BSD)  defined(CONFIG_USER_ONLY)
  {
  #ifdef HAVE_KINFO_GETVMMAP
 

Acked-by: Paolo Bonzini pbonz...@redhat.com



[Qemu-devel] [PATCH] target-ppc: ppc64 target's virtio can be either endian. (v3)

2014-01-17 Thread Greg Kurz
We base it on the OS endian, as reflected by the endianness of the
interrupt vectors (handled through the ILE bit in the LPCR register).

This patch implements virtio_get_byteswap() over LPCR.

Using first_cpu to fetch the registers from KVM may look arbitrary
and awkward, but it is okay because KVM sets/unsets the ILE bit on
all CPUs.

Changes in v3:
-  dropped the explicit calls to kvm_[get|put]_one_reg as LPCR is
   now properly handled by the generic SPR code.
Changes in v2:
- call cpu_synchronize_state() instead of kvm_arch_get_registers().

Suggested-by: Benjamin Herrenschmidt b...@kernel.crashing.org
Signed-off-by: Rusty Russell ru...@rustcorp.com.au
Signed-off-by: Greg Kurz gk...@linux.vnet.ibm.com
Reviewed-by: Alexander Graf ag...@suse.de
---
 target-ppc/misc_helper.c |   12 
 1 file changed, 12 insertions(+)

diff --git a/target-ppc/misc_helper.c b/target-ppc/misc_helper.c
index 616aab6..e8fc8a3 100644
--- a/target-ppc/misc_helper.c
+++ b/target-ppc/misc_helper.c
@@ -20,6 +20,8 @@
 #include helper.h
 
 #include helper_regs.h
+#include hw/virtio/virtio.h
+#include sysemu/kvm.h
 
 /*/
 /* SPR accesses */
@@ -116,3 +118,13 @@ void ppc_store_msr(CPUPPCState *env, target_ulong value)
 {
 hreg_store_msr(env, value, 0);
 }
+
+bool virtio_get_byteswap(void)
+{
+PowerPCCPU *cp = POWERPC_CPU(first_cpu);
+CPUPPCState *env = cp-env;
+
+cpu_synchronize_state(first_cpu);
+
+return env-spr[SPR_LPCR]  LPCR_ILE;
+}




[Qemu-devel] [PATCH v3 05/29] block: Detect unaligned length in bdrv_qiov_is_aligned()

2014-01-17 Thread Kevin Wolf
For an O_DIRECT request to succeed, it's not only necessary that all
base addresses in the qiov are aligned, but also that each length in it
is aligned.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block.c b/block.c
index 9a8562d..bf9ac5f 100644
--- a/block.c
+++ b/block.c
@@ -4566,6 +4566,9 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, 
QEMUIOVector *qiov)
 if ((uintptr_t) qiov-iov[i].iov_base % bs-buffer_alignment) {
 return false;
 }
+if (qiov-iov[i].iov_len % bs-buffer_alignment) {
+return false;
+}
 }
 
 return true;
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 03/29] block: Update BlockLimits when they might have changed

2014-01-17 Thread Kevin Wolf
When reopening with different flags, or when backing files disappear
from the chain, the limits may change. Make sure they get updated in
these cases.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c   | 5 -
 block/stream.c| 2 ++
 include/block/block.h | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 20f85cb..9a8562d 100644
--- a/block.c
+++ b/block.c
@@ -479,7 +479,7 @@ int bdrv_create_file(const char* filename, 
QEMUOptionParameter *options,
 return ret;
 }
 
-static int bdrv_refresh_limits(BlockDriverState *bs)
+int bdrv_refresh_limits(BlockDriverState *bs)
 {
 BlockDriver *drv = bs-drv;
 
@@ -1464,6 +1464,8 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state)
 reopen_state-bs-enable_write_cache = !!(reopen_state-flags 
   BDRV_O_CACHE_WB);
 reopen_state-bs-read_only = !(reopen_state-flags  BDRV_O_RDWR);
+
+bdrv_refresh_limits(reopen_state-bs);
 }
 
 /*
@@ -2261,6 +2263,7 @@ int bdrv_drop_intermediate(BlockDriverState *active, 
BlockDriverState *top,
 }
 new_top_bs-backing_hd = base_bs;
 
+bdrv_refresh_limits(new_top_bs);
 
 QSIMPLEQ_FOREACH_SAFE(intermediate_state, states_to_delete, entry, next) {
 /* so that bdrv_close() does not recursively close the chain */
diff --git a/block/stream.c b/block/stream.c
index 46bec7d..dd0b4ac 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -75,6 +75,8 @@ static void close_unused_images(BlockDriverState *top, 
BlockDriverState *base,
 unused-backing_hd = NULL;
 bdrv_unref(unused);
 }
+
+bdrv_refresh_limits(top);
 }
 
 static void coroutine_fn stream_run(void *opaque)
diff --git a/include/block/block.h b/include/block/block.h
index 36efaea..64fb319 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -249,6 +249,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset);
 int64_t bdrv_getlength(BlockDriverState *bs);
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
 void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
+int bdrv_refresh_limits(BlockDriverState *bs);
 int bdrv_commit(BlockDriverState *bs);
 int bdrv_commit_all(void);
 int bdrv_change_backing_file(BlockDriverState *bs,
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 00/29] block: Support for 512b-on-4k emulation

2014-01-17 Thread Kevin Wolf
This patch series adds code to the block layer that allows performing
I/O requests in smaller granularities than required by the host backend
(most importantly, O_DIRECT restrictions). It achieves this for reads
by rounding the request to host-side block boundary, and for writes by
performing a read-modify-write cycle (and serialising requests
touching the same block so that the RMW doesn't write back stale data).

Originally I intended to reuse a lot of code from Paolo's previous
patch series, however as I tried to integrate pread/pwrite, which
already do a very similar thing (except for considering concurrency),
and because I wanted to implement zero-copy, most of this series ended
up being new code.

Zero-copy is possible in a common case because while XFS defauls to a
4k sector size and therefore 4k on-disk O_DIRECT alignment for 512E
disks, it still only has a 512 byte memory alignment requirement.
(Unfortunately the XFS_IOC_DIOINFO ioctl claims 4k even for memory, but
we know that the value is wrong and can probe it.)


Changes in v2 - v3:
- Fixed I/O throttling bypass by converting to byte granularity [Wenchao]
- Made 'bytes' argument to tracked_request_overlaps() unsigned [Max]
- Fixed a corruption bug that came from using outdated RMW buffers after
  waiting for another request and added some assertions to check the
  assumptions [Peter]
- Fixed bytes vs. sectors error in zero-after-EOF code of
  bdrv_co_do_preadv [Max]
- Removed orphaned protoype in block.h [Max]
- A qemu-iotests case and some infrastructure to support it

Changes in v1 - v2:
- Fixed overlap_bytes calculation in mark_request_serialising()
- Fixed wait_serialising_requests() deadlock
- iscsi: Set bs-request_alignment [Peter]
- iscsi: Query block limits only in iscsi_open() when no other request
  are in flight, and in iscsi_refresh_limits() copy the stored values
  into bs-bl [Peter]

Changes in RFC - v1:
- Moved opt_mem_alignment into BlockLimits [Paolo]
- Changed BlockLimits in turn to work a bit more like the
  .bdrv_opt_mem_align() callback of the RFC; allows updating the
  BlockLimits later when the chain changes or bdrv_reopen() toggles
  O_DIRECT
- Fixed a typo in a commit message [Eric]


Kevin Wolf (26):
  block: Move initialisation of BlockLimits to bdrv_refresh_limits()
  block: Inherit opt_transfer_length
  block: Update BlockLimits when they might have changed
  qemu_memalign: Allow small alignments
  block: Detect unaligned length in bdrv_qiov_is_aligned()
  block: Don't use guest sector size for qemu_blockalign()
  block: Introduce bdrv_aligned_preadv()
  block: Introduce bdrv_co_do_preadv()
  block: Introduce bdrv_aligned_pwritev()
  block: write: Handle COR dependency after I/O throttling
  block: Introduce bdrv_co_do_pwritev()
  block: Switch BdrvTrackedRequest to byte granularity
  block: Allow waiting for overlapping requests between begin/end
  block: Make zero-after-EOF work with larger alignment
  block: Generalise and optimise COR serialisation
  block: Make overlap range for serialisation dynamic
  block: Allow wait_serialising_requests() at any point
  block: Align requests in bdrv_co_do_pwritev()
  block: Assert serialisation assumptions in pwritev
  block: Change coroutine wrapper to byte granularity
  block: Make bdrv_pread() a bdrv_prwv_co() wrapper
  block: Make bdrv_pwrite() a bdrv_prwv_co() wrapper
  blkdebug: Make required alignment configurable
  qemu-io: New command 'sleep'
  qemu-iotests: Test pwritev RMW logic
  block: Switch bdrv_io_limits_intercept() to byte granularity

Paolo Bonzini (3):
  block: rename buffer_alignment to guest_block_size
  raw: Probe required direct I/O alignment
  iscsi: Set bs-request_alignment

 block.c| 644 +++--
 block/backup.c |   7 +-
 block/blkdebug.c   |  24 ++
 block/iscsi.c  |  47 ++--
 block/qcow2.c  |  11 +-
 block/qed.c|  11 +-
 block/raw-posix.c  | 102 +--
 block/raw-win32.c  |  41 +++
 block/stream.c |   2 +
 block/vmdk.c   |  22 +-
 hw/block/virtio-blk.c  |   2 +-
 hw/ide/core.c  |   2 +-
 hw/scsi/scsi-disk.c|   2 +-
 hw/scsi/scsi-generic.c |   2 +-
 include/block/block.h  |  15 +-
 include/block/block_int.h  |  27 +-
 qemu-io-cmds.c |  42 +++
 tests/qemu-iotests/077 | 278 +++
 tests/qemu-iotests/077.out | 202 ++
 tests/qemu-iotests/group   |   1 +
 util/oslib-posix.c |   5 +
 21 files changed, 1234 insertions(+), 255 deletions(-)
 create mode 100755 tests/qemu-iotests/077
 create mode 100644 tests/qemu-iotests/077.out

-- 
1.8.1.4




[Qemu-devel] [PATCH v3 04/29] qemu_memalign: Allow small alignments

2014-01-17 Thread Kevin Wolf
The functions used by qemu_memalign() require an alignment that is at
least sizeof(void*). Adjust it if it is too small.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 util/oslib-posix.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e00a44c..54f8932 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -85,6 +85,11 @@ void *qemu_oom_check(void *ptr)
 void *qemu_memalign(size_t alignment, size_t size)
 {
 void *ptr;
+
+if (alignment  sizeof(void*)) {
+alignment = sizeof(void*);
+}
+
 #if defined(_POSIX_C_SOURCE)  !defined(__sun__)
 int ret;
 ret = posix_memalign(ptr, alignment, size);
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 07/29] block: rename buffer_alignment to guest_block_size

2014-01-17 Thread Kevin Wolf
From: Paolo Bonzini pbonz...@redhat.com

The alignment field is now set to the value that is promised to the
guest, rather than required by the host.  The next patches will make
QEMU aware of the host-provided values, so make this clear.

The alignment is also not about memory buffers, but about the sectors on
the disk, change the documentation of the field.

At this point, the field is set by the device emulation, but completely
ignored by the block layer.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c   | 10 +-
 hw/block/virtio-blk.c |  2 +-
 hw/ide/core.c |  2 +-
 hw/scsi/scsi-disk.c   |  2 +-
 hw/scsi/scsi-generic.c|  2 +-
 include/block/block.h |  2 +-
 include/block/block_int.h |  4 ++--
 7 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/block.c b/block.c
index 04c409a..b738abe 100644
--- a/block.c
+++ b/block.c
@@ -812,7 +812,7 @@ static int bdrv_open_common(BlockDriverState *bs, 
BlockDriverState *file,
 }
 
 bs-open_flags = flags;
-bs-buffer_alignment = 512;
+bs-guest_block_size = 512;
 bs-zero_beyond_eof = true;
 open_flags = bdrv_open_flags(bs, flags);
 bs-read_only = !(open_flags  BDRV_O_RDWR);
@@ -1648,7 +1648,7 @@ static void bdrv_move_feature_fields(BlockDriverState 
*bs_dest,
 bs_dest-dev_ops= bs_src-dev_ops;
 bs_dest-dev_opaque = bs_src-dev_opaque;
 bs_dest-dev= bs_src-dev;
-bs_dest-buffer_alignment   = bs_src-buffer_alignment;
+bs_dest-guest_block_size   = bs_src-guest_block_size;
 bs_dest-copy_on_read   = bs_src-copy_on_read;
 
 bs_dest-enable_write_cache = bs_src-enable_write_cache;
@@ -1800,7 +1800,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev)
 bs-dev = NULL;
 bs-dev_ops = NULL;
 bs-dev_opaque = NULL;
-bs-buffer_alignment = 512;
+bs-guest_block_size = 512;
 }
 
 /* TODO change to return DeviceState * when all users are qdevified */
@@ -4561,9 +4561,9 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
 return NULL;
 }
 
-void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
+void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
 {
-bs-buffer_alignment = align;
+bs-guest_block_size = align;
 }
 
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 19d0961..8a568e5 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -731,7 +731,7 @@ static void virtio_blk_device_realize(DeviceState *dev, 
Error **errp)
 register_savevm(dev, virtio-blk, virtio_blk_id++, 2,
 virtio_blk_save, virtio_blk_load, s);
 bdrv_set_dev_ops(s-bs, virtio_block_ops, s);
-bdrv_set_buffer_alignment(s-bs, s-conf-logical_block_size);
+bdrv_set_guest_block_size(s-bs, s-conf-logical_block_size);
 
 bdrv_iostatus_enable(s-bs);
 
diff --git a/hw/ide/core.c b/hw/ide/core.c
index e1f4c33..036cd4a 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -2103,7 +2103,7 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs, 
IDEDriveKind kind,
 s-smart_selftest_count = 0;
 if (kind == IDE_CD) {
 bdrv_set_dev_ops(bs, ide_cd_block_ops, s);
-bdrv_set_buffer_alignment(bs, 2048);
+bdrv_set_guest_block_size(bs, 2048);
 } else {
 if (!bdrv_is_inserted(s-bs)) {
 error_report(Device needs media, but drive is empty);
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index bce617c..6491091 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2254,7 +2254,7 @@ static int scsi_initfn(SCSIDevice *dev)
 } else {
 bdrv_set_dev_ops(s-qdev.conf.bs, scsi_disk_block_ops, s);
 }
-bdrv_set_buffer_alignment(s-qdev.conf.bs, s-qdev.blocksize);
+bdrv_set_guest_block_size(s-qdev.conf.bs, s-qdev.blocksize);
 
 bdrv_iostatus_enable(s-qdev.conf.bs);
 add_boot_device_path(s-qdev.conf.bootindex, dev-qdev, NULL);
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 8f195be..f08b64e 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -210,7 +210,7 @@ static void scsi_read_complete(void * opaque, int ret)
 s-blocksize = ldl_be_p(r-buf[8]);
 s-max_lba = ldq_be_p(r-buf[0]);
 }
-bdrv_set_buffer_alignment(s-conf.bs, s-blocksize);
+bdrv_set_guest_block_size(s-conf.bs, s-blocksize);
 
 scsi_req_data(r-req, len);
 if (!r-req.io_canceled) {
diff --git a/include/block/block.h b/include/block/block.h
index cf63ee2..05252d5 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -422,7 +422,7 @@ void bdrv_img_create(const char *filename, const char *fmt,
 /* Returns the alignment in bytes that is required so that no bounce buffer
  * is required throughout the stack */
 size_t 

[Qemu-devel] [PATCH v3 09/29] block: Introduce bdrv_aligned_preadv()

2014-01-17 Thread Kevin Wolf
This separates the part of bdrv_co_do_readv() that needs to happen
before the request is modified to match the backend alignment, and a
part that needs to be executed afterwards and passes the request to the
BlockDriver.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c | 61 +++--
 1 file changed, 43 insertions(+), 18 deletions(-)

diff --git a/block.c b/block.c
index 25ae896..0c2baf5 100644
--- a/block.c
+++ b/block.c
@@ -2705,26 +2705,24 @@ err:
 }
 
 /*
- * Handle a read request in coroutine context
+ * Forwards an already correctly aligned request to the BlockDriver. This
+ * handles copy on read and zeroing after EOF; any other features must be
+ * implemented by the caller.
  */
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
-BdrvRequestFlags flags)
+static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
+int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
 {
 BlockDriver *drv = bs-drv;
 BdrvTrackedRequest req;
 int ret;
 
-if (!drv) {
-return -ENOMEDIUM;
-}
-if (bdrv_check_request(bs, sector_num, nb_sectors)) {
-return -EIO;
-}
+int64_t sector_num = offset  BDRV_SECTOR_BITS;
+unsigned int nb_sectors = bytes  BDRV_SECTOR_BITS;
 
-if (bs-copy_on_read) {
-flags |= BDRV_REQ_COPY_ON_READ;
-}
+assert((offset  (BDRV_SECTOR_SIZE - 1)) == 0);
+assert((bytes  (BDRV_SECTOR_SIZE - 1)) == 0);
+
+/* Handle Copy on Read and associated serialisation */
 if (flags  BDRV_REQ_COPY_ON_READ) {
 bs-copy_on_read_in_flight++;
 }
@@ -2733,11 +2731,6 @@ static int coroutine_fn 
bdrv_co_do_readv(BlockDriverState *bs,
 wait_for_overlapping_requests(bs, sector_num, nb_sectors);
 }
 
-/* throttling disk I/O */
-if (bs-io_limits_enabled) {
-bdrv_io_limits_intercept(bs, nb_sectors, false);
-}
-
 tracked_request_begin(req, bs, sector_num, nb_sectors, false);
 
 if (flags  BDRV_REQ_COPY_ON_READ) {
@@ -2754,6 +2747,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState 
*bs,
 }
 }
 
+/* Forward the request to the BlockDriver */
 if (!(bs-zero_beyond_eof  bs-growable)) {
 ret = drv-bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 } else {
@@ -2794,6 +2788,37 @@ out:
 return ret;
 }
 
+/*
+ * Handle a read request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+BdrvRequestFlags flags)
+{
+BlockDriver *drv = bs-drv;
+int ret;
+
+if (!drv) {
+return -ENOMEDIUM;
+}
+if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+return -EIO;
+}
+
+if (bs-copy_on_read) {
+flags |= BDRV_REQ_COPY_ON_READ;
+}
+
+/* throttling disk I/O */
+if (bs-io_limits_enabled) {
+bdrv_io_limits_intercept(bs, nb_sectors, false);
+}
+
+ret = bdrv_aligned_preadv(bs, sector_num  BDRV_SECTOR_BITS,
+ nb_sectors  BDRV_SECTOR_BITS, qiov, flags);
+return ret;
+}
+
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
 int nb_sectors, QEMUIOVector *qiov)
 {
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 10/29] block: Introduce bdrv_co_do_preadv()

2014-01-17 Thread Kevin Wolf
Similar to bdrv_pread(), which aligns byte-aligned request to 512 byte
sectors, bdrv_co_do_preadv() takes a byte-aligned request and aligns it
to the alignment specified in bs-request_alignment.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
 block.c | 64 ++--
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index 0c2baf5..f378a10 100644
--- a/block.c
+++ b/block.c
@@ -2791,17 +2791,23 @@ out:
 /*
  * Handle a read request in coroutine context
  */
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags)
 {
 BlockDriver *drv = bs-drv;
+/* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+uint64_t align = MAX(BDRV_SECTOR_SIZE, bs-request_alignment);
+uint8_t *head_buf = NULL;
+uint8_t *tail_buf = NULL;
+QEMUIOVector local_qiov;
+bool use_local_qiov = false;
 int ret;
 
 if (!drv) {
 return -ENOMEDIUM;
 }
-if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+if (bdrv_check_byte_request(bs, offset, bytes)) {
 return -EIO;
 }
 
@@ -2811,14 +2817,60 @@ static int coroutine_fn 
bdrv_co_do_readv(BlockDriverState *bs,
 
 /* throttling disk I/O */
 if (bs-io_limits_enabled) {
-bdrv_io_limits_intercept(bs, nb_sectors, false);
+/* TODO Switch to byte granularity */
+bdrv_io_limits_intercept(bs, bytes  BDRV_SECTOR_BITS, false);
+}
+
+/* Align read if necessary by padding qiov */
+if (offset  (align - 1)) {
+head_buf = qemu_blockalign(bs, align);
+qemu_iovec_init(local_qiov, qiov-niov + 2);
+qemu_iovec_add(local_qiov, head_buf, offset  (align - 1));
+qemu_iovec_concat(local_qiov, qiov, 0, qiov-size);
+use_local_qiov = true;
+
+bytes += offset  (align - 1);
+offset = offset  ~(align - 1);
+}
+
+if ((offset + bytes)  (align - 1)) {
+if (!use_local_qiov) {
+qemu_iovec_init(local_qiov, qiov-niov + 1);
+qemu_iovec_concat(local_qiov, qiov, 0, qiov-size);
+use_local_qiov = true;
+}
+tail_buf = qemu_blockalign(bs, align);
+qemu_iovec_add(local_qiov, tail_buf,
+   align - ((offset + bytes)  (align - 1)));
+
+bytes = ROUND_UP(bytes, align);
+}
+
+ret = bdrv_aligned_preadv(bs, offset, bytes,
+  use_local_qiov ? local_qiov : qiov,
+  flags);
+
+if (use_local_qiov) {
+qemu_iovec_destroy(local_qiov);
+qemu_vfree(head_buf);
+qemu_vfree(tail_buf);
 }
 
-ret = bdrv_aligned_preadv(bs, sector_num  BDRV_SECTOR_BITS,
- nb_sectors  BDRV_SECTOR_BITS, qiov, flags);
 return ret;
 }
 
+static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
+int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+BdrvRequestFlags flags)
+{
+if (nb_sectors  0 || nb_sectors  (UINT_MAX  BDRV_SECTOR_BITS)) {
+return -EINVAL;
+}
+
+return bdrv_co_do_preadv(bs, sector_num  BDRV_SECTOR_BITS,
+ nb_sectors  BDRV_SECTOR_BITS, qiov, flags);
+}
+
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
 int nb_sectors, QEMUIOVector *qiov)
 {
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 08/29] raw: Probe required direct I/O alignment

2014-01-17 Thread Kevin Wolf
From: Paolo Bonzini pbonz...@redhat.com

Add a bs-request_alignment field that contains the required
offset/length alignment for I/O requests and fill it in the raw block
drivers. Use ioctls if possible, else see what alignment it takes for
O_DIRECT to succeed.

While at it, also expose the memory alignment requirements, which may be
(and in practice are) different from the disk alignment requirements.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c   |   3 ++
 block/raw-posix.c | 102 ++
 block/raw-win32.c |  41 +++
 include/block/block_int.h |   3 ++
 4 files changed, 132 insertions(+), 17 deletions(-)

diff --git a/block.c b/block.c
index b738abe..25ae896 100644
--- a/block.c
+++ b/block.c
@@ -813,6 +813,7 @@ static int bdrv_open_common(BlockDriverState *bs, 
BlockDriverState *file,
 
 bs-open_flags = flags;
 bs-guest_block_size = 512;
+bs-request_alignment = 512;
 bs-zero_beyond_eof = true;
 open_flags = bdrv_open_flags(bs, flags);
 bs-read_only = !(open_flags  BDRV_O_RDWR);
@@ -881,6 +882,8 @@ static int bdrv_open_common(BlockDriverState *bs, 
BlockDriverState *file,
 }
 
 bdrv_refresh_limits(bs);
+assert(bdrv_opt_mem_align(bs) != 0);
+assert(bs-request_alignment != 0);
 
 #ifndef _WIN32
 if (bs-is_temporary) {
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 0676037..126a634 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -127,6 +127,8 @@ typedef struct BDRVRawState {
 int fd;
 int type;
 int open_flags;
+size_t buf_align;
+
 #if defined(__linux__)
 /* linux floppy specific */
 int64_t fd_open_time;
@@ -213,6 +215,76 @@ static int raw_normalize_devicepath(const char **filename)
 }
 #endif
 
+static void raw_probe_alignment(BlockDriverState *bs)
+{
+BDRVRawState *s = bs-opaque;
+char *buf;
+unsigned int sector_size;
+
+/* For /dev/sg devices the alignment is not really used.
+   With buffered I/O, we don't have any restrictions. */
+if (bs-sg || !(s-open_flags  O_DIRECT)) {
+bs-request_alignment = 1;
+s-buf_align = 1;
+return;
+}
+
+/* Try a few ioctls to get the right size */
+bs-request_alignment = 0;
+s-buf_align = 0;
+
+#ifdef BLKSSZGET
+if (ioctl(s-fd, BLKSSZGET, sector_size) = 0) {
+bs-request_alignment = sector_size;
+}
+#endif
+#ifdef DKIOCGETBLOCKSIZE
+if (ioctl(s-fd, DKIOCGETBLOCKSIZE, sector_size) = 0) {
+bs-request_alignment = sector_size;
+}
+#endif
+#ifdef DIOCGSECTORSIZE
+if (ioctl(s-fd, DIOCGSECTORSIZE, sector_size) = 0) {
+bs-request_alignment = sector_size;
+}
+#endif
+#ifdef CONFIG_XFS
+if (s-is_xfs) {
+struct dioattr da;
+if (xfsctl(NULL, s-fd, XFS_IOC_DIOINFO, da) = 0) {
+bs-request_alignment = da.d_miniosz;
+/* The kernel returns wrong information for d_mem */
+/* s-buf_align = da.d_mem; */
+}
+}
+#endif
+
+/* If we could not get the sizes so far, we can only guess them */
+if (!s-buf_align) {
+size_t align;
+buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
+for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
+if (pread(s-fd, buf + align, MAX_BLOCKSIZE, 0) = 0) {
+s-buf_align = align;
+break;
+}
+}
+qemu_vfree(buf);
+}
+
+if (!bs-request_alignment) {
+size_t align;
+buf = qemu_memalign(s-buf_align, MAX_BLOCKSIZE);
+for (align = 512; align = MAX_BLOCKSIZE; align = 1) {
+if (pread(s-fd, buf, align, 0) = 0) {
+bs-request_alignment = align;
+break;
+}
+}
+qemu_vfree(buf);
+}
+}
+
 static void raw_parse_flags(int bdrv_flags, int *open_flags)
 {
 assert(open_flags != NULL);
@@ -463,7 +535,6 @@ static int raw_reopen_prepare(BDRVReopenState *state,
 return ret;
 }
 
-
 static void raw_reopen_commit(BDRVReopenState *state)
 {
 BDRVRawReopenState *raw_s = state-opaque;
@@ -499,23 +570,15 @@ static void raw_reopen_abort(BDRVReopenState *state)
 state-opaque = NULL;
 }
 
+static int raw_refresh_limits(BlockDriverState *bs)
+{
+BDRVRawState *s = bs-opaque;
 
-/* XXX: use host sector size if necessary with:
-#ifdef DIOCGSECTORSIZE
-{
-unsigned int sectorsize = 512;
-if (!ioctl(fd, DIOCGSECTORSIZE, sectorsize) 
-sectorsize  bufsize)
-bufsize = sectorsize;
-}
-#endif
-#ifdef CONFIG_COCOA
-uint32_t blockSize = 512;
-if ( !ioctl( fd, DKIOCGETBLOCKSIZE, blockSize )  blockSize  
bufsize) {
-bufsize = blockSize;
-}
-#endif
-*/
+raw_probe_alignment(bs);
+bs-bl.opt_mem_alignment = s-buf_align;
+
+

[Qemu-devel] [PATCH v3 01/29] block: Move initialisation of BlockLimits to bdrv_refresh_limits()

2014-01-17 Thread Kevin Wolf
This function separates filling the BlockLimits from bdrv_open(), which
allows it to call it from other operations which may change the limits
(e.g. modifications to the backing file chain or bdrv_reopen)

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c   | 19 +++
 block/iscsi.c | 46 +-
 block/qcow2.c | 11 ++-
 block/qed.c   | 11 ++-
 block/vmdk.c  | 22 ++
 include/block/block_int.h |  2 ++
 6 files changed, 88 insertions(+), 23 deletions(-)

diff --git a/block.c b/block.c
index 64e7d22..99e69da 100644
--- a/block.c
+++ b/block.c
@@ -479,6 +479,19 @@ int bdrv_create_file(const char* filename, 
QEMUOptionParameter *options,
 return ret;
 }
 
+static int bdrv_refresh_limits(BlockDriverState *bs)
+{
+BlockDriver *drv = bs-drv;
+
+memset(bs-bl, 0, sizeof(bs-bl));
+
+if (drv  drv-bdrv_refresh_limits) {
+return drv-bdrv_refresh_limits(bs);
+}
+
+return 0;
+}
+
 /*
  * Create a uniquely-named empty temporary file.
  * Return 0 upon success, otherwise a negative errno value.
@@ -833,6 +846,8 @@ static int bdrv_open_common(BlockDriverState *bs, 
BlockDriverState *file,
 goto free_and_fail;
 }
 
+bdrv_refresh_limits(bs);
+
 #ifndef _WIN32
 if (bs-is_temporary) {
 assert(bs-filename[0] != '\0');
@@ -1018,6 +1033,10 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*options, Error **errp)
 }
 pstrcpy(bs-backing_file, sizeof(bs-backing_file),
 bs-backing_hd-file-filename);
+
+/* Recalculate the BlockLimits with the backing file */
+bdrv_refresh_limits(bs);
+
 return 0;
 }
 
diff --git a/block/iscsi.c b/block/iscsi.c
index c0ea0c4..3202dc5 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1265,23 +1265,6 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
sizeof(struct scsi_inquiry_block_limits));
 scsi_free_scsi_task(task);
 task = NULL;
-
-if (iscsilun-bl.max_unmap  0x) {
-bs-bl.max_discard = sector_lun2qemu(iscsilun-bl.max_unmap,
- iscsilun);
-}
-bs-bl.discard_alignment = sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
-   iscsilun);
-
-if (iscsilun-bl.max_ws_len  0x) {
-bs-bl.max_write_zeroes = sector_lun2qemu(iscsilun-bl.max_ws_len,
-  iscsilun);
-}
-bs-bl.write_zeroes_alignment = 
sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
-iscsilun);
-
-bs-bl.opt_transfer_length = sector_lun2qemu(iscsilun-bl.opt_xfer_len,
- iscsilun);
 }
 
 #if defined(LIBISCSI_FEATURE_NOP_COUNTER)
@@ -1326,6 +1309,34 @@ static void iscsi_close(BlockDriverState *bs)
 memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
+static int iscsi_refresh_limits(BlockDriverState *bs)
+{
+IscsiLun *iscsilun = bs-opaque;
+
+/* We don't actually refresh here, but just return data queried in
+ * iscsi_open(): iscsi targets don't change their limits. */
+if (iscsilun-lbp.lbpu || iscsilun-lbp.lbpws) {
+if (iscsilun-bl.max_unmap  0x) {
+bs-bl.max_discard = sector_lun2qemu(iscsilun-bl.max_unmap,
+ iscsilun);
+}
+bs-bl.discard_alignment = sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
+   iscsilun);
+
+if (iscsilun-bl.max_ws_len  0x) {
+bs-bl.max_write_zeroes = sector_lun2qemu(iscsilun-bl.max_ws_len,
+  iscsilun);
+}
+bs-bl.write_zeroes_alignment = 
sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
+iscsilun);
+
+bs-bl.opt_transfer_length = sector_lun2qemu(iscsilun-bl.opt_xfer_len,
+ iscsilun);
+}
+
+return 0;
+}
+
 static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
 {
 IscsiLun *iscsilun = bs-opaque;
@@ -1438,6 +1449,7 @@ static BlockDriver bdrv_iscsi = {
 .bdrv_getlength  = iscsi_getlength,
 .bdrv_get_info   = iscsi_get_info,
 .bdrv_truncate   = iscsi_truncate,
+.bdrv_refresh_limits = iscsi_refresh_limits,
 
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
 .bdrv_co_get_block_status = iscsi_co_get_block_status,
diff --git a/block/qcow2.c b/block/qcow2.c
index 8ec9db1..6562994 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -718,7 +718,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, 
int flags,
 }
 
 qemu_opts_del(opts);
-

[Qemu-devel] [PATCH v3 06/29] block: Don't use guest sector size for qemu_blockalign()

2014-01-17 Thread Kevin Wolf
bs-buffer_alignment is set by the device emulation and contains the
logical block size of the guest device. This isn't something that the
block layer should know, and even less something to use for determining
the right alignment of buffers to be used for the host.

The new BlockLimits field opt_mem_alignment tells the qemu block layer
the optimal alignment to be used so that no bounce buffer must be used
in the driver.

This patch may change the buffer alignment from 4k to 512 for all
callers that used qemu_blockalign() with the top-level image format
BlockDriverState. The value was never propagated to other levels in the
tree, so in particular raw-posix never required anything else than 512.

While on disks with 4k sectors direct I/O requires a 4k alignment,
memory may still be okay when aligned to 512 byte boundaries. This is
what must have happened in practice, because otherwise this would
already have failed earlier. Therefore I don't expect regressions even
with this intermediate state. Later, raw-posix can implement the hook
and expose a different memory alignment requirement.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c   | 23 ---
 include/block/block.h |  3 +++
 include/block/block_int.h |  3 +++
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index bf9ac5f..04c409a 100644
--- a/block.c
+++ b/block.c
@@ -214,6 +214,16 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
 qemu_co_queue_next(bs-throttled_reqs[is_write]);
 }
 
+size_t bdrv_opt_mem_align(BlockDriverState *bs)
+{
+if (!bs || !bs-drv) {
+/* 4k should be on the safe side */
+return 4096;
+}
+
+return bs-bl.opt_mem_alignment;
+}
+
 /* check if the path starts with protocol: */
 static int path_has_protocol(const char *path)
 {
@@ -493,6 +503,9 @@ int bdrv_refresh_limits(BlockDriverState *bs)
 if (bs-file) {
 bdrv_refresh_limits(bs-file);
 bs-bl.opt_transfer_length = bs-file-bl.opt_transfer_length;
+bs-bl.opt_mem_alignment = bs-file-bl.opt_mem_alignment;
+} else {
+bs-bl.opt_mem_alignment = 512;
 }
 
 if (bs-backing_hd) {
@@ -500,6 +513,9 @@ int bdrv_refresh_limits(BlockDriverState *bs)
 bs-bl.opt_transfer_length =
 MAX(bs-bl.opt_transfer_length,
 bs-backing_hd-bl.opt_transfer_length);
+bs-bl.opt_mem_alignment =
+MAX(bs-bl.opt_mem_alignment,
+bs-backing_hd-bl.opt_mem_alignment);
 }
 
 /* Then let the driver override it */
@@ -4552,7 +4568,7 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int 
align)
 
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
-return qemu_memalign((bs  bs-buffer_alignment) ? bs-buffer_alignment : 
512, size);
+return qemu_memalign(bdrv_opt_mem_align(bs), size);
 }
 
 /*
@@ -4561,12 +4577,13 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size)
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
 {
 int i;
+size_t alignment = bdrv_opt_mem_align(bs);
 
 for (i = 0; i  qiov-niov; i++) {
-if ((uintptr_t) qiov-iov[i].iov_base % bs-buffer_alignment) {
+if ((uintptr_t) qiov-iov[i].iov_base % alignment) {
 return false;
 }
-if (qiov-iov[i].iov_len % bs-buffer_alignment) {
+if (qiov-iov[i].iov_len % alignment) {
 return false;
 }
 }
diff --git a/include/block/block.h b/include/block/block.h
index 64fb319..cf63ee2 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -419,6 +419,9 @@ void bdrv_img_create(const char *filename, const char *fmt,
  char *options, uint64_t img_size, int flags,
  Error **errp, bool quiet);
 
+/* Returns the alignment in bytes that is required so that no bounce buffer
+ * is required throughout the stack */
+size_t bdrv_opt_mem_align(BlockDriverState *bs);
 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 3dfe724..a602e14 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -252,6 +252,9 @@ typedef struct BlockLimits {
 
 /* optimal transfer length in sectors */
 int opt_transfer_length;
+
+/* memory alignment so that no bounce buffer is needed */
+size_t opt_mem_alignment;
 } BlockLimits;
 
 /*
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 16/29] block: Make zero-after-EOF work with larger alignment

2014-01-17 Thread Kevin Wolf
Odd file sizes could make bdrv_aligned_preadv() shorten the request in
non-aligned ways. Fix it by rounding to the required alignment instead
of 512 bytes.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
 block.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 18dcd43..d7156ce 100644
--- a/block.c
+++ b/block.c
@@ -2730,7 +2730,7 @@ err:
  */
 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
-QEMUIOVector *qiov, int flags)
+int64_t align, QEMUIOVector *qiov, int flags)
 {
 BlockDriver *drv = bs-drv;
 int ret;
@@ -2778,7 +2778,8 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 }
 
 total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE);
-max_nb_sectors = MAX(0, total_sectors - sector_num);
+max_nb_sectors = MAX(0, ROUND_UP(total_sectors - sector_num,
+ align  BDRV_SECTOR_BITS));
 if (max_nb_sectors  0) {
 ret = drv-bdrv_co_readv(bs, sector_num,
  MIN(nb_sectors, max_nb_sectors), qiov);
@@ -2864,7 +2865,7 @@ static int coroutine_fn 
bdrv_co_do_preadv(BlockDriverState *bs,
 }
 
 tracked_request_begin(req, bs, offset, bytes, false);
-ret = bdrv_aligned_preadv(bs, req, offset, bytes,
+ret = bdrv_aligned_preadv(bs, req, offset, bytes, align,
   use_local_qiov ? local_qiov : qiov,
   flags);
 tracked_request_end(req);
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 02/29] block: Inherit opt_transfer_length

2014-01-17 Thread Kevin Wolf
When there is a format driver between the backend, it's not guaranteed
that exposing the opt_transfer_length for the format driver results in
the optimal requests (because of fragmentation etc.), but it can't make
things worse, so let's just do it.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 99e69da..20f85cb 100644
--- a/block.c
+++ b/block.c
@@ -485,7 +485,25 @@ static int bdrv_refresh_limits(BlockDriverState *bs)
 
 memset(bs-bl, 0, sizeof(bs-bl));
 
-if (drv  drv-bdrv_refresh_limits) {
+if (!drv) {
+return 0;
+}
+
+/* Take some limits from the children as a default */
+if (bs-file) {
+bdrv_refresh_limits(bs-file);
+bs-bl.opt_transfer_length = bs-file-bl.opt_transfer_length;
+}
+
+if (bs-backing_hd) {
+bdrv_refresh_limits(bs-backing_hd);
+bs-bl.opt_transfer_length =
+MAX(bs-bl.opt_transfer_length,
+bs-backing_hd-bl.opt_transfer_length);
+}
+
+/* Then let the driver override it */
+if (drv-bdrv_refresh_limits) {
 return drv-bdrv_refresh_limits(bs);
 }
 
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 11/29] block: Introduce bdrv_aligned_pwritev()

2014-01-17 Thread Kevin Wolf
This separates the part of bdrv_co_do_writev() that needs to happen
before the request is modified to match the backend alignment, and a
part that needs to be executed afterwards and passes the request to the
BlockDriver.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c | 62 +-
 1 file changed, 41 insertions(+), 21 deletions(-)

diff --git a/block.c b/block.c
index f378a10..70b72f0 100644
--- a/block.c
+++ b/block.c
@@ -2964,34 +2964,20 @@ static int coroutine_fn 
bdrv_co_do_write_zeroes(BlockDriverState *bs,
 }
 
 /*
- * Handle a write request in coroutine context
+ * Forwards an already correctly aligned write request to the BlockDriver.
  */
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
-BdrvRequestFlags flags)
+static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
+int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
 {
 BlockDriver *drv = bs-drv;
 BdrvTrackedRequest req;
 int ret;
 
-if (!bs-drv) {
-return -ENOMEDIUM;
-}
-if (bs-read_only) {
-return -EACCES;
-}
-if (bdrv_check_request(bs, sector_num, nb_sectors)) {
-return -EIO;
-}
-
-if (bs-copy_on_read_in_flight) {
-wait_for_overlapping_requests(bs, sector_num, nb_sectors);
-}
+int64_t sector_num = offset  BDRV_SECTOR_BITS;
+unsigned int nb_sectors = bytes  BDRV_SECTOR_BITS;
 
-/* throttling disk I/O */
-if (bs-io_limits_enabled) {
-bdrv_io_limits_intercept(bs, nb_sectors, true);
-}
+assert((offset  (BDRV_SECTOR_SIZE - 1)) == 0);
+assert((bytes  (BDRV_SECTOR_SIZE - 1)) == 0);
 
 tracked_request_begin(req, bs, sector_num, nb_sectors, true);
 
@@ -3023,6 +3009,40 @@ static int coroutine_fn 
bdrv_co_do_writev(BlockDriverState *bs,
 return ret;
 }
 
+/*
+ * Handle a write request in coroutine context
+ */
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+BdrvRequestFlags flags)
+{
+int ret;
+
+if (!bs-drv) {
+return -ENOMEDIUM;
+}
+if (bs-read_only) {
+return -EACCES;
+}
+if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+return -EIO;
+}
+
+if (bs-copy_on_read_in_flight) {
+wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+}
+
+/* throttling disk I/O */
+if (bs-io_limits_enabled) {
+bdrv_io_limits_intercept(bs, nb_sectors, true);
+}
+
+ret = bdrv_aligned_pwritev(bs, sector_num  BDRV_SECTOR_BITS,
+   nb_sectors  BDRV_SECTOR_BITS, qiov, flags);
+
+return ret;
+}
+
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
 int nb_sectors, QEMUIOVector *qiov)
 {
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 13/29] block: Introduce bdrv_co_do_pwritev()

2014-01-17 Thread Kevin Wolf
This is going to become the bdrv_co_do_preadv() equivalent for writes.
In this patch, however, just a function taking byte offsets is created,
it doesn't align anything yet.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
 block.c | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index fe55217..328f592 100644
--- a/block.c
+++ b/block.c
@@ -3016,8 +3016,8 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 /*
  * Handle a write request in coroutine context
  */
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags)
 {
 int ret;
@@ -3028,21 +3028,33 @@ static int coroutine_fn 
bdrv_co_do_writev(BlockDriverState *bs,
 if (bs-read_only) {
 return -EACCES;
 }
-if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+if (bdrv_check_byte_request(bs, offset, bytes)) {
 return -EIO;
 }
 
 /* throttling disk I/O */
 if (bs-io_limits_enabled) {
-bdrv_io_limits_intercept(bs, nb_sectors, true);
+/* TODO Switch to byte granularity */
+bdrv_io_limits_intercept(bs, bytes  BDRV_SECTOR_BITS, true);
 }
 
-ret = bdrv_aligned_pwritev(bs, sector_num  BDRV_SECTOR_BITS,
-   nb_sectors  BDRV_SECTOR_BITS, qiov, flags);
+ret = bdrv_aligned_pwritev(bs, offset, bytes, qiov, flags);
 
 return ret;
 }
 
+static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
+int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+BdrvRequestFlags flags)
+{
+if (nb_sectors  0 || nb_sectors  (INT_MAX  BDRV_SECTOR_BITS)) {
+return -EINVAL;
+}
+
+return bdrv_co_do_pwritev(bs, sector_num  BDRV_SECTOR_BITS,
+  nb_sectors  BDRV_SECTOR_BITS, qiov, flags);
+}
+
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
 int nb_sectors, QEMUIOVector *qiov)
 {
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 12/29] block: write: Handle COR dependency after I/O throttling

2014-01-17 Thread Kevin Wolf
First waiting for all COR requests to complete and calling the
throttling function afterwards means that the request could be delayed
and we still need to wait for the COR request even if it was issued only
after the throttled write request.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index 70b72f0..fe55217 100644
--- a/block.c
+++ b/block.c
@@ -2979,6 +2979,10 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 assert((offset  (BDRV_SECTOR_SIZE - 1)) == 0);
 assert((bytes  (BDRV_SECTOR_SIZE - 1)) == 0);
 
+if (bs-copy_on_read_in_flight) {
+wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+}
+
 tracked_request_begin(req, bs, sector_num, nb_sectors, true);
 
 ret = notifier_with_return_list_notify(bs-before_write_notifiers, req);
@@ -3028,10 +3032,6 @@ static int coroutine_fn 
bdrv_co_do_writev(BlockDriverState *bs,
 return -EIO;
 }
 
-if (bs-copy_on_read_in_flight) {
-wait_for_overlapping_requests(bs, sector_num, nb_sectors);
-}
-
 /* throttling disk I/O */
 if (bs-io_limits_enabled) {
 bdrv_io_limits_intercept(bs, nb_sectors, true);
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 15/29] block: Allow waiting for overlapping requests between begin/end

2014-01-17 Thread Kevin Wolf
Previously, it was not possible to use wait_for_overlapping_requests()
between tracked_request_begin()/end() because it would wait for itself.

Ignore the current request in the overlap check and run more of the
bdrv_co_do_preadv/pwritev code with a BdrvTrackedRequest present.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c | 38 --
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/block.c b/block.c
index 85f28ab..18dcd43 100644
--- a/block.c
+++ b/block.c
@@ -2106,7 +2106,7 @@ static bool tracked_request_overlaps(BdrvTrackedRequest 
*req,
 }
 
 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
-int64_t offset, unsigned int bytes)
+BdrvTrackedRequest *self, int64_t offset, unsigned int bytes)
 {
 BdrvTrackedRequest *req;
 int64_t cluster_offset;
@@ -2124,6 +2124,9 @@ static void coroutine_fn 
wait_for_overlapping_requests(BlockDriverState *bs,
 do {
 retry = false;
 QLIST_FOREACH(req, bs-tracked_requests, list) {
+if (req == self) {
+continue;
+}
 if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) {
 /* Hitting this means there was a reentrant request, for
  * example, a block driver issuing nested requests.  This must
@@ -2726,10 +2729,10 @@ err:
  * implemented by the caller.
  */
 static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
-int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
+BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+QEMUIOVector *qiov, int flags)
 {
 BlockDriver *drv = bs-drv;
-BdrvTrackedRequest req;
 int ret;
 
 int64_t sector_num = offset  BDRV_SECTOR_BITS;
@@ -2744,11 +2747,9 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 }
 
 if (bs-copy_on_read_in_flight) {
-wait_for_overlapping_requests(bs, offset, bytes);
+wait_for_overlapping_requests(bs, req, offset, bytes);
 }
 
-tracked_request_begin(req, bs, offset, bytes, false);
-
 if (flags  BDRV_REQ_COPY_ON_READ) {
 int pnum;
 
@@ -2795,8 +2796,6 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 }
 
 out:
-tracked_request_end(req);
-
 if (flags  BDRV_REQ_COPY_ON_READ) {
 bs-copy_on_read_in_flight--;
 }
@@ -2812,6 +2811,8 @@ static int coroutine_fn 
bdrv_co_do_preadv(BlockDriverState *bs,
 BdrvRequestFlags flags)
 {
 BlockDriver *drv = bs-drv;
+BdrvTrackedRequest req;
+
 /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
 uint64_t align = MAX(BDRV_SECTOR_SIZE, bs-request_alignment);
 uint8_t *head_buf = NULL;
@@ -2862,9 +2863,11 @@ static int coroutine_fn 
bdrv_co_do_preadv(BlockDriverState *bs,
 bytes = ROUND_UP(bytes, align);
 }
 
-ret = bdrv_aligned_preadv(bs, offset, bytes,
+tracked_request_begin(req, bs, offset, bytes, false);
+ret = bdrv_aligned_preadv(bs, req, offset, bytes,
   use_local_qiov ? local_qiov : qiov,
   flags);
+tracked_request_end(req);
 
 if (use_local_qiov) {
 qemu_iovec_destroy(local_qiov);
@@ -2983,10 +2986,10 @@ static int coroutine_fn 
bdrv_co_do_write_zeroes(BlockDriverState *bs,
  * Forwards an already correctly aligned write request to the BlockDriver.
  */
 static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
-int64_t offset, unsigned int bytes, QEMUIOVector *qiov, int flags)
+BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
+QEMUIOVector *qiov, int flags)
 {
 BlockDriver *drv = bs-drv;
-BdrvTrackedRequest req;
 int ret;
 
 int64_t sector_num = offset  BDRV_SECTOR_BITS;
@@ -2996,12 +2999,10 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 assert((bytes  (BDRV_SECTOR_SIZE - 1)) == 0);
 
 if (bs-copy_on_read_in_flight) {
-wait_for_overlapping_requests(bs, offset, bytes);
+wait_for_overlapping_requests(bs, req, offset, bytes);
 }
 
-tracked_request_begin(req, bs, offset, bytes, true);
-
-ret = notifier_with_return_list_notify(bs-before_write_notifiers, req);
+ret = notifier_with_return_list_notify(bs-before_write_notifiers, req);
 
 if (ret  0) {
 /* Do nothing, write notifier decided to fail this request */
@@ -3024,8 +3025,6 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 bs-total_sectors = MAX(bs-total_sectors, sector_num + nb_sectors);
 }
 
-tracked_request_end(req);
-
 return ret;
 }
 
@@ -3036,6 +3035,7 @@ static int coroutine_fn 
bdrv_co_do_pwritev(BlockDriverState *bs,
 int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags)
 {
+BdrvTrackedRequest req;
 int ret;
 
 if (!bs-drv) {
@@ -3054,7 +3054,9 @@ 

[Qemu-devel] [PATCH v3 20/29] block: Align requests in bdrv_co_do_pwritev()

2014-01-17 Thread Kevin Wolf
This patch changes bdrv_co_do_pwritev() to actually be what its name
promises. If requests aren't properly aligned, it performs a RMW.

Requests touching the same block are serialised against the RMW request.
Further optimisation of this is possible by differentiating types of
requests (concurrent reads should actually be okay here).

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c | 86 -
 1 file changed, 85 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 55e8c69..859e1aa 100644
--- a/block.c
+++ b/block.c
@@ -3055,6 +3055,12 @@ static int coroutine_fn 
bdrv_co_do_pwritev(BlockDriverState *bs,
 BdrvRequestFlags flags)
 {
 BdrvTrackedRequest req;
+/* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */
+uint64_t align = MAX(BDRV_SECTOR_SIZE, bs-request_alignment);
+uint8_t *head_buf = NULL;
+uint8_t *tail_buf = NULL;
+QEMUIOVector local_qiov;
+bool use_local_qiov = false;
 int ret;
 
 if (!bs-drv) {
@@ -3073,10 +3079,88 @@ static int coroutine_fn 
bdrv_co_do_pwritev(BlockDriverState *bs,
 bdrv_io_limits_intercept(bs, bytes  BDRV_SECTOR_BITS, true);
 }
 
+/*
+ * Align write if necessary by performing a read-modify-write cycle.
+ * Pad qiov with the read parts and be sure to have a tracked request not
+ * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
+ */
 tracked_request_begin(req, bs, offset, bytes, true);
-ret = bdrv_aligned_pwritev(bs, req, offset, bytes, qiov, flags);
+
+if (offset  (align - 1)) {
+QEMUIOVector head_qiov;
+struct iovec head_iov;
+
+mark_request_serialising(req, align);
+wait_serialising_requests(req);
+
+head_buf = qemu_blockalign(bs, align);
+head_iov = (struct iovec) {
+.iov_base   = head_buf,
+.iov_len= align,
+};
+qemu_iovec_init_external(head_qiov, head_iov, 1);
+
+ret = bdrv_aligned_preadv(bs, req, offset  ~(align - 1), align,
+  align, head_qiov, 0);
+if (ret  0) {
+goto fail;
+}
+
+qemu_iovec_init(local_qiov, qiov-niov + 2);
+qemu_iovec_add(local_qiov, head_buf, offset  (align - 1));
+qemu_iovec_concat(local_qiov, qiov, 0, qiov-size);
+use_local_qiov = true;
+
+bytes += offset  (align - 1);
+offset = offset  ~(align - 1);
+}
+
+if ((offset + bytes)  (align - 1)) {
+QEMUIOVector tail_qiov;
+struct iovec tail_iov;
+size_t tail_bytes;
+
+mark_request_serialising(req, align);
+wait_serialising_requests(req);
+
+tail_buf = qemu_blockalign(bs, align);
+tail_iov = (struct iovec) {
+.iov_base   = tail_buf,
+.iov_len= align,
+};
+qemu_iovec_init_external(tail_qiov, tail_iov, 1);
+
+ret = bdrv_aligned_preadv(bs, req, (offset + bytes)  ~(align - 1), 
align,
+  align, tail_qiov, 0);
+if (ret  0) {
+goto fail;
+}
+
+if (!use_local_qiov) {
+qemu_iovec_init(local_qiov, qiov-niov + 1);
+qemu_iovec_concat(local_qiov, qiov, 0, qiov-size);
+use_local_qiov = true;
+}
+
+tail_bytes = (offset + bytes)  (align - 1);
+qemu_iovec_add(local_qiov, tail_buf + tail_bytes, align - tail_bytes);
+
+bytes = ROUND_UP(bytes, align);
+}
+
+ret = bdrv_aligned_pwritev(bs, req, offset, bytes,
+   use_local_qiov ? local_qiov : qiov,
+   flags);
+
+fail:
 tracked_request_end(req);
 
+if (use_local_qiov) {
+qemu_iovec_destroy(local_qiov);
+qemu_vfree(head_buf);
+qemu_vfree(tail_buf);
+}
+
 return ret;
 }
 
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 21/29] block: Assert serialisation assumptions in pwritev

2014-01-17 Thread Kevin Wolf
If a request calls wait_serialising_requests() and actually has to wait
in this function (i.e. a coroutine yield), other requests can run and
previously read data (like the head or tail buffer) could become
outdated. In this case, we would have to restart from the beginning to
read in the updated data.

However, we're lucky and don't actually need to do that: A request can
only wait in the first call of wait_serialising_requests() because we
mark it as serialising before that call, so any later requests would
wait. So as we don't wait in practice, we don't have to reload the data.

This is an important assumption that may not be broken or data
corruption will happen. Document it with some assertions.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
 block.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index 859e1aa..53d9bd5 100644
--- a/block.c
+++ b/block.c
@@ -2123,14 +2123,15 @@ static bool tracked_request_overlaps(BdrvTrackedRequest 
*req,
 return true;
 }
 
-static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
+static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 {
 BlockDriverState *bs = self-bs;
 BdrvTrackedRequest *req;
 bool retry;
+bool waited = false;
 
 if (!bs-serialising_in_flight) {
-return;
+return false;
 }
 
 do {
@@ -2156,11 +2157,14 @@ static void coroutine_fn 
wait_serialising_requests(BdrvTrackedRequest *self)
 qemu_co_queue_wait(req-wait_queue);
 self-waiting_for = NULL;
 retry = true;
+waited = true;
 break;
 }
 }
 }
 } while (retry);
+
+return waited;
 }
 
 /*
@@ -3011,6 +3015,7 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 QEMUIOVector *qiov, int flags)
 {
 BlockDriver *drv = bs-drv;
+bool waited;
 int ret;
 
 int64_t sector_num = offset  BDRV_SECTOR_BITS;
@@ -3019,7 +3024,8 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 assert((offset  (BDRV_SECTOR_SIZE - 1)) == 0);
 assert((bytes  (BDRV_SECTOR_SIZE - 1)) == 0);
 
-wait_serialising_requests(req);
+waited = wait_serialising_requests(req);
+assert(!waited || !req-serialising);
 
 ret = notifier_with_return_list_notify(bs-before_write_notifiers, req);
 
@@ -3119,9 +3125,11 @@ static int coroutine_fn 
bdrv_co_do_pwritev(BlockDriverState *bs,
 QEMUIOVector tail_qiov;
 struct iovec tail_iov;
 size_t tail_bytes;
+bool waited;
 
 mark_request_serialising(req, align);
-wait_serialising_requests(req);
+waited = wait_serialising_requests(req);
+assert(!waited || !use_local_qiov);
 
 tail_buf = qemu_blockalign(bs, align);
 tail_iov = (struct iovec) {
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 22/29] block: Change coroutine wrapper to byte granularity

2014-01-17 Thread Kevin Wolf
Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c | 48 ++--
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/block.c b/block.c
index 53d9bd5..2a7d1b3 100644
--- a/block.c
+++ b/block.c
@@ -69,11 +69,11 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState 
*bs,
 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
  int64_t sector_num, int nb_sectors,
  QEMUIOVector *iov);
-static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
+int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags);
-static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
+static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
+int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 BdrvRequestFlags flags);
 static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
int64_t sector_num,
@@ -2374,8 +2374,7 @@ static int bdrv_check_request(BlockDriverState *bs, 
int64_t sector_num,
 
 typedef struct RwCo {
 BlockDriverState *bs;
-int64_t sector_num;
-int nb_sectors;
+int64_t offset;
 QEMUIOVector *qiov;
 bool is_write;
 int ret;
@@ -2387,34 +2386,32 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 RwCo *rwco = opaque;
 
 if (!rwco-is_write) {
-rwco-ret = bdrv_co_do_readv(rwco-bs, rwco-sector_num,
- rwco-nb_sectors, rwco-qiov,
- rwco-flags);
-} else {
-rwco-ret = bdrv_co_do_writev(rwco-bs, rwco-sector_num,
-  rwco-nb_sectors, rwco-qiov,
+rwco-ret = bdrv_co_do_preadv(rwco-bs, rwco-offset,
+  rwco-qiov-size, rwco-qiov,
   rwco-flags);
+} else {
+rwco-ret = bdrv_co_do_pwritev(rwco-bs, rwco-offset,
+   rwco-qiov-size, rwco-qiov,
+   rwco-flags);
 }
 }
 
 /*
  * Process a vectored synchronous request using coroutines
  */
-static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num,
-   QEMUIOVector *qiov, bool is_write,
-   BdrvRequestFlags flags)
+static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset,
+QEMUIOVector *qiov, bool is_write,
+BdrvRequestFlags flags)
 {
 Coroutine *co;
 RwCo rwco = {
 .bs = bs,
-.sector_num = sector_num,
-.nb_sectors = qiov-size  BDRV_SECTOR_BITS,
+.offset = offset,
 .qiov = qiov,
 .is_write = is_write,
 .ret = NOT_DONE,
 .flags = flags,
 };
-assert((qiov-size  (BDRV_SECTOR_SIZE - 1)) == 0);
 
 /**
  * In sync call context, when the vcpu is blocked, this throttling timer
@@ -2453,7 +2450,8 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t 
sector_num, uint8_t *buf,
 };
 
 qemu_iovec_init_external(qiov, iov, 1);
-return bdrv_rwv_co(bs, sector_num, qiov, is_write, flags);
+return bdrv_prwv_co(bs, sector_num  BDRV_SECTOR_BITS,
+qiov, is_write, flags);
 }
 
 /* return  0 if error. See bdrv_write() for the return codes */
@@ -2491,7 +2489,7 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
 
 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
 {
-return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
+return bdrv_prwv_co(bs, sector_num  BDRV_SECTOR_BITS, qiov, true, 0);
 }
 
 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
@@ -4611,9 +4609,15 @@ int bdrv_flush(BlockDriverState *bs)
 return rwco.ret;
 }
 
+typedef struct DiscardCo {
+BlockDriverState *bs;
+int64_t sector_num;
+int nb_sectors;
+int ret;
+} DiscardCo;
 static void coroutine_fn bdrv_discard_co_entry(void *opaque)
 {
-RwCo *rwco = opaque;
+DiscardCo *rwco = opaque;
 
 rwco-ret = bdrv_co_discard(rwco-bs, rwco-sector_num, rwco-nb_sectors);
 }
@@ -4697,7 +4701,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, 
int64_t sector_num,
 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
 {
 Coroutine *co;
-RwCo rwco = {
+DiscardCo rwco = {
 .bs = bs,
 .sector_num = sector_num,
 .nb_sectors = nb_sectors,
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 18/29] block: Make overlap range for serialisation dynamic

2014-01-17 Thread Kevin Wolf
Copy on Read wants to serialise with all requests touching the same
cluster, so wait_serialising_requests() rounded to cluster boundaries.
Other users like alignment RMW will have different requirements, though
(requests touching the same sector), so make it dynamic.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c   | 53 ---
 include/block/block_int.h |  4 
 2 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/block.c b/block.c
index efa8979..e72966a 100644
--- a/block.c
+++ b/block.c
@@ -2051,6 +2051,8 @@ static void tracked_request_begin(BdrvTrackedRequest *req,
 .is_write   = is_write,
 .co = qemu_coroutine_self(),
 .serialising= false,
+.overlap_offset = offset,
+.overlap_bytes  = bytes,
 };
 
 qemu_co_queue_init(req-wait_queue);
@@ -2058,12 +2060,19 @@ static void tracked_request_begin(BdrvTrackedRequest 
*req,
 QLIST_INSERT_HEAD(bs-tracked_requests, req, list);
 }
 
-static void mark_request_serialising(BdrvTrackedRequest *req)
+static void mark_request_serialising(BdrvTrackedRequest *req, size_t align)
 {
+int64_t overlap_offset = req-offset  ~(align - 1);
+int overlap_bytes = ROUND_UP(req-offset + req-bytes, align)
+  - overlap_offset;
+
 if (!req-serialising) {
 req-bs-serialising_in_flight++;
 req-serialising = true;
 }
+
+req-overlap_offset = MIN(req-overlap_offset, overlap_offset);
+req-overlap_bytes = MAX(req-overlap_bytes, overlap_bytes);
 }
 
 /**
@@ -2087,20 +2096,16 @@ void bdrv_round_to_clusters(BlockDriverState *bs,
 }
 }
 
-static void round_bytes_to_clusters(BlockDriverState *bs,
-int64_t offset, unsigned int bytes,
-int64_t *cluster_offset,
-unsigned int *cluster_bytes)
+static int bdrv_get_cluster_size(BlockDriverState *bs)
 {
 BlockDriverInfo bdi;
+int ret;
 
-if (bdrv_get_info(bs, bdi)  0 || bdi.cluster_size == 0) {
-*cluster_offset = offset;
-*cluster_bytes = bytes;
+ret = bdrv_get_info(bs, bdi);
+if (ret  0 || bdi.cluster_size == 0) {
+return bs-request_alignment;
 } else {
-*cluster_offset = QEMU_ALIGN_DOWN(offset, bdi.cluster_size);
-*cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes,
-   bdi.cluster_size);
+return bdi.cluster_size;
 }
 }
 
@@ -2108,11 +2113,11 @@ static bool tracked_request_overlaps(BdrvTrackedRequest 
*req,
  int64_t offset, unsigned int bytes)
 {
 /*    */
-if (offset = req-offset + req-bytes) {
+if (offset = req-overlap_offset + req-overlap_bytes) {
 return false;
 }
 /*    */
-if (req-offset = offset + bytes) {
+if (req-overlap_offset = offset + bytes) {
 return false;
 }
 return true;
@@ -2122,30 +2127,21 @@ static void coroutine_fn 
wait_serialising_requests(BdrvTrackedRequest *self)
 {
 BlockDriverState *bs = self-bs;
 BdrvTrackedRequest *req;
-int64_t cluster_offset;
-unsigned int cluster_bytes;
 bool retry;
 
 if (!bs-serialising_in_flight) {
 return;
 }
 
-/* If we touch the same cluster it counts as an overlap.  This guarantees
- * that allocating writes will be serialized and not race with each other
- * for the same cluster.  For example, in copy-on-read it ensures that the
- * CoR read and write operations are atomic and guest writes cannot
- * interleave between them.
- */
-round_bytes_to_clusters(bs, self-offset, self-bytes,
-cluster_offset, cluster_bytes);
-
 do {
 retry = false;
 QLIST_FOREACH(req, bs-tracked_requests, list) {
 if (req == self || (!req-serialising  !self-serialising)) {
 continue;
 }
-if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) {
+if (tracked_request_overlaps(req, self-overlap_offset,
+ self-overlap_bytes))
+{
 /* Hitting this means there was a reentrant request, for
  * example, a block driver issuing nested requests.  This must
  * never happen since it means deadlock.
@@ -2761,7 +2757,12 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 
 /* Handle Copy on Read and associated serialisation */
 if (flags  BDRV_REQ_COPY_ON_READ) {
-mark_request_serialising(req);
+/* If we touch the same cluster it counts as an overlap.  This
+ * guarantees that allocating writes will be serialized and not race
+ * with each other for the same cluster.  For example, in copy-on-read
+

Re: [Qemu-devel] Question on pointers in the qemu user space emulation

2014-01-17 Thread Christopher Covington
Hi Erik,

On 01/17/2014 01:33 AM, Erik de Castro Lopo wrote:
 Hi all,
 
 I'm currently working on implementing a missing part of a linux-user
 syscall. This syscall includes a function pointer for a callback.
 
 If one has a 64 bit user space emulation running on a 32 bit host,
 how does one handle the fact that the pointer might be 64 bits?
 
 Does the fact that the 32 bit host can only ever give out 32 bit
 addreses to the 64 bit guest just cancel out the possibility of
 any problems?

Not that I know anything about QEMU internals yet, but just for fun here's my
armchair philosophizing. My interpretation of the scenario you describe is
that some function exists in a 64-bit instruction set architecture. QEMU/TCG
has translated it to the host's native 32-bit ISA for actual execution. It
seems like you should be exclusively communicating the address of the 32-bit
translated version to the host kernel. I don't think the host kernel could do
anything useful with a pointer to the foreign ISA version, even if it got the
address right.

Regards,
Christopher

-- 
Employee of Qualcomm Innovation Center, Inc.
Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
hosted by the Linux Foundation.



[Qemu-devel] [PATCH v3 24/29] block: Make bdrv_pwrite() a bdrv_prwv_co() wrapper

2014-01-17 Thread Kevin Wolf
Instead of implementing the alignment adjustment here, use the now
existing functionality of bdrv_co_do_pwritev().

Signed-off-by: Kevin Wolf kw...@redhat.com
---
 block.c   | 64 ---
 include/block/block.h |  1 -
 2 files changed, 9 insertions(+), 56 deletions(-)

diff --git a/block.c b/block.c
index 4dc4711..812b1b2 100644
--- a/block.c
+++ b/block.c
@@ -2487,11 +2487,6 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
 return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 }
 
-int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov)
-{
-return bdrv_prwv_co(bs, sector_num  BDRV_SECTOR_BITS, qiov, true, 0);
-}
-
 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
   int nb_sectors, BdrvRequestFlags flags)
 {
@@ -2565,70 +2560,29 @@ int bdrv_pread(BlockDriverState *bs, int64_t offset, 
void *buf, int bytes)
 
 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
 {
-uint8_t tmp_buf[BDRV_SECTOR_SIZE];
-int len, nb_sectors, count;
-int64_t sector_num;
 int ret;
 
-count = qiov-size;
-
-/* first write to align to sector start */
-len = (BDRV_SECTOR_SIZE - offset)  (BDRV_SECTOR_SIZE - 1);
-if (len  count)
-len = count;
-sector_num = offset  BDRV_SECTOR_BITS;
-if (len  0) {
-if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1))  0)
-return ret;
-qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset  (BDRV_SECTOR_SIZE - 1)),
-  len);
-if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1))  0)
-return ret;
-count -= len;
-if (count == 0)
-return qiov-size;
-sector_num++;
-}
-
-/* write the sectors in place */
-nb_sectors = count  BDRV_SECTOR_BITS;
-if (nb_sectors  0) {
-QEMUIOVector qiov_inplace;
-
-qemu_iovec_init(qiov_inplace, qiov-niov);
-qemu_iovec_concat(qiov_inplace, qiov, len,
-  nb_sectors  BDRV_SECTOR_BITS);
-ret = bdrv_writev(bs, sector_num, qiov_inplace);
-qemu_iovec_destroy(qiov_inplace);
-if (ret  0) {
-return ret;
-}
-
-sector_num += nb_sectors;
-len = nb_sectors  BDRV_SECTOR_BITS;
-count -= len;
+ret = bdrv_prwv_co(bs, offset, qiov, true, 0);
+if (ret  0) {
+return ret;
 }
 
-/* add data from the last sector */
-if (count  0) {
-if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1))  0)
-return ret;
-qemu_iovec_to_buf(qiov, qiov-size - count, tmp_buf, count);
-if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1))  0)
-return ret;
-}
 return qiov-size;
 }
 
 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
-const void *buf, int count1)
+const void *buf, int bytes)
 {
 QEMUIOVector qiov;
 struct iovec iov = {
 .iov_base   = (void *) buf,
-.iov_len= count1,
+.iov_len= bytes,
 };
 
+if (bytes  0) {
+return -EINVAL;
+}
+
 qemu_iovec_init_external(qiov, iov, 1);
 return bdrv_pwritev(bs, offset, qiov);
 }
diff --git a/include/block/block.h b/include/block/block.h
index 05252d5..7e40ccc 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -220,7 +220,6 @@ BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState 
*bs, int64_t sector_num
 int nb_sectors, BdrvRequestFlags flags,
 BlockDriverCompletionFunc *cb, void 
*opaque);
 int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags);
-int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov);
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
void *buf, int count);
 int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 29/29] block: Switch bdrv_io_limits_intercept() to byte granularity

2014-01-17 Thread Kevin Wolf
Request sizes used to be rounded down to the next sector boundary,
allowing to bypass the I/O limit. Now all requests are accounted for
with their exact byte size.

Reported-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Signed-off-by: Kevin Wolf kw...@redhat.com
---
 block.c | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/block.c b/block.c
index 12af7fb..d3e3291 100644
--- a/block.c
+++ b/block.c
@@ -188,7 +188,7 @@ void bdrv_io_limits_enable(BlockDriverState *bs)
  * @is_write:   is the IO a write
  */
 static void bdrv_io_limits_intercept(BlockDriverState *bs,
- int nb_sectors,
+ unsigned int bytes,
  bool is_write)
 {
 /* does this io must wait */
@@ -201,9 +201,8 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
 }
 
 /* the IO will be executed, do the accounting */
-throttle_account(bs-throttle_state,
- is_write,
- nb_sectors * BDRV_SECTOR_SIZE);
+throttle_account(bs-throttle_state, is_write, bytes);
+
 
 /* if the next request must wait - do nothing */
 if (throttle_schedule_timer(bs-throttle_state, is_write)) {
@@ -2788,8 +2787,7 @@ static int coroutine_fn 
bdrv_co_do_preadv(BlockDriverState *bs,
 
 /* throttling disk I/O */
 if (bs-io_limits_enabled) {
-/* TODO Switch to byte granularity */
-bdrv_io_limits_intercept(bs, bytes  BDRV_SECTOR_BITS, false);
+bdrv_io_limits_intercept(bs, bytes, false);
 }
 
 /* Align read if necessary by padding qiov */
@@ -3013,8 +3011,7 @@ static int coroutine_fn 
bdrv_co_do_pwritev(BlockDriverState *bs,
 
 /* throttling disk I/O */
 if (bs-io_limits_enabled) {
-/* TODO Switch to byte granularity */
-bdrv_io_limits_intercept(bs, bytes  BDRV_SECTOR_BITS, true);
+bdrv_io_limits_intercept(bs, bytes, true);
 }
 
 /*
-- 
1.8.1.4




[Qemu-devel] [Bug 423910] Re: openbios-sparc has no installation candidate

2014-01-17 Thread Ken Sharp
Both are available in Debian.
http://packages.debian.org/search?keywords=openbios

I imagine there's a status page for Ubuntu somewhere showing packaging
problems but I wouldn't know where to look for that.

** Tags removed: no-installation-candidate openbios-sparc package-manager qemu
** Tags added: needs-packaging openbios

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/423910

Title:
  openbios-sparc has no installation candidate

Status in QEMU:
  Invalid
Status in “openbios-sparc” package in Ubuntu:
  Confirmed
Status in “qemu-kvm” package in Ubuntu:
  Fix Released

Bug description:
  When installing Qemu through apt-get, openbios-sparc is recommended,
  but,

  Package openbios-sparc is not available, but is referred to by another 
package.
  This may mean that the package is missing, has been obsoleted, or
  is only available from another source
  E: Package openbios-sparc has no installation candidate

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/423910/+subscriptions



[Qemu-devel] [Bug 423910] Re: openbios-sparc has no installation candidate

2014-01-17 Thread Ken Sharp
(Actually PPC is available in Saucy but that's by-the-by anyway)

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/423910

Title:
  openbios-sparc has no installation candidate

Status in QEMU:
  Invalid
Status in “openbios-sparc” package in Ubuntu:
  Confirmed
Status in “qemu-kvm” package in Ubuntu:
  Fix Released

Bug description:
  When installing Qemu through apt-get, openbios-sparc is recommended,
  but,

  Package openbios-sparc is not available, but is referred to by another 
package.
  This may mean that the package is missing, has been obsoleted, or
  is only available from another source
  E: Package openbios-sparc has no installation candidate

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/423910/+subscriptions



[Qemu-devel] [PATCH v3 26/29] blkdebug: Make required alignment configurable

2014-01-17 Thread Kevin Wolf
The new 'align' option of blkdebug can be used in order to emulate
backends with a required 4k alignment on hosts which only really require
512 byte alignment.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
 block/blkdebug.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index ebc5f13..dc4ba46 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -346,6 +346,11 @@ static QemuOptsList runtime_opts = {
 .type = QEMU_OPT_STRING,
 .help = [internal use only, will be removed],
 },
+{
+.name = align,
+.type = QEMU_OPT_SIZE,
+.help = Required alignment in bytes,
+},
 { /* end of list */ }
 },
 };
@@ -357,6 +362,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
 QemuOpts *opts;
 Error *local_err = NULL;
 const char *filename, *config;
+uint64_t align;
 int ret;
 
 opts = qemu_opts_create(runtime_opts, NULL, 0, error_abort);
@@ -394,6 +400,16 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
 goto fail;
 }
 
+/* Set request alignment */
+align = qemu_opt_get_size(opts, align, bs-request_alignment);
+if (align  0  align  INT_MAX  !(align  (align - 1))) {
+bs-request_alignment = align;
+} else {
+error_setg(errp, Invalid alignment);
+ret = -EINVAL;
+goto fail;
+}
+
 ret = 0;
 fail:
 qemu_opts_del(opts);
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 27/29] qemu-io: New command 'sleep'

2014-01-17 Thread Kevin Wolf
There is no easy way to check that a request correctly waits for a
different request. With a sleep command we can at least approximate it.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
 qemu-io-cmds.c | 42 ++
 1 file changed, 42 insertions(+)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 85e4982..978a3a0 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -12,6 +12,7 @@
 #include block/block_int.h
 #include block/qapi.h
 #include qemu/main-loop.h
+#include qemu/timer.h
 
 #define CMD_NOFILE_OK   0x01
 
@@ -2038,6 +2039,46 @@ static const cmdinfo_t abort_cmd = {
.oneline= simulate a program crash using abort(3),
 };
 
+static void sleep_cb(void *opaque)
+{
+bool *expired = opaque;
+*expired = true;
+}
+
+static int sleep_f(BlockDriverState *bs, int argc, char **argv)
+{
+char *endptr;
+long ms;
+struct QEMUTimer *timer;
+bool expired = false;
+
+ms = strtol(argv[1], endptr, 0);
+if (ms  0 || *endptr != '\0') {
+printf(%s is not a valid number\n, argv[1]);
+return 0;
+}
+
+timer = timer_new_ns(QEMU_CLOCK_HOST, sleep_cb, expired);
+timer_mod(timer, qemu_clock_get_ns(QEMU_CLOCK_HOST) + SCALE_MS * ms);
+
+while (!expired) {
+main_loop_wait(false);
+}
+
+timer_free(timer);
+
+return 0;
+}
+
+static const cmdinfo_t sleep_cmd = {
+   .name   = sleep,
+   .argmin = 1,
+   .argmax = 1,
+   .cfunc  = sleep_f,
+   .flags  = CMD_NOFILE_OK,
+   .oneline= waits for the given value in milliseconds,
+};
+
 static void help_oneline(const char *cmd, const cmdinfo_t *ct)
 {
 if (cmd) {
@@ -2151,4 +2192,5 @@ static void __attribute((constructor)) 
init_qemuio_commands(void)
 qemuio_add_command(resume_cmd);
 qemuio_add_command(wait_break_cmd);
 qemuio_add_command(abort_cmd);
+qemuio_add_command(sleep_cmd);
 }
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 19/29] block: Allow wait_serialising_requests() at any point

2014-01-17 Thread Kevin Wolf
We can only have a single wait_serialising_requests() call per request
because otherwise we can run into deadlocks where requests are waiting
for each other. The same is true when wait_serialising_requests() is not
at the very beginning of a request, so that other requests can be issued
between the start of the tracking and wait_serialising_requests().

Fix this by changing wait_serialising_requests() to ignore requests that
are already (directly or indirectly) waiting for the calling request.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c   | 13 ++---
 include/block/block_int.h |  2 ++
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index e72966a..55e8c69 100644
--- a/block.c
+++ b/block.c
@@ -2148,9 +2148,16 @@ static void coroutine_fn 
wait_serialising_requests(BdrvTrackedRequest *self)
  */
 assert(qemu_coroutine_self() != req-co);
 
-qemu_co_queue_wait(req-wait_queue);
-retry = true;
-break;
+/* If the request is already (indirectly) waiting for us, or
+ * will wait for us as soon as it wakes up, then just go on
+ * (instead of producing a deadlock in the former case). */
+if (!req-waiting_for) {
+self-waiting_for = req;
+qemu_co_queue_wait(req-wait_queue);
+self-waiting_for = NULL;
+retry = true;
+break;
+}
 }
 }
 } while (retry);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index ccd2c68..fdf0e0b 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -68,6 +68,8 @@ typedef struct BdrvTrackedRequest {
 QLIST_ENTRY(BdrvTrackedRequest) list;
 Coroutine *co; /* owner, used for deadlock detection */
 CoQueue wait_queue; /* coroutines blocked on this request */
+
+struct BdrvTrackedRequest *waiting_for;
 } BdrvTrackedRequest;
 
 struct BlockDriver {
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 28/29] qemu-iotests: Test pwritev RMW logic

2014-01-17 Thread Kevin Wolf
Signed-off-by: Kevin Wolf kw...@redhat.com
---
 block.c|   7 ++
 block/blkdebug.c   |   8 ++
 include/block/block.h  |   8 ++
 tests/qemu-iotests/077 | 278 +
 tests/qemu-iotests/077.out | 202 
 tests/qemu-iotests/group   |   1 +
 6 files changed, 504 insertions(+)
 create mode 100755 tests/qemu-iotests/077
 create mode 100644 tests/qemu-iotests/077.out

diff --git a/block.c b/block.c
index 812b1b2..12af7fb 100644
--- a/block.c
+++ b/block.c
@@ -2961,10 +2961,13 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 if (ret  0) {
 /* Do nothing, write notifier decided to fail this request */
 } else if (flags  BDRV_REQ_ZERO_WRITE) {
+BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO);
 ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
 } else {
+BLKDBG_EVENT(bs, BLKDBG_PWRITEV);
 ret = drv-bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 }
+BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE);
 
 if (ret == 0  !bs-enable_write_cache) {
 ret = bdrv_co_flush(bs);
@@ -3035,11 +3038,13 @@ static int coroutine_fn 
bdrv_co_do_pwritev(BlockDriverState *bs,
 };
 qemu_iovec_init_external(head_qiov, head_iov, 1);
 
+BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD);
 ret = bdrv_aligned_preadv(bs, req, offset  ~(align - 1), align,
   align, head_qiov, 0);
 if (ret  0) {
 goto fail;
 }
+BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
 
 qemu_iovec_init(local_qiov, qiov-niov + 2);
 qemu_iovec_add(local_qiov, head_buf, offset  (align - 1));
@@ -3067,11 +3072,13 @@ static int coroutine_fn 
bdrv_co_do_pwritev(BlockDriverState *bs,
 };
 qemu_iovec_init_external(tail_qiov, tail_iov, 1);
 
+BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL);
 ret = bdrv_aligned_preadv(bs, req, (offset + bytes)  ~(align - 1), 
align,
   align, tail_qiov, 0);
 if (ret  0) {
 goto fail;
 }
+BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
 
 if (!use_local_qiov) {
 qemu_iovec_init(local_qiov, qiov-niov + 1);
diff --git a/block/blkdebug.c b/block/blkdebug.c
index dc4ba46..6657671 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -186,6 +186,14 @@ static const char *event_names[BLKDBG_EVENT_MAX] = {
 
 [BLKDBG_FLUSH_TO_OS]= flush_to_os,
 [BLKDBG_FLUSH_TO_DISK]  = flush_to_disk,
+
+[BLKDBG_PWRITEV_RMW_HEAD]   = pwritev_rmw.head,
+[BLKDBG_PWRITEV_RMW_AFTER_HEAD] = pwritev_rmw.after_head,
+[BLKDBG_PWRITEV_RMW_TAIL]   = pwritev_rmw.tail,
+[BLKDBG_PWRITEV_RMW_AFTER_TAIL] = pwritev_rmw.after_tail,
+[BLKDBG_PWRITEV]= pwritev,
+[BLKDBG_PWRITEV_ZERO]   = pwritev_zero,
+[BLKDBG_PWRITEV_DONE]   = pwritev_done,
 };
 
 static int get_event_by_name(const char *name, BlkDebugEvent *event)
diff --git a/include/block/block.h b/include/block/block.h
index 7e40ccc..127fc2f 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -518,6 +518,14 @@ typedef enum {
 BLKDBG_FLUSH_TO_OS,
 BLKDBG_FLUSH_TO_DISK,
 
+BLKDBG_PWRITEV_RMW_HEAD,
+BLKDBG_PWRITEV_RMW_AFTER_HEAD,
+BLKDBG_PWRITEV_RMW_TAIL,
+BLKDBG_PWRITEV_RMW_AFTER_TAIL,
+BLKDBG_PWRITEV,
+BLKDBG_PWRITEV_ZERO,
+BLKDBG_PWRITEV_DONE,
+
 BLKDBG_EVENT_MAX,
 } BlkDebugEvent;
 
diff --git a/tests/qemu-iotests/077 b/tests/qemu-iotests/077
new file mode 100755
index 000..58bfc8f
--- /dev/null
+++ b/tests/qemu-iotests/077
@@ -0,0 +1,278 @@
+#!/bin/bash
+#
+# Test concurrent pread/pwrite
+#
+# Copyright (C) 2014 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see http://www.gnu.org/licenses/.
+#
+
+# creator
+owner=kw...@redhat.com
+
+seq=`basename $0`
+echo QA output created by $seq
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+
+_cleanup()
+{
+   _cleanup_test_img
+}
+trap _cleanup; exit \$status 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt generic
+_supported_proto generic
+_supported_os Linux
+
+CLUSTER_SIZE=4k
+size=128M
+

[Qemu-devel] [PATCH v3 14/29] block: Switch BdrvTrackedRequest to byte granularity

2014-01-17 Thread Kevin Wolf
Signed-off-by: Kevin Wolf kw...@redhat.com
---
 block.c   | 52 +++
 block/backup.c|  7 ++-
 include/block/block_int.h |  4 ++--
 3 files changed, 42 insertions(+), 21 deletions(-)

diff --git a/block.c b/block.c
index 328f592..85f28ab 100644
--- a/block.c
+++ b/block.c
@@ -2037,13 +2037,13 @@ static void tracked_request_end(BdrvTrackedRequest *req)
  */
 static void tracked_request_begin(BdrvTrackedRequest *req,
   BlockDriverState *bs,
-  int64_t sector_num,
-  int nb_sectors, bool is_write)
+  int64_t offset,
+  unsigned int bytes, bool is_write)
 {
 *req = (BdrvTrackedRequest){
 .bs = bs,
-.sector_num = sector_num,
-.nb_sectors = nb_sectors,
+.offset = offset,
+.bytes = bytes,
 .is_write = is_write,
 .co = qemu_coroutine_self(),
 };
@@ -2074,25 +2074,43 @@ void bdrv_round_to_clusters(BlockDriverState *bs,
 }
 }
 
+static void round_bytes_to_clusters(BlockDriverState *bs,
+int64_t offset, unsigned int bytes,
+int64_t *cluster_offset,
+unsigned int *cluster_bytes)
+{
+BlockDriverInfo bdi;
+
+if (bdrv_get_info(bs, bdi)  0 || bdi.cluster_size == 0) {
+*cluster_offset = offset;
+*cluster_bytes = bytes;
+} else {
+*cluster_offset = QEMU_ALIGN_DOWN(offset, bdi.cluster_size);
+*cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes,
+   bdi.cluster_size);
+}
+}
+
 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
- int64_t sector_num, int nb_sectors) {
+ int64_t offset, unsigned int bytes)
+{
 /*    */
-if (sector_num = req-sector_num + req-nb_sectors) {
+if (offset = req-offset + req-bytes) {
 return false;
 }
 /*    */
-if (req-sector_num = sector_num + nb_sectors) {
+if (req-offset = offset + bytes) {
 return false;
 }
 return true;
 }
 
 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors)
+int64_t offset, unsigned int bytes)
 {
 BdrvTrackedRequest *req;
-int64_t cluster_sector_num;
-int cluster_nb_sectors;
+int64_t cluster_offset;
+unsigned int cluster_bytes;
 bool retry;
 
 /* If we touch the same cluster it counts as an overlap.  This guarantees
@@ -2101,14 +2119,12 @@ static void coroutine_fn 
wait_for_overlapping_requests(BlockDriverState *bs,
  * CoR read and write operations are atomic and guest writes cannot
  * interleave between them.
  */
-bdrv_round_to_clusters(bs, sector_num, nb_sectors,
-   cluster_sector_num, cluster_nb_sectors);
+round_bytes_to_clusters(bs, offset, bytes, cluster_offset, 
cluster_bytes);
 
 do {
 retry = false;
 QLIST_FOREACH(req, bs-tracked_requests, list) {
-if (tracked_request_overlaps(req, cluster_sector_num,
- cluster_nb_sectors)) {
+if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) {
 /* Hitting this means there was a reentrant request, for
  * example, a block driver issuing nested requests.  This must
  * never happen since it means deadlock.
@@ -2728,10 +2744,10 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 }
 
 if (bs-copy_on_read_in_flight) {
-wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+wait_for_overlapping_requests(bs, offset, bytes);
 }
 
-tracked_request_begin(req, bs, sector_num, nb_sectors, false);
+tracked_request_begin(req, bs, offset, bytes, false);
 
 if (flags  BDRV_REQ_COPY_ON_READ) {
 int pnum;
@@ -2980,10 +2996,10 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 assert((bytes  (BDRV_SECTOR_SIZE - 1)) == 0);
 
 if (bs-copy_on_read_in_flight) {
-wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+wait_for_overlapping_requests(bs, offset, bytes);
 }
 
-tracked_request_begin(req, bs, sector_num, nb_sectors, true);
+tracked_request_begin(req, bs, offset, bytes, true);
 
 ret = notifier_with_return_list_notify(bs-before_write_notifiers, req);
 
diff --git a/block/backup.c b/block/backup.c
index 0198514..15a2e55 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -181,8 +181,13 @@ static int coroutine_fn backup_before_write_notify(
 void *opaque)
 {
 BdrvTrackedRequest *req = opaque;
+int64_t sector_num = req-offset  BDRV_SECTOR_BITS;

[Qemu-devel] [PATCH v3 17/29] block: Generalise and optimise COR serialisation

2014-01-17 Thread Kevin Wolf
Change the API so that specific requests can be marked serialising. Only
these requests are checked for overlaps then.

This means that during a Copy on Read operation, not all requests
overlapping other requests are serialised any more, but only those that
actually overlap with the specific COR request.

Also remove COR from function and variable names because this
functionality can be useful in other contexts.

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c   | 48 ---
 include/block/block_int.h |  5 +++--
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/block.c b/block.c
index d7156ce..efa8979 100644
--- a/block.c
+++ b/block.c
@@ -2028,6 +2028,10 @@ int bdrv_commit_all(void)
  */
 static void tracked_request_end(BdrvTrackedRequest *req)
 {
+if (req-serialising) {
+req-bs-serialising_in_flight--;
+}
+
 QLIST_REMOVE(req, list);
 qemu_co_queue_restart_all(req-wait_queue);
 }
@@ -2042,10 +2046,11 @@ static void tracked_request_begin(BdrvTrackedRequest 
*req,
 {
 *req = (BdrvTrackedRequest){
 .bs = bs,
-.offset = offset,
-.bytes = bytes,
-.is_write = is_write,
-.co = qemu_coroutine_self(),
+.offset = offset,
+.bytes  = bytes,
+.is_write   = is_write,
+.co = qemu_coroutine_self(),
+.serialising= false,
 };
 
 qemu_co_queue_init(req-wait_queue);
@@ -2053,6 +2058,14 @@ static void tracked_request_begin(BdrvTrackedRequest 
*req,
 QLIST_INSERT_HEAD(bs-tracked_requests, req, list);
 }
 
+static void mark_request_serialising(BdrvTrackedRequest *req)
+{
+if (!req-serialising) {
+req-bs-serialising_in_flight++;
+req-serialising = true;
+}
+}
+
 /**
  * Round a region to cluster boundaries
  */
@@ -2105,26 +2118,31 @@ static bool tracked_request_overlaps(BdrvTrackedRequest 
*req,
 return true;
 }
 
-static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
-BdrvTrackedRequest *self, int64_t offset, unsigned int bytes)
+static void coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 {
+BlockDriverState *bs = self-bs;
 BdrvTrackedRequest *req;
 int64_t cluster_offset;
 unsigned int cluster_bytes;
 bool retry;
 
+if (!bs-serialising_in_flight) {
+return;
+}
+
 /* If we touch the same cluster it counts as an overlap.  This guarantees
  * that allocating writes will be serialized and not race with each other
  * for the same cluster.  For example, in copy-on-read it ensures that the
  * CoR read and write operations are atomic and guest writes cannot
  * interleave between them.
  */
-round_bytes_to_clusters(bs, offset, bytes, cluster_offset, 
cluster_bytes);
+round_bytes_to_clusters(bs, self-offset, self-bytes,
+cluster_offset, cluster_bytes);
 
 do {
 retry = false;
 QLIST_FOREACH(req, bs-tracked_requests, list) {
-if (req == self) {
+if (req == self || (!req-serialising  !self-serialising)) {
 continue;
 }
 if (tracked_request_overlaps(req, cluster_offset, cluster_bytes)) {
@@ -2743,12 +2761,10 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 
 /* Handle Copy on Read and associated serialisation */
 if (flags  BDRV_REQ_COPY_ON_READ) {
-bs-copy_on_read_in_flight++;
+mark_request_serialising(req);
 }
 
-if (bs-copy_on_read_in_flight) {
-wait_for_overlapping_requests(bs, req, offset, bytes);
-}
+wait_serialising_requests(req);
 
 if (flags  BDRV_REQ_COPY_ON_READ) {
 int pnum;
@@ -2797,10 +2813,6 @@ static int coroutine_fn 
bdrv_aligned_preadv(BlockDriverState *bs,
 }
 
 out:
-if (flags  BDRV_REQ_COPY_ON_READ) {
-bs-copy_on_read_in_flight--;
-}
-
 return ret;
 }
 
@@ -2999,9 +3011,7 @@ static int coroutine_fn 
bdrv_aligned_pwritev(BlockDriverState *bs,
 assert((offset  (BDRV_SECTOR_SIZE - 1)) == 0);
 assert((bytes  (BDRV_SECTOR_SIZE - 1)) == 0);
 
-if (bs-copy_on_read_in_flight) {
-wait_for_overlapping_requests(bs, req, offset, bytes);
-}
+wait_serialising_requests(req);
 
 ret = notifier_with_return_list_notify(bs-before_write_notifiers, req);
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 97a4d23..d8443df 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -60,6 +60,7 @@ typedef struct BdrvTrackedRequest {
 int64_t offset;
 unsigned int bytes;
 bool is_write;
+bool serialising;
 QLIST_ENTRY(BdrvTrackedRequest) list;
 Coroutine *co; /* owner, used for deadlock detection */
 CoQueue wait_queue; /* coroutines blocked on this request */
@@ -296,8 +297,8 @@ struct BlockDriverState {
 /* 

[Qemu-devel] [PATCH v3 23/29] block: Make bdrv_pread() a bdrv_prwv_co() wrapper

2014-01-17 Thread Kevin Wolf
Instead of implementing the alignment adjustment here, use the now
existing functionality of bdrv_co_do_preadv().

Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block.c | 49 +
 1 file changed, 13 insertions(+), 36 deletions(-)

diff --git a/block.c b/block.c
index 2a7d1b3..4dc4711 100644
--- a/block.c
+++ b/block.c
@@ -2541,49 +2541,26 @@ int bdrv_make_zero(BlockDriverState *bs, 
BdrvRequestFlags flags)
 }
 }
 
-int bdrv_pread(BlockDriverState *bs, int64_t offset,
-   void *buf, int count1)
+int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes)
 {
-uint8_t tmp_buf[BDRV_SECTOR_SIZE];
-int len, nb_sectors, count;
-int64_t sector_num;
+QEMUIOVector qiov;
+struct iovec iov = {
+.iov_base = (void *)buf,
+.iov_len = bytes,
+};
 int ret;
 
-count = count1;
-/* first read to align to sector start */
-len = (BDRV_SECTOR_SIZE - offset)  (BDRV_SECTOR_SIZE - 1);
-if (len  count)
-len = count;
-sector_num = offset  BDRV_SECTOR_BITS;
-if (len  0) {
-if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1))  0)
-return ret;
-memcpy(buf, tmp_buf + (offset  (BDRV_SECTOR_SIZE - 1)), len);
-count -= len;
-if (count == 0)
-return count1;
-sector_num++;
-buf += len;
+if (bytes  0) {
+return -EINVAL;
 }
 
-/* read the sectors in place */
-nb_sectors = count  BDRV_SECTOR_BITS;
-if (nb_sectors  0) {
-if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors))  0)
-return ret;
-sector_num += nb_sectors;
-len = nb_sectors  BDRV_SECTOR_BITS;
-buf += len;
-count -= len;
+qemu_iovec_init_external(qiov, iov, 1);
+ret = bdrv_prwv_co(bs, offset, qiov, false, 0);
+if (ret  0) {
+return ret;
 }
 
-/* add data from the last sector */
-if (count  0) {
-if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1))  0)
-return ret;
-memcpy(buf, tmp_buf, count);
-}
-return count1;
+return bytes;
 }
 
 int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov)
-- 
1.8.1.4




[Qemu-devel] [PATCH v3 25/29] iscsi: Set bs-request_alignment

2014-01-17 Thread Kevin Wolf
From: Paolo Bonzini pbonz...@redhat.com

The iSCSI backend already gets the block size from the READ CAPACITY
command it sends.  Save it so that the generic block layer gets it
too.

Signed-off-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Kevin Wolf kw...@redhat.com
Reviewed-by: Max Reitz mre...@redhat.com
---
 block/iscsi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index 3202dc5..23c3fc4 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1217,6 +1217,7 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 goto out;
 }
 bs-total_sectors = sector_lun2qemu(iscsilun-num_blocks, iscsilun);
+bs-request_alignment = iscsilun-block_size;
 
 /* Medium changer or tape. We dont have any emulation for this so this must
  * be sg ioctl compatible. We force it to be sg, otherwise qemu will try
-- 
1.8.1.4




Re: [Qemu-devel] [PULL 36/42] qapi: QMP interface for blkdebug and blkverify

2014-01-17 Thread Andreas Färber
Am 16.01.2014 11:03, schrieb Kevin Wolf:
 Am 15.01.2014 um 16:19 hat Eric Blake geschrieben:
 On 01/15/2014 03:22 AM, Kevin Wolf wrote:
 From: Max Reitz mre...@redhat.com

 Add structures to support blkdebug and blkverify in blockdev-add.

 Signed-off-by: Max Reitz mre...@redhat.com
 Signed-off-by: Kevin Wolf kw...@redhat.com
 ---
  qapi-schema.json | 113 
 +--
  1 file changed, 109 insertions(+), 4 deletions(-)

 Sorry for not noticing this sooner, but we still have some interface
 problems that need to be ironed out.
 
 Nothing that should hold up this pull request, there are just a few
 improvements that can be done in a follow-up.
 
 diff --git a/qapi-schema.json b/qapi-schema.json
 index f27c48a..35f7b34 100644
 --- a/qapi-schema.json
 +++ b/qapi-schema.json
 @@ -4201,6 +4201,113 @@
  '*pass-discard-other': 'bool' } }
  
  ##
 +# @BlkdebugEvent
 +#
 +# Trigger events supported by blkdebug.
 +##
 +{ 'enum': 'BlkdebugEvent',

 Missing a 'Since: 2.0' designation; would be worth adding that in a
 followup patch.
 
 Ack.
 
 +  'data': [ 'l1_update', 'l1_grow.alloc_table', 'l1_grow.write_table',
 +'l1_grow.activate_table', 'l2_load', 'l2_update',
 +'l2_update_compressed', 'l2_alloc.cow_read', 'l2_alloc.write',
 +'read_aio', 'read_backing_aio', 'read_compressed', 'write_aio',
 +'write_compressed', 'vmstate_load', 'vmstate_save', 'cow_read',
 +'cow_write', 'reftable_load', 'reftable_grow', 
 'reftable_update',
 +'refblock_load', 'refblock_update', 'refblock_update_part',
 +'refblock_alloc', 'refblock_alloc.hookup', 
 'refblock_alloc.write',
 +'refblock_alloc.write_blocks', 'refblock_alloc.write_table',
 +'refblock_alloc.switch_table', 'cluster_alloc',
 +'cluster_alloc_bytes', 'cluster_free', 'flush_to_os',
 +'flush_to_disk' ] }

 Do we want to prefer '-' over '_' in all these names?
 
 These are existing names from the blkdebug configuration file. If we
 wanted to have '-' in QMP and '_' in the config file, we'd have to have
 some conversion somewhere. Uglier than having underscores here, imho.
[snip]

FWIW for -cpu we added code to do an automatic conversion from '_' to
'-' so that we could name QOM properties the new way while keeping
command line compatibility. As a side-effect, both became possible IIUC.

Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg



Re: [Qemu-devel] [PULL 10/42] qtest: Fix the bug about disable vnc causes make check fail

2014-01-17 Thread Andreas Färber
Am 15.01.2014 11:22, schrieb Kevin Wolf:
 From: Kewei Yu kewe...@gmail.com
 
 When we disable vnc from ./configure, QEMU can't use the vnc option.
 So qtest can't use the vnc -none , otherwise make check fails.
 If QEMU uses -display none, -vnc none is excrescent, So we just need to 
 drop it.
 
 Signed-off-by: Kewei Yu kewe...@gmail.com
 Reviewed-by: Paolo Bonzini pbonz...@redhat.com
 Signed-off-by: Kevin Wolf kw...@redhat.com

If the pull does get respun, 'Fix make check failing for
--disable-vnc' would be better English. ;)

Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg



Re: [Qemu-devel] Question on pointers in the qemu user space emulation

2014-01-17 Thread Peter Maydell
On 17 January 2014 06:33, Erik de Castro Lopo mle+to...@mega-nerd.com wrote:
 I'm currently working on implementing a missing part of a linux-user
 syscall. This syscall includes a function pointer for a callback.

Which syscall? Callbacks from the kernel are pretty tricky.
Basically you need to register a host function as the callback
with the host kernel, and stash the guest function pointer somewhere
so that when the callback comes in from the host kernel you can
arrange to interrupt the guest and restart it at the desired
location.

Pretty much the only situation we support this for is the special
case of signal handlers. In fact I wasn't even aware there was
any other kind of kernel-to-userspace callback...

thanks
-- PMM



Re: [Qemu-devel] [PATCH 1/2] kvm: initialize qemu_host_page_size

2014-01-17 Thread Alex Williamson
On Fri, 2014-01-17 at 13:55 +0100, Paolo Bonzini wrote:
 Il 16/01/2014 07:21, Alexey Kardashevskiy ha scritto:
  There is a HOST_PAGE_ALIGN macro which makes sense for KVM accelerator
  but it uses qemu_host_page_size/qemu_host_page_mask which initialized
  for TCG only.
  
  This moves qemu_host_page_size/qemu_host_page_mask initialization from
  TCG's page_init() and adds a call for it from kvm_init().
  
  Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
  ---
   include/exec/exec-all.h |  1 +
   kvm-all.c   |  1 +
   translate-all.c | 14 --
   3 files changed, 10 insertions(+), 6 deletions(-)
  
  diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
  index ea90b64..3b03cbf 100644
  --- a/include/exec/exec-all.h
  +++ b/include/exec/exec-all.h
  @@ -81,6 +81,7 @@ void cpu_gen_init(void);
   int cpu_gen_code(CPUArchState *env, struct TranslationBlock *tb,
int *gen_code_size_ptr);
   bool cpu_restore_state(CPUArchState *env, uintptr_t searched_pc);
  +void page_size_init(void);
   
   void QEMU_NORETURN cpu_resume_from_signal(CPUArchState *env1, void *puc);
   void QEMU_NORETURN cpu_io_recompile(CPUArchState *env, uintptr_t retaddr);
  diff --git a/kvm-all.c b/kvm-all.c
  index 0bfb060..edf2365 100644
  --- a/kvm-all.c
  +++ b/kvm-all.c
  @@ -1360,6 +1360,7 @@ int kvm_init(void)
* page size for the system though.
*/
   assert(TARGET_PAGE_SIZE = getpagesize());
  +page_size_init();
   
   #ifdef KVM_CAP_SET_GUEST_DEBUG
   QTAILQ_INIT(s-kvm_sw_breakpoints);
  diff --git a/translate-all.c b/translate-all.c
  index 105c25a..543e1ff 100644
  --- a/translate-all.c
  +++ b/translate-all.c
  @@ -289,17 +289,15 @@ static inline void map_exec(void *addr, long size)
   }
   #endif
   
  -static void page_init(void)
  +void page_size_init(void)
   {
   /* NOTE: we can always suppose that qemu_host_page_size =
  TARGET_PAGE_SIZE */
   #ifdef _WIN32
  -{
  -SYSTEM_INFO system_info;
  +SYSTEM_INFO system_info;
   
  -GetSystemInfo(system_info);
  -qemu_real_host_page_size = system_info.dwPageSize;
  -}
  +GetSystemInfo(system_info);
  +qemu_real_host_page_size = system_info.dwPageSize;
   #else
   qemu_real_host_page_size = getpagesize();
   #endif
  @@ -310,7 +308,11 @@ static void page_init(void)
   qemu_host_page_size = TARGET_PAGE_SIZE;
   }
   qemu_host_page_mask = ~(qemu_host_page_size - 1);
  +}
   
  +static void page_init(void)
  +{
  +page_size_init();
   #if defined(CONFIG_BSD)  defined(CONFIG_USER_ONLY)
   {
   #ifdef HAVE_KINFO_GETVMMAP
  
 
 Acked-by: Paolo Bonzini pbonz...@redhat.com

How should this go in?  With your ack I could include it in my vfio tree
with patch 2/2.  Sound ok?  Thanks,

Alex




[Qemu-devel] [PULL 3/3] xen_pt: Fix passthrough of device with ROM.

2014-01-17 Thread Stefano Stabellini
From: Anthony PERARD anthony.per...@citrix.com

QEMU does not need and should not allocate memory for the ROM of a
passthrough PCI device. So this patch initialize the particular region
like any other PCI BAR of a passthrough device.

When a guest will access the ROM, Xen will take care of the IO, QEMU
will not be involved in it.

Xen set a limit of memory available for each guest, allocating memory
for a ROM can hit this limit.

Signed-off-by: Anthony PERARD anthony.per...@citrix.com
Signed-off-by: Stefano Stabellini stefano.stabell...@eu.citrix.com
Reported-and-Tested-by: Konrad Rzeszutek Wilk konrad.w...@oracle.com
---
 hw/xen/xen_pt.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c
index eee4354..be4220b 100644
--- a/hw/xen/xen_pt.c
+++ b/hw/xen/xen_pt.c
@@ -440,8 +440,8 @@ static int xen_pt_register_regions(XenPCIPassthroughState 
*s)
 
 s-bases[PCI_ROM_SLOT].access.maddr = d-rom.base_addr;
 
-memory_region_init_rom_device(s-rom, OBJECT(s), NULL, NULL,
-  xen-pci-pt-rom, d-rom.size);
+memory_region_init_io(s-rom, OBJECT(s), ops, s-dev,
+  xen-pci-pt-rom, d-rom.size);
 pci_register_bar(s-dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
  s-rom);
 
-- 
1.7.10.4




[Qemu-devel] [PULL 2/3] xen_pt: Fix debug output.

2014-01-17 Thread Stefano Stabellini
From: Anthony PERARD anthony.per...@citrix.com

Signed-off-by: Anthony PERARD anthony.per...@citrix.com
Signed-off-by: Stefano Stabellini stefano.stabell...@eu.citrix.com
Reviewed-by: Konrad Rzeszutek Wilk konrad.w...@oracle.com
---
 hw/xen/xen_pt.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/xen/xen_pt.c b/hw/xen/xen_pt.c
index d58cb61..eee4354 100644
--- a/hw/xen/xen_pt.c
+++ b/hw/xen/xen_pt.c
@@ -420,8 +420,8 @@ static int xen_pt_register_regions(XenPCIPassthroughState 
*s)
   xen-pci-pt-bar, r-size);
 pci_register_bar(s-dev, i, type, s-bar[i]);
 
-XEN_PT_LOG(s-dev, IO region %i registered (size=0x%lxPRIx64
-base_addr=0x%lxPRIx64 type: %#x)\n,
+XEN_PT_LOG(s-dev, IO region %i registered (size=0x%08PRIx64
+base_addr=0x%08PRIx64 type: %#x)\n,
i, r-size, r-base_addr, type);
 }
 
-- 
1.7.10.4




[Qemu-devel] [Bug 1254786] Re: qemu-m68k-static: illegal instruction ebc0 during debootstrap second stage

2014-01-17 Thread Ken Sharp
Debian currently runs on the 68020, 68030, 68040 and 68060 processors
http://www.debian.org/ports/m68k/

From 2006:
https://lists.debian.org/debian-devel-announce/2006/01/msg5.html

So I really don't know. :-/

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1254786

Title:
  qemu-m68k-static: illegal instruction ebc0 during debootstrap second
  stage

Status in QEMU:
  New
Status in “qemu-linaro” package in Ubuntu:
  New

Bug description:
  Host: Ubuntu Precise amd64
  Guest: Debian (ports) sid m68k

  $ sudo qemu-debootstrap --no-check-gpg --arch=m68k sid m68k 
http://ftp.debian-ports.org/debian
  I: Running command: debootstrap --arch m68k --foreign --no-check-gpg sid m68k 
http://ftp.debian-ports.org/debian
  [...]
  I: Running command: chroot m68k /debootstrap/debootstrap --second-stage
  qemu: fatal: Illegal instruction: ebc0 @ f67e5662
  D0 = 6ef5   A0 = f67fbf58   F0 =  (   0)
  D1 = 010a   A1 =    F1 =  (   0)
  D2 = 000f   A2 =    F2 =  (   0)
  D3 =    A3 = f67e   F3 =  (   0)
  D4 =    A4 =    F4 =  (   0)
  D5 =    A5 = f67fc000   F5 =  (   0)
  D6 =    A6 = f6fff7cc   F6 =  (   0)
  D7 =    A7 = f6fff580   F7 =  (   0)
  PC = f67e5662   SR =  - FPRESULT =0
  Aborted (core dumped)

  ProblemType: Bug
  DistroRelease: Ubuntu 12.04
  Package: qemu-user-static 1.0.50-2012.03-0ubuntu2.1
  ProcVersionSignature: Ubuntu 3.8.0-33.48~precise1-generic 3.8.13.11
  Uname: Linux 3.8.0-33-generic x86_64
  NonfreeKernelModules: wl
  ApportVersion: 2.0.1-0ubuntu17.6
  Architecture: amd64
  Date: Mon Nov 25 16:08:26 2013
  Dependencies:
   
  InstallationMedia: Ubuntu 12.04.3 LTS Precise Pangolin - Release amd64 
(20130820.1)
  MarkForUpload: True
  ProcEnviron:
   LANGUAGE=en_GB:en
   TERM=xterm
   PATH=(custom, no user)
   LANG=en_GB.UTF-8
   SHELL=/bin/bash
  SourcePackage: qemu-linaro
  UpgradeStatus: No upgrade log present (probably fresh install)

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1254786/+subscriptions



[Qemu-devel] [PULL 1/3] xenfb: map framebuffer read-only and handle unmap errors

2014-01-17 Thread Stefano Stabellini
The framebuffer is needlessly mapped (PROT_READ | PROT_WRITE), map it
PROT_READ instead.

The framebuffer is unmapped by replacing the framebuffer pages with
anonymous shared memory, calling mmap. Check for return errors and print
a warning.

Signed-off-by: Stefano Stabellini stefano.stabell...@eu.citrix.com
---
 hw/display/xenfb.c |7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/display/xenfb.c b/hw/display/xenfb.c
index f0333a0..cb9d456 100644
--- a/hw/display/xenfb.c
+++ b/hw/display/xenfb.c
@@ -495,7 +495,7 @@ static int xenfb_map_fb(struct XenFB *xenfb)
 munmap(map, n_fbdirs * XC_PAGE_SIZE);
 
 xenfb-pixels = xc_map_foreign_pages(xen_xc, xenfb-c.xendev.dom,
-PROT_READ | PROT_WRITE, fbmfns, 
xenfb-fbpages);
+PROT_READ, fbmfns, xenfb-fbpages);
 if (xenfb-pixels == NULL)
goto out;
 
@@ -903,6 +903,11 @@ static void fb_disconnect(struct XenDevice *xendev)
 fb-pixels = mmap(fb-pixels, fb-fbpages * XC_PAGE_SIZE,
   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON,
   -1, 0);
+if (fb-pixels == MAP_FAILED) {
+xen_be_printf(xendev, 0,
+Couldn't replace the framebuffer with anonymous memory 
errno=%d\n,
+errno);
+}
 common_unbind(fb-c);
 fb-feature_update = 0;
 fb-bug_trigger= 0;
-- 
1.7.10.4




[Qemu-devel] [PULL 0/3] xen-170114

2014-01-17 Thread Stefano Stabellini
The following changes since commit 1cf892ca2689c84960b4ce4d2723b6bee453711c:

  SPARC: Fix LEON3 power down instruction (2014-01-15 15:37:33 +1000)

are available in the git repository at:

  git://xenbits.xen.org/people/sstabellini/qemu-dm.git xen-170114

for you to fetch changes up to 794798e36eda77802ce7cc7d7d6b1c65751e8a76:

  xen_pt: Fix passthrough of device with ROM. (2014-01-17 15:29:33 +)


Anthony PERARD (2):
  xen_pt: Fix debug output.
  xen_pt: Fix passthrough of device with ROM.

Stefano Stabellini (1):
  xenfb: map framebuffer read-only and handle unmap errors

 hw/display/xenfb.c |7 ++-
 hw/xen/xen_pt.c|8 
 2 files changed, 10 insertions(+), 5 deletions(-)



Re: [Qemu-devel] [PATCH 0/2] block: commits of snapshots larger than backing files

2014-01-17 Thread Jeff Cody
On Fri, Jan 17, 2014 at 03:17:10PM +0800, Stefan Hajnoczi wrote:
 On Mon, Jan 13, 2014 at 03:18:44PM -0500, Jeff Cody wrote:
  If a snapshot is larger than a backing file, then the offline bdrv_commit 
  and
  the live active layer commit will fail with an i/o error (usually).  A live
  commit of a non-active layer will complete successfully, as it runs
  bdrv_truncate() on the backing image to resize it to the larger size.
  
  For both bdrv_commit() and commit_active_start(), this series will resize
  the underlying base image if needed.  If the resize fails, an error will
  be returned.
 
 This got me thinking about the opposite case: when the snapshot is
 smaller than the backing file.  We leave the backing file with its
 original size.  In practice this is safe because the partition and
 volume metadata should show the smaller size.  If the user really wants
 to shrink the device they can still truncate after completing the
 commit operation.
 
 Can you update the QEMU documentation to explicitly cover both snapshot
   backing and snapshot  backing cases?


Yes, no problem.




Re: [Qemu-devel] [PATCH 1/2] kvm: initialize qemu_host_page_size

2014-01-17 Thread Paolo Bonzini
Il 17/01/2014 16:34, Alex Williamson ha scritto:
 Acked-by: Paolo Bonzini pbonz...@redhat.com
 
 How should this go in?  With your ack I could include it in my vfio tree
 with patch 2/2.  Sound ok?  Thanks,

Yup, thanks!

This is just a small enabler, the real meat is in patch 2 so it makes
sense for you to pick up both.

Paolo



Re: [Qemu-devel] [PATCH 2/2] block: resize backing image during active layer commit, if needed

2014-01-17 Thread Jeff Cody
On Wed, Jan 15, 2014 at 01:58:29PM +0800, Fam Zheng wrote:
 On Mon, 01/13 15:18, Jeff Cody wrote:
  If the top image to commit is the active layer, and also larger than
  the base image, then an I/O error will likely be returned during
  block-commit.
  
  For instance, if we have a base image with a virtual size 10G, and a
  active layer image of size 20G, then committing the snapshot via
  'block-commit' will likely fail.
  
  This will automatically attempt to resize the base image, if the
  active layer image to be committed is larger.
  
  Signed-off-by: Jeff Cody jc...@redhat.com
  ---
   block/mirror.c | 13 +
   1 file changed, 13 insertions(+)
  
  diff --git a/block/mirror.c b/block/mirror.c
  index 2932bab..c4e42fa 100644
  --- a/block/mirror.c
  +++ b/block/mirror.c
  @@ -630,9 +630,22 @@ void commit_active_start(BlockDriverState *bs, 
  BlockDriverState *base,
BlockDriverCompletionFunc *cb,
void *opaque, Error **errp)
   {
  +int64_t length;
   if (bdrv_reopen(base, bs-open_flags, errp)) {
   return;
   }
 
 base is already reopened here.
 
  +
  +length = bdrv_getlength(bs);
  +
  +if (length  bdrv_getlength(base)) {
  +if (bdrv_truncate(base, length)  0) {
  +error_setg(errp, Top image %s is larger than base image %s, 
  and 
  + resize of base image failed.,
  + bs-filename, base-filename);
  +return;
 
 Should we restore open flags for base?


Good catch.  Yes, I think we should; and I think we should also do it
after the call to mirror_start_job, if errp is set.  I'll add that in
for v2.

  +}
  +}
  +
   bdrv_ref(base);
   mirror_start_job(bs, base, speed, 0, 0,
on_error, on_error, cb, opaque, errp,
  -- 
  1.8.3.1
  



Re: [Qemu-devel] linux-user: interrupting syscalls

2014-01-17 Thread Peter Maydell
This is resurrecting a two year old waffly proposal (full text
here: http://lists.gnu.org/archive/html/qemu-devel/2011-12/msg00384.html)
because I realised my idea was broken.

On 4 December 2011 16:29, Peter Maydell peter.mayd...@linaro.org wrote:
 When run natively, this program will block until you send it a
 SIGUSR1; the signal handler will write to the pipe and cause the read
 to complete. Run in linux-user mode, we deadlock, because qemu does
 not run the guest signal handler when in the middle of emulating a
 system call -- it merely queues it to be run when the syscall
 finishes. For cases like this where the event that causes the syscall
 to complete is actually triggered by the guest signal handler, this
 doesn't work.  (There is a real-world instance of this problem in the
 Boehm garbage collector, where a signal handler posts to a semaphore
 which is being waited on by the mainline code.)

 It's not sufficient to simply force all syscalls to be non-restartable
 (and then to take the signal when the syscall returns EINTR), because
 of the following race condition:
  * qemu enters do_syscall on behalf of main thread
  * do_syscall is about to call the underlying syscall, when...
  * the signal arrives (and we queue it)
  * do_syscall then calls the host syscall, which will block. Oops.

 To fix this I think we need to have linux-user's signal handler
 wrapper do a siglongjmp if a signal arrives while we're inside
 do_syscall(). This allows us to properly interrupt whether we'd
 got to the point of making the host syscall or not.

This won't work, because there's no way to tell the difference
between we longjmped out of a write syscall before the kernel
wrote anything and we longjmped out just after the kernel
returned from a write syscall having written the buffer, so
you can't tell if you need to repeat the syscall or not.

I'm no longer sure if there's any fix for these races at all :-(

thanks
-- PMM



Re: [Qemu-devel] [PATCH] block: do not allow read-only=on and snapshot=on to be used together

2014-01-17 Thread Kevin Wolf
Am 16.01.2014 um 20:20 hat Jeff Cody geschrieben:
 On Thu, Jan 16, 2014 at 10:39:30AM +0100, Kevin Wolf wrote:
  Am 14.01.2014 um 20:12 hat Jeff Cody geschrieben:
   Having both read-only=on and snapshot=on together does not make sense;
   currently, the read-only argument is effectively ignored for the
   temporary snapshot.  To prevent confusion, disallow the usage of both
   'snapshot=on' and 'read-only=on'.
   
   Signed-off-by: Jeff Cody jc...@redhat.com
  
  I believe the reason why this was allowed was so that you can use a
  read-only file with -snapshot. It might not be necessary any more since
  I switched -snapshot implementation to modify the options QDict instead
  of manually doing a second bdrv_open().
  
  Did you test that this still works now?
 
 Yes, that still works.

Great. :-)

  The other question is about this code in bdrv_open_flags():
  
  /*
   * Snapshots should be writable.
   */
  if (bs-is_temporary) {
  open_flags |= BDRV_O_RDWR;
  }
  
  Is this dead code now because the flag is always already set?
 
 
 Yes, that ends up being dead code.  From later in blockdev_init():
 
 bdrv_flags |= ro ? 0 : BDRV_O_RDWR;
 
 QINCREF(bs_opts);
 ret = bdrv_open(dinfo-bdrv, file, bs_opts, bdrv_flags, drv, error);
 
 
 And ro is set from the read-only QemuOpts, that we check in this
 patch in conjunction with snapshot.  So if read-only=on is not
 specified, it is opened r/w by default.

Good, that was my impression as well. Can you send a follow-up patch to
remove the dead code?

Kevin



[Qemu-devel] KVM and variable-endianness guest CPUs

2014-01-17 Thread Peter Maydell
[This seemed like a good jumping off point for this question.]

On 16 January 2014 17:51, Alexander Graf ag...@suse.de wrote:
 Am 16.01.2014 um 18:41 schrieb Peter Maydell peter.mayd...@linaro.org:
 Also see my remarks on the previous patch series suggesting
 that we should look at this in a more holistic way than
 just randomly fixing small bits of things. A good place
 to start would be what should the semantics of stl_p()
 be for a QEMU where the CPU is currently operating with
 a reversed endianness to the TARGET_WORDS_BIGENDIAN
 setting?.

 That'd open a giant can of worms that I'd rather not open.

Yeah, but you kind of have to open that can, because stl_p()
is used in the code path for KVM MMIO accesses to devices.

Specifically, the KVM API says here's a uint8_t[] byte
array and a length, and the current QEMU code treats that
as this is a byte array written as if the guest CPU
(a) were in TARGET_WORDS_BIGENDIAN order and (b) wrote its
I/O access to this buffer rather than to the device.

The KVM API docs don't actually specify the endianness
semantics of the byte array, but I think that that really
needs to be nailed down. I can think of a couple of options:
 * always LE
 * always BE
   [these first two are non-starters because they would
   break either x86 or PPC existing code]
 * always the endianness the guest is at the time
 * always some arbitrary endianness based purely on the
   endianness the KVM implementation used historically
 * always the endianness of the host QEMU binary
 * something else?

Any preferences? Current QEMU code basically assumes
always the endianness of TARGET_WORDS_BIGENDIAN,
which is pretty random.

thanks
-- PMM



[Qemu-devel] [PATCH 3/8] target-arm: A64: Add SIMD scalar 3 same add, sub and compare ops

2014-01-17 Thread Peter Maydell
Implement the add, sub and compare ops from the SIMD scalar three same
group.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/translate-a64.c | 131 -
 1 file changed, 130 insertions(+), 1 deletion(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 78f5eb9..2b3f3a3 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5501,6 +5501,58 @@ static void 
disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
 unsupported_encoding(s, insn);
 }
 
+static void handle_3same_64(DisasContext *s, int opcode, bool u,
+TCGv_i64 tcg_rd, TCGv_i64 tcg_rn, TCGv_i64 tcg_rm)
+{
+/* Handle 64x64-64 opcodes which are shared between the scalar
+ * and vector 3-same groups. We cover every opcode where size == 3
+ * is valid in either the three-reg-same (integer, not pairwise)
+ * or scalar-three-reg-same groups. (Some opcodes are not yet
+ * implemented.)
+ */
+TCGCond cond;
+
+switch (opcode) {
+case 0x6: /* CMGT, CMHI */
+/* 64 bit integer comparison, result = test ? (2^64 - 1) : 0.
+ * We implement this using setcond (!test) and subtracting 1.
+ */
+cond = u ? TCG_COND_GTU : TCG_COND_GT;
+do_cmop:
+tcg_gen_setcond_i64(tcg_invert_cond(cond), tcg_rd, tcg_rn, tcg_rm);
+tcg_gen_subi_i64(tcg_rd, tcg_rd, 1);
+break;
+case 0x7: /* CMGE, CMHS */
+cond = u ? TCG_COND_GEU : TCG_COND_GE;
+goto do_cmop;
+case 0x11: /* CMTST, CMEQ */
+if (u) {
+cond = TCG_COND_EQ;
+goto do_cmop;
+}
+/* CMTST : test is if (X  Y != 0). */
+tcg_gen_and_i64(tcg_rd, tcg_rn, tcg_rm);
+tcg_gen_setcondi_i64(TCG_COND_EQ, tcg_rd, tcg_rd, 0);
+tcg_gen_subi_i64(tcg_rd, tcg_rd, 1);
+break;
+case 0x10: /* ADD, SUB */
+if (u) {
+tcg_gen_sub_i64(tcg_rd, tcg_rn, tcg_rm);
+} else {
+tcg_gen_add_i64(tcg_rd, tcg_rn, tcg_rm);
+}
+break;
+case 0x1: /* SQADD */
+case 0x5: /* SQSUB */
+case 0x8: /* SSHL, USHL */
+case 0x9: /* SQSHL, UQSHL */
+case 0xa: /* SRSHL, URSHL */
+case 0xb: /* SQRSHL, UQRSHL */
+default:
+g_assert_not_reached();
+}
+}
+
 /* C3.6.11 AdvSIMD scalar three same
  *  31 30  29 28   24 23  22  21 20  16 1511  10 95 40
  * +-+---+---+--+---+--++---+--+--+
@@ -5509,7 +5561,84 @@ static void 
disas_simd_scalar_three_reg_diff(DisasContext *s, uint32_t insn)
  */
 static void disas_simd_scalar_three_reg_same(DisasContext *s, uint32_t insn)
 {
-unsupported_encoding(s, insn);
+int rd = extract32(insn, 0, 5);
+int rn = extract32(insn, 5, 5);
+int opcode = extract32(insn, 11, 5);
+int rm = extract32(insn, 16, 5);
+int size = extract32(insn, 22, 2);
+bool u = extract32(insn, 29, 1);
+TCGv_i64 tcg_rn;
+TCGv_i64 tcg_rm;
+TCGv_i64 tcg_rd;
+
+if (opcode = 0x18) {
+/* Floating point: U, size[1] and opcode indicate operation */
+int fpopcode = opcode | (extract32(size, 1, 1)  5) | (u  6);
+switch (fpopcode) {
+case 0x1b: /* FMULX */
+case 0x1c: /* FCMEQ */
+case 0x1f: /* FRECPS */
+case 0x3f: /* FRSQRTS */
+case 0x5c: /* FCMGE */
+case 0x5d: /* FACGE */
+case 0x7a: /* FABD  */
+case 0x7c: /* FCMGT */
+case 0x7d: /* FACGT */
+unsupported_encoding(s, insn);
+return;
+default:
+unallocated_encoding(s);
+return;
+}
+}
+
+switch (opcode) {
+case 0x1: /* SQADD, UQADD */
+case 0x5: /* SQSUB, UQSUB */
+case 0x8: /* SSHL, USHL */
+case 0xa: /* SRSHL, URSHL */
+unsupported_encoding(s, insn);
+return;
+case 0x6: /* CMGT, CMHI */
+case 0x7: /* CMGE, CMHS */
+case 0x11: /* CMTST, CMEQ */
+case 0x10: /* ADD, SUB (vector) */
+if (size != 3) {
+unallocated_encoding(s);
+return;
+}
+break;
+case 0x9: /* SQSHL, UQSHL */
+case 0xb: /* SQRSHL, UQRSHL */
+unsupported_encoding(s, insn);
+return;
+case 0x16: /* SQDMULH, SQRDMULH (vector) */
+if (size != 1  size != 2) {
+unallocated_encoding(s);
+return;
+}
+unsupported_encoding(s, insn);
+return;
+default:
+unallocated_encoding(s);
+return;
+}
+
+tcg_rn = read_fp_dreg(s, rn);   /* op1 */
+tcg_rm = read_fp_dreg(s, rm);   /* op2 */
+tcg_rd = tcg_temp_new_i64();
+
+/* For the moment we only support the opcodes which are
+ * 64-bit-width only. The size != 3 cases will
+ * be handled later when the relevant ops are implemented.
+ */
+handle_3same_64(s, opcode, u, tcg_rd, tcg_rn, tcg_rm);
+
+

[Qemu-devel] [PATCH 1/8] target-arm: A64: Add SIMD three-different multiply accumulate insns

2014-01-17 Thread Peter Maydell
Add support for the multiply-accumulate instructions from the
SIMD three-different instructions group (C3.6.15):
 * skeleton decode of unallocated encodings and split of
   the group into its three sub-parts
 * framework for handling the 64x64-128 widening subpart
 * implementation of the multiply-accumulate instructions
   SMLAL, SMLAL2, UMLAL, UMLAL2, SMLSL, SMLSL2, UMLSL, UMLSL2,
   UMULL, UMULL2, SMULL, SMULL2

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/translate-a64.c | 208 -
 1 file changed, 207 insertions(+), 1 deletion(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 003bd9b..c5062e1 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5545,6 +5545,154 @@ static void disas_simd_shift_imm(DisasContext *s, 
uint32_t insn)
 unsupported_encoding(s, insn);
 }
 
+static void handle_3rd_widening(DisasContext *s, int is_q, int is_u, int size,
+int opcode, int rd, int rn, int rm)
+{
+/* 3-reg-different widening insns: 64 x 64 - 128 */
+TCGv_i64 tcg_res[2];
+int pass, accop;
+
+tcg_res[0] = tcg_temp_new_i64();
+tcg_res[1] = tcg_temp_new_i64();
+
+/* Does this op do an adding accumulate, a subtracting accumulate,
+ * or no accumulate at all?
+ */
+switch (opcode) {
+case 5:
+case 8:
+case 9:
+accop = 1;
+break;
+case 10:
+case 11:
+accop = -1;
+break;
+default:
+accop = 0;
+break;
+}
+
+if (accop != 0) {
+read_vec_element(s, tcg_res[0], rd, 0, MO_64);
+read_vec_element(s, tcg_res[1], rd, 1, MO_64);
+}
+
+/* size == 2 means two 32x32-64 operations; this is worth special
+ * casing because we can generally handle it inline.
+ */
+if (size == 2) {
+for (pass = 0; pass  2; pass++) {
+TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+TCGv_i64 tcg_passres;
+TCGMemOp memop = MO_32 | (is_u ? 0 : MO_SIGN);
+
+int elt = pass + is_q * 2;
+
+read_vec_element(s, tcg_op1, rn, elt, memop);
+read_vec_element(s, tcg_op2, rm, elt, memop);
+
+if (accop == 0) {
+tcg_passres = tcg_res[pass];
+} else {
+tcg_passres = tcg_temp_new_i64();
+}
+
+switch (opcode) {
+case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+tcg_gen_mul_i64(tcg_passres, tcg_op1, tcg_op2);
+break;
+default:
+g_assert_not_reached();
+}
+
+if (accop  0) {
+tcg_gen_add_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+tcg_temp_free_i64(tcg_passres);
+} else if (accop  0) {
+tcg_gen_sub_i64(tcg_res[pass], tcg_res[pass], tcg_passres);
+tcg_temp_free_i64(tcg_passres);
+}
+
+tcg_temp_free_i64(tcg_op1);
+tcg_temp_free_i64(tcg_op2);
+}
+} else {
+/* size 0 or 1, generally helper functions */
+for (pass = 0; pass  2; pass++) {
+TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+TCGv_i64 tcg_passres;
+int elt = pass + is_q * 2;
+
+read_vec_element(s, tcg_tmp, rn, elt, MO_32);
+tcg_gen_trunc_i64_i32(tcg_op1, tcg_tmp);
+read_vec_element(s, tcg_tmp, rm, elt, MO_32);
+tcg_gen_trunc_i64_i32(tcg_op2, tcg_tmp);
+tcg_temp_free_i64(tcg_tmp);
+
+if (accop == 0) {
+tcg_passres = tcg_res[pass];
+} else {
+tcg_passres = tcg_temp_new_i64();
+}
+
+switch (opcode) {
+case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
+case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
+case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
+if (size == 0) {
+if (is_u) {
+gen_helper_neon_mull_u8(tcg_passres, tcg_op1, tcg_op2);
+} else {
+gen_helper_neon_mull_s8(tcg_passres, tcg_op1, tcg_op2);
+}
+} else {
+if (is_u) {
+gen_helper_neon_mull_u16(tcg_passres, tcg_op1, 
tcg_op2);
+} else {
+gen_helper_neon_mull_s16(tcg_passres, tcg_op1, 
tcg_op2);
+}
+}
+break;
+default:
+g_assert_not_reached();
+}
+tcg_temp_free_i32(tcg_op1);
+tcg_temp_free_i32(tcg_op2);
+

[Qemu-devel] [PATCH 0/8] target-arm: A64 Neon instructions, set 2

2014-01-17 Thread Peter Maydell
This is the second set of patches for A64 Neon. The last patch
set did a complete coverage of some of the smaller and simpler
instruction groupings; this patch set attacks a few of the
larger groupings (3-same; scalar 3-same; 3-different; shift-imm;
scalar shift-imm) and doesn't attempt complete coverage of them.
My rule of thumb was to (a) implement enough instructions to
demonstrate that the general decode and set of sub-functions/loops
was adequate (b) aim to cover at least as much as the SuSE tree.

Remaining things in SuSE 1.6 tree not yet implemented:
 3-reg-same MLA, MLS, MUL, PMUL, SMAX, UMAX, SMIN, UMIN,
SSHL, USHL, SQSHL, UQSHL, SRSHL, URSHL
 2-reg-misc XTN, FABS, FNEG, NOT

thanks
-- PMM

Alex Bennée (2):
  target-arm: A64: Add logic ops from SIMD 3 same group
  target-arm: A64: Add SIMD shift by immediate

Peter Maydell (6):
  target-arm: A64: Add SIMD three-different multiply accumulate insns
  target-arm: A64: Add SIMD three-different ABDL instructions
  target-arm: A64: Add SIMD scalar 3 same add, sub and compare ops
  target-arm: A64: Add top level decode for SIMD 3-same group
  target-arm: A64: Add integer ops from SIMD 3-same group
  target-arm: A64: Add simple SIMD 3-same floating point ops

 target-arm/translate-a64.c | 1190 +++-
 1 file changed, 1186 insertions(+), 4 deletions(-)

-- 
1.8.5



[Qemu-devel] [PATCH 8/8] target-arm: A64: Add SIMD shift by immediate

2014-01-17 Thread Peter Maydell
From: Alex Bennée alex.ben...@linaro.org

This implements a subset of the AdvSIMD shift operations (namely all the
none saturating or narrowing ones). The actual shift generation code
itself is common for both the scalar and vector cases but wrapped with
either vector element iteration or the fp reg access.

The rounding operations need to take special care to correctly reflect
the result of adding rounding bits on high bits as the intermediates do
not truncate.

Signed-off-by: Alex Bennée alex.ben...@linaro.org
Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/translate-a64.c | 385 -
 1 file changed, 383 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 9980759..bfcce09 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5479,15 +5479,224 @@ static void disas_simd_scalar_pairwise(DisasContext 
*s, uint32_t insn)
 unsupported_encoding(s, insn);
 }
 
+/*
+ * Common SSHR[RA]/USHR[RA] - Shift right (optional rounding/accumulate)
+ *
+ * This code is handles the common shifting code and is used by both
+ * the vector and scalar code.
+ */
+static void handle_shri_with_rndacc(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+TCGv_i64 tcg_rnd, bool accumulate,
+bool is_u, int size, int shift)
+{
+bool extended_result = false;
+bool round = !TCGV_IS_UNUSED_I64(tcg_rnd);
+int ext_lshift = 0;
+TCGv_i64 tcg_src_hi;
+
+if (round  size == 3) {
+extended_result = true;
+ext_lshift = 64 - shift;
+tcg_src_hi = tcg_temp_new_i64();
+} else if (shift == 64) {
+if (!accumulate  is_u) {
+/* result is zero */
+tcg_gen_movi_i64(tcg_res, 0);
+return;
+}
+}
+
+/* Deal with the rounding step */
+if (round) {
+if (extended_result) {
+TCGv_i64 tcg_zero = tcg_const_i64(0);
+if (!is_u) {
+/* take care of sign extending tcg_res */
+tcg_gen_sari_i64(tcg_src_hi, tcg_src, 63);
+tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+ tcg_src, tcg_src_hi,
+ tcg_rnd, tcg_zero);
+} else {
+tcg_gen_add2_i64(tcg_src, tcg_src_hi,
+ tcg_src, tcg_zero,
+ tcg_rnd, tcg_zero);
+}
+tcg_temp_free_i64(tcg_zero);
+} else {
+tcg_gen_add_i64(tcg_src, tcg_src, tcg_rnd);
+}
+}
+
+/* Now do the shift right */
+if (round  extended_result) {
+/* extended case, 64 bit precision required */
+if (ext_lshift == 0) {
+/* special case, only high bits matter */
+tcg_gen_mov_i64(tcg_src, tcg_src_hi);
+} else {
+tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+tcg_gen_shli_i64(tcg_src_hi, tcg_src_hi, ext_lshift);
+tcg_gen_or_i64(tcg_src, tcg_src, tcg_src_hi);
+}
+} else {
+if (is_u) {
+if (shift == 64) {
+/* essentially shifting in 64 zeros */
+tcg_gen_movi_i64(tcg_src, 0);
+} else {
+tcg_gen_shri_i64(tcg_src, tcg_src, shift);
+}
+} else {
+if (shift == 64) {
+/* effectively extending the sign-bit */
+tcg_gen_sari_i64(tcg_src, tcg_src, 63);
+} else {
+tcg_gen_sari_i64(tcg_src, tcg_src, shift);
+}
+}
+}
+
+if (accumulate) {
+tcg_gen_add_i64(tcg_res, tcg_res, tcg_src);
+} else {
+tcg_gen_mov_i64(tcg_res, tcg_src);
+}
+
+if (extended_result) {
+tcg_temp_free(tcg_src_hi);
+}
+}
+
+/* Common SHL/SLI - Shift left with an optional insert */
+static void handle_shli_with_ins(TCGv_i64 tcg_res, TCGv_i64 tcg_src,
+ bool insert, int shift)
+{
+tcg_gen_shli_i64(tcg_src, tcg_src, shift);
+if (insert) {
+/* SLI */
+uint64_t mask = (1ULL  shift) - 1;
+tcg_gen_andi_i64(tcg_res, tcg_res, mask);
+tcg_gen_or_i64(tcg_res, tcg_res, tcg_src);
+} else {
+tcg_gen_mov_i64(tcg_res, tcg_src);
+}
+}
+
+/* SSHR[RA]/USHR[RA] - Scalar shift right (optional rounding/accumulate) */
+static void handle_scalar_simd_shri(DisasContext *s,
+bool is_u, int immh, int immb,
+int opcode, int rn, int rd)
+{
+const int size = 3;
+int immhb = immh  3 | immb;
+int shift = 2 * (8  size) - immhb;
+bool accumulate = false;
+bool round = false;
+TCGv_i64 tcg_rn;
+TCGv_i64 tcg_rd;
+TCGv_i64 tcg_round;
+
+if (!extract32(immh, 3, 1)) {
+unallocated_encoding(s);
+return;
+}
+
+switch 

[Qemu-devel] [PATCH 6/8] target-arm: A64: Add integer ops from SIMD 3-same group

2014-01-17 Thread Peter Maydell
Add some of the integer operations in the SIMD 3-same group:
specifically, the comparisons, addition and subtraction.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/translate-a64.c | 141 -
 1 file changed, 140 insertions(+), 1 deletion(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 82f8e8e..893c05b 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -6009,7 +6009,146 @@ static void disas_simd_3same_float(DisasContext *s, 
uint32_t insn)
 /* Integer op subgroup of C3.6.16. */
 static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
 {
-unsupported_encoding(s, insn);
+int is_q = extract32(insn, 30, 1);
+int u = extract32(insn, 29, 1);
+int size = extract32(insn, 22, 2);
+int opcode = extract32(insn, 11, 5);
+int rm = extract32(insn, 16, 5);
+int rn = extract32(insn, 5, 5);
+int rd = extract32(insn, 0, 5);
+int pass;
+
+switch (opcode) {
+case 0x13: /* MUL, PMUL */
+if (u  size != 0) {
+unallocated_encoding(s);
+return;
+}
+/* fall through */
+case 0x0: /* SHADD, UHADD */
+case 0x2: /* SRHADD, URHADD */
+case 0x4: /* SHSUB, UHSUB */
+case 0xc: /* SMAX, UMAX */
+case 0xd: /* SMIN, UMIN */
+case 0xe: /* SABD, UABD */
+case 0xf: /* SABA, UABA */
+case 0x12: /* MLA, MLS */
+if (size == 3) {
+unallocated_encoding(s);
+return;
+}
+break;
+case 0x1: /* SQADD */
+case 0x5: /* SQSUB */
+case 0x8: /* SSHL, USHL */
+case 0x9: /* SQSHL, UQSHL */
+case 0xa: /* SRSHL, URSHL */
+case 0xb: /* SQRSHL, UQRSHL */
+if (size == 3  !is_q) {
+unallocated_encoding(s);
+return;
+}
+unsupported_encoding(s, insn);
+return;
+default:
+if (size == 3  !is_q) {
+unallocated_encoding(s);
+return;
+}
+break;
+}
+
+if (size == 3) {
+for (pass = 0; pass  (is_q ? 2 : 1); pass++) {
+TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+read_vec_element(s, tcg_op1, rn, pass, MO_64);
+read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+handle_3same_64(s, opcode, u, tcg_res, tcg_op1, tcg_op2);
+
+write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+tcg_temp_free_i64(tcg_res);
+tcg_temp_free_i64(tcg_op1);
+tcg_temp_free_i64(tcg_op2);
+}
+} else {
+for (pass = 0; pass  (is_q ? 4 : 2); pass++) {
+TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+TCGv_i32 tcg_res = tcg_temp_new_i32();
+TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+typedef void NeonGenFn(TCGv_i32, TCGv_i32, TCGv_i32);
+NeonGenFn *genfn;
+
+read_vec_element(s, tcg_tmp, rn, pass, MO_32);
+tcg_gen_trunc_i64_i32(tcg_op1, tcg_tmp);
+read_vec_element(s, tcg_tmp, rm, pass, MO_32);
+tcg_gen_trunc_i64_i32(tcg_op2, tcg_tmp);
+
+switch (opcode) {
+case 0x6: /* CMGT, CMHI */
+{
+NeonGenFn *ceqtstfns[3][2] = {
+{ gen_helper_neon_cgt_s8, gen_helper_neon_cgt_u8 },
+{ gen_helper_neon_cgt_s16, gen_helper_neon_cgt_u16 },
+{ gen_helper_neon_cgt_s32, gen_helper_neon_cgt_u32 },
+};
+genfn = ceqtstfns[size][u];
+break;
+}
+case 0x7: /* CMGE, CMHS */
+{
+NeonGenFn *ceqtstfns[3][2] = {
+{ gen_helper_neon_cge_s8, gen_helper_neon_cge_u8 },
+{ gen_helper_neon_cge_s16, gen_helper_neon_cge_u16 },
+{ gen_helper_neon_cge_s32, gen_helper_neon_cge_u32 },
+};
+genfn = ceqtstfns[size][u];
+break;
+}
+case 0x10: /* ADD, SUB */
+{
+NeonGenFn *addfns[3][2] = {
+{ gen_helper_neon_add_u8, gen_helper_neon_sub_u8 },
+{ gen_helper_neon_add_u16, gen_helper_neon_sub_u16 },
+{ tcg_gen_add_i32, tcg_gen_sub_i32 },
+};
+genfn = addfns[size][u];
+break;
+}
+case 0x11: /* CMTST, CMEQ */
+{
+NeonGenFn *ceqtstfns[3][2] = {
+{ gen_helper_neon_tst_u8, gen_helper_neon_ceq_u8 },
+{ gen_helper_neon_tst_u16, gen_helper_neon_ceq_u16 },
+{ gen_helper_neon_tst_u32, gen_helper_neon_ceq_u32 },
+};
+genfn = ceqtstfns[size][u];
+break;
+ 

[Qemu-devel] [PATCH 4/8] target-arm: A64: Add top level decode for SIMD 3-same group

2014-01-17 Thread Peter Maydell
Add top level decode for the A64 SIMD three regs same group
(C3.6.16), splitting it into the pairwise, logical, float and
integer subgroups.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/translate-a64.c | 45 -
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 2b3f3a3..a92d69d 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5922,6 +5922,30 @@ static void disas_simd_three_reg_diff(DisasContext *s, 
uint32_t insn)
 }
 }
 
+/* Logic op (opcode == 3) subgroup of C3.6.16. */
+static void disas_simd_3same_logic(DisasContext *s, uint32_t insn)
+{
+unsupported_encoding(s, insn);
+}
+
+/* Pairwise op subgroup of C3.6.16. */
+static void disas_simd_3same_pair(DisasContext *s, uint32_t insn)
+{
+unsupported_encoding(s, insn);
+}
+
+/* Floating point op subgroup of C3.6.16. */
+static void disas_simd_3same_float(DisasContext *s, uint32_t insn)
+{
+unsupported_encoding(s, insn);
+}
+
+/* Integer op subgroup of C3.6.16. */
+static void disas_simd_3same_int(DisasContext *s, uint32_t insn)
+{
+unsupported_encoding(s, insn);
+}
+
 /* C3.6.16 AdvSIMD three same
  *  31  30  29  28   24 23  22  21 20  16 1511  10 95 40
  * +---+---+---+---+--+---+--++---+--+--+
@@ -5930,7 +5954,26 @@ static void disas_simd_three_reg_diff(DisasContext *s, 
uint32_t insn)
  */
 static void disas_simd_three_reg_same(DisasContext *s, uint32_t insn)
 {
-unsupported_encoding(s, insn);
+int opcode = extract32(insn, 11, 5);
+
+switch (opcode) {
+case 0x3: /* logic ops */
+disas_simd_3same_logic(s, insn);
+break;
+case 0x17: /* ADDP */
+case 0x14: /* SMAXP, UMAXP */
+case 0x15: /* SMINP, UMINP */
+/* Pairwise operations */
+disas_simd_3same_pair(s, insn);
+break;
+case 0x18 ... 0x31:
+/* floating point ops, sz[1] and U are part of opcode */
+disas_simd_3same_float(s, insn);
+break;
+default:
+disas_simd_3same_int(s, insn);
+break;
+}
 }
 
 /* C3.6.17 AdvSIMD two reg misc
-- 
1.8.5




[Qemu-devel] [PATCH 7/8] target-arm: A64: Add simple SIMD 3-same floating point ops

2014-01-17 Thread Peter Maydell
Implement a simple subset of the SIMD 3-same floating point
operations. This includes a common helper function used for both
scalar and vector ops; FABD is the only currently implemented
shared op.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/translate-a64.c | 191 -
 1 file changed, 189 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index 893c05b..9980759 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5553,6 +5553,132 @@ static void handle_3same_64(DisasContext *s, int 
opcode, bool u,
 }
 }
 
+/* Handle the 3-same-operands float operations; shared by the scalar
+ * and vector encodings. The caller must filter out any encodings
+ * not allocated for the encoding it is dealing with.
+ */
+static void handle_3same_float(DisasContext *s, int size, int elements,
+   int fpopcode, int rd, int rn, int rm)
+{
+int pass;
+TCGv_ptr fpst = get_fpstatus_ptr();
+
+for (pass = 0; pass  elements; pass++) {
+if (size) {
+/* Double */
+TCGv_i64 tcg_op1 = tcg_temp_new_i64();
+TCGv_i64 tcg_op2 = tcg_temp_new_i64();
+TCGv_i64 tcg_res = tcg_temp_new_i64();
+
+read_vec_element(s, tcg_op1, rn, pass, MO_64);
+read_vec_element(s, tcg_op2, rm, pass, MO_64);
+
+switch (fpopcode) {
+case 0x18: /* FMAXNM */
+gen_helper_vfp_maxnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x1a: /* FADD */
+gen_helper_vfp_addd(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x1e: /* FMAX */
+gen_helper_vfp_maxd(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x38: /* FMINNM */
+gen_helper_vfp_minnumd(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x3a: /* FSUB */
+gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x3e: /* FMIN */
+gen_helper_vfp_mind(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x5b: /* FMUL */
+gen_helper_vfp_muld(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x5f: /* FDIV */
+gen_helper_vfp_divd(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x7a: /* FABD */
+gen_helper_vfp_subd(tcg_res, tcg_op1, tcg_op2, fpst);
+gen_helper_vfp_absd(tcg_res, tcg_res);
+break;
+default:
+g_assert_not_reached();
+}
+
+write_vec_element(s, tcg_res, rd, pass, MO_64);
+
+tcg_temp_free_i64(tcg_res);
+tcg_temp_free_i64(tcg_op1);
+tcg_temp_free_i64(tcg_op2);
+} else {
+/* Single */
+TCGv_i32 tcg_op1 = tcg_temp_new_i32();
+TCGv_i32 tcg_op2 = tcg_temp_new_i32();
+TCGv_i32 tcg_res = tcg_temp_new_i32();
+TCGv_i64 tcg_tmp = tcg_temp_new_i64();
+
+read_vec_element(s, tcg_tmp, rn, pass, MO_32);
+tcg_gen_trunc_i64_i32(tcg_op1, tcg_tmp);
+read_vec_element(s, tcg_tmp, rm, pass, MO_32);
+tcg_gen_trunc_i64_i32(tcg_op2, tcg_tmp);
+
+switch (fpopcode) {
+case 0x1a: /* FADD */
+gen_helper_vfp_adds(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x1e: /* FMAX */
+gen_helper_vfp_maxs(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x18: /* FMAXNM */
+gen_helper_vfp_maxnums(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x38: /* FMINNM */
+gen_helper_vfp_minnums(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x3a: /* FSUB */
+gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x3e: /* FMIN */
+gen_helper_vfp_mins(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x5b: /* FMUL */
+gen_helper_vfp_muls(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x5f: /* FDIV */
+gen_helper_vfp_divs(tcg_res, tcg_op1, tcg_op2, fpst);
+break;
+case 0x7a: /* FABD */
+gen_helper_vfp_subs(tcg_res, tcg_op1, tcg_op2, fpst);
+gen_helper_vfp_abss(tcg_res, tcg_res);
+break;
+default:
+g_assert_not_reached();
+}
+
+tcg_gen_extu_i32_i64(tcg_tmp, tcg_res);
+if (elements == 1) {
+/* scalar single so clear high part */
+write_vec_element(s, tcg_tmp, rd, pass, 

Re: [Qemu-devel] KVM and variable-endianness guest CPUs

2014-01-17 Thread Peter Maydell
On 17 January 2014 17:53, Peter Maydell peter.mayd...@linaro.org wrote:
 Specifically, the KVM API says here's a uint8_t[] byte
 array and a length, and the current QEMU code treats that
 as this is a byte array written as if the guest CPU
 (a) were in TARGET_WORDS_BIGENDIAN order and (b) wrote its
 I/O access to this buffer rather than to the device.

 The KVM API docs don't actually specify the endianness
 semantics of the byte array, but I think that that really
 needs to be nailed down. I can think of a couple of options:
  * always LE
  * always BE
[these first two are non-starters because they would
break either x86 or PPC existing code]
  * always the endianness the guest is at the time
  * always some arbitrary endianness based purely on the
endianness the KVM implementation used historically
  * always the endianness of the host QEMU binary
  * something else?

 Any preferences? Current QEMU code basically assumes
 always the endianness of TARGET_WORDS_BIGENDIAN,
 which is pretty random.

Having thought a little more about this, my opinion is:

 * we should specify that the byte order of the mmio.data
   array is host kernel endianness (ie same endianness
   as the QEMU process itself) [this is what it actually
   is, I think, for all the cases that work today]
 * we should fix the code path in QEMU for handling
   mmio.data which currently has the implicit assumption
   that when using KVM TARGET_WORDS_BIGENDIAN is the same
   as the QEMU host process endianness (because it's using
   load/store functions which swap if TARGET_WORDS_BIGENDIAN
   is different from HOST_WORDS_BIGENDIAN)

thanks
-- PMM



[Qemu-devel] [PATCH 2/8] target-arm: A64: Add SIMD three-different ABDL instructions

2014-01-17 Thread Peter Maydell
Implement the absolute-difference instructions in the SIMD
three-different group: SABAL, SABAL2, UABAL, UABAL2, SABDL,
SABDL2, UABDL, UABDL2.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/translate-a64.c | 35 +--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index c5062e1..78f5eb9 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -5600,6 +5600,21 @@ static void handle_3rd_widening(DisasContext *s, int 
is_q, int is_u, int size,
 }
 
 switch (opcode) {
+case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+{
+TCGv_i64 tcg_tmp1 = tcg_temp_new_i64();
+TCGv_i64 tcg_tmp2 = tcg_temp_new_i64();
+
+tcg_gen_sub_i64(tcg_tmp1, tcg_op1, tcg_op2);
+tcg_gen_sub_i64(tcg_tmp2, tcg_op2, tcg_op1);
+tcg_gen_movcond_i64(is_u ? TCG_COND_GEU : TCG_COND_GE,
+tcg_passres,
+tcg_op1, tcg_op2, tcg_tmp1, tcg_tmp2);
+tcg_temp_free_i64(tcg_tmp1);
+tcg_temp_free_i64(tcg_tmp2);
+break;
+}
 case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
 case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
 case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
@@ -5642,6 +5657,22 @@ static void handle_3rd_widening(DisasContext *s, int 
is_q, int is_u, int size,
 }
 
 switch (opcode) {
+case 5: /* SABAL, SABAL2, UABAL, UABAL2 */
+case 7: /* SABDL, SABDL2, UABDL, UABDL2 */
+if (size == 0) {
+if (is_u) {
+gen_helper_neon_abdl_u16(tcg_passres, tcg_op1, 
tcg_op2);
+} else {
+gen_helper_neon_abdl_s16(tcg_passres, tcg_op1, 
tcg_op2);
+}
+} else {
+if (is_u) {
+gen_helper_neon_abdl_u32(tcg_passres, tcg_op1, 
tcg_op2);
+} else {
+gen_helper_neon_abdl_s32(tcg_passres, tcg_op1, 
tcg_op2);
+}
+}
+break;
 case 8: /* SMLAL, SMLAL2, UMLAL, UMLAL2 */
 case 10: /* SMLSL, SMLSL2, UMLSL, UMLSL2 */
 case 12: /* UMULL, UMULL2, SMULL, SMULL2 */
@@ -5741,10 +5772,10 @@ static void disas_simd_three_reg_diff(DisasContext *s, 
uint32_t insn)
 /* fall through */
 case 0:
 case 2:
-case 5:
-case 7:
 unsupported_encoding(s, insn);
 break;
+case 5:
+case 7:
 case 8:
 case 10:
 case 12:
-- 
1.8.5




[Qemu-devel] Exposing and calculating CPU APIC IDs (was Re: [RFC 1/3] target-i386: moving registers of vmstate from cpu_exec_init() to x86_cpu_realizefn())

2014-01-17 Thread Eduardo Habkost
On Wed, Jan 15, 2014 at 03:37:04PM +0100, Igor Mammedov wrote:
 On Wed, 15 Jan 2014 20:24:01 +0800
 Chen Fan chen.fan.f...@cn.fujitsu.com wrote:
  On Tue, 2014-01-14 at 11:40 +0100, Igor Mammedov wrote:
   On Tue, 14 Jan 2014 17:27:20 +0800
   Chen Fan chen.fan.f...@cn.fujitsu.com wrote:
   
the intend of this patch is to register cpu vmstates with apic id 
instead of cpu
index, due to the property setting of apic_id is behind the cpu 
initialization. so
we move the registers of cpu vmstate from cpu_exec_init() to 
x86_cpu_realizefn() to
let the set apicid as the parameter.

Signed-off-by: Chen Fan chen.fan.f...@cn.fujitsu.com
---
 exec.c| 5 +
 target-i386/cpu.c | 9 +
 2 files changed, 14 insertions(+)

diff --git a/exec.c b/exec.c
index 7e49e8e..9be5855 100644
--- a/exec.c
+++ b/exec.c
@@ -438,7 +438,9 @@ CPUState *qemu_get_cpu(int index)
 void cpu_exec_init(CPUArchState *env)
 {
 CPUState *cpu = ENV_GET_CPU(env);
+#if !defined(TARGET_I386)
 CPUClass *cc = CPU_GET_CLASS(cpu);
+#endif
 CPUState *some_cpu;
 int cpu_index;
 
@@ -460,6 +462,8 @@ void cpu_exec_init(CPUArchState *env)
 #if defined(CONFIG_USER_ONLY)
 cpu_list_unlock();
 #endif
+
+#if !defined(TARGET_I386)
 if (qdev_get_vmsd(DEVICE(cpu)) == NULL) {
 vmstate_register(NULL, cpu_index, vmstate_cpu_common, cpu);
 }
@@ -472,6 +476,7 @@ void cpu_exec_init(CPUArchState *env)
 if (cc-vmsd != NULL) {
 vmstate_register(NULL, cpu_index, cc-vmsd, cpu);
 }
+#endif /* !TARGET_I386 */
 }
 
 #if defined(TARGET_HAS_ICE)
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 967529a..dada6f6 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -2552,6 +2552,7 @@ static void x86_cpu_apic_realize(X86CPU *cpu, 
Error **errp)
 static void x86_cpu_realizefn(DeviceState *dev, Error **errp)
 {
 CPUState *cs = CPU(dev);
+CPUClass *cc = CPU_GET_CLASS(cs);
 X86CPU *cpu = X86_CPU(dev);
 X86CPUClass *xcc = X86_CPU_GET_CLASS(dev);
 CPUX86State *env = cpu-env;
@@ -2615,6 +2616,14 @@ static void x86_cpu_realizefn(DeviceState *dev, 
Error **errp)
 cpu_reset(cs);
 
 xcc-parent_realize(dev, local_err);
+
+if (qdev_get_vmsd(DEVICE(cs)) == NULL) {
+vmstate_register(NULL, env-cpuid_apic_id, 
vmstate_cpu_common, cs);
+}
+
+if (cc-vmsd != NULL) {
+vmstate_register(NULL, env-cpuid_apic_id, cc-vmsd, cs);
+}
   how about doing it in common CPUclass.realize()
   you can use get_arch_id() for getting CPU id, it returns cpu_index by 
   default
   and apic_id for target-i386.
  
  Thanks for your kind suggestion, does this mean we can directly move
  vmstate_register to cpu_common_realizefn()? 
 yep, that is a gist of it.
 
 There is more to the issue with discontinuous CPUs, a lot of code still
 use cpu_index and the way it's allocated is not compatible with discontinuous
 CPUs, so this issue should be fixed as well.
 
 Also you propose to use a raw apic id with CLI/user interface.
 I recall there were objections to it since APIC ID contains topology
 information and it's not trivial for user to get it right.
 The last idea that was discussed to fix it was not expose APIC ID to
 user but rather introduce QOM hierarchy like:
   /machine/node/N/socket/X/core/Y/thread/Z
 and use it in user interface as a means to specify an arbitrary CPU
 and let QEMU calculate APIC ID based on this path.
 
 But nobody took on implementing it yet.

We're taking so long to get a decent interface implemented, that part of
me is considering exposing the APIC ID directly like suggested before,
and requiring libvirt to calculate topology-aware APIC IDs[1] to
properly implement CPU hotplug (and possibly for other tasks).

Another part of me is hoping that the libvirt developers ask us to
please not do that, so I can use it as argument against exposing the
APIC IDs directly the next time we discuss this.  :)


[1] See target-i386/topology.h for references

-- 
Eduardo



Re: [Qemu-devel] Question on pointers in the qemu user space emulation

2014-01-17 Thread Erik de Castro Lopo
Peter Maydell wrote:

 On 17 January 2014 06:33, Erik de Castro Lopo mle+to...@mega-nerd.com wrote:
  I'm currently working on implementing a missing part of a linux-user
  syscall. This syscall includes a function pointer for a callback.
 
 Which syscall? Callbacks from the kernel are pretty tricky.
 Basically you need to register a host function as the callback
 with the host kernel, and stash the guest function pointer somewhere
 so that when the callback comes in from the host kernel you can
 arrange to interrupt the guest and restart it at the desired
 location.
 
 Pretty much the only situation we support this for is the special
 case of signal handlers. In fact I wasn't even aware there was
 any other kind of kernel-to-userspace callback...

The syscall is kind of signal related.

When I implemented the POSIX timer syscalls a little while ago I got
them working for my specific use case. Since then someone pointed
out that the implementation was not complete and I'd like to fix
that. The ticket is here:

https://bugs.launchpad.net/qemu/+bug/1042388#27

and the guest user space test case here:


https://bugs.launchpad.net/qemu/+bug/1042388/+attachment/3948443/+files/timer_test.c

Erik
-- 
--
Erik de Castro Lopo
http://www.mega-nerd.com/



[Qemu-devel] [PULL 0/7] vfio pull request

2014-01-17 Thread Alex Williamson
Hi Anthony,

The following changes since commit 1cf892ca2689c84960b4ce4d2723b6bee453711c:

  SPARC: Fix LEON3 power down instruction (2014-01-15 15:37:33 +1000)

are available in the git repository at:

  git://github.com/awilliam/qemu-vfio.git tags/vfio-pci-for-qemu-20140117.0

for you to fetch changes up to 8d7b5a1da0e06aa7addd7f084d9ec9d433c4bafb:

  vfio: fix mapping of MSIX bar (2014-01-17 11:12:56 -0700)


vfio-pci updates include:
 - Destroy MemoryRegions on device teardown
 - Print warnings around PCI option ROM failures
 - Skip bogus mappings from 64bit BAR sizing
 - Act on DMA mapping failures
 - Fix alignment to avoid MSI-X table mapping


Alex Williamson (3):
  vfio: Destroy memory regions
  vfio: Filter out bogus mappings
  vfio-pci: Fail initfn on DMA mapping errors

Alexey Kardashevskiy (2):
  kvm: initialize qemu_host_page_size
  vfio: fix mapping of MSIX bar

Bandan Das (2):
  vfio: warn if host device rom can't be read
  vfio: Do not reattempt a failed rom read

 hw/misc/vfio.c  | 76 ++---
 include/exec/exec-all.h |  1 +
 kvm-all.c   |  1 +
 translate-all.c | 14 +
 4 files changed, 76 insertions(+), 16 deletions(-)



[Qemu-devel] [PULL 1/7] vfio: Destroy memory regions

2014-01-17 Thread Alex Williamson
Somehow this has been lurking for a while; we remove our subregions
from the base BAR and VGA region mappings, but we don't destroy them,
creating a leak and more serious problems when we try to migrate after
removing these devices.  Add the trivial bit of final cleanup to
remove these entirely.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/misc/vfio.c |4 
 1 file changed, 4 insertions(+)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 9aecaa8..ec9f41b 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -1968,6 +1968,7 @@ static void vfio_vga_quirk_teardown(VFIODevice *vdev)
 while (!QLIST_EMPTY(vdev-vga.region[i].quirks)) {
 VFIOQuirk *quirk = QLIST_FIRST(vdev-vga.region[i].quirks);
 memory_region_del_subregion(vdev-vga.region[i].mem, quirk-mem);
+memory_region_destroy(quirk-mem);
 QLIST_REMOVE(quirk, next);
 g_free(quirk);
 }
@@ -1990,6 +1991,7 @@ static void vfio_bar_quirk_teardown(VFIODevice *vdev, int 
nr)
 while (!QLIST_EMPTY(bar-quirks)) {
 VFIOQuirk *quirk = QLIST_FIRST(bar-quirks);
 memory_region_del_subregion(bar-mem, quirk-mem);
+memory_region_destroy(quirk-mem);
 QLIST_REMOVE(quirk, next);
 g_free(quirk);
 }
@@ -2412,10 +2414,12 @@ static void vfio_unmap_bar(VFIODevice *vdev, int nr)
 
 memory_region_del_subregion(bar-mem, bar-mmap_mem);
 munmap(bar-mmap, memory_region_size(bar-mmap_mem));
+memory_region_destroy(bar-mmap_mem);
 
 if (vdev-msix  vdev-msix-table_bar == nr) {
 memory_region_del_subregion(bar-mem, vdev-msix-mmap_mem);
 munmap(vdev-msix-mmap, memory_region_size(vdev-msix-mmap_mem));
+memory_region_destroy(vdev-msix-mmap_mem);
 }
 
 memory_region_destroy(bar-mem);




[Qemu-devel] [PULL 3/7] vfio: Do not reattempt a failed rom read

2014-01-17 Thread Alex Williamson
From: Bandan Das b...@redhat.com

During lazy rom loading, if rom read fails, and the
guest attempts a read again, vfio will again attempt it.
Add a boolean to prevent this. There could be a case where
a failed rom read might succeed the next time because of
a device reset or such, but it's best to exclude unpredictable
behavior

Signed-off-by: Bandan Das b...@redhat.com
Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/misc/vfio.c |6 ++
 1 file changed, 6 insertions(+)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index ef615fc..30b1a78 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -191,6 +191,7 @@ typedef struct VFIODevice {
 bool has_flr;
 bool has_pm_reset;
 bool needs_reset;
+bool rom_read_failed;
 } VFIODevice;
 
 typedef struct VFIOGroup {
@@ -1125,6 +1126,7 @@ static void vfio_pci_load_rom(VFIODevice *vdev)
 vdev-rom_offset = reg_info.offset;
 
 if (!vdev-rom_size) {
+vdev-rom_read_failed = true;
 error_report(vfio-pci: Cannot read device rom at 
 %04x:%02x:%02x.%x\n,
 vdev-host.domain, vdev-host.bus, vdev-host.slot,
@@ -1163,6 +1165,9 @@ static uint64_t vfio_rom_read(void *opaque, hwaddr addr, 
unsigned size)
 /* Load the ROM lazily when the guest tries to read it */
 if (unlikely(!vdev-rom)) {
 vfio_pci_load_rom(vdev);
+if (unlikely(!vdev-rom  !vdev-rom_read_failed)) {
+vfio_pci_load_rom(vdev);
+}
 }
 
 memcpy(val, vdev-rom + addr,
@@ -1230,6 +1235,7 @@ static void vfio_pci_size_rom(VFIODevice *vdev)
  PCI_BASE_ADDRESS_SPACE_MEMORY, vdev-pdev.rom);
 
 vdev-pdev.has_rom = true;
+vdev-rom_read_failed = false;
 }
 
 static void vfio_vga_write(void *opaque, hwaddr addr,




[Qemu-devel] [PULL 6/7] kvm: initialize qemu_host_page_size

2014-01-17 Thread Alex Williamson
From: Alexey Kardashevskiy a...@ozlabs.ru

There is a HOST_PAGE_ALIGN macro which makes sense for KVM accelerator
but it uses qemu_host_page_size/qemu_host_page_mask which initialized
for TCG only.

This moves qemu_host_page_size/qemu_host_page_mask initialization from
TCG's page_init() and adds a call for it from kvm_init().

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Acked-by: Paolo Bonzini pbonz...@redhat.com
Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 include/exec/exec-all.h |1 +
 kvm-all.c   |1 +
 translate-all.c |   14 --
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index ea90b64..3b03cbf 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -81,6 +81,7 @@ void cpu_gen_init(void);
 int cpu_gen_code(CPUArchState *env, struct TranslationBlock *tb,
  int *gen_code_size_ptr);
 bool cpu_restore_state(CPUArchState *env, uintptr_t searched_pc);
+void page_size_init(void);
 
 void QEMU_NORETURN cpu_resume_from_signal(CPUArchState *env1, void *puc);
 void QEMU_NORETURN cpu_io_recompile(CPUArchState *env, uintptr_t retaddr);
diff --git a/kvm-all.c b/kvm-all.c
index 0bfb060..edf2365 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1360,6 +1360,7 @@ int kvm_init(void)
  * page size for the system though.
  */
 assert(TARGET_PAGE_SIZE = getpagesize());
+page_size_init();
 
 #ifdef KVM_CAP_SET_GUEST_DEBUG
 QTAILQ_INIT(s-kvm_sw_breakpoints);
diff --git a/translate-all.c b/translate-all.c
index 105c25a..543e1ff 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -289,17 +289,15 @@ static inline void map_exec(void *addr, long size)
 }
 #endif
 
-static void page_init(void)
+void page_size_init(void)
 {
 /* NOTE: we can always suppose that qemu_host_page_size =
TARGET_PAGE_SIZE */
 #ifdef _WIN32
-{
-SYSTEM_INFO system_info;
+SYSTEM_INFO system_info;
 
-GetSystemInfo(system_info);
-qemu_real_host_page_size = system_info.dwPageSize;
-}
+GetSystemInfo(system_info);
+qemu_real_host_page_size = system_info.dwPageSize;
 #else
 qemu_real_host_page_size = getpagesize();
 #endif
@@ -310,7 +308,11 @@ static void page_init(void)
 qemu_host_page_size = TARGET_PAGE_SIZE;
 }
 qemu_host_page_mask = ~(qemu_host_page_size - 1);
+}
 
+static void page_init(void)
+{
+page_size_init();
 #if defined(CONFIG_BSD)  defined(CONFIG_USER_ONLY)
 {
 #ifdef HAVE_KINFO_GETVMMAP




[Qemu-devel] [PULL 2/7] vfio: warn if host device rom can't be read

2014-01-17 Thread Alex Williamson
From: Bandan Das b...@redhat.com

If the device rom can't be read, report an error to the
user. This alerts the user that the device has a bad
state that is causing rom read failure or option rom
loading has been disabled from the device boot menu
(among other reasons).

Signed-off-by: Bandan Das b...@redhat.com
Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/misc/vfio.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index ec9f41b..ef615fc 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -1125,6 +1125,13 @@ static void vfio_pci_load_rom(VFIODevice *vdev)
 vdev-rom_offset = reg_info.offset;
 
 if (!vdev-rom_size) {
+error_report(vfio-pci: Cannot read device rom at 
+%04x:%02x:%02x.%x\n,
+vdev-host.domain, vdev-host.bus, vdev-host.slot,
+vdev-host.function);
+error_printf(Device option ROM contents are probably invalid 
+(check dmesg).\nSkip option ROM probe with rombar=0, 
+or load from file with romfile=\n);
 return;
 }
 




[Qemu-devel] [PULL 4/7] vfio: Filter out bogus mappings

2014-01-17 Thread Alex Williamson
Since 57271d63 we now see spurious mappings with the upper bits set
if 64bit PCI BARs are sized while enabled.  The guest writes a mask
of 0x to the lower BAR to size it, then restores it, then
writes the same mask to the upper BAR resulting in a spurious BAR
mapping into the last 4G of the 64bit address space.  Most
architectures do not support or make use of the full 64bits address
space for PCI BARs, so we filter out mappings with the high bit set.
Long term, we probably need to think about vfio telling us the
address width limitations of the IOMMU.

Signed-off-by: Alex Williamson alex.william...@redhat.com
Reviewed-by: Michael S. Tsirkin m...@redhat.com
---
 hw/misc/vfio.c |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 30b1a78..d304213 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -2156,7 +2156,14 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
iova,
 
 static bool vfio_listener_skipped_section(MemoryRegionSection *section)
 {
-return !memory_region_is_ram(section-mr);
+return !memory_region_is_ram(section-mr) ||
+   /*
+* Sizing an enabled 64-bit BAR can cause spurious mappings to
+* addresses in the upper part of the 64-bit address space.  These
+* are never accessed by the CPU and beyond the address width of
+* some IOMMU hardware.  TODO: VFIO should tell us the IOMMU width.
+*/
+   section-offset_within_address_space  (1ULL  63);
 }
 
 static void vfio_listener_region_add(MemoryListener *listener,




[Qemu-devel] [PULL 5/7] vfio-pci: Fail initfn on DMA mapping errors

2014-01-17 Thread Alex Williamson
The vfio-pci initfn will currently succeed even if DMA mappings fail.
A typical reason for failure is if the user does not have sufficient
privilege to lock all the memory for the guest.  In this case, the
device gets attached, but can only access a portion of guest memory
and is extremely unlikely to work.

DMA mappings are done via a MemoryListener, which provides no direct
error return path.  We therefore stuff the errno into our container
structure and check for error after registration completes.  We can
also test for mapping errors during runtime, but our only option for
resolution at that point is to kill the guest with a hw_error.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/misc/vfio.c |   44 ++--
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index d304213..432547c 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -135,12 +135,18 @@ enum {
 
 struct VFIOGroup;
 
+typedef struct VFIOType1 {
+MemoryListener listener;
+int error;
+bool initialized;
+} VFIOType1;
+
 typedef struct VFIOContainer {
 int fd; /* /dev/vfio/vfio, empowered by the attached groups */
 struct {
 /* enable abstraction to support various iommu backends */
 union {
-MemoryListener listener; /* Used by type1 iommu */
+VFIOType1 type1;
 };
 void (*release)(struct VFIOContainer *);
 } iommu_data;
@@ -2170,7 +2176,7 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
  MemoryRegionSection *section)
 {
 VFIOContainer *container = container_of(listener, VFIOContainer,
-iommu_data.listener);
+iommu_data.type1.listener);
 hwaddr iova, end;
 void *vaddr;
 int ret;
@@ -2212,6 +2218,19 @@ static void vfio_listener_region_add(MemoryListener 
*listener,
 error_report(vfio_dma_map(%p, 0x%HWADDR_PRIx, 
  0x%HWADDR_PRIx, %p) = %d (%m),
  container, iova, end - iova, vaddr, ret);
+
+/*
+ * On the initfn path, store the first error in the container so we
+ * can gracefully fail.  Runtime, there's not much we can do other
+ * than throw a hardware error.
+ */
+if (!container-iommu_data.type1.initialized) {
+if (!container-iommu_data.type1.error) {
+container-iommu_data.type1.error = ret;
+}
+} else {
+hw_error(vfio: DMA mapping failed, unable to continue\n);
+}
 }
 }
 
@@ -2219,7 +2238,7 @@ static void vfio_listener_region_del(MemoryListener 
*listener,
  MemoryRegionSection *section)
 {
 VFIOContainer *container = container_of(listener, VFIOContainer,
-iommu_data.listener);
+iommu_data.type1.listener);
 hwaddr iova, end;
 int ret;
 
@@ -2264,7 +2283,7 @@ static MemoryListener vfio_memory_listener = {
 
 static void vfio_listener_release(VFIOContainer *container)
 {
-memory_listener_unregister(container-iommu_data.listener);
+memory_listener_unregister(container-iommu_data.type1.listener);
 }
 
 /*
@@ -3236,10 +3255,23 @@ static int vfio_connect_container(VFIOGroup *group)
 return -errno;
 }
 
-container-iommu_data.listener = vfio_memory_listener;
+container-iommu_data.type1.listener = vfio_memory_listener;
 container-iommu_data.release = vfio_listener_release;
 
-memory_listener_register(container-iommu_data.listener, 
address_space_memory);
+memory_listener_register(container-iommu_data.type1.listener,
+ address_space_memory);
+
+if (container-iommu_data.type1.error) {
+ret = container-iommu_data.type1.error;
+vfio_listener_release(container);
+g_free(container);
+close(fd);
+error_report(vfio: memory listener initialization failed for 
container\n);
+return ret;
+}
+
+container-iommu_data.type1.initialized = true;
+
 } else {
 error_report(vfio: No available IOMMU models);
 g_free(container);




[Qemu-devel] [PULL 7/7] vfio: fix mapping of MSIX bar

2014-01-17 Thread Alex Williamson
From: Alexey Kardashevskiy a...@ozlabs.ru

VFIO virtualizes MSIX table for the guest but not mapping the part of
a BAR which contains an MSIX table. Since vfio_mmap_bar() mmaps chunks
before and after the MSIX table, they have to be aligned to the host
page size which may be TARGET_PAGE_MASK (4K) or 64K in case of PPC64.

This fixes boundaries calculations to use the real host page size.

Without the patch, the chunk before MSIX table may overlap with the MSIX
table and mmap will fail in the host kernel. The result will be serious
slowdown as the whole BAR will be emulated by QEMU.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 hw/misc/vfio.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/misc/vfio.c b/hw/misc/vfio.c
index 432547c..8a1f1a1 100644
--- a/hw/misc/vfio.c
+++ b/hw/misc/vfio.c
@@ -2544,7 +2544,7 @@ static void vfio_map_bar(VFIODevice *vdev, int nr)
  * potentially insert a direct-mapped subregion before and after it.
  */
 if (vdev-msix  vdev-msix-table_bar == nr) {
-size = vdev-msix-table_offset  TARGET_PAGE_MASK;
+size = vdev-msix-table_offset  qemu_host_page_mask;
 }
 
 strncat(name,  mmap, sizeof(name) - strlen(name) - 1);
@@ -2556,8 +2556,8 @@ static void vfio_map_bar(VFIODevice *vdev, int nr)
 if (vdev-msix  vdev-msix-table_bar == nr) {
 unsigned start;
 
-start = TARGET_PAGE_ALIGN(vdev-msix-table_offset +
-  (vdev-msix-entries * PCI_MSIX_ENTRY_SIZE));
+start = HOST_PAGE_ALIGN(vdev-msix-table_offset +
+(vdev-msix-entries * PCI_MSIX_ENTRY_SIZE));
 
 size = start  bar-size ? bar-size - start : 0;
 strncat(name,  msix-hi, sizeof(name) - strlen(name) - 1);




[Qemu-devel] [PATCH v3] target-ppc: gdbstub allow byte swapping for, reading/writing registers

2014-01-17 Thread Thomas Falcon

This patch allows registers to be properly read from and written to
when using the gdbstub to debug a ppc guest running in little
endian mode.  It accomplishes this goal by byte swapping the values of
any registers if the MSR:LE value is set.

Signed-off-by: Thomas Falcon tlfal...@linux.vnet.ibm.com
---
Differences from v2:

Fixed formatting issues
Added logic to ensure only FP registers have a guaranteed size of 8 bytes

---
 target-ppc/cpu-qom.h|  2 ++
 target-ppc/gdbstub.c| 45 
+

 target-ppc/translate_init.c |  4 ++--
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/target-ppc/cpu-qom.h b/target-ppc/cpu-qom.h
index 72b2232..992963f 100644
--- a/target-ppc/cpu-qom.h
+++ b/target-ppc/cpu-qom.h
@@ -109,7 +109,9 @@ void ppc_cpu_dump_statistics(CPUState *cpu, FILE *f,
  fprintf_function cpu_fprintf, int flags);
 hwaddr ppc_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
 int ppc_cpu_gdb_read_register(CPUState *cpu, uint8_t *buf, int reg);
+int ppc_cpu_gdb_read_register_wrap(CPUState *cpu, uint8_t *buf, int reg);
 int ppc_cpu_gdb_write_register(CPUState *cpu, uint8_t *buf, int reg);
+int ppc_cpu_gdb_write_register_wrap(CPUState *cpu, uint8_t *buf, int reg);
 int ppc64_cpu_write_elf64_qemunote(WriteCoreDumpFunction f,
CPUState *cpu, void *opaque);
 int ppc64_cpu_write_elf64_note(WriteCoreDumpFunction f, CPUState *cs,
diff --git a/target-ppc/gdbstub.c b/target-ppc/gdbstub.c
index 1c91090..cc4eac5 100644
--- a/target-ppc/gdbstub.c
+++ b/target-ppc/gdbstub.c
@@ -21,6 +21,51 @@
 #include qemu-common.h
 #include exec/gdbstub.h

+/* The following functions are used to ensure the correct
+ * transfer of registers between a little endian ppc target
+ * and a big endian host by checking the LE bit in the Machine State 
Register

+ */
+
+int ppc_cpu_gdb_read_register_wrap(CPUState *cs, uint8_t *mem_buf, int n)
+{
+PowerPCCPU *cpu = POWERPC_CPU(cs);
+CPUPPCState *env = cpu-env;
+
+int len = ppc_cpu_gdb_read_register(cs, mem_buf, n), i;
+if (msr_le) {
+uint8_t tmp;
+for (i = 0; i  len/2 ; i++) {
+tmp = *(mem_buf + i);
+*(mem_buf + i) = *(mem_buf + len - 1 - i);
+*(mem_buf + len - 1 - i) = tmp;
+}
+}
+return len;
+}
+
+int ppc_cpu_gdb_write_register_wrap(CPUState *cs, uint8_t *mem_buf, int n)
+{
+PowerPCCPU *cpu = POWERPC_CPU(cs);
+CPUPPCState *env = cpu-env;
+if (msr_le) {
+int len = 0, i = 0;
+if (n  31  n  64) {
+len = 8;
+} else if (n == 66) {
+len = 4;
+} else {
+len = sizeof(target_ulong);
+}
+uint8_t tmp;
+for (i = 0; i  len/2; i++) {
+tmp = *(mem_buf + i);
+*(mem_buf+i) = *(mem_buf + len - 1 - i);
+*(mem_buf + len - 1 - i) = tmp;
+}
+}
+return ppc_cpu_gdb_write_register(cs, mem_buf, n);
+}
+
 /* Old gdb always expects FP registers.  Newer (xml-aware) gdb only
  * expects whatever the target description contains.  Due to a
  * historical mishap the FP registers appear in between core integer
diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
index c030a20..41ea4b7 100644
--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -8655,8 +8655,8 @@ static void ppc_cpu_class_init(ObjectClass *oc, 
void *data)

 cc-dump_state = ppc_cpu_dump_state;
 cc-dump_statistics = ppc_cpu_dump_statistics;
 cc-set_pc = ppc_cpu_set_pc;
-cc-gdb_read_register = ppc_cpu_gdb_read_register;
-cc-gdb_write_register = ppc_cpu_gdb_write_register;
+cc-gdb_read_register = ppc_cpu_gdb_read_register_wrap;
+cc-gdb_write_register = ppc_cpu_gdb_write_register_wrap;
 #ifndef CONFIG_USER_ONLY
 cc-get_phys_page_debug = ppc_cpu_get_phys_page_debug;
 cc-vmsd = vmstate_ppc_cpu;
--
1.8.3.1




[Qemu-devel] RFC: ACPI, HPET._CRS, MacOSX vs. WinXP

2014-01-17 Thread Gabriel L. Somlo
On Fri, Jan 10, 2014 at 05:13:11PM +0100, Igor Mammedov wrote:
  On Fri, Jan 10, 2014 at 01:37:14PM +0100, Paolo Bonzini wrote:
   Il 09/01/2014 22:44, Gabriel L. Somlo ha scritto:
1. hardcode IRQNoFlags(){2, 8} and require -no-hpet to prevent XP
   from bluescreening. Basically, this means we don't support XP on
   a VM where HPET is enabled.

2. conditionally insert IRQNoFlags(){2, 8} if _OSI(Darwin) returns
   0x, [...] if we want to run OS X on piix+smp
   (all other combinations of (piix vs. q35) x (up vs.  smp) work fine
...
 there is harder route to get a clue why XP BSODs,
 one can use AMLI debugger to see what is happening in XP on boot
 http://msdn.microsoft.com/en-us/library/windows/hardware/ff537808%28v=vs.85%29.aspx
 that was how I found out about not supported ConcatenateResTemplate first.

After doing a bit of research, here's what I was able to find out:

1. IRQNoFlags {2, 8} was a mistake/typo I made early on, real Apple
   hardware (and *only* Apple hardware) has IRQNoFlags {0, 8} included
   with HPET._CRS

2. Both 0 and 8 are already spoken for, by the system timer (TMR), and
   by the RTC, respectively.

3. WinXP will also bluescreen on QEMU if HPET._CRS has IRQNoFlags {0, 8}
   added; in fact, it will bluescreen if *any* IRQNoFlags statement is
   included with the HPET DSDT node, as far as I was able to tell.

4. I acpidump-ed the DSDT on a bunch of machines, including a MacPro5,1,
   a MacBookPro2,2, a MacBookPro9,1, a Dell Latitude D630, and a Dell R410.
   Here's what I found:

 - All Macs (and only Macs) have IRQNoFlags {0, 8} in HPET._CRS.
 - my Dells (and I suspect most non-Apple machines) don't have
   IRQNoFlags in HPET_CRS at all.

 - On Macs, there's no TMR DSDT node at all.
 - My Latitude D630 laptop has a TMR node, with IRQNoFlags {2} in _CRS
 - the R410 also has a TMR node, with IRQNoFlags {0} in _CRS.

 - On Macs, RTC has no IRQNoFlags in its _CRS
 - Both my dells have IRQNoFlags {8} in RTC._CRS

5. I was able to boot XPsp2 on the MacBookPro2,2 straight from the DVD
   with a zeroed-out hard drive. It installs and works fine, and when
   I pull up the Device Manager, neither the RTC or the TMR devices have
   any IRQs listed under Properties/Resources.
   What is even more interesting, the HPET does NOT show up in the device
   tree *at all* !!!

6. On the MacBookPro9,1, XP bluescreens during install, in a similar
   mannner to how it bluescreens on QEMU if HPET._CRS contains IRQNoFlags.
   Bootcamp doesn't support anything older than Windows 7 on those machines,
   so I don't think there's anything I can do to get XP running and look
   at the device tree on that machine.

7. On QEMU, XP does indeed show the HPET alongside the TMR and RTC in
   its device tree, obviously without any IRQ resources under properties,
   since the only way it boots is if HPET._CRS doesn't include IRQNoFlags.

8. Windows 7, while it boots and runs fine when HPET._CRS contains
   IRQNoFlags, will show an unresolved IRQ conflict between the HPET and
   the RTC in the device tree. Commenting out IRQNoFlags from the RTC
   in QEMU does NOT solve that (still shows up as a conflict in the
   device tree).

9. I followed Igor's advice and ran a debug session, but unlike with
   ConcatenateResTemplate, XP didn't choke on AML byte code itself, but
   appears to die of a memory access violation:

 *** Fatal System Error: 0x007e (0x...)
 ...
 Probably caused by: ACPI.sys (ACPI!AcpiArbCrackPRT+113)

 80527bdc cc   int 3

   Poking around with various !amli debugger commands does not show
   anything AML-related as abnormal, so I think the problem is that
   either XP specifically expects the HPET._CRS buffer to be of a
   certain hardcoded size (which doesn't include any extra room for
   IRQNoFlags), or attempts to somehow process the values given via
   IRQNoFlags (0 and 8) in ways that cause it to then kill itself.

   Hard to tell without knowing more about XP internals (e.g., without
   XP kernel and/or acpi.sys source code).

At this point, conditionally inserting IRQNoFlags {0,8} only for OS X
sounds less of a hack and more like the right thing to do. I would be
comfortable using ConcatenateResTemplate since XP never gets a chance
to attempt to interpret that bytecode, which gets exercised only by
the _OSI(Darwin) branch. I tested this, and both XP and OSX seem
happy with it.

Please let me know what you think.

Thanks,
--Gabriel

diff --git a/hw/i386/acpi-dsdt-hpet.dsl b/hw/i386/acpi-dsdt-hpet.dsl
index dfde174..0cf7fbf 100644
--- a/hw/i386/acpi-dsdt-hpet.dsl
+++ b/hw/i386/acpi-dsdt-hpet.dsl
@@ -38,14 +38,21 @@ Scope(\_SB) {
 }
 Return (0x0F)
 }
-Name(_CRS, ResourceTemplate() {
-#if 0   /* This makes WinXP BSOD for not yet figured reasons. */
-IRQNoFlags() {2, 8}
-#endif
+Name(RESP, ResourceTemplate() {
 Memory32Fixed(ReadOnly,
 

Re: [Qemu-devel] [PATCHv6 5/6] qemu-iotests: fix expected output of test 067

2014-01-17 Thread Max Reitz

On 15.01.2014 09:01, Fam Zheng wrote:

On Wed, 01/15 08:01, Peter Lieven wrote:

On 15.01.2014 07:54, Fam Zheng wrote:

On Mon, 01/13 11:21, Peter Lieven wrote:

Signed-off-by: Peter Lieven p...@kamp.de
---
  tests/qemu-iotests/067.out |8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/067.out b/tests/qemu-iotests/067.out
index 8d271cc..79ed90f 100644
--- a/tests/qemu-iotests/067.out
+++ b/tests/qemu-iotests/067.out
@@ -12,7 +12,7 @@ QMP_VERSION
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: DEVICE_DELETED, 
data: {path: /machine/peripheral/virtio0/virtio-backend}}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: DEVICE_DELETED, data: 
{device: virtio0, path: /machine/peripheral/virtio0}}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: 
RESET}
-{return: [{io-status: ok, device: ide1-cd0, locked: false, removable: true, tray_open: false, type: unknown}, {device: floppy0, locked: false, 
removable: true, tray_open: false, type: unknown}, {device: sd0, locked: false, removable: true, tray_open: false, type: unknown}]}
+{return: [{io-status: ok, device: disk, locked: false, removable: false, inserted: {iops_rd: 0, image: {virtual-size: 134217728, filename: TEST_DIR/t.qcow2, cluster-size: 65536, format: qcow2, actual-size: SIZE, format-specific: {type: qcow2, data: {compat: 1.1, lazy-refcounts: false}}, dirty-flag: false}, iops_wr: 0, ro: false, backing_file_depth: 0, drv: 
qcow2, iops: 0, bps_wr: 0, encrypted: false, bps: 0, bps_rd: 0, file: TEST_DIR/t.qcow2, encryption_key_missing: false}, type: unknown}, {io-status: ok, device: ide1-cd0, locked: false, removable: true, tray_open: false, type: unknown}, {device: floppy0, locked: false, removable: true, tray_open: false, type: unknown}, {device: sd0, locked: false, 
removable: true, tray_open: false, type: unknown}]}
  {return: {}}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: 
SHUTDOWN}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: DEVICE_TRAY_MOVED, data: 
{device: ide1-cd0, tray-open: true}}
@@ -31,7 +31,7 @@ QMP_VERSION
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: DEVICE_DELETED, 
data: {path: /machine/peripheral/virtio0/virtio-backend}}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: DEVICE_DELETED, data: 
{device: virtio0, path: /machine/peripheral/virtio0}}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: 
RESET}
-{return: [{io-status: ok, device: ide1-cd0, locked: false, removable: true, tray_open: false, type: unknown}, {device: floppy0, locked: false, 
removable: true, tray_open: false, type: unknown}, {device: sd0, locked: false, removable: true, tray_open: false, type: unknown}]}
+{return: [{io-status: ok, device: disk, locked: false, removable: false, inserted: {iops_rd: 0, image: {virtual-size: 134217728, filename: TEST_DIR/t.qcow2, cluster-size: 65536, format: qcow2, actual-size: SIZE, format-specific: {type: qcow2, data: {compat: 1.1, lazy-refcounts: false}}, dirty-flag: false}, iops_wr: 0, ro: false, backing_file_depth: 0, drv: 
qcow2, iops: 0, bps_wr: 0, encrypted: false, bps: 0, bps_rd: 0, file: TEST_DIR/t.qcow2, encryption_key_missing: false}, type: unknown}, {io-status: ok, device: ide1-cd0, locked: false, removable: true, tray_open: false, type: unknown}, {device: floppy0, locked: false, removable: true, tray_open: false, type: unknown}, {device: sd0, locked: false, 
removable: true, tray_open: false, type: unknown}]}
  {return: {}}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: 
SHUTDOWN}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: DEVICE_TRAY_MOVED, data: 
{device: ide1-cd0, tray-open: true}}
@@ -51,7 +51,7 @@ QMP_VERSION
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: DEVICE_DELETED, 
data: {path: /machine/peripheral/virtio0/virtio-backend}}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: DEVICE_DELETED, data: 
{device: virtio0, path: /machine/peripheral/virtio0}}
  {timestamp: {seconds:  TIMESTAMP, microseconds:  TIMESTAMP}, event: 
RESET}
-{return: [{io-status: ok, device: ide1-cd0, locked: false, removable: true, tray_open: false, type: unknown}, {device: floppy0, locked: false, 
removable: true, tray_open: false, type: unknown}, {device: sd0, locked: false, removable: true, tray_open: false, type: unknown}]}
+{return: [{io-status: ok, device: ide1-cd0, locked: false, removable: true, tray_open: false, type: unknown}, {device: floppy0, locked: false, removable: true, tray_open: false, type: unknown}, {device: sd0, locked: false, removable: true, tray_open: false, type: unknown}, {io-status: ok, device: disk, locked: false, removable: false, 
inserted: {iops_rd: 0, image: {virtual-size: 134217728, filename: TEST_DIR/t.qcow2, cluster-size: 65536, format: qcow2, actual-size: SIZE, format-specific: {type: qcow2, data: 

Re: [Qemu-devel] [PATCH v3 04/29] qemu_memalign: Allow small alignments

2014-01-17 Thread Benoît Canet
Le Friday 17 Jan 2014 à 15:14:54 (+0100), Kevin Wolf a écrit :
 The functions used by qemu_memalign() require an alignment that is at
 least sizeof(void*). Adjust it if it is too small.
 
 Signed-off-by: Kevin Wolf kw...@redhat.com
 Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
 Reviewed-by: Max Reitz mre...@redhat.com
 ---
  util/oslib-posix.c | 5 +
  1 file changed, 5 insertions(+)
 
 diff --git a/util/oslib-posix.c b/util/oslib-posix.c
 index e00a44c..54f8932 100644
 --- a/util/oslib-posix.c
 +++ b/util/oslib-posix.c
 @@ -85,6 +85,11 @@ void *qemu_oom_check(void *ptr)
  void *qemu_memalign(size_t alignment, size_t size)
  {
  void *ptr;
 +
 +if (alignment  sizeof(void*)) {
 +alignment = sizeof(void*);
 +}
 +
  #if defined(_POSIX_C_SOURCE)  !defined(__sun__)
  int ret;
  ret = posix_memalign(ptr, alignment, size);
 -- 
 1.8.1.4
 
 
Reviewed-by: Benoît Canet ben...@irqsave.net



Re: [Qemu-devel] [PATCH v3 03/29] block: Update BlockLimits when they might have changed

2014-01-17 Thread Benoît Canet
Le Friday 17 Jan 2014 à 15:14:53 (+0100), Kevin Wolf a écrit :
 When reopening with different flags, or when backing files disappear
 from the chain, the limits may change. Make sure they get updated in
 these cases.
 
 Signed-off-by: Kevin Wolf kw...@redhat.com
 Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
 Reviewed-by: Max Reitz mre...@redhat.com
 ---
  block.c   | 5 -
  block/stream.c| 2 ++
  include/block/block.h | 1 +
  3 files changed, 7 insertions(+), 1 deletion(-)
 
 diff --git a/block.c b/block.c
 index 20f85cb..9a8562d 100644
 --- a/block.c
 +++ b/block.c
 @@ -479,7 +479,7 @@ int bdrv_create_file(const char* filename, 
 QEMUOptionParameter *options,
  return ret;
  }
  
 -static int bdrv_refresh_limits(BlockDriverState *bs)
 +int bdrv_refresh_limits(BlockDriverState *bs)
  {
  BlockDriver *drv = bs-drv;
  
 @@ -1464,6 +1464,8 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state)
  reopen_state-bs-enable_write_cache = !!(reopen_state-flags 
BDRV_O_CACHE_WB);
  reopen_state-bs-read_only = !(reopen_state-flags  BDRV_O_RDWR);
 +
 +bdrv_refresh_limits(reopen_state-bs);
  }
  
  /*
 @@ -2261,6 +2263,7 @@ int bdrv_drop_intermediate(BlockDriverState *active, 
 BlockDriverState *top,
  }
  new_top_bs-backing_hd = base_bs;
  
 +bdrv_refresh_limits(new_top_bs);
  
  QSIMPLEQ_FOREACH_SAFE(intermediate_state, states_to_delete, entry, 
 next) {
  /* so that bdrv_close() does not recursively close the chain */
 diff --git a/block/stream.c b/block/stream.c
 index 46bec7d..dd0b4ac 100644
 --- a/block/stream.c
 +++ b/block/stream.c
 @@ -75,6 +75,8 @@ static void close_unused_images(BlockDriverState *top, 
 BlockDriverState *base,
  unused-backing_hd = NULL;
  bdrv_unref(unused);
  }
 +
 +bdrv_refresh_limits(top);
  }
  
  static void coroutine_fn stream_run(void *opaque)
 diff --git a/include/block/block.h b/include/block/block.h
 index 36efaea..64fb319 100644
 --- a/include/block/block.h
 +++ b/include/block/block.h
 @@ -249,6 +249,7 @@ int bdrv_truncate(BlockDriverState *bs, int64_t offset);
  int64_t bdrv_getlength(BlockDriverState *bs);
  int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
  void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
 +int bdrv_refresh_limits(BlockDriverState *bs);
  int bdrv_commit(BlockDriverState *bs);
  int bdrv_commit_all(void);
  int bdrv_change_backing_file(BlockDriverState *bs,
 -- 
 1.8.1.4
 
Reviewed-by: Benoît Canet ben...@irqsave.net

 



Re: [Qemu-devel] [PATCH v3 01/29] block: Move initialisation of BlockLimits to bdrv_refresh_limits()

2014-01-17 Thread Benoît Canet
Le Friday 17 Jan 2014 à 15:14:51 (+0100), Kevin Wolf a écrit :
 This function separates filling the BlockLimits from bdrv_open(), which
 allows it to call it from other operations which may change the limits
 (e.g. modifications to the backing file chain or bdrv_reopen)
 
 Signed-off-by: Kevin Wolf kw...@redhat.com
 Reviewed-by: Max Reitz mre...@redhat.com
 ---
  block.c   | 19 +++
  block/iscsi.c | 46 +-
  block/qcow2.c | 11 ++-
  block/qed.c   | 11 ++-
  block/vmdk.c  | 22 ++
  include/block/block_int.h |  2 ++
  6 files changed, 88 insertions(+), 23 deletions(-)
 
 diff --git a/block.c b/block.c
 index 64e7d22..99e69da 100644
 --- a/block.c
 +++ b/block.c
 @@ -479,6 +479,19 @@ int bdrv_create_file(const char* filename, 
 QEMUOptionParameter *options,
  return ret;
  }
  
 +static int bdrv_refresh_limits(BlockDriverState *bs)
 +{
 +BlockDriver *drv = bs-drv;
 +
 +memset(bs-bl, 0, sizeof(bs-bl));
 +
 +if (drv  drv-bdrv_refresh_limits) {
 +return drv-bdrv_refresh_limits(bs);
 +}
 +
 +return 0;
 +}
 +
  /*
   * Create a uniquely-named empty temporary file.
   * Return 0 upon success, otherwise a negative errno value.
 @@ -833,6 +846,8 @@ static int bdrv_open_common(BlockDriverState *bs, 
 BlockDriverState *file,
  goto free_and_fail;
  }
  
 +bdrv_refresh_limits(bs);
 +
  #ifndef _WIN32
  if (bs-is_temporary) {
  assert(bs-filename[0] != '\0');
 @@ -1018,6 +1033,10 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
 *options, Error **errp)
  }
  pstrcpy(bs-backing_file, sizeof(bs-backing_file),
  bs-backing_hd-file-filename);
 +
 +/* Recalculate the BlockLimits with the backing file */
 +bdrv_refresh_limits(bs);
 +
  return 0;
  }
  
 diff --git a/block/iscsi.c b/block/iscsi.c
 index c0ea0c4..3202dc5 100644
 --- a/block/iscsi.c
 +++ b/block/iscsi.c
 @@ -1265,23 +1265,6 @@ static int iscsi_open(BlockDriverState *bs, QDict 
 *options, int flags,
 sizeof(struct scsi_inquiry_block_limits));
  scsi_free_scsi_task(task);
  task = NULL;
 -
 -if (iscsilun-bl.max_unmap  0x) {
 -bs-bl.max_discard = sector_lun2qemu(iscsilun-bl.max_unmap,
 - iscsilun);
 -}
 -bs-bl.discard_alignment = 
 sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
 -   iscsilun);
 -
 -if (iscsilun-bl.max_ws_len  0x) {
 -bs-bl.max_write_zeroes = 
 sector_lun2qemu(iscsilun-bl.max_ws_len,
 -  iscsilun);
 -}
 -bs-bl.write_zeroes_alignment = 
 sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
 -iscsilun);
 -
 -bs-bl.opt_transfer_length = 
 sector_lun2qemu(iscsilun-bl.opt_xfer_len,
 - iscsilun);
  }
  
  #if defined(LIBISCSI_FEATURE_NOP_COUNTER)
 @@ -1326,6 +1309,34 @@ static void iscsi_close(BlockDriverState *bs)
  memset(iscsilun, 0, sizeof(IscsiLun));
  }
  
 +static int iscsi_refresh_limits(BlockDriverState *bs)
 +{
 +IscsiLun *iscsilun = bs-opaque;
 +
 +/* We don't actually refresh here, but just return data queried in
 + * iscsi_open(): iscsi targets don't change their limits. */
 +if (iscsilun-lbp.lbpu || iscsilun-lbp.lbpws) {
 +if (iscsilun-bl.max_unmap  0x) {
 +bs-bl.max_discard = sector_lun2qemu(iscsilun-bl.max_unmap,
 + iscsilun);
 +}
 +bs-bl.discard_alignment = 
 sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
 +   iscsilun);
 +
 +if (iscsilun-bl.max_ws_len  0x) {
 +bs-bl.max_write_zeroes = 
 sector_lun2qemu(iscsilun-bl.max_ws_len,
 +  iscsilun);
 +}
 +bs-bl.write_zeroes_alignment = 
 sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
 +iscsilun);
 +
 +bs-bl.opt_transfer_length = 
 sector_lun2qemu(iscsilun-bl.opt_xfer_len,
 + iscsilun);
Why is bl.opt_transfer_length filled only inside the test for unmap and write 
same ?
Is there a relationship ?

 +}
 +
 +return 0;
 +}
 +
  static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
  {
  IscsiLun *iscsilun = bs-opaque;
 @@ -1438,6 +1449,7 @@ static BlockDriver bdrv_iscsi = {
  .bdrv_getlength  = iscsi_getlength,
  .bdrv_get_info   = iscsi_get_info,
  .bdrv_truncate   = iscsi_truncate,
 +.bdrv_refresh_limits = iscsi_refresh_limits,
  
  #if defined(LIBISCSI_FEATURE_IOVECTOR)
  

Re: [Qemu-devel] [PATCH v3 14/29] block: Switch BdrvTrackedRequest to byte granularity

2014-01-17 Thread Max Reitz

On 17.01.2014 15:15, Kevin Wolf wrote:

Signed-off-by: Kevin Wolf kw...@redhat.com
---
  block.c   | 52 +++
  block/backup.c|  7 ++-
  include/block/block_int.h |  4 ++--
  3 files changed, 42 insertions(+), 21 deletions(-)


Reviewed-by: Max Reitz mre...@redhat.com



Re: [Qemu-devel] [PATCH v3 16/29] block: Make zero-after-EOF work with larger alignment

2014-01-17 Thread Max Reitz

On 17.01.2014 15:15, Kevin Wolf wrote:

Odd file sizes could make bdrv_aligned_preadv() shorten the request in
non-aligned ways. Fix it by rounding to the required alignment instead
of 512 bytes.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
  block.c | 7 ---
  1 file changed, 4 insertions(+), 3 deletions(-)


Reviewed-by: Max Reitz mre...@redhat.com



Re: [Qemu-devel] [PATCH v3 21/29] block: Assert serialisation assumptions in pwritev

2014-01-17 Thread Max Reitz

On 17.01.2014 15:15, Kevin Wolf wrote:

If a request calls wait_serialising_requests() and actually has to wait
in this function (i.e. a coroutine yield), other requests can run and
previously read data (like the head or tail buffer) could become
outdated. In this case, we would have to restart from the beginning to
read in the updated data.

However, we're lucky and don't actually need to do that: A request can
only wait in the first call of wait_serialising_requests() because we
mark it as serialising before that call, so any later requests would
wait. So as we don't wait in practice, we don't have to reload the data.

This is an important assumption that may not be broken or data
corruption will happen. Document it with some assertions.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
  block.c | 16 
  1 file changed, 12 insertions(+), 4 deletions(-)


Reviewed-by: Max Reitz mre...@redhat.com



Re: [Qemu-devel] [PATCH v3 24/29] block: Make bdrv_pwrite() a bdrv_prwv_co() wrapper

2014-01-17 Thread Max Reitz

On 17.01.2014 15:15, Kevin Wolf wrote:

Instead of implementing the alignment adjustment here, use the now
existing functionality of bdrv_co_do_pwritev().

Signed-off-by: Kevin Wolf kw...@redhat.com
---
  block.c   | 64 ---
  include/block/block.h |  1 -
  2 files changed, 9 insertions(+), 56 deletions(-)


Reviewed-by: Max Reitz mre...@redhat.com



Re: [Qemu-devel] [PATCH v3 26/29] blkdebug: Make required alignment configurable

2014-01-17 Thread Max Reitz

On 17.01.2014 15:15, Kevin Wolf wrote:

The new 'align' option of blkdebug can be used in order to emulate
backends with a required 4k alignment on hosts which only really require
512 byte alignment.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
  block/blkdebug.c | 16 
  1 file changed, 16 insertions(+)


The patch itself is okay, but will have to be rebased on the 
blkdebug/blkverify series – specifically, we'll need to add this new 
option to qapi-schema.json (BlockdevOptionsBlkdebug).



diff --git a/block/blkdebug.c b/block/blkdebug.c
index ebc5f13..dc4ba46 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -346,6 +346,11 @@ static QemuOptsList runtime_opts = {
  .type = QEMU_OPT_STRING,
  .help = [internal use only, will be removed],
  },
+{
+.name = align,
+.type = QEMU_OPT_SIZE,
+.help = Required alignment in bytes,
+},
  { /* end of list */ }
  },
  };
@@ -357,6 +362,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
  QemuOpts *opts;
  Error *local_err = NULL;
  const char *filename, *config;
+uint64_t align;
  int ret;
  
  opts = qemu_opts_create(runtime_opts, NULL, 0, error_abort);

@@ -394,6 +400,16 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
  goto fail;
  }
  
+/* Set request alignment */

+align = qemu_opt_get_size(opts, align, bs-request_alignment);
+if (align  0  align  INT_MAX  !(align  (align - 1))) {


Hm, I like that test to check whether align is a power of two. ;-)

Max


+bs-request_alignment = align;
+} else {
+error_setg(errp, Invalid alignment);
+ret = -EINVAL;
+goto fail;
+}
+
  ret = 0;
  fail:
  qemu_opts_del(opts);





Re: [Qemu-devel] [PATCH v3 27/29] qemu-io: New command 'sleep'

2014-01-17 Thread Max Reitz

On 17.01.2014 15:15, Kevin Wolf wrote:

There is no easy way to check that a request correctly waits for a
different request. With a sleep command we can at least approximate it.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
  qemu-io-cmds.c | 42 ++
  1 file changed, 42 insertions(+)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 85e4982..978a3a0 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -12,6 +12,7 @@
  #include block/block_int.h
  #include block/qapi.h
  #include qemu/main-loop.h
+#include qemu/timer.h
  
  #define CMD_NOFILE_OK   0x01
  
@@ -2038,6 +2039,46 @@ static const cmdinfo_t abort_cmd = {

 .oneline= simulate a program crash using abort(3),
  };
  
+static void sleep_cb(void *opaque)

+{
+bool *expired = opaque;
+*expired = true;
+}
+
+static int sleep_f(BlockDriverState *bs, int argc, char **argv)
+{
+char *endptr;
+long ms;
+struct QEMUTimer *timer;
+bool expired = false;
+
+ms = strtol(argv[1], endptr, 0);
+if (ms  0 || *endptr != '\0') {
+printf(%s is not a valid number\n, argv[1]);
+return 0;
+}
+
+timer = timer_new_ns(QEMU_CLOCK_HOST, sleep_cb, expired);
+timer_mod(timer, qemu_clock_get_ns(QEMU_CLOCK_HOST) + SCALE_MS * ms);
+
+while (!expired) {


I don't know whether compilers don't optimize accesses to variables 
whose addresses have been given to other functions, but shouldn't 
expired be marked volatile just to be sure?


Max


+main_loop_wait(false);
+}
+
+timer_free(timer);
+
+return 0;
+}
+
+static const cmdinfo_t sleep_cmd = {
+   .name   = sleep,
+   .argmin = 1,
+   .argmax = 1,
+   .cfunc  = sleep_f,
+   .flags  = CMD_NOFILE_OK,
+   .oneline= waits for the given value in milliseconds,
+};
+
  static void help_oneline(const char *cmd, const cmdinfo_t *ct)
  {
  if (cmd) {
@@ -2151,4 +2192,5 @@ static void __attribute((constructor)) 
init_qemuio_commands(void)
  qemuio_add_command(resume_cmd);
  qemuio_add_command(wait_break_cmd);
  qemuio_add_command(abort_cmd);
+qemuio_add_command(sleep_cmd);
  }





Re: [Qemu-devel] [PATCH v3 29/29] block: Switch bdrv_io_limits_intercept() to byte granularity

2014-01-17 Thread Max Reitz

On 17.01.2014 15:15, Kevin Wolf wrote:

Request sizes used to be rounded down to the next sector boundary,
allowing to bypass the I/O limit. Now all requests are accounted for
with their exact byte size.

Reported-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Signed-off-by: Kevin Wolf kw...@redhat.com
---
  block.c | 13 +
  1 file changed, 5 insertions(+), 8 deletions(-)


Reviewed-by: Max Reitz mre...@redhat.com



Re: [Qemu-devel] [PATCH v3 10/29] block: Introduce bdrv_co_do_preadv()

2014-01-17 Thread Max Reitz

On 17.01.2014 15:15, Kevin Wolf wrote:

Similar to bdrv_pread(), which aligns byte-aligned request to 512 byte
sectors, bdrv_co_do_preadv() takes a byte-aligned request and aligns it
to the alignment specified in bs-request_alignment.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
  block.c | 64 ++--
  1 file changed, 58 insertions(+), 6 deletions(-)


Together with patch 29:

Reviewed-by: Max Reitz mre...@redhat.com



Re: [Qemu-devel] [PATCH v3 13/29] block: Introduce bdrv_co_do_pwritev()

2014-01-17 Thread Max Reitz

On 17.01.2014 15:15, Kevin Wolf wrote:

This is going to become the bdrv_co_do_preadv() equivalent for writes.
In this patch, however, just a function taking byte offsets is created,
it doesn't align anything yet.

Signed-off-by: Kevin Wolf kw...@redhat.com
---
  block.c | 24 ++--
  1 file changed, 18 insertions(+), 6 deletions(-)


Together with patch 29:

Reviewed-by: Max Reitz mre...@redhat.com



Re: [Qemu-devel] [PATCH v3 02/29] block: Inherit opt_transfer_length

2014-01-17 Thread Benoît Canet
Le Friday 17 Jan 2014 à 15:14:52 (+0100), Kevin Wolf a écrit :
 When there is a format driver between the backend, it's not guaranteed
 that exposing the opt_transfer_length for the format driver results in
 the optimal requests (because of fragmentation etc.), but it can't make
 things worse, so let's just do it.
 
 Signed-off-by: Kevin Wolf kw...@redhat.com
 Reviewed-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
 Reviewed-by: Max Reitz mre...@redhat.com
 ---
  block.c | 20 +++-
  1 file changed, 19 insertions(+), 1 deletion(-)
 
 diff --git a/block.c b/block.c
 index 99e69da..20f85cb 100644
 --- a/block.c
 +++ b/block.c
 @@ -485,7 +485,25 @@ static int bdrv_refresh_limits(BlockDriverState *bs)
  
  memset(bs-bl, 0, sizeof(bs-bl));
  
 -if (drv  drv-bdrv_refresh_limits) {
 +if (!drv) {
 +return 0;
 +}
 +
 +/* Take some limits from the children as a default */
 +if (bs-file) {
 +bdrv_refresh_limits(bs-file);
 +bs-bl.opt_transfer_length = bs-file-bl.opt_transfer_length;
 +}
 +
 +if (bs-backing_hd) {
 +bdrv_refresh_limits(bs-backing_hd);
 +bs-bl.opt_transfer_length =
 +MAX(bs-bl.opt_transfer_length,
 +bs-backing_hd-bl.opt_transfer_length);
 +}
 +
 +/* Then let the driver override it */
 +if (drv-bdrv_refresh_limits) {
  return drv-bdrv_refresh_limits(bs);
  }

Reviewed-by: Benoît Canet ben...@irqsave.net
  
 -- 
 1.8.1.4
 
 



  1   2   >