Re: [Qemu-devel] [PATCH] sun4u: ensure kernel_top is always initialised

2018-08-12 Thread Thomas Huth
On 08/10/2018 01:23 PM, Mark Cave-Ayland wrote:
> Valgrind reports that when loading a non-ELF kernel, kernel_top may be used
> uninitialised when checking for an initrd.
> 
> Since there are no known non-ELF kernels for SPARC64 then we can simply
> initialise kernel_top to 0 and then skip the initrd load process if it hasn't
> been set by load_elf().
> 
> Reported-by: Thomas Huth 
> Signed-off-by: Mark Cave-Ayland 
> ---
>  hw/sparc64/sun4u.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/sparc64/sun4u.c b/hw/sparc64/sun4u.c
> index 74b748497e..d16843b30e 100644
> --- a/hw/sparc64/sun4u.c
> +++ b/hw/sparc64/sun4u.c
> @@ -139,7 +139,7 @@ static uint64_t sun4u_load_kernel(const char 
> *kernel_filename,
>  unsigned int i;
>  long kernel_size;
>  uint8_t *ptr;
> -uint64_t kernel_top;
> +uint64_t kernel_top = 0;
>  
>  linux_boot = (kernel_filename != NULL);
>  
> @@ -172,7 +172,7 @@ static uint64_t sun4u_load_kernel(const char 
> *kernel_filename,
>  }
>  /* load initrd above kernel */
>  *initrd_size = 0;
> -if (initrd_filename) {
> +if (initrd_filename && kernel_top) {
>  *initrd_addr = TARGET_PAGE_ALIGN(kernel_top);
>  
>  *initrd_size = load_image_targphys(initrd_filename,
> 

Reviewed-by: Thomas Huth 




Re: [Qemu-devel] [PATCH] ppc: add DBCR based debugging

2018-08-12 Thread David Gibson
On Tue, Aug 07, 2018 at 11:29:48AM +0200, Roman Kapl wrote:
> Add support for DBCR (debug control register) based debugging as used on
> BookE ppc. So far supports only branch and single-step events, but these are
> the important ones. GDB in Linux guest can now do single-stepping.
> 
> Signed-off-by: Roman Kapl 
> ---
>  target/ppc/cpu.h|   5 ++
>  target/ppc/excp_helper.c|   3 +-
>  target/ppc/translate.c  | 107 
> ++--
>  target/ppc/translate_init.inc.c |  17 +++
>  4 files changed, 104 insertions(+), 28 deletions(-)
> 
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index 4edcf62cf7..ec149349e2 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -481,6 +481,11 @@ struct ppc_slb_t {
>  #define msr_ts   ((env->msr >> MSR_TS1)  & 3)
>  #define msr_tm   ((env->msr >> MSR_TM)   & 1)
>  
> +#define DBCR0_ICMP (1 << 27)
> +#define DBCR0_BRT (1 << 26)
> +#define DBSR_ICMP (1 << 27)
> +#define DBSR_BRT (1 << 26)
> +
>  /* Hypervisor bit is more specific */
>  #if defined(TARGET_PPC64)
>  #define MSR_HVB (1ULL << MSR_SHV)
> diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
> index d6e97a90e0..3463efaf4e 100644
> --- a/target/ppc/excp_helper.c
> +++ b/target/ppc/excp_helper.c
> @@ -359,8 +359,7 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
> excp_model, int excp)
>  default:
>  break;
>  }
> -/* XXX: TODO */
> -cpu_abort(cs, "Debug exception is not implemented yet !\n");
> +/* DBSR already modified by caller */

Ths is unconditionally enabling the exception for every platform,
which doesn't seem right.  I think it would be better to add the
specific exception models which use the DBCR to the switch statement
above this.

>  break;
>  case POWERPC_EXCP_SPEU:  /* SPE/embedded floating-point unavailable  
> */
>  env->spr[SPR_BOOKE_ESR] = ESR_SPV;
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index 9eaa10b421..69cd45dd81 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -211,6 +211,7 @@ struct DisasContext {
>  bool gtse;
>  ppc_spr_t *spr_cb; /* Needed to check rights for mfspr/mtspr */
>  int singlestep_enabled;
> +uint32_t flags;
>  uint64_t insns_flags;
>  uint64_t insns_flags2;
>  };
> @@ -251,6 +252,17 @@ struct opc_handler_t {
>  #endif
>  };
>  
> +/* SPR load/store helpers */
> +static inline void gen_load_spr(TCGv t, int reg)
> +{
> +tcg_gen_ld_tl(t, cpu_env, offsetof(CPUPPCState, spr[reg]));
> +}
> +
> +static inline void gen_store_spr(int reg, TCGv t)
> +{
> +tcg_gen_st_tl(t, cpu_env, offsetof(CPUPPCState, spr[reg]));
> +}
> +
>  static inline void gen_set_access_type(DisasContext *ctx, int access_type)
>  {
>  if (ctx->need_access_type && ctx->access_type != access_type) {
> @@ -313,6 +325,38 @@ static void gen_exception_nip(DisasContext *ctx, 
> uint32_t excp,
>  ctx->exception = (excp);
>  }
>  
> +/* Translates the EXCP_TRACE/BRANCH to an appropriate exception depending
> + * on processor version (BookE vs regular)

It's not obvious what "regular" means in this context.

> + */
> +static uint32_t gen_prep_dbgex(DisasContext *ctx, uint32_t excp)
> +{
> +if ((ctx->singlestep_enabled & CPU_SINGLE_STEP)
> +&& (excp == POWERPC_EXCP_BRANCH)) {
> +/* Trace excpt. has priority */
> +excp = POWERPC_EXCP_TRACE;
> +}
> +if (ctx->flags & POWERPC_FLAG_DE) {
> +target_ulong dbsr = 0;
> +switch (excp) {
> +case POWERPC_EXCP_TRACE:
> +dbsr = DBCR0_ICMP;
> +break;
> +case POWERPC_EXCP_BRANCH:
> +dbsr = DBCR0_BRT;
> +break;
> +}
> +TCGv t0 = tcg_temp_new();
> +gen_load_spr(t0, SPR_BOOKE_DBSR);
> +tcg_gen_ori_tl(t0, t0, dbsr);
> +gen_store_spr(SPR_BOOKE_DBSR, t0);
> +tcg_temp_free(t0);
> +return POWERPC_EXCP_DEBUG;
> +} else {
> +return excp;
> +}
> +return POWERPC_EXCP_NONE;
> +}
> +
>  static void gen_debug_exception(DisasContext *ctx)
>  {
>  TCGv_i32 t0;
> @@ -575,17 +619,6 @@ typedef struct opcode_t {
>  }
>  #endif
>  
> -/* SPR load/store helpers */
> -static inline void gen_load_spr(TCGv t, int reg)
> -{
> -tcg_gen_ld_tl(t, cpu_env, offsetof(CPUPPCState, spr[reg]));
> -}
> -
> -static inline void gen_store_spr(int reg, TCGv t)
> -{
> -tcg_gen_st_tl(t, cpu_env, offsetof(CPUPPCState, spr[reg]));
> -}
> -

You move these around, but the new code doesn't seem to use them, so
I'm not sure why.

>  /* Invalid instruction */
>  static void gen_invalid(DisasContext *ctx)
>  {
> @@ -3602,6 +3635,24 @@ static inline bool use_goto_tb(DisasContext *ctx, 
> target_ulong dest)
>  #endif
>  }
>  
> +static void gen_lookup_and_goto_ptr(DisasContext *ctx)
> +{
> +int sse = ctx->singlestep_enabled;
> +if (unlikely(sse)) {
> +if (sse & 

[Qemu-devel] [Bug 1785972] Re: v3.0.0-rc4: VM fails to start after vcpuhotunplug, managedsave sequence

2018-08-12 Thread Satheesh Rajendran
Applied to ppc-for-3.1.

https://lists.gnu.org/archive/html/qemu-devel/2018-08/msg01317.html

** Changed in: qemu
   Status: Confirmed => Fix Committed

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1785972

Title:
  v3.0.0-rc4: VM fails to start after vcpuhotunplug, managedsave
  sequence

Status in QEMU:
  Fix Committed

Bug description:
  VM fails to start after vcpu hot un-plug, managedsave sequence

  Host info:
  Kernel: 4.18.0-rc8-2-g1236568ee3cb

  qemu: commit 6ad90805383e6d04b3ff49681b8519a48c9f4410 (HEAD -> master, tag: 
v3.0.0-rc4)
  QEMU emulator version 2.12.94 (v3.0.0-rc4-dirty)

  libvirt: commit 087de2f5a3dffb27d2eeb0c50a86d5d6984e5a5e (HEAD -> master)
  libvirtd (libvirt) 4.6.0

  Guest Kernel: 4.18.0-rc8-2-g1236568ee3cb

  
  Steps to reproduce:
  1. Start a guest(VM) with 2 current , 4 max vcpus
  virsh start vm1
  Domain vm1 started

  2. Hotplug 2 vcpus
  virsh setvcpus vm1 4 --live

  3. Hot unplug 2 vcpus
  virsh setvcpus vm1 2 --live

  4. Managedsave the VM
  virsh managedsave vm1

  Domain vm1 state saved by libvirt

  5. Start the VM ---Fails to start
  virsh start vm1

  error: Failed to start domain avocado-vt-vm1
  error: internal error: qemu unexpectedly closed the monitor: 
2018-08-08T06:27:53.853818Z qemu: Unknown savevm section or instance 
'spapr_cpu' 2
  2018-08-08T06:27:53.854949Z qemu: load of migration failed: Invalid argument


  Testcase:
  avocado run 
libvirt_vcpu_plug_unplug.positive_test.vcpu_set.live.vm_operate.managedsave_with_unplug
 --vt-type libvirt --vt-extra-params 
emulator_path=/usr/share/avocado-plugins-vt/bin/qemu create_vm_libvirt=yes 
kill_vm_libvirt=yes env_cleanup=yes smp=8 backup_image_before_testing=no 
libvirt_controller=virtio-scsi scsi_hba=virtio-scsi-pci drive_format=scsi-hd 
use_os_variant=no restore_image_after_testing=no vga=none display=nographic 
kernel=/home/kvmci/linux/vmlinux kernel_args='root=/dev/sda2 rw console=tty0 
console=ttyS0,115200 init=/sbin/init  initcall_debug' 
take_regular_screendumps=no --vt-guest-os JeOS.27.ppc64le
  JOB ID : 1f869477ad87e7d7e7ef631ae08965f41a74
  JOB LOG: /root/avocado/job-results/job-2018-08-08T02.42-1f86947/job.log
   (1/1) 
type_specific.io-github-autotest-libvirt.libvirt_vcpu_plug_unplug.positive_test.vcpu_set.live.vm_operate.managedsave_with_unplug:
 ERROR (91.58 s)
  RESULTS: PASS 0 | ERROR 1 | FAIL 0 | SKIP 0 | WARN 0 | INTERRUPT 0 | 
CANCEL 0
  JOB TIME   : 95.89 s

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1785972/+subscriptions



[Qemu-devel] [Bug 1785972] Re: v3.0.0-rc4: VM fails to start after vcpuhotunplug, managedsave sequence

2018-08-12 Thread Satheesh Rajendran
This commit https://lists.gnu.org/archive/html/qemu-
devel/2018-08/msg01281.html from ~bharata-rao, fixes the issue.

** Changed in: qemu
   Status: New => Confirmed

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1785972

Title:
  v3.0.0-rc4: VM fails to start after vcpuhotunplug, managedsave
  sequence

Status in QEMU:
  Fix Committed

Bug description:
  VM fails to start after vcpu hot un-plug, managedsave sequence

  Host info:
  Kernel: 4.18.0-rc8-2-g1236568ee3cb

  qemu: commit 6ad90805383e6d04b3ff49681b8519a48c9f4410 (HEAD -> master, tag: 
v3.0.0-rc4)
  QEMU emulator version 2.12.94 (v3.0.0-rc4-dirty)

  libvirt: commit 087de2f5a3dffb27d2eeb0c50a86d5d6984e5a5e (HEAD -> master)
  libvirtd (libvirt) 4.6.0

  Guest Kernel: 4.18.0-rc8-2-g1236568ee3cb

  
  Steps to reproduce:
  1. Start a guest(VM) with 2 current , 4 max vcpus
  virsh start vm1
  Domain vm1 started

  2. Hotplug 2 vcpus
  virsh setvcpus vm1 4 --live

  3. Hot unplug 2 vcpus
  virsh setvcpus vm1 2 --live

  4. Managedsave the VM
  virsh managedsave vm1

  Domain vm1 state saved by libvirt

  5. Start the VM ---Fails to start
  virsh start vm1

  error: Failed to start domain avocado-vt-vm1
  error: internal error: qemu unexpectedly closed the monitor: 
2018-08-08T06:27:53.853818Z qemu: Unknown savevm section or instance 
'spapr_cpu' 2
  2018-08-08T06:27:53.854949Z qemu: load of migration failed: Invalid argument


  Testcase:
  avocado run 
libvirt_vcpu_plug_unplug.positive_test.vcpu_set.live.vm_operate.managedsave_with_unplug
 --vt-type libvirt --vt-extra-params 
emulator_path=/usr/share/avocado-plugins-vt/bin/qemu create_vm_libvirt=yes 
kill_vm_libvirt=yes env_cleanup=yes smp=8 backup_image_before_testing=no 
libvirt_controller=virtio-scsi scsi_hba=virtio-scsi-pci drive_format=scsi-hd 
use_os_variant=no restore_image_after_testing=no vga=none display=nographic 
kernel=/home/kvmci/linux/vmlinux kernel_args='root=/dev/sda2 rw console=tty0 
console=ttyS0,115200 init=/sbin/init  initcall_debug' 
take_regular_screendumps=no --vt-guest-os JeOS.27.ppc64le
  JOB ID : 1f869477ad87e7d7e7ef631ae08965f41a74
  JOB LOG: /root/avocado/job-results/job-2018-08-08T02.42-1f86947/job.log
   (1/1) 
type_specific.io-github-autotest-libvirt.libvirt_vcpu_plug_unplug.positive_test.vcpu_set.live.vm_operate.managedsave_with_unplug:
 ERROR (91.58 s)
  RESULTS: PASS 0 | ERROR 1 | FAIL 0 | SKIP 0 | WARN 0 | INTERRUPT 0 | 
CANCEL 0
  JOB TIME   : 95.89 s

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1785972/+subscriptions



[Qemu-devel] [PATCH v2] file-posix: Skip effectiveless OFD lock operations

2018-08-12 Thread Fam Zheng
If we know we've already locked the bytes, don't do it again; similarly
don't unlock a byte if we haven't locked it. This doesn't change the
behavior, but fixes a corner case explained below.

Libvirt had an error handling bug that an image can get its (ownership,
file mode, SELinux) permissions changed (RHBZ 1584982) by mistake behind
QEMU. Specifically, an image in use by Libvirt VM has:

$ ls -lhZ b.img
-rw-r--r--. qemu qemu system_u:object_r:svirt_image_t:s0:c600,c690 b.img

Trying to attach it a second time won't work because of image locking.
And after the error, it becomes:

$ ls -lhZ b.img
-rw-r--r--. root root system_u:object_r:virt_image_t:s0 b.img

Then, we won't be able to do OFD lock operations with the existing fd.
In other words, the code such as in blk_detach_dev:

blk_set_perm(blk, 0, BLK_PERM_ALL, _abort);

can abort() QEMU, out of environmental changes.

This patch is an easy fix to this and the change is regardlessly
reasonable, so do it.

Signed-off-by: Fam Zheng 

---

v2: For s == NULL, unlock all bits. [Kevin]
---
 block/file-posix.c | 41 +++--
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index fe83cbf0eb..73ae00c8c5 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -680,23 +680,42 @@ typedef enum {
  * file; if @unlock == true, also unlock the unneeded bytes.
  * @shared_perm_lock_bits is the mask of all permissions that are NOT shared.
  */
-static int raw_apply_lock_bytes(int fd,
+static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 uint64_t perm_lock_bits,
 uint64_t shared_perm_lock_bits,
 bool unlock, Error **errp)
 {
 int ret;
 int i;
+uint64_t locked_perm, locked_shared_perm;
+
+if (s) {
+locked_perm = s->perm;
+locked_shared_perm = ~s->shared_perm & BLK_PERM_ALL;
+} else {
+/*
+ * We don't have the previous bits, just lock/unlock for each of the
+ * requested bits.
+ */
+if (unlock) {
+locked_perm = BLK_PERM_ALL;
+locked_shared_perm = BLK_PERM_ALL;
+} else {
+locked_perm = 0;
+locked_shared_perm = 0;
+}
+}
 
 PERM_FOREACH(i) {
 int off = RAW_LOCK_PERM_BASE + i;
-if (perm_lock_bits & (1ULL << i)) {
+uint64_t bit = (1ULL << i);
+if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
 ret = qemu_lock_fd(fd, off, 1, false);
 if (ret) {
 error_setg(errp, "Failed to lock byte %d", off);
 return ret;
 }
-} else if (unlock) {
+} else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
 ret = qemu_unlock_fd(fd, off, 1);
 if (ret) {
 error_setg(errp, "Failed to unlock byte %d", off);
@@ -706,13 +725,15 @@ static int raw_apply_lock_bytes(int fd,
 }
 PERM_FOREACH(i) {
 int off = RAW_LOCK_SHARED_BASE + i;
-if (shared_perm_lock_bits & (1ULL << i)) {
+uint64_t bit = (1ULL << i);
+if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
 ret = qemu_lock_fd(fd, off, 1, false);
 if (ret) {
 error_setg(errp, "Failed to lock byte %d", off);
 return ret;
 }
-} else if (unlock) {
+} else if (unlock && (locked_shared_perm & bit) &&
+   !(shared_perm_lock_bits & bit)) {
 ret = qemu_unlock_fd(fd, off, 1);
 if (ret) {
 error_setg(errp, "Failed to unlock byte %d", off);
@@ -788,7 +809,7 @@ static int raw_handle_perm_lock(BlockDriverState *bs,
 
 switch (op) {
 case RAW_PL_PREPARE:
-ret = raw_apply_lock_bytes(s->lock_fd, s->perm | new_perm,
+ret = raw_apply_lock_bytes(s, s->lock_fd, s->perm | new_perm,
~s->shared_perm | ~new_shared,
false, errp);
 if (!ret) {
@@ -800,7 +821,7 @@ static int raw_handle_perm_lock(BlockDriverState *bs,
 op = RAW_PL_ABORT;
 /* fall through to unlock bytes. */
 case RAW_PL_ABORT:
-raw_apply_lock_bytes(s->lock_fd, s->perm, ~s->shared_perm,
+raw_apply_lock_bytes(s, s->lock_fd, s->perm, ~s->shared_perm,
  true, _err);
 if (local_err) {
 /* Theoretically the above call only unlocks bytes and it cannot
@@ -810,7 +831,7 @@ static int raw_handle_perm_lock(BlockDriverState *bs,
 }
 break;
 case RAW_PL_COMMIT:
-raw_apply_lock_bytes(s->lock_fd, new_perm, ~new_shared,
+raw_apply_lock_bytes(s, s->lock_fd, new_perm, ~new_shared,
  true, _err);
 if (local_err) {
 /* Theoretically the above call only 

[Qemu-devel] [PATCH 17/17] iotests: Add test for active mirror with COR

2018-08-12 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/151 | 56 ++
 tests/qemu-iotests/151.out |  4 +--
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/151 b/tests/qemu-iotests/151
index fe53b9f446..e5515c2d37 100755
--- a/tests/qemu-iotests/151
+++ b/tests/qemu-iotests/151
@@ -114,6 +114,62 @@ class TestActiveMirror(iotests.QMPTestCase):
 def testActiveIOFlushed(self):
 self.doActiveIO(True)
 
+def testCOR(self):
+# Fill the source image
+self.vm.hmp_qemu_io('source',
+'write -P 1 0 %i' % self.image_len);
+
+# Start some background requests
+for offset in range(0, self.image_len / 2, 1024 * 1024):
+self.vm.hmp_qemu_io('source', 'aio_write -P 2 %i 1M' % offset)
+
+# Start the block job
+result = self.vm.qmp('blockdev-mirror',
+ job_id='mirror',
+ filter_node_name='mirror-node',
+ device='source-node',
+ target='target-node',
+ sync='full',
+ copy_mode='write-blocking')
+self.assert_qmp(result, 'return', {})
+
+# Stop background request processing
+result = self.vm.qmp('job-pause', id='mirror')
+
+result = self.vm.qmp('blockdev-add',
+ node_name='cor-node',
+ driver='copy-on-read',
+ file='mirror-node')
+self.assert_qmp(result, 'return', {})
+
+# Now read everything, thus synchronously copying it to the
+# target
+for offset in range(0, self.image_len, 1024 * 1024):
+self.vm.hmp_qemu_io('cor-node', 'read %i 1M' % offset)
+
+# Resuming the job should make it READY immediately, because
+# the read requests from cor-node above should have copied
+# everything to target already.
+# Set the job speed to 1 before resuming it, so that we can
+# see that the READY event really comes immediately.
+
+result = self.vm.qmp('block-job-set-speed',
+ device='mirror',
+ speed=1)
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.qmp('job-resume', id='mirror')
+self.assert_qmp(result, 'return', {})
+
+# Wait for the READY event; if we get a timeout here, reading
+# from cor-node has failed to copy something to target
+self.vm.event_wait(name='BLOCK_JOB_READY', timeout=5.0)
+
+self.vm.hmp_qemu_io('source', 'aio_flush')
+self.potential_writes_in_flight = False
+
+self.complete_and_wait(drive='mirror', wait_ready=False)
+
 
 
 if __name__ == '__main__':
diff --git a/tests/qemu-iotests/151.out b/tests/qemu-iotests/151.out
index fbc63e62f8..8d7e996700 100644
--- a/tests/qemu-iotests/151.out
+++ b/tests/qemu-iotests/151.out
@@ -1,5 +1,5 @@
-..
+...
 --
-Ran 2 tests
+Ran 3 tests
 
 OK
-- 
2.17.1




[Qemu-devel] [PATCH 16/17] mirror: Support COR with write-blocking

2018-08-12 Thread Max Reitz
This patch adds a .bdrv_co_block_status() implementation for the mirror
block job that reports an area as allocated iff source and target are
in sync.  This allows putting a copy-on-read node on top of a mirror
node which automatically copies all data read from the source to the
target.

To make this perform better, bdrv_mirror_top_do_write() is modified to
ignore BDRV_REQ_WRITE_UNCHANGED requests regarding the source, and only
write them to the target (in write-blocking mode).  Otherwise, using COR
would result in all data read from the source that is not in sync with
the target to be re-written to the source (which is not the intention of
COR).

Signed-off-by: Max Reitz 
---
 block/mirror.c | 88 ++
 1 file changed, 75 insertions(+), 13 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index cba7de610e..625297fec1 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1304,29 +1304,40 @@ static int coroutine_fn 
bdrv_mirror_top_do_write(BlockDriverState *bs,
 MirrorBDSOpaque *s = bs->opaque;
 int ret = 0;
 bool copy_to_target;
+bool write_to_source;
 
 copy_to_target = s->job->ret >= 0 &&
  s->job->copy_mode == MIRROR_COPY_MODE_WRITE_BLOCKING;
 
+/* WRITE_UNCHANGED requests are allocating writes, which in this
+ * case means that we should ensure the target is in sync with the
+ * source (by writing the data to the target).  Therefore, if we
+ * do write to the target (only in write-blocking mode), skip
+ * writing the (unchanged) data to the source. */
+write_to_source = s->job->copy_mode != MIRROR_COPY_MODE_WRITE_BLOCKING ||
+  !(flags & BDRV_REQ_WRITE_UNCHANGED);
+
 if (copy_to_target) {
 op = active_write_prepare(s->job, offset, bytes);
 }
 
-switch (method) {
-case MIRROR_METHOD_COPY:
-ret = bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
-break;
+if (write_to_source) {
+switch (method) {
+case MIRROR_METHOD_COPY:
+ret = bdrv_co_pwritev(bs->backing, offset, bytes, qiov, flags);
+break;
 
-case MIRROR_METHOD_ZERO:
-ret = bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
-break;
+case MIRROR_METHOD_ZERO:
+ret = bdrv_co_pwrite_zeroes(bs->backing, offset, bytes, flags);
+break;
 
-case MIRROR_METHOD_DISCARD:
-ret = bdrv_co_pdiscard(bs->backing, offset, bytes);
-break;
+case MIRROR_METHOD_DISCARD:
+ret = bdrv_co_pdiscard(bs->backing, offset, bytes);
+break;
 
-default:
-abort();
+default:
+abort();
+}
 }
 
 if (ret < 0) {
@@ -1403,6 +1414,57 @@ static int coroutine_fn 
bdrv_mirror_top_pdiscard(BlockDriverState *bs,
 NULL, 0);
 }
 
+/**
+ * Allocation status is determined by whether source and target are in
+ * sync:
+ * - If they are (dirty bitmap is clean), the data is considered to be
+ *   allocated in this layer.  Then, return BDRV_BLOCK_RAW so the
+ *   request is forwarded to the source.
+ * - Dirty (unsynced) areas are considered unallocated.  For those,
+ *   return 0.
+ */
+static int coroutine_fn bdrv_mirror_top_block_status(BlockDriverState *bs,
+ bool want_zero,
+ int64_t offset,
+ int64_t bytes,
+ int64_t *pnum,
+ int64_t *map,
+ BlockDriverState **file)
+{
+MirrorBDSOpaque *s = bs->opaque;
+BdrvDirtyBitmapIter *iter;
+uint64_t dirty_offset, clean_offset;
+int ret;
+
+*map = offset;
+*file = bs->backing->bs;
+
+iter = bdrv_dirty_iter_new(s->job->dirty_bitmap);
+bdrv_set_dirty_iter(iter, offset);
+
+bdrv_dirty_bitmap_lock(s->job->dirty_bitmap);
+dirty_offset = bdrv_dirty_iter_next(iter);
+bdrv_dirty_iter_free(iter);
+if (dirty_offset > offset) {
+/* Clean area */
+*pnum = MIN(dirty_offset - offset, bytes);
+ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
+goto out;
+}
+
+/* Dirty area, find next clean area */
+clean_offset = bdrv_dirty_bitmap_next_zero(s->job->dirty_bitmap, offset);
+bdrv_dirty_bitmap_unlock(s->job->dirty_bitmap);
+
+assert(clean_offset > offset);
+*pnum = MIN(clean_offset - offset, bytes);
+ret = 0;
+
+out:
+bdrv_dirty_bitmap_unlock(s->job->dirty_bitmap);
+return ret;
+}
+
 static void bdrv_mirror_top_refresh_filename(BlockDriverState *bs)
 {
 if (bs->backing == NULL) {
@@ -1442,7 +1504,7 @@ static BlockDriver bdrv_mirror_top = {
 .bdrv_co_pwrite_zeroes  = bdrv_mirror_top_pwrite_zeroes,
 .bdrv_co_pdiscard   = 

[Qemu-devel] [PATCH 14/17] mirror: Inline mirror_iteration_done()

2018-08-12 Thread Max Reitz
mirror_co_perform() is the sole user of that function, and it looks a
bit weird now.  This patch inlines it into mirror_co_perform().

Signed-off-by: Max Reitz 
---
 block/mirror.c | 53 ++
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 62fd499799..053c37b6a6 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -154,35 +154,6 @@ static void coroutine_fn mirror_wait_on_conflicts(MirrorOp 
*self,
 }
 }
 
-static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
-{
-MirrorBlockJob *s = op->s;
-int64_t chunk_num;
-int nb_chunks;
-
-trace_mirror_iteration_done(s, op->offset, op->bytes, ret);
-
-s->in_flight--;
-s->bytes_in_flight -= op->bytes;
-
-chunk_num = op->offset / s->granularity;
-nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
-
-bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
-QTAILQ_REMOVE(>ops_in_flight, op, next);
-if (ret >= 0) {
-if (s->cow_bitmap) {
-bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
-}
-if (!s->initial_zeroing_ongoing) {
-job_progress_update(>common.job, op->bytes);
-}
-}
-
-qemu_co_queue_restart_all(>waiting_requests);
-g_free(op);
-}
-
 /* Clip bytes relative to offset to not exceed end-of-file */
 static inline int64_t mirror_clip_bytes(MirrorBlockJob *s,
 int64_t offset,
@@ -373,6 +344,8 @@ static void coroutine_fn mirror_co_perform(void *opaque)
 MirrorOp *op = opaque;
 MirrorBlockJob *s = op->s;
 AioContext *aio_context;
+int64_t chunk_num;
+int nb_chunks;
 bool failed_on_read = false;
 int ret;
 
@@ -403,7 +376,27 @@ static void coroutine_fn mirror_co_perform(void *opaque)
 }
 }
 
-mirror_iteration_done(op, ret);
+trace_mirror_iteration_done(s, op->offset, op->bytes, ret);
+
+s->in_flight--;
+s->bytes_in_flight -= op->bytes;
+
+chunk_num = op->offset / s->granularity;
+nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
+
+bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks);
+QTAILQ_REMOVE(>ops_in_flight, op, next);
+if (ret >= 0) {
+if (s->cow_bitmap) {
+bitmap_set(s->cow_bitmap, chunk_num, nb_chunks);
+}
+if (!s->initial_zeroing_ongoing) {
+job_progress_update(>common.job, op->bytes);
+}
+}
+
+qemu_co_queue_restart_all(>waiting_requests);
+g_free(op);
 
 aio_context_release(aio_context);
 }
-- 
2.17.1




[Qemu-devel] [PATCH 15/17] mirror: Release AioCtx before queue_restart_all()

2018-08-12 Thread Max Reitz
Calling qemu_co_queue_restart_all() with a held AioContext looks a bit
strange.  There is no reason why we would hold the context any longer
(as this coroutine is not going to perform any further operations that
would necessitate it), so release it before restarting the waiting
requests.

Signed-off-by: Max Reitz 
---
 block/mirror.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 053c37b6a6..cba7de610e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -395,10 +395,10 @@ static void coroutine_fn mirror_co_perform(void *opaque)
 }
 }
 
+aio_context_release(aio_context);
+
 qemu_co_queue_restart_all(>waiting_requests);
 g_free(op);
-
-aio_context_release(aio_context);
 }
 
 /* If mirror_method == MIRROR_METHOD_COPY, *offset and *bytes will be
-- 
2.17.1




[Qemu-devel] [PATCH 13/17] mirror: Linearize mirror_co_read()

2018-08-12 Thread Max Reitz
This function inlines mirror_read_complete() and mirror_write_complete()
into mirror_co_read() (which is thus renamed to mirror_co_copy()).  In
addition, freeing of the I/O vector is removed from
mirror_iteration_done() and put into an own function mirror_free_qiov()
(which is called by mirror_co_copy()).

Signed-off-by: Max Reitz 
---
 block/mirror.c | 118 -
 1 file changed, 48 insertions(+), 70 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index f3df7dd984..62fd499799 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -154,26 +154,16 @@ static void coroutine_fn 
mirror_wait_on_conflicts(MirrorOp *self,
 }
 }
 
-static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret,
-   QEMUIOVector *qiov)
+static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
 {
 MirrorBlockJob *s = op->s;
-struct iovec *iov;
 int64_t chunk_num;
-int i, nb_chunks;
+int nb_chunks;
 
 trace_mirror_iteration_done(s, op->offset, op->bytes, ret);
 
 s->in_flight--;
 s->bytes_in_flight -= op->bytes;
-if (qiov) {
-iov = qiov->iov;
-for (i = 0; i < qiov->niov; i++) {
-MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
-QSIMPLEQ_INSERT_TAIL(>buf_free, buf, next);
-s->buf_free_count++;
-}
-}
 
 chunk_num = op->offset / s->granularity;
 nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
@@ -188,53 +178,11 @@ static void coroutine_fn mirror_iteration_done(MirrorOp 
*op, int ret,
 job_progress_update(>common.job, op->bytes);
 }
 }
-if (qiov) {
-qemu_iovec_destroy(qiov);
-}
 
 qemu_co_queue_restart_all(>waiting_requests);
 g_free(op);
 }
 
-static void coroutine_fn mirror_write_complete(MirrorOp *op, int ret,
-   QEMUIOVector *qiov)
-{
-MirrorBlockJob *s = op->s;
-
-if (ret < 0) {
-BlockErrorAction action;
-
-bdrv_set_dirty_bitmap(s->dirty_bitmap, op->offset, op->bytes);
-action = mirror_error_action(s, false, -ret);
-if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
-s->ret = ret;
-}
-}
-mirror_iteration_done(op, ret, qiov);
-}
-
-static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret,
-  QEMUIOVector *qiov)
-{
-MirrorBlockJob *s = op->s;
-
-if (ret < 0) {
-BlockErrorAction action;
-
-bdrv_set_dirty_bitmap(s->dirty_bitmap, op->offset, op->bytes);
-action = mirror_error_action(s, true, -ret);
-if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
-s->ret = ret;
-}
-
-mirror_iteration_done(op, ret, qiov);
-} else {
-ret = blk_co_pwritev(s->target, op->offset,
- qiov->size, qiov, 0);
-mirror_write_complete(op, ret, qiov);
-}
-}
-
 /* Clip bytes relative to offset to not exceed end-of-file */
 static inline int64_t mirror_clip_bytes(MirrorBlockJob *s,
 int64_t offset,
@@ -322,6 +270,19 @@ static void coroutine_fn 
mirror_co_alloc_qiov(MirrorBlockJob *s,
 }
 }
 
+static void mirror_free_qiov(MirrorBlockJob *s, QEMUIOVector *qiov)
+{
+struct iovec *iov = qiov->iov;
+int i;
+
+for (i = 0; i < qiov->niov; i++) {
+MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
+QSIMPLEQ_INSERT_TAIL(>buf_free, buf, next);
+s->buf_free_count++;
+}
+qemu_iovec_destroy(qiov);
+}
+
 /*
  * Restrict *bytes to how much we can actually handle, and align the
  * [*offset, *bytes] range to clusters if COW is needed.
@@ -351,24 +312,41 @@ static void mirror_align_for_copy(MirrorBlockJob *s,
 assert(QEMU_IS_ALIGNED(*bytes, BDRV_SECTOR_SIZE));
 }
 
-/* Perform a mirror copy operation. */
-static void coroutine_fn mirror_co_read(void *opaque)
+/*
+ * Perform a mirror copy operation.
+ *
+ * On error, -errno is returned and *failed_on_read is set to the
+ * appropriate value.
+ */
+static int coroutine_fn mirror_co_copy(MirrorBlockJob *s,
+   int64_t offset, int64_t bytes,
+   bool *failed_on_read)
 {
-MirrorOp *op = opaque;
-MirrorBlockJob *s = op->s;
 QEMUIOVector qiov;
-uint64_t ret;
+int ret;
 
-mirror_co_alloc_qiov(s, , op->offset, op->bytes);
+mirror_co_alloc_qiov(s, , offset, bytes);
 
 /* Copy the dirty cluster.  */
 s->in_flight++;
-s->bytes_in_flight += op->bytes;
-trace_mirror_one_iteration(s, op->offset, op->bytes);
+s->bytes_in_flight += bytes;
+trace_mirror_one_iteration(s, offset, bytes);
+
+ret = bdrv_co_preadv(s->mirror_top_bs->backing, offset, bytes, , 0);
+if (ret < 0) {
+*failed_on_read = true;
+goto fail;
+}
 
-ret = 

[Qemu-devel] [PATCH 09/17] mirror: Lock AioContext in mirror_co_perform()

2018-08-12 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 block/mirror.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 85b08086cc..6330269156 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -196,7 +196,6 @@ static void coroutine_fn mirror_write_complete(MirrorOp 
*op, int ret)
 {
 MirrorBlockJob *s = op->s;
 
-aio_context_acquire(blk_get_aio_context(s->common.blk));
 if (ret < 0) {
 BlockErrorAction action;
 
@@ -207,14 +206,12 @@ static void coroutine_fn mirror_write_complete(MirrorOp 
*op, int ret)
 }
 }
 mirror_iteration_done(op, ret);
-aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
 static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret)
 {
 MirrorBlockJob *s = op->s;
 
-aio_context_acquire(blk_get_aio_context(s->common.blk));
 if (ret < 0) {
 BlockErrorAction action;
 
@@ -230,7 +227,6 @@ static void coroutine_fn mirror_read_complete(MirrorOp *op, 
int ret)
  op->qiov.size, >qiov, 0);
 mirror_write_complete(op, ret);
 }
-aio_context_release(blk_get_aio_context(s->common.blk));
 }
 
 /* Clip bytes relative to offset to not exceed end-of-file */
@@ -387,12 +383,16 @@ static void coroutine_fn mirror_co_perform(void *opaque)
 {
 MirrorOp *op = opaque;
 MirrorBlockJob *s = op->s;
+AioContext *aio_context;
 int ret;
 
+aio_context = blk_get_aio_context(s->common.blk);
+aio_context_acquire(aio_context);
+
 switch (op->mirror_method) {
 case MIRROR_METHOD_COPY:
 mirror_co_read(opaque);
-return;
+goto done;
 case MIRROR_METHOD_ZERO:
 ret = mirror_co_zero(s, op->offset, op->bytes);
 break;
@@ -404,6 +404,9 @@ static void coroutine_fn mirror_co_perform(void *opaque)
 }
 
 mirror_write_complete(op, ret);
+
+done:
+aio_context_release(aio_context);
 }
 
 /* If mirror_method == MIRROR_METHOD_COPY, *offset and *bytes will be
-- 
2.17.1




[Qemu-devel] [PATCH 11/17] mirror: Inline mirror_write_complete(), part 1

2018-08-12 Thread Max Reitz
Eventually, we want to inline mirror_write_complete() fully into
mirror_co_perform().  This patch does the inlining, but we cannot remove
the function yet, as it is still required by mirror_co_read().

Signed-off-by: Max Reitz 
---
 block/mirror.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/block/mirror.c b/block/mirror.c
index 2a131d8b99..66746cf075 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -407,7 +407,17 @@ static void coroutine_fn mirror_co_perform(void *opaque)
 abort();
 }
 
-mirror_write_complete(op, ret);
+if (ret < 0) {
+BlockErrorAction action;
+
+bdrv_set_dirty_bitmap(s->dirty_bitmap, op->offset, op->bytes);
+action = mirror_error_action(s, false, -ret);
+if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) {
+s->ret = ret;
+}
+}
+
+mirror_iteration_done(op, ret);
 
 done:
 aio_context_release(aio_context);
-- 
2.17.1




[Qemu-devel] [PATCH 12/17] mirror: Put QIOV locally into mirror_co_read

2018-08-12 Thread Max Reitz
We only need the I/O vector for data copying operations, so we do not
need to put it into the MirrorOp structure and can keep it locally in
mirror_co_read().

Signed-off-by: Max Reitz 
---
 block/mirror.c | 43 +--
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 66746cf075..f3df7dd984 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -95,7 +95,6 @@ struct MirrorOp {
 MirrorBlockJob *s;
 MirrorMethod mirror_method;
 
-QEMUIOVector qiov;
 int64_t offset;
 int64_t bytes;
 
@@ -155,7 +154,8 @@ static void coroutine_fn mirror_wait_on_conflicts(MirrorOp 
*self,
 }
 }
 
-static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret)
+static void coroutine_fn mirror_iteration_done(MirrorOp *op, int ret,
+   QEMUIOVector *qiov)
 {
 MirrorBlockJob *s = op->s;
 struct iovec *iov;
@@ -166,11 +166,13 @@ static void coroutine_fn mirror_iteration_done(MirrorOp 
*op, int ret)
 
 s->in_flight--;
 s->bytes_in_flight -= op->bytes;
-iov = op->qiov.iov;
-for (i = 0; i < op->qiov.niov; i++) {
-MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
-QSIMPLEQ_INSERT_TAIL(>buf_free, buf, next);
-s->buf_free_count++;
+if (qiov) {
+iov = qiov->iov;
+for (i = 0; i < qiov->niov; i++) {
+MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
+QSIMPLEQ_INSERT_TAIL(>buf_free, buf, next);
+s->buf_free_count++;
+}
 }
 
 chunk_num = op->offset / s->granularity;
@@ -186,13 +188,16 @@ static void coroutine_fn mirror_iteration_done(MirrorOp 
*op, int ret)
 job_progress_update(>common.job, op->bytes);
 }
 }
-qemu_iovec_destroy(>qiov);
+if (qiov) {
+qemu_iovec_destroy(qiov);
+}
 
 qemu_co_queue_restart_all(>waiting_requests);
 g_free(op);
 }
 
-static void coroutine_fn mirror_write_complete(MirrorOp *op, int ret)
+static void coroutine_fn mirror_write_complete(MirrorOp *op, int ret,
+   QEMUIOVector *qiov)
 {
 MirrorBlockJob *s = op->s;
 
@@ -205,10 +210,11 @@ static void coroutine_fn mirror_write_complete(MirrorOp 
*op, int ret)
 s->ret = ret;
 }
 }
-mirror_iteration_done(op, ret);
+mirror_iteration_done(op, ret, qiov);
 }
 
-static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret)
+static void coroutine_fn mirror_read_complete(MirrorOp *op, int ret,
+  QEMUIOVector *qiov)
 {
 MirrorBlockJob *s = op->s;
 
@@ -221,11 +227,11 @@ static void coroutine_fn mirror_read_complete(MirrorOp 
*op, int ret)
 s->ret = ret;
 }
 
-mirror_iteration_done(op, ret);
+mirror_iteration_done(op, ret, qiov);
 } else {
 ret = blk_co_pwritev(s->target, op->offset,
- op->qiov.size, >qiov, 0);
-mirror_write_complete(op, ret);
+ qiov->size, qiov, 0);
+mirror_write_complete(op, ret, qiov);
 }
 }
 
@@ -350,9 +356,10 @@ static void coroutine_fn mirror_co_read(void *opaque)
 {
 MirrorOp *op = opaque;
 MirrorBlockJob *s = op->s;
+QEMUIOVector qiov;
 uint64_t ret;
 
-mirror_co_alloc_qiov(s, >qiov, op->offset, op->bytes);
+mirror_co_alloc_qiov(s, , op->offset, op->bytes);
 
 /* Copy the dirty cluster.  */
 s->in_flight++;
@@ -360,8 +367,8 @@ static void coroutine_fn mirror_co_read(void *opaque)
 trace_mirror_one_iteration(s, op->offset, op->bytes);
 
 ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes,
- >qiov, 0);
-mirror_read_complete(op, ret);
+ , 0);
+mirror_read_complete(op, ret, );
 }
 
 static int coroutine_fn mirror_co_zero(MirrorBlockJob *s,
@@ -417,7 +424,7 @@ static void coroutine_fn mirror_co_perform(void *opaque)
 }
 }
 
-mirror_iteration_done(op, ret);
+mirror_iteration_done(op, ret, NULL);
 
 done:
 aio_context_release(aio_context);
-- 
2.17.1




[Qemu-devel] [PATCH 07/17] mirror: Make mirror_co_zero() nicer

2018-08-12 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 block/mirror.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 89452ad371..df8e0242dc 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -364,17 +364,14 @@ static void coroutine_fn mirror_co_read(void *opaque)
 mirror_read_complete(op, ret);
 }
 
-static void coroutine_fn mirror_co_zero(void *opaque)
+static int coroutine_fn mirror_co_zero(MirrorBlockJob *s,
+   int64_t offset, int64_t bytes)
 {
-MirrorOp *op = opaque;
-int ret;
-
-op->s->in_flight++;
-op->s->bytes_in_flight += op->bytes;
+s->in_flight++;
+s->bytes_in_flight += bytes;
 
-ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
-   op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
-mirror_write_complete(op, ret);
+return blk_co_pwrite_zeroes(s->target, offset, bytes,
+s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
 }
 
 static void coroutine_fn mirror_co_discard(void *opaque)
@@ -392,20 +389,24 @@ static void coroutine_fn mirror_co_discard(void *opaque)
 static void coroutine_fn mirror_co_perform(void *opaque)
 {
 MirrorOp *op = opaque;
+MirrorBlockJob *s = op->s;
+int ret;
 
 switch (op->mirror_method) {
 case MIRROR_METHOD_COPY:
 mirror_co_read(opaque);
 return;
 case MIRROR_METHOD_ZERO:
-mirror_co_zero(opaque);
-return;
+ret = mirror_co_zero(s, op->offset, op->bytes);
+break;
 case MIRROR_METHOD_DISCARD:
 mirror_co_discard(opaque);
 return;
 default:
 abort();
 }
+
+mirror_write_complete(op, ret);
 }
 
 /* If mirror_method == MIRROR_METHOD_COPY, *offset and *bytes will be
-- 
2.17.1




[Qemu-devel] [PATCH 04/17] mirror: Remove bytes_handled, part 1

2018-08-12 Thread Max Reitz
By moving the mirror_align_for_copy() call from mirror_co_read() to
mirror_perform(), we can drop bytes_handled from MirrorOp.

mirror_align_for_copy() takes a uint64_t * for @bytes, so this commit
changes mirror_perform()'s @bytes parameter to uint64_t, too; but it
still asserts that its value does not exceed UINT_MAX (necessary because
it may be assigned to bytes_handled, which is returned and may thus not
exceed UINT_MAX).  This assertion is removed in a later patch.

Signed-off-by: Max Reitz 
---
 block/mirror.c | 40 +---
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 34cb8293b2..3234b8b687 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -91,10 +91,6 @@ struct MirrorOp {
 int64_t offset;
 uint64_t bytes;
 
-/* The pointee is set by mirror_co_read(), mirror_co_zero(), and
- * mirror_co_discard() before yielding for the first time */
-int64_t *bytes_handled;
-
 bool is_pseudo_op;
 bool is_active_write;
 CoQueue waiting_requests;
@@ -343,14 +339,7 @@ static void mirror_align_for_copy(MirrorBlockJob *s,
 assert(QEMU_IS_ALIGNED(*bytes, BDRV_SECTOR_SIZE));
 }
 
-/* Perform a mirror copy operation.
- *
- * *op->bytes_handled is set to the number of bytes copied after and
- * including offset, excluding any bytes copied prior to offset due
- * to alignment.  This will be op->bytes if no alignment is necessary,
- * or (new_end - op->offset) if the tail is rounded up or down due to
- * alignment or buffer limit.
- */
+/* Perform a mirror copy operation. */
 static void coroutine_fn mirror_co_read(void *opaque)
 {
 MirrorOp *op = opaque;
@@ -358,7 +347,6 @@ static void coroutine_fn mirror_co_read(void *opaque)
 int nb_chunks;
 uint64_t ret;
 
-mirror_align_for_copy(s, >offset, >bytes, op->bytes_handled);
 nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
 
 while (s->buf_free_count < nb_chunks) {
@@ -396,7 +384,6 @@ static void coroutine_fn mirror_co_zero(void *opaque)
 
 op->s->in_flight++;
 op->s->bytes_in_flight += op->bytes;
-*op->bytes_handled = op->bytes;
 
 ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
@@ -410,25 +397,33 @@ static void coroutine_fn mirror_co_discard(void *opaque)
 
 op->s->in_flight++;
 op->s->bytes_in_flight += op->bytes;
-*op->bytes_handled = op->bytes;
 
 ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes);
 mirror_write_complete(op, ret);
 }
 
 static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
-   unsigned bytes, MirrorMethod mirror_method)
+   uint64_t bytes, MirrorMethod mirror_method)
 {
 MirrorOp *op;
 Coroutine *co;
-int64_t bytes_handled = -1;
+int64_t bytes_handled;
+
+/* FIXME: Drop this assertion */
+assert(bytes <= UINT_MAX);
+
+if (mirror_method == MIRROR_METHOD_COPY) {
+mirror_align_for_copy(s, , , _handled);
+assert(bytes_handled <= UINT_MAX);
+} else {
+bytes_handled = bytes;
+}
 
 op = g_new(MirrorOp, 1);
 *op = (MirrorOp){
 .s  = s,
 .offset = offset,
 .bytes  = bytes,
-.bytes_handled  = _handled,
 };
 qemu_co_queue_init(>waiting_requests);
 
@@ -448,16 +443,7 @@ static unsigned mirror_perform(MirrorBlockJob *s, int64_t 
offset,
 
 QTAILQ_INSERT_TAIL(>ops_in_flight, op, next);
 qemu_coroutine_enter(co);
-/* At this point, ownership of op has been moved to the coroutine
- * and the object may already be freed */
-
-/* Assert that this value has been set */
-assert(bytes_handled >= 0);
 
-/* Same assertion as in mirror_co_read() (and for mirror_co_read()
- * and mirror_co_discard(), bytes_handled == op->bytes, which
- * is the @bytes parameter given to this function) */
-assert(bytes_handled <= UINT_MAX);
 return bytes_handled;
 }
 
-- 
2.17.1




[Qemu-devel] [PATCH 10/17] mirror: Create mirror_co_alloc_qiov()

2018-08-12 Thread Max Reitz
By pulling this function out of mirror_co_read(), the latter becomes
simpler.  This is important so the following patches can make it more
complicated again.

Signed-off-by: Max Reitz 
---
 block/mirror.c | 44 
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 6330269156..2a131d8b99 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -293,6 +293,29 @@ static inline void coroutine_fn
 mirror_co_wait_for_any_operation(s, false);
 }
 
+/* Allocate a QEMUIOVector from the mirror buffer pool (s->buf_free) */
+static void coroutine_fn mirror_co_alloc_qiov(MirrorBlockJob *s,
+  QEMUIOVector *qiov,
+  int64_t offset, uint64_t bytes)
+{
+int nb_chunks = DIV_ROUND_UP(bytes, s->granularity);
+
+while (s->buf_free_count < nb_chunks) {
+trace_mirror_yield_in_flight(s, offset, s->in_flight);
+mirror_co_wait_for_free_in_flight_slot(s);
+}
+
+qemu_iovec_init(qiov, nb_chunks);
+while (nb_chunks-- > 0) {
+MirrorBuffer *buf = QSIMPLEQ_FIRST(>buf_free);
+size_t remaining = bytes - qiov->size;
+
+QSIMPLEQ_REMOVE_HEAD(>buf_free, next);
+s->buf_free_count--;
+qemu_iovec_add(qiov, buf, MIN(s->granularity, remaining));
+}
+}
+
 /*
  * Restrict *bytes to how much we can actually handle, and align the
  * [*offset, *bytes] range to clusters if COW is needed.
@@ -327,28 +350,9 @@ static void coroutine_fn mirror_co_read(void *opaque)
 {
 MirrorOp *op = opaque;
 MirrorBlockJob *s = op->s;
-int nb_chunks;
 uint64_t ret;
 
-nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
-
-while (s->buf_free_count < nb_chunks) {
-trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
-mirror_co_wait_for_free_in_flight_slot(s);
-}
-
-/* Now make a QEMUIOVector taking enough granularity-sized chunks
- * from s->buf_free.
- */
-qemu_iovec_init(>qiov, nb_chunks);
-while (nb_chunks-- > 0) {
-MirrorBuffer *buf = QSIMPLEQ_FIRST(>buf_free);
-size_t remaining = op->bytes - op->qiov.size;
-
-QSIMPLEQ_REMOVE_HEAD(>buf_free, next);
-s->buf_free_count--;
-qemu_iovec_add(>qiov, buf, MIN(s->granularity, remaining));
-}
+mirror_co_alloc_qiov(s, >qiov, op->offset, op->bytes);
 
 /* Copy the dirty cluster.  */
 s->in_flight++;
-- 
2.17.1




[Qemu-devel] [PATCH 06/17] mirror: Create mirror_co_perform()

2018-08-12 Thread Max Reitz
While very simple now, this function will be fattened in future patches.

Signed-off-by: Max Reitz 
---
 block/mirror.c | 48 +---
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index f05404e557..89452ad371 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -85,8 +85,16 @@ typedef struct MirrorBDSOpaque {
 MirrorBlockJob *job;
 } MirrorBDSOpaque;
 
+typedef enum MirrorMethod {
+MIRROR_METHOD_COPY,
+MIRROR_METHOD_ZERO,
+MIRROR_METHOD_DISCARD,
+} MirrorMethod;
+
 struct MirrorOp {
 MirrorBlockJob *s;
+MirrorMethod mirror_method;
+
 QEMUIOVector qiov;
 int64_t offset;
 int64_t bytes;
@@ -98,12 +106,6 @@ struct MirrorOp {
 QTAILQ_ENTRY(MirrorOp) next;
 };
 
-typedef enum MirrorMethod {
-MIRROR_METHOD_COPY,
-MIRROR_METHOD_ZERO,
-MIRROR_METHOD_DISCARD,
-} MirrorMethod;
-
 static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
 int error)
 {
@@ -387,6 +389,25 @@ static void coroutine_fn mirror_co_discard(void *opaque)
 mirror_write_complete(op, ret);
 }
 
+static void coroutine_fn mirror_co_perform(void *opaque)
+{
+MirrorOp *op = opaque;
+
+switch (op->mirror_method) {
+case MIRROR_METHOD_COPY:
+mirror_co_read(opaque);
+return;
+case MIRROR_METHOD_ZERO:
+mirror_co_zero(opaque);
+return;
+case MIRROR_METHOD_DISCARD:
+mirror_co_discard(opaque);
+return;
+default:
+abort();
+}
+}
+
 /* If mirror_method == MIRROR_METHOD_COPY, *offset and *bytes will be
  * aligned as necessary */
 static void mirror_perform(MirrorBlockJob *s, int64_t *offset, int64_t *bytes,
@@ -402,24 +423,13 @@ static void mirror_perform(MirrorBlockJob *s, int64_t 
*offset, int64_t *bytes,
 op = g_new(MirrorOp, 1);
 *op = (MirrorOp){
 .s  = s,
+.mirror_method  = mirror_method,
 .offset = *offset,
 .bytes  = *bytes,
 };
 qemu_co_queue_init(>waiting_requests);
 
-switch (mirror_method) {
-case MIRROR_METHOD_COPY:
-co = qemu_coroutine_create(mirror_co_read, op);
-break;
-case MIRROR_METHOD_ZERO:
-co = qemu_coroutine_create(mirror_co_zero, op);
-break;
-case MIRROR_METHOD_DISCARD:
-co = qemu_coroutine_create(mirror_co_discard, op);
-break;
-default:
-abort();
-}
+co = qemu_coroutine_create(mirror_co_perform, op);
 
 QTAILQ_INSERT_TAIL(>ops_in_flight, op, next);
 qemu_coroutine_enter(co);
-- 
2.17.1




[Qemu-devel] [PATCH 08/17] mirror: Make mirror_co_discard() nicer

2018-08-12 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 block/mirror.c | 17 +++--
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index df8e0242dc..85b08086cc 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -374,16 +374,13 @@ static int coroutine_fn mirror_co_zero(MirrorBlockJob *s,
 s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
 }
 
-static void coroutine_fn mirror_co_discard(void *opaque)
+static int coroutine_fn mirror_co_discard(MirrorBlockJob *s,
+  int64_t offset, int64_t bytes)
 {
-MirrorOp *op = opaque;
-int ret;
-
-op->s->in_flight++;
-op->s->bytes_in_flight += op->bytes;
+s->in_flight++;
+s->bytes_in_flight += bytes;
 
-ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes);
-mirror_write_complete(op, ret);
+return blk_co_pdiscard(s->target, offset, bytes);
 }
 
 static void coroutine_fn mirror_co_perform(void *opaque)
@@ -400,8 +397,8 @@ static void coroutine_fn mirror_co_perform(void *opaque)
 ret = mirror_co_zero(s, op->offset, op->bytes);
 break;
 case MIRROR_METHOD_DISCARD:
-mirror_co_discard(opaque);
-return;
+ret = mirror_co_discard(s, op->offset, op->bytes);
+break;
 default:
 abort();
 }
-- 
2.17.1




[Qemu-devel] [PATCH 00/17] mirror: Mainly coroutine refinements

2018-08-12 Thread Max Reitz
This series is based on v2 of my "block: Deal with filters" series:

Based-on: <20180809223117.7846-1-mre...@redhat.com>


The bulk of this series (14 of the patches here, in fact) makes more of
the coroutined mirror I started with my active mirror series.  For this,
mirror_perform() is translated into a coroutine version that actually
looks like coroutines are a nice thing to have (a single AioContext
lock, a common clean-up path, and a mirror_co_copy() that does both read
and write in a single function (like it is supposed to)).

Another thing I have wanted to implement is a read-write-blocking mode.
write-blocking (active mirror) copies data to the target whenever you
write data to the mirror node -- so once the mirror bitmap is clean, it
will stay clean, and completing the job is guaranteed to be basically
instantenous.  read-write-blocking would also copy data whenever you
read from the mirror node, because we have already read the data right
now, so we might as well copy it to the target now.

Instead of implementing it actually as a new copy-mode (which is how I
used to do, but I decided that was boring), I opted for implementing an
interesting .bdrv_co_block_status() function that allows using COR for
the purpose: You simply put a COR node on top of your write-blocking
mirror node, and then all the data read will be copied to the target
through COR (as long as it is not already cleanly mirrored).

The benefit I see with this approach is that is just looks nice.  Why
implement an actual new copy-mode when we can just an existing block
layer function for the purpose?

The drawbacks I see are these:
(1) mirror in write-blocking mode has to interpret
BDRV_REQ_WRITE_UNCHANGED in a special manner, or this doesn't really
work.  With patch 16 of this series, such requests are not written
to the source, but only to the target.  You might find that weird.

(2) We currently don't have a good way of inserting a COR node at
runtime...

(3) You may break your data if you do funny stuff.  Now this is of
course the interesting point.
Mirror works (by design) even when you attach a parent to the source
directly, i.e., not going through the mirror node.  That is because
mirror uses the source's dirty bitmap functionality and does not
build its own.  But if you do this with COR/write-blocking, then the
COR driver may issue a request while some other parent is issuing
another request immediately to the source, which may cause source
and target to go out of sync.
But...  Technically this is already an issue with write-blocking
alone.  We reset the bitmap only in do_sync_target_write(), after we
have already written to the source.  So if another of source's
parents writes to source between our write and the bitmap reset, we
lose that information.
So I suppose when using write-blocking, the mirror driver may not
share the WRITE permission?

(4) The same issue exists with COR itself, actually...  It can only
share the WRITE permission because COR requests are serializing, but
that only works on a single layer.  With a filter between COR and
the allocating node, you get the same issue of someone being able to
slip in a request between the COR read and the COR allocation.
Maybe we actually want COR not to share WRITE either?

So in my opinion the main drawback is that this design reveals current
weaknesses and bugs of the block layer.


Max Reitz (17):
  iotests: Try reading while mirroring in 156
  mirror: Make wait_for_any_operation() coroutine_fn
  mirror: Pull *_align_for_copy() from *_co_read()
  mirror: Remove bytes_handled, part 1
  mirror: Remove bytes_handled, part 2
  mirror: Create mirror_co_perform()
  mirror: Make mirror_co_zero() nicer
  mirror: Make mirror_co_discard() nicer
  mirror: Lock AioContext in mirror_co_perform()
  mirror: Create mirror_co_alloc_qiov()
  mirror: Inline mirror_write_complete(), part 1
  mirror: Put QIOV locally into mirror_co_read
  mirror: Linearize mirror_co_read()
  mirror: Inline mirror_iteration_done()
  mirror: Release AioCtx before queue_restart_all()
  mirror: Support COR with write-blocking
  iotests: Add test for active mirror with COR

 block/mirror.c | 483 -
 tests/qemu-iotests/151 |  56 +
 tests/qemu-iotests/151.out |   4 +-
 tests/qemu-iotests/156 |   7 +
 tests/qemu-iotests/156.out |   3 +
 5 files changed, 334 insertions(+), 219 deletions(-)

-- 
2.17.1




[Qemu-devel] [PATCH 05/17] mirror: Remove bytes_handled, part 2

2018-08-12 Thread Max Reitz
Remove mirror_perform()'s return value, instead aligning the @offset and
@bytes it receives.  We needed the return value for two reasons:

(1) So that "offset += io_bytes" would result in an aligned offset, and
(2) so that io_bytes_acct is kind of aligned (the tail is aligned, but
the head is not).

(2) does not make sense.  If we want to account for the bytes we have to
copy, we should not have used the returned io_bytes as the basis but its
original value before the mirror_perform() call.  If we want to account
for the bytes we have actually copied, we should not disregard the
potential alignment head, but bytes_handled only includes the tail, so
that was wrong, too.
Let's go for the fully aligned value (the number of bytes actually
copied), because apparently we do want some alignment (or we would have
just added io_bytes before this patch instead of bytes_handled).

(1) can be achieved by simply letting mirror_perform() return the
aligned values to its callers, which is what this patch does.

Signed-off-by: Max Reitz 
---
 block/mirror.c | 56 +++---
 1 file changed, 17 insertions(+), 39 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 3234b8b687..f05404e557 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -89,7 +89,7 @@ struct MirrorOp {
 MirrorBlockJob *s;
 QEMUIOVector qiov;
 int64_t offset;
-uint64_t bytes;
+int64_t bytes;
 
 bool is_pseudo_op;
 bool is_active_write;
@@ -239,13 +239,10 @@ static inline int64_t mirror_clip_bytes(MirrorBlockJob *s,
 return MIN(bytes, s->bdev_length - offset);
 }
 
-/* Round offset and/or bytes to target cluster if COW is needed, and
- * return the offset of the adjusted tail against original. */
-static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset,
-uint64_t *bytes)
+/* Round offset and/or bytes to target cluster if COW is needed */
+static void mirror_cow_align(MirrorBlockJob *s, int64_t *offset, int64_t 
*bytes)
 {
 bool need_cow;
-int ret = 0;
 int64_t align_offset = *offset;
 int64_t align_bytes = *bytes;
 int max_bytes = s->granularity * s->max_iov;
@@ -268,11 +265,8 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t 
*offset,
  * that doesn't matter because it's already the end of source image. */
 align_bytes = mirror_clip_bytes(s, align_offset, align_bytes);
 
-ret = align_offset + align_bytes - (*offset + *bytes);
 *offset = align_offset;
 *bytes = align_bytes;
-assert(ret >= 0);
-return ret;
 }
 
 static inline void coroutine_fn
@@ -304,16 +298,9 @@ static inline void coroutine_fn
 /*
  * Restrict *bytes to how much we can actually handle, and align the
  * [*offset, *bytes] range to clusters if COW is needed.
- *
- * *bytes_handled is set to the number of bytes copied after and
- * including offset, excluding any bytes copied prior to offset due
- * to alignment.  This will be *bytes if no alignment is necessary, or
- * (new_end - *offset) if the tail is rounded up or down due to
- * alignment or buffer limit.
  */
 static void mirror_align_for_copy(MirrorBlockJob *s,
-  int64_t *offset, uint64_t *bytes,
-  int64_t *bytes_handled)
+  int64_t *offset, int64_t *bytes)
 {
 uint64_t max_bytes;
 
@@ -323,13 +310,11 @@ static void mirror_align_for_copy(MirrorBlockJob *s,
 *bytes = MIN(s->buf_size, MIN(max_bytes, *bytes));
 assert(*bytes);
 assert(*bytes < BDRV_REQUEST_MAX_BYTES);
-*bytes_handled = *bytes;
 
 if (s->cow_bitmap) {
-*bytes_handled += mirror_cow_align(s, offset, bytes);
+mirror_cow_align(s, offset, bytes);
 }
-/* Cannot exceed BDRV_REQUEST_MAX_BYTES + INT_MAX */
-assert(*bytes_handled <= UINT_MAX);
+
 assert(*bytes <= s->buf_size);
 /* The offset is granularity-aligned because:
  * 1) Caller passes in aligned values;
@@ -402,28 +387,23 @@ static void coroutine_fn mirror_co_discard(void *opaque)
 mirror_write_complete(op, ret);
 }
 
-static unsigned mirror_perform(MirrorBlockJob *s, int64_t offset,
-   uint64_t bytes, MirrorMethod mirror_method)
+/* If mirror_method == MIRROR_METHOD_COPY, *offset and *bytes will be
+ * aligned as necessary */
+static void mirror_perform(MirrorBlockJob *s, int64_t *offset, int64_t *bytes,
+   MirrorMethod mirror_method)
 {
 MirrorOp *op;
 Coroutine *co;
-int64_t bytes_handled;
-
-/* FIXME: Drop this assertion */
-assert(bytes <= UINT_MAX);
 
 if (mirror_method == MIRROR_METHOD_COPY) {
-mirror_align_for_copy(s, , , _handled);
-assert(bytes_handled <= UINT_MAX);
-} else {
-bytes_handled = bytes;
+mirror_align_for_copy(s, offset, bytes);
 }
 
 op = g_new(MirrorOp, 1);
 *op = (MirrorOp){
 .s  = s,
-.offset = 

[Qemu-devel] [PATCH 03/17] mirror: Pull *_align_for_copy() from *_co_read()

2018-08-12 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 block/mirror.c | 54 +-
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index c28b6159d5..34cb8293b2 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -305,42 +305,60 @@ static inline void coroutine_fn
 mirror_co_wait_for_any_operation(s, false);
 }
 
-/* Perform a mirror copy operation.
+/*
+ * Restrict *bytes to how much we can actually handle, and align the
+ * [*offset, *bytes] range to clusters if COW is needed.
  *
- * *op->bytes_handled is set to the number of bytes copied after and
+ * *bytes_handled is set to the number of bytes copied after and
  * including offset, excluding any bytes copied prior to offset due
- * to alignment.  This will be op->bytes if no alignment is necessary,
- * or (new_end - op->offset) if the tail is rounded up or down due to
+ * to alignment.  This will be *bytes if no alignment is necessary, or
+ * (new_end - *offset) if the tail is rounded up or down due to
  * alignment or buffer limit.
  */
-static void coroutine_fn mirror_co_read(void *opaque)
+static void mirror_align_for_copy(MirrorBlockJob *s,
+  int64_t *offset, uint64_t *bytes,
+  int64_t *bytes_handled)
 {
-MirrorOp *op = opaque;
-MirrorBlockJob *s = op->s;
-int nb_chunks;
-uint64_t ret;
 uint64_t max_bytes;
 
 max_bytes = s->granularity * s->max_iov;
 
 /* We can only handle as much as buf_size at a time. */
-op->bytes = MIN(s->buf_size, MIN(max_bytes, op->bytes));
-assert(op->bytes);
-assert(op->bytes < BDRV_REQUEST_MAX_BYTES);
-*op->bytes_handled = op->bytes;
+*bytes = MIN(s->buf_size, MIN(max_bytes, *bytes));
+assert(*bytes);
+assert(*bytes < BDRV_REQUEST_MAX_BYTES);
+*bytes_handled = *bytes;
 
 if (s->cow_bitmap) {
-*op->bytes_handled += mirror_cow_align(s, >offset, >bytes);
+*bytes_handled += mirror_cow_align(s, offset, bytes);
 }
 /* Cannot exceed BDRV_REQUEST_MAX_BYTES + INT_MAX */
-assert(*op->bytes_handled <= UINT_MAX);
-assert(op->bytes <= s->buf_size);
+assert(*bytes_handled <= UINT_MAX);
+assert(*bytes <= s->buf_size);
 /* The offset is granularity-aligned because:
  * 1) Caller passes in aligned values;
  * 2) mirror_cow_align is used only when target cluster is larger. */
-assert(QEMU_IS_ALIGNED(op->offset, s->granularity));
+assert(QEMU_IS_ALIGNED(*offset, s->granularity));
 /* The range is sector-aligned, since bdrv_getlength() rounds up. */
-assert(QEMU_IS_ALIGNED(op->bytes, BDRV_SECTOR_SIZE));
+assert(QEMU_IS_ALIGNED(*bytes, BDRV_SECTOR_SIZE));
+}
+
+/* Perform a mirror copy operation.
+ *
+ * *op->bytes_handled is set to the number of bytes copied after and
+ * including offset, excluding any bytes copied prior to offset due
+ * to alignment.  This will be op->bytes if no alignment is necessary,
+ * or (new_end - op->offset) if the tail is rounded up or down due to
+ * alignment or buffer limit.
+ */
+static void coroutine_fn mirror_co_read(void *opaque)
+{
+MirrorOp *op = opaque;
+MirrorBlockJob *s = op->s;
+int nb_chunks;
+uint64_t ret;
+
+mirror_align_for_copy(s, >offset, >bytes, op->bytes_handled);
 nb_chunks = DIV_ROUND_UP(op->bytes, s->granularity);
 
 while (s->buf_free_count < nb_chunks) {
-- 
2.17.1




[Qemu-devel] [PATCH 01/17] iotests: Try reading while mirroring in 156

2018-08-12 Thread Max Reitz
Currently, we never test whether we can read from the source while
mirroring (that means, whether we can read from the mirror BDS).  Add
such a test to 156 because it fits well.

Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/156 | 7 +++
 tests/qemu-iotests/156.out | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/tests/qemu-iotests/156 b/tests/qemu-iotests/156
index 0a9a09802e..99f7326158 100755
--- a/tests/qemu-iotests/156
+++ b/tests/qemu-iotests/156
@@ -106,6 +106,13 @@ _send_qemu_cmd $QEMU_HANDLE \
   'qemu-io source \"write -P 4 192k 64k\"' } }" \
 'return'
 
+# Read it back (proving that we can read while mirroring)
+_send_qemu_cmd $QEMU_HANDLE \
+"{ 'execute': 'human-monitor-command',
+   'arguments': { 'command-line':
+  'qemu-io source \"read -P 4 192k 64k\"' } }" \
+'return'
+
 # Copy source backing chain to the target before completing the job
 cp "$TEST_IMG.backing" "$TEST_IMG.target.backing"
 cp "$TEST_IMG" "$TEST_IMG.target"
diff --git a/tests/qemu-iotests/156.out b/tests/qemu-iotests/156.out
index 34c057b626..a591bd3a1e 100644
--- a/tests/qemu-iotests/156.out
+++ b/tests/qemu-iotests/156.out
@@ -20,6 +20,9 @@ Formatting 'TEST_DIR/t.IMGFMT.target.overlay', fmt=IMGFMT 
size=1048576 backing_f
 wrote 65536/65536 bytes at offset 196608
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 {"return": ""}
+read 65536/65536 bytes at offset 196608
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+{"return": ""}
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "source"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "source"}}
-- 
2.17.1




[Qemu-devel] [PATCH 02/17] mirror: Make wait_for_any_operation() coroutine_fn

2018-08-12 Thread Max Reitz
mirror_wait_for_any_operation() calls qemu_co_queue_wait(), which is a
coroutine_fn (technically it is a macro which resolves to a
coroutine_fn).  Therefore, this function needs to be a coroutine_fn as
well.

This patch makes it and all of its callers coroutine_fns.

Signed-off-by: Max Reitz 
---
 block/mirror.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 85f5742eae..c28b6159d5 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -279,7 +279,8 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t 
*offset,
 return ret;
 }
 
-static inline void mirror_wait_for_any_operation(MirrorBlockJob *s, bool 
active)
+static inline void coroutine_fn
+mirror_co_wait_for_any_operation(MirrorBlockJob *s, bool active)
 {
 MirrorOp *op;
 
@@ -297,10 +298,11 @@ static inline void 
mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
 abort();
 }
 
-static inline void mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
+static inline void coroutine_fn
+mirror_co_wait_for_free_in_flight_slot(MirrorBlockJob *s)
 {
 /* Only non-active operations use up in-flight slots */
-mirror_wait_for_any_operation(s, false);
+mirror_co_wait_for_any_operation(s, false);
 }
 
 /* Perform a mirror copy operation.
@@ -343,7 +345,7 @@ static void coroutine_fn mirror_co_read(void *opaque)
 
 while (s->buf_free_count < nb_chunks) {
 trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
-mirror_wait_for_free_in_flight_slot(s);
+mirror_co_wait_for_free_in_flight_slot(s);
 }
 
 /* Now make a QEMUIOVector taking enough granularity-sized chunks
@@ -549,7 +551,7 @@ static uint64_t coroutine_fn 
mirror_iteration(MirrorBlockJob *s)
 
 while (s->in_flight >= MAX_IN_FLIGHT) {
 trace_mirror_yield_in_flight(s, offset, s->in_flight);
-mirror_wait_for_free_in_flight_slot(s);
+mirror_co_wait_for_free_in_flight_slot(s);
 }
 
 if (s->ret < 0) {
@@ -600,10 +602,10 @@ static void mirror_free_init(MirrorBlockJob *s)
  * mirror_resume() because mirror_run() will begin iterating again
  * when the job is resumed.
  */
-static void mirror_wait_for_all_io(MirrorBlockJob *s)
+static void coroutine_fn mirror_co_wait_for_all_io(MirrorBlockJob *s)
 {
 while (s->in_flight > 0) {
-mirror_wait_for_free_in_flight_slot(s);
+mirror_co_wait_for_free_in_flight_slot(s);
 }
 }
 
@@ -762,7 +764,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 if (s->in_flight >= MAX_IN_FLIGHT) {
 trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
s->in_flight);
-mirror_wait_for_free_in_flight_slot(s);
+mirror_co_wait_for_free_in_flight_slot(s);
 continue;
 }
 
@@ -770,7 +772,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 offset += bytes;
 }
 
-mirror_wait_for_all_io(s);
+mirror_co_wait_for_all_io(s);
 s->initial_zeroing_ongoing = false;
 }
 
@@ -916,7 +918,7 @@ static void coroutine_fn mirror_run(void *opaque)
 /* Do not start passive operations while there are active
  * writes in progress */
 while (s->in_active_write_counter) {
-mirror_wait_for_any_operation(s, true);
+mirror_co_wait_for_any_operation(s, true);
 }
 
 if (s->ret < 0) {
@@ -942,7 +944,7 @@ static void coroutine_fn mirror_run(void *opaque)
 if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
 (cnt == 0 && s->in_flight > 0)) {
 trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
-mirror_wait_for_free_in_flight_slot(s);
+mirror_co_wait_for_free_in_flight_slot(s);
 continue;
 } else if (cnt != 0) {
 delay_ns = mirror_iteration(s);
@@ -1028,7 +1030,7 @@ immediate_exit:
 assert(ret < 0 || ((s->common.job.force_cancel || !s->synced) &&
job_is_cancelled(>common.job)));
 assert(need_drain);
-mirror_wait_for_all_io(s);
+mirror_co_wait_for_all_io(s);
 }
 
 assert(s->in_flight == 0);
@@ -1098,11 +1100,11 @@ static void mirror_complete(Job *job, Error **errp)
 job_enter(job);
 }
 
-static void mirror_pause(Job *job)
+static void coroutine_fn mirror_pause(Job *job)
 {
 MirrorBlockJob *s = container_of(job, MirrorBlockJob, common.job);
 
-mirror_wait_for_all_io(s);
+mirror_co_wait_for_all_io(s);
 }
 
 static bool mirror_drained_poll(BlockJob *job)
-- 
2.17.1




Re: [Qemu-devel] [PATCH 0/2] mac: don't use legacy fw_cfg_init_mem() functions

2018-08-12 Thread David Gibson
On Fri, Aug 10, 2018 at 11:27:55AM +0100, Mark Cave-Ayland wrote:
> Here are a couple of patches to switch the fw_cfg initialisation for
> the Mac Old World and New World machines over to use qdev rather than
> using the legacy fw_cfg_init_mem() function.
> 
> Signed-off-by: Mark Cave-Ayland 

Applied to ppc-for-3.1, thanks.

> 
> 
> Mark Cave-Ayland (2):
>   mac_oldworld: don't use legacy fw_cfg_init_mem() function
>   mac_newworld: don't use legacy fw_cfg_init_mem() function
> 
>  hw/ppc/mac_newworld.c | 12 +++-
>  hw/ppc/mac_oldworld.c | 12 +++-
>  2 files changed, 22 insertions(+), 2 deletions(-)
> 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH] spapr_pci: factorize the use of SPAPR_MACHINE_GET_CLASS()

2018-08-12 Thread David Gibson
On Fri, Aug 10, 2018 at 10:00:26AM +0200, Cédric Le Goater wrote:
> It should save us some CPU cycles as these routines perform a lot of
> checks.

I don't think any of these is particularly fastpath, but sure, applied
to ppc-for-3.1.

> 
> Signed-off-by: Cédric Le Goater 
> ---
>  hw/ppc/spapr_pci.c | 11 ++-
>  1 file changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
> index 3791ced6c536..5cd676e4430d 100644
> --- a/hw/ppc/spapr_pci.c
> +++ b/hw/ppc/spapr_pci.c
> @@ -267,6 +267,7 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, 
> sPAPRMachineState *spapr,
>  target_ulong args, uint32_t nret,
>  target_ulong rets)
>  {
> +sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
>  uint32_t config_addr = rtas_ld(args, 0);
>  uint64_t buid = rtas_ldq(args, 1);
>  unsigned int func = rtas_ld(args, 3);
> @@ -334,7 +335,7 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, 
> sPAPRMachineState *spapr,
>  return;
>  }
>  
> -if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +if (!smc->legacy_irq_allocation) {
>  spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
>  }
>  spapr_irq_free(spapr, msi->first_irq, msi->num);
> @@ -375,7 +376,7 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, 
> sPAPRMachineState *spapr,
>  }
>  
>  /* Allocate MSIs */
> -if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +if (smc->legacy_irq_allocation) {
>  irq = spapr_irq_find(spapr, req_num, ret_intr_type == RTAS_TYPE_MSI,
>   );
>  } else {
> @@ -401,7 +402,7 @@ static void rtas_ibm_change_msi(PowerPCCPU *cpu, 
> sPAPRMachineState *spapr,
>  
>  /* Release previous MSIs */
>  if (msi) {
> -if (!SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +if (!smc->legacy_irq_allocation) {
>  spapr_irq_msi_free(spapr, msi->first_irq, msi->num);
>  }
>  spapr_irq_free(spapr, msi->first_irq, msi->num);
> @@ -1558,6 +1559,7 @@ static void spapr_phb_realize(DeviceState *dev, Error 
> **errp)
>  sPAPRMachineState *spapr =
>  (sPAPRMachineState *) object_dynamic_cast(qdev_get_machine(),
>TYPE_SPAPR_MACHINE);
> +sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
>  SysBusDevice *s = SYS_BUS_DEVICE(dev);
>  sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(s);
>  PCIHostState *phb = PCI_HOST_BRIDGE(s);
> @@ -1575,7 +1577,6 @@ static void spapr_phb_realize(DeviceState *dev, Error 
> **errp)
>  }
>  
>  if (sphb->index != (uint32_t)-1) {
> -sPAPRMachineClass *smc = SPAPR_MACHINE_GET_CLASS(spapr);
>  Error *local_err = NULL;
>  
>  smc->phb_placement(spapr, sphb->index,
> @@ -1720,7 +1721,7 @@ static void spapr_phb_realize(DeviceState *dev, Error 
> **errp)
>  uint32_t irq = SPAPR_IRQ_PCI_LSI + sphb->index * PCI_NUM_PINS + i;
>  Error *local_err = NULL;
>  
> -if (SPAPR_MACHINE_GET_CLASS(spapr)->legacy_irq_allocation) {
> +if (smc->legacy_irq_allocation) {
>  irq = spapr_irq_findone(spapr, _err);
>  if (local_err) {
>  error_propagate(errp, local_err);

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH] 40p: don't use legacy fw_cfg_init_mem() function

2018-08-12 Thread David Gibson
On Fri, Aug 10, 2018 at 01:04:18PM +0100, Mark Cave-Ayland wrote:
> Instead initialise the device via qdev to allow us to set device properties
> directly as required.
> 
> Signed-off-by: Mark Cave-Ayland 

Applied to ppc-for-3.1, thanks.

> ---
>  hw/ppc/prep.c | 13 +++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/ppc/prep.c b/hw/ppc/prep.c
> index 3401570d98..9cf4a2adc3 100644
> --- a/hw/ppc/prep.c
> +++ b/hw/ppc/prep.c
> @@ -706,7 +706,7 @@ static void ibm_40p_init(MachineState *machine)
>  uint16_t cmos_checksum;
>  PowerPCCPU *cpu;
>  DeviceState *dev;
> -SysBusDevice *pcihost;
> +SysBusDevice *pcihost, *s;
>  Nvram *m48t59 = NULL;
>  PCIBus *pci_bus;
>  ISABus *isa_bus;
> @@ -799,7 +799,16 @@ static void ibm_40p_init(MachineState *machine)
>  }
>  
>  /* Prepare firmware configuration for OpenBIOS */
> -fw_cfg = fw_cfg_init_mem(CFG_ADDR, CFG_ADDR + 2);
> +dev = qdev_create(NULL, TYPE_FW_CFG_MEM);
> +fw_cfg = FW_CFG(dev);
> +qdev_prop_set_uint32(dev, "data_width", 1);
> +qdev_prop_set_bit(dev, "dma_enabled", false);
> +object_property_add_child(OBJECT(qdev_get_machine()), TYPE_FW_CFG,
> +  OBJECT(fw_cfg), NULL);
> +qdev_init_nofail(dev);
> +s = SYS_BUS_DEVICE(dev);
> +sysbus_mmio_map(s, 0, CFG_ADDR);
> +sysbus_mmio_map(s, 1, CFG_ADDR + 2);
>  
>  if (machine->kernel_filename) {
>  /* load kernel */

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH] qemu-doc: mark ppc/prep machine as deprecated

2018-08-12 Thread David Gibson
On Sat, Aug 11, 2018 at 12:46:45AM +0200, Hervé Poussineau wrote:
> 40p machine type should be used instead.
> 
> Signed-off-by: Hervé Poussineau 
> ---
>  qemu-deprecated.texi | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/qemu-deprecated.texi b/qemu-deprecated.texi
> index 9920a85adc..9c7ff3fe2d 100644
> --- a/qemu-deprecated.texi
> +++ b/qemu-deprecated.texi
> @@ -218,6 +218,12 @@ support page sizes < 4096 any longer.
>  These machine types are very old and likely can not be used for live 
> migration
>  from old QEMU versions anymore. A newer machine type should be used instead.
>  
> +@subsection prep (PowerPC) (since 3.1)
> +
> +This machine type uses an unmaintained firmware, broken in lots of ways,
> +and unable to start post-2004 operating systems. 40p machine type should be
> +used instead.
> +
>  @section Device options
>  
>  @subsection Block device options

Applied to ppc-for-3.1, thanks.

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


Re: [Qemu-devel] [PATCH] fw_cfg: ignore suffixes in the bootdevice list dependent on machine class

2018-08-12 Thread David Gibson
On Fri, Aug 10, 2018 at 09:57:13AM -0300, Eduardo Habkost wrote:
> On Fri, Aug 10, 2018 at 01:40:27PM +0100, Mark Cave-Ayland wrote:
> > For the older machines (such as Mac and SPARC) the DT nodes representing
> > bootdevices for disk nodes are irregular for mainly historical reasons.
> > 
> > Since the majority of bootdevice nodes for these machines either do not 
> > have a
> > separate disk node or require different (custom) names then it is much 
> > easier
> > for processing to just disable all suffixes for a particular machine.
> > 
> > Introduce a new ignore_boot_device_suffixes MachineClass property to control
> > bootdevice suffix generation, defaulting to false in order to preserve
> > compatibility.
> > 
> > Suggested-by: Eduardo Habkost 
> > Signed-off-by: Mark Cave-Ayland 
> 
> Thanks, I'm queueing this for 3.1:
>   https://github.com/ehabkost/qemu.git machine-for-3.1

Thanks.

Acked-by: David Gibson 


-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: PGP signature


[Qemu-devel] Question about dirty page statistics for live migration

2018-08-12 Thread Li Qiang
Hello Dave, Juan and all,

It is useful to get the dirty page rates in guest to evaluate the guest
loads
so that we can make a decide to live migrate it or not. So I think we can
add a on-demand qmp for showing the dirty page rates.

I found someone has done this work in here:
-->https://github.com/grivon/yabusame-qemu-dpt
and here:
https://github.com/btrplace/qemu-patch

But seems not go to the upstream.

I want to know your opinions about adding this qmp.

Thanks,
Li Qiang


Re: [Qemu-devel] [PATCH] file-posix: Skip effectiveless OFD lock operations

2018-08-12 Thread Fam Zheng
On Fri, 08/10 14:14, Kevin Wolf wrote:
> Am 18.07.2018 um 10:43 hat Fam Zheng geschrieben:
> > If we know we've already locked the bytes, don't do it again; similarly
> > don't unlock a byte if we haven't locked it. This doesn't change the
> > behavior, but fixes a corner case explained below.
> > 
> > Libvirt had an error handling bug that an image can get its (ownership,
> > file mode, SELinux) permissions changed (RHBZ 1584982) by mistake behind
> > QEMU. Specifically, an image in use by Libvirt VM has:
> > 
> > $ ls -lhZ b.img
> > -rw-r--r--. qemu qemu system_u:object_r:svirt_image_t:s0:c600,c690 b.img
> > 
> > Trying to attach it a second time won't work because of image locking.
> > And after the error, it becomes:
> > 
> > $ ls -lhZ b.img
> > -rw-r--r--. root root system_u:object_r:virt_image_t:s0 b.img
> > 
> > Then, we won't be able to do OFD lock operations with the existing fd.
> > In other words, the code such as in blk_detach_dev:
> > 
> > blk_set_perm(blk, 0, BLK_PERM_ALL, _abort);
> > 
> > can abort() QEMU, out of environmental changes.
> > 
> > This patch is an easy fix to this and the change is regardlessly
> > reasonable, so do it.
> > 
> > Signed-off-by: Fam Zheng 
> > ---
> >  block/file-posix.c | 27 +--
> >  1 file changed, 17 insertions(+), 10 deletions(-)
> > 
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 60af4b3d51..45d44c9947 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -680,23 +680,28 @@ typedef enum {
> >   * file; if @unlock == true, also unlock the unneeded bytes.
> >   * @shared_perm_lock_bits is the mask of all permissions that are NOT 
> > shared.
> >   */
> > -static int raw_apply_lock_bytes(int fd,
> > +static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
> >  uint64_t perm_lock_bits,
> >  uint64_t shared_perm_lock_bits,
> >  bool unlock, Error **errp)
> >  {
> >  int ret;
> >  int i;
> > +uint64_t locked_perm, locked_shared_perm;
> > +
> > +locked_perm = s ? s->perm : 0;
> > +locked_shared_perm = s ? ~s->shared_perm & BLK_PERM_ALL : 0;
> 
> For the s == NULL case, using 0 is okay for locking because we will
> always consider the bit as previously unlocked, so we will lock it.
> 
> For unlocking, however, we'll also see it as previously unlocked, so we
> will never actually unlock anything any more.
> 
> Am I missing something?

You are right. Though s == NULL only happens in raw_co_create and the fd will be
closed before the function returns, I agree for the correctness of this function
it's better to do a blanket unlock when unlocking. Will respin.

Fam



Re: [Qemu-devel] [PATCH v6 3/4] spapr: introduce a IRQ controller backend to the machine

2018-08-12 Thread David Gibson

On Fri, Aug 10, 2018 at 09:47:49AM +0200, Cédric Le Goater wrote:
> On 08/10/2018 02:46 AM, David Gibson wrote:
> > On Mon, Jul 30, 2018 at 04:11:33PM +0200, Cédric Le Goater wrote:
> >> This proposal moves all the related IRQ routines of the sPAPR machine
> >> behind a sPAPR IRQ backend interface 'spapr_irq' to prepare for future
> >> changes. First of which will be to increase the size of the IRQ number
> >> space, then, will follow a new backend for the POWER9 XIVE IRQ controller.
> >>
> >> Signed-off-by: Cédric Le Goater 
> > 
> > Applied to ppc-for-3.1.
> 
> I see that you have applied patch 2/4 and not 3/4. I suppose you 
> are still reviewing 3/4.

Actually, I just forgot to push my tree out.  3/4 should be in there
now.
> 
> 
> Btw, it would be good if we could change the way the interrupt 
> device objects (ICSState and soon sPAPRXive) are allocated and 
> use object_initialize() instead of object_new(). That can come 
> later for XICS but for XIVE, it would be better to get the initial 
> patch right. This means that the holding C struct is common to 
> the QEMU and KVM devices.  
> 
> Thanks,
> 
> C. 
>  
> >> ---
> >>  include/hw/ppc/spapr.h |  11 +-
> >>  include/hw/ppc/spapr_irq.h |  22 
> >>  hw/ppc/spapr.c | 180 +
> >>  hw/ppc/spapr_cpu_core.c|   1 +
> >>  hw/ppc/spapr_irq.c | 230 +
> >>  5 files changed, 259 insertions(+), 185 deletions(-)
> >>
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index 73067f5ee8aa..ad4d7cfd97b0 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -4,7 +4,6 @@
> >>  #include "qemu/units.h"
> >>  #include "sysemu/dma.h"
> >>  #include "hw/boards.h"
> >> -#include "hw/ppc/xics.h"
> >>  #include "hw/ppc/spapr_drc.h"
> >>  #include "hw/mem/pc-dimm.h"
> >>  #include "hw/ppc/spapr_ovec.h"
> >> @@ -16,6 +15,7 @@ struct sPAPRNVRAM;
> >>  typedef struct sPAPREventLogEntry sPAPREventLogEntry;
> >>  typedef struct sPAPREventSource sPAPREventSource;
> >>  typedef struct sPAPRPendingHPT sPAPRPendingHPT;
> >> +typedef struct ICSState ICSState;
> >>  
> >>  #define HPTE64_V_HPTE_DIRTY 0x0040ULL
> >>  #define SPAPR_ENTRY_POINT   0x100
> >> @@ -110,6 +110,7 @@ struct sPAPRMachineClass {
> >>unsigned n_dma, uint32_t *liobns, Error **errp);
> >>  sPAPRResizeHPT resize_hpt_default;
> >>  sPAPRCapabilities default_caps;
> >> +sPAPRIrq *irq;
> >>  };
> >>  
> >>  /**
> >> @@ -780,14 +781,6 @@ int spapr_get_vcpu_id(PowerPCCPU *cpu);
> >>  void spapr_set_vcpu_id(PowerPCCPU *cpu, int cpu_index, Error **errp);
> >>  PowerPCCPU *spapr_find_cpu(int vcpu_id);
> >>  
> >> -int spapr_irq_find(sPAPRMachineState *spapr, int num, bool align,
> >> -   Error **errp);
> >> -#define spapr_irq_findone(spapr, errp) spapr_irq_find(spapr, 1, false, 
> >> errp)
> >> -int spapr_irq_claim(sPAPRMachineState *spapr, int irq, bool lsi, Error 
> >> **errp);
> >> -void spapr_irq_free(sPAPRMachineState *spapr, int irq, int num);
> >> -qemu_irq spapr_qirq(sPAPRMachineState *spapr, int irq);
> >> -
> >> -
> >>  int spapr_caps_pre_load(void *opaque);
> >>  int spapr_caps_pre_save(void *opaque);
> >>  
> >> diff --git a/include/hw/ppc/spapr_irq.h b/include/hw/ppc/spapr_irq.h
> >> index 6f7f50548809..0e98c4474bb2 100644
> >> --- a/include/hw/ppc/spapr_irq.h
> >> +++ b/include/hw/ppc/spapr_irq.h
> >> @@ -29,4 +29,26 @@ int spapr_irq_msi_alloc(sPAPRMachineState *spapr, 
> >> uint32_t num, bool align,
> >>  void spapr_irq_msi_free(sPAPRMachineState *spapr, int irq, uint32_t num);
> >>  void spapr_irq_msi_reset(sPAPRMachineState *spapr);
> >>  
> >> +typedef struct sPAPRIrq {
> >> +uint32_tnr_irqs;
> >> +
> >> +void (*init)(sPAPRMachineState *spapr, Error **errp);
> >> +int (*claim)(sPAPRMachineState *spapr, int irq, bool lsi, Error 
> >> **errp);
> >> +void (*free)(sPAPRMachineState *spapr, int irq, int num);
> >> +qemu_irq (*qirq)(sPAPRMachineState *spapr, int irq);
> >> +void (*print_info)(sPAPRMachineState *spapr, Monitor *mon);
> >> +} sPAPRIrq;
> >> +
> >> +extern sPAPRIrq spapr_irq_xics;
> >> +
> >> +int spapr_irq_claim(sPAPRMachineState *spapr, int irq, bool lsi, Error 
> >> **errp);
> >> +void spapr_irq_free(sPAPRMachineState *spapr, int irq, int num);
> >> +qemu_irq spapr_qirq(sPAPRMachineState *spapr, int irq);
> >> +
> >> +/*
> >> + * XICS legacy routines
> >> + */
> >> +int spapr_irq_find(sPAPRMachineState *spapr, int num, bool align, Error 
> >> **errp);
> >> +#define spapr_irq_findone(spapr, errp) spapr_irq_find(spapr, 1, false, 
> >> errp)
> >> +
> >>  #endif
> >> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> >> index 792e24453d8b..d9f8cca49208 100644
> >> --- a/hw/ppc/spapr.c
> >> +++ b/hw/ppc/spapr.c
> >> @@ -54,7 +54,6 @@
> >>  #include "hw/ppc/spapr.h"
> >>  #include "hw/ppc/spapr_vio.h"
> >>  #include "hw/pci-host/spapr.h"
> >> -#include 

Re: [Qemu-devel] [PATCH v7 6/9] qcow2: Increase the default upper limit on the L2 cache size

2018-08-12 Thread Max Reitz
On 2018-08-10 14:00, Alberto Garcia wrote:
> On Fri 10 Aug 2018 08:26:44 AM CEST, Leonid Bloch wrote:
>> The upper limit on the L2 cache size is increased from 1 MB to 32 MB.
>> This is done in order to allow default full coverage with the L2 cache
>> for images of up to 256 GB in size (was 8 GB). Note, that only the
>> needed amount to cover the full image is allocated. The value which is
>> changed here is just the upper limit on the L2 cache size, beyond which
>> it will not grow, even if the size of the image will require it to.
>>
>> Signed-off-by: Leonid Bloch 
> 
> Reviewed-by: Alberto Garcia 
> 
>> -#define DEFAULT_L2_CACHE_MAX_SIZE (1 * MiB)
>> +#define DEFAULT_L2_CACHE_MAX_SIZE (32 * MiB)
> 
> The patch looks perfect to me now and I'm fine with this change, but
> this is quite an increase from the previous default value. If anyone
> thinks that this is too aggressive (or too little :)) I'm all ears.

This is just noise from the sidelines (so nothing too serious), but
anyway, I don't like it very much.

My first point is that the old limit doesn't mean you can only use 8 GB
qcow2 images.  You can use more, you just can't access more than 8 GB
randomly.  I know I'm naive, but I think that the number of use cases
where you need random IOPS spread out over more than 8 GB of an image
are limited.

My second point is that qemu still allocated 128 MB of RAM by default.
Using 1/4th of that for every qcow2 image you attach to the VM seems a
bit much.

Now it gets a bit complicated.  This series makes cache-clean-interval
default to 10 minutes, so it shouldn't be an issue in practice.  But one
thing to note is that this is a Linux-specific feature, so on every
other system, this really means 32 MB per image.  (Also, 10 minutes
means that whenever I boot up my VM with a couple of disks with random
accesses all over the images during boot, I might end up using 32 MB per
image again (for 10 min), even though I don't really need that performance.)

Now if we really rely on that cache-clean-interval, why not make it
always cover the whole image by default?  I don't really see why we
should now say "256 GB seems reasonable, and 32 MB doesn't sound like
too much, let's go there".  (Well, OK, I do see how you end up using 32
MB as basically a safety margin, where you'd say that anything above it
is just unreasonable.)

Do we update the limit in a couple of years again because people have
more RAM and larger disks then?  (Maybe we do?)

My personal opinion is this: Most users should be fine with 8 GB of
randomly accessible image space (this may be wrong).  Whenever a user
does have an application that uses more than 8 GB, they are probably in
an area where they want to do some performance tuning anyway.  Requiring
them to set l2-cache-full in that case seems reasonable to me.  Pushing
the default to 256 GB to me looks a bit like just letting them run into
the problem later.  It doesn't solve the issue that you need to do some
performance tuning if you have a bit of a special use case (but maybe
I'm wrong and accessing more than 8 GB randomly is what everybody does
with their VMs).

(Maybe it's even a good thing to limit it to a smaller number so users
run into the issue sooner than later...)

OTOH, this change means that everyone on a non-Linux system will have to
use 32 MB of their RAM per qcow2 image, and everyone on a Linux system
will potentially use it e.g. during boot when you do access a lot
randomly (even though the performance usually is not of utmost
importance then (important, but not extremely so)).  But then again,
this will probably only affect a single disk (the one with the OS on
it), so it won't be too bad.

So my stance is:

(1) Is it really worth pushing the default to 256 GB if you probably
have to do a bit of performance tuning anyway when you get past 8 GB
random IOPS?  I think it's reasonable to ask users to use l2-cache-full
or adjust the cache to their needs.

(2) For non-Linux systems, this seems to really mean 32 MB of RAM per
qcow2 image.  That's 1/4th of default VM RAM.  Is that worth it?

(3) For Linux, I don't like it much either, but that's because I'm
stupid.  The fact that if you don't need this much random I/O only your
boot disk may cause a RAM usage spike, and even then it's going to go
down after 10 minutes, is probably enough to justify this change.


I suppose my moaning would subside if we only increased the default on
systems that actually support cache-clean-interval...?

Max


PS: I also don't quite like how you got to the default of 10 minutes of
the cache-clean-interval.  You can't justify using 32 MB as the default
cache size by virtue of "We have a cache-clean-interval now", and then
justify a CCI of 10 min by "It's just for VMs which sit idle".

No.  If you rely on CCI to be there to make the cache size reasonable by
default for whatever the user is doing with their images, you have to
consider that fact when choosing a CCI.

Ideally we'd probably want a soft and a 

[Qemu-devel] [PATCH v8 1/8] qcow2: Options' documentation fixes

2018-08-12 Thread Leonid Bloch
Signed-off-by: Leonid Bloch 
---
 docs/qcow2-cache.txt | 20 +---
 qemu-options.hx  |  9 ++---
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
index 8a09a5cc5f..2326db01b9 100644
--- a/docs/qcow2-cache.txt
+++ b/docs/qcow2-cache.txt
@@ -77,7 +77,7 @@ aforementioned L2 cache, and its size can also be configured.
 Choosing the right cache sizes
 --
 In order to choose the cache sizes we need to know how they relate to
-the amount of allocated space.
+the amount of the allocated space.
 
 The amount of virtual disk that can be mapped by the L2 and refcount
 caches (in bytes) is:
@@ -86,7 +86,7 @@ caches (in bytes) is:
disk_size = refcount_cache_size * cluster_size * 8 / refcount_bits
 
 With the default values for cluster_size (64KB) and refcount_bits
-(16), that is
+(16), this becomes:
 
disk_size = l2_cache_size * 8192
disk_size = refcount_cache_size * 32768
@@ -97,12 +97,15 @@ need:
l2_cache_size = disk_size_GB * 131072
refcount_cache_size = disk_size_GB * 32768
 
-QEMU has a default L2 cache of 1MB (1048576 bytes) and a refcount
-cache of 256KB (262144 bytes), so using the formulas we've just seen
-we have
+For example, 1MB of L2 cache is needed to cover every 8 GB of the virtual
+image size (given that the default cluster size is used):
 
-   1048576 / 131072 = 8 GB of virtual disk covered by that cache
-262144 /  32768 = 8 GB
+   8 * 131072 = 1 MB
+
+A default refcount cache is 4 times the cluster size, which defaults to
+256 KB (262144 bytes). This is sufficient for 8 GB of image size:
+
+   262144 / 32768 = 8 GB
 
 
 How to configure the cache sizes
@@ -130,6 +133,9 @@ There are a few things that need to be taken into account:
memory as possible to the L2 cache before increasing the refcount
cache size.
 
+ - At most two of "l2-cache-size", "refcount-cache-size", and "cache-size"
+   can be set simultaneously.
+
 Unlike L2 tables, refcount blocks are not used during normal I/O but
 only during allocations and internal snapshots. In most cases they are
 accessed sequentially (even during random guest I/O) so increasing the
diff --git a/qemu-options.hx b/qemu-options.hx
index b1bf0f485f..f6804758d3 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -752,15 +752,18 @@ image file)
 
 @item cache-size
 The maximum total size of the L2 table and refcount block caches in bytes
-(default: 1048576 bytes or 8 clusters, whichever is larger)
+(default: the sum of l2-cache-size and refcount-cache-size)
 
 @item l2-cache-size
 The maximum size of the L2 table cache in bytes
-(default: 4/5 of the total cache size)
+(default: if cache-size is not defined - 1048576 bytes or 8 clusters, whichever
+is larger; otherwise, as large as possible or needed within the cache-size,
+while permitting the requested or the minimal refcount cache size)
 
 @item refcount-cache-size
 The maximum size of the refcount block cache in bytes
-(default: 1/5 of the total cache size)
+(default: 4 times the cluster size; or if cache-size is specified, the part of
+it which is not used for the L2 cache)
 
 @item cache-clean-interval
 Clean unused entries in the L2 and refcount caches. The interval is in seconds.
-- 
2.17.1




[Qemu-devel] [PATCH v8 5/8] qcow2: Increase the default upper limit on the L2 cache size

2018-08-12 Thread Leonid Bloch
The upper limit on the L2 cache size is increased from 1 MB to 32 MB.
This is done in order to allow default full coverage with the L2 cache
for images of up to 256 GB in size (was 8 GB). Note, that only the
needed amount to cover the full image is allocated. The value which is
changed here is just the upper limit on the L2 cache size, beyond which
it will not grow, even if the size of the image will require it to.

Signed-off-by: Leonid Bloch 
Reviewed-by: Alberto Garcia 
---
 block/qcow2.h| 2 +-
 docs/qcow2-cache.txt | 4 ++--
 qemu-options.hx  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index d917b5f577..e699a55d02 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -74,7 +74,7 @@
 /* Must be at least 4 to cover all cases of refcount table growth */
 #define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
 
-#define DEFAULT_L2_CACHE_MAX_SIZE (1 * MiB)
+#define DEFAULT_L2_CACHE_MAX_SIZE (32 * MiB)
 
 #define DEFAULT_CLUSTER_SIZE (64 * KiB)
 
diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
index e89e74b372..0fc438f397 100644
--- a/docs/qcow2-cache.txt
+++ b/docs/qcow2-cache.txt
@@ -124,8 +124,8 @@ There are a few things that need to be taken into account:
  - Both caches must have a size that is a multiple of the cluster size
(or the cache entry size: see "Using smaller cache sizes" below).
 
- - The maximum L2 cache size is 1 MB by default (enough for full coverage
-   of 8 GB images, with the default cluster size). This value can be
+ - The maximum L2 cache size is 32 MB by default (enough for full coverage
+   of 256 GB images, with the default cluster size). This value can be
modified using the "l2-cache-size" option. QEMU will not use more memory
than needed to hold all of the image's L2 tables, regardless of this max.
value. The minimal L2 cache size is 2 clusters (or 2 cache entries, see
diff --git a/qemu-options.hx b/qemu-options.hx
index 22e8e2d113..4c44cdbc23 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -756,7 +756,7 @@ The maximum total size of the L2 table and refcount block 
caches in bytes
 
 @item l2-cache-size
 The maximum size of the L2 table cache in bytes
-(default: if cache-size is not specified - 1M; otherwise, as large as possible
+(default: if cache-size is not specified - 32M; otherwise, as large as possible
 within the cache-size, while permitting the requested or the minimal refcount
 cache size)
 
-- 
2.17.1




[Qemu-devel] [PATCH v8 7/8] qcow2: Set the default cache-clean-interval to 10 minutes

2018-08-12 Thread Leonid Bloch
The default cache-clean-interval is set to 10 minutes, in order to lower
the overhead of the qcow2 caches (before the default was 0, i.e.
disabled).

Signed-off-by: Leonid Bloch 
Reviewed-by: Alberto Garcia 
---
 block/qcow2.c| 2 +-
 block/qcow2.h| 1 +
 docs/qcow2-cache.txt | 4 ++--
 qapi/block-core.json | 3 ++-
 qemu-options.hx  | 2 +-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 1445cd5360..f885afa0ed 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -944,7 +944,7 @@ static int qcow2_update_options_prepare(BlockDriverState 
*bs,
 /* New interval for cache cleanup timer */
 r->cache_clean_interval =
 qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL,
-s->cache_clean_interval);
+DEFAULT_CACHE_CLEAN_INTERVAL);
 #ifndef CONFIG_LINUX
 if (r->cache_clean_interval != 0) {
 error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL
diff --git a/block/qcow2.h b/block/qcow2.h
index e699a55d02..5e94f7ffc4 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -78,6 +78,7 @@
 
 #define DEFAULT_CLUSTER_SIZE (64 * KiB)
 
+#define DEFAULT_CACHE_CLEAN_INTERVAL 600  /* seconds */
 
 #define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts"
 #define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request"
diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
index 0fc438f397..0530d79f5b 100644
--- a/docs/qcow2-cache.txt
+++ b/docs/qcow2-cache.txt
@@ -206,8 +206,8 @@ This example removes all unused cache entries every 15 
minutes:
 
-drive file=hd.qcow2,cache-clean-interval=900
 
-If unset, the default value for this parameter is 0 and it disables
-this feature.
+If unset, the default value for this parameter is 600. Setting it to 0
+disables this feature.
 
 Note that this functionality currently relies on the MADV_DONTNEED
 argument for madvise() to actually free the memory. This is a
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 5b9084a394..9a6a708a37 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2830,7 +2830,8 @@
 #
 # @cache-clean-interval:  clean unused entries in the L2 and refcount
 # caches. The interval is in seconds. The default value
-# is 0 and it disables this feature (since 2.5)
+# is 600, and 0 disables this feature. (since 2.5)
+#
 # @encrypt:   Image decryption options. Mandatory for
 # encrypted images, except when doing a metadata-only
 # probe of the image. (since 2.10)
diff --git a/qemu-options.hx b/qemu-options.hx
index 4c44cdbc23..6abf3631ec 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -767,7 +767,7 @@ it which is not used for the L2 cache)
 
 @item cache-clean-interval
 Clean unused entries in the L2 and refcount caches. The interval is in seconds.
-The default value is 0 and it disables this feature.
+The default value is 600. Setting it to 0 disables this feature.
 
 @item pass-discard-request
 Whether discard requests to the qcow2 device should be forwarded to the data
-- 
2.17.1




[Qemu-devel] [PATCH v8 2/8] qcow2: Make sizes more humanly readable

2018-08-12 Thread Leonid Bloch
Signed-off-by: Leonid Bloch 
Reviewed-by: Alberto Garcia 
---
 block/qcow2.c | 2 +-
 block/qcow2.h | 9 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index ec9e6238a0..67cc82f0b9 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -830,7 +830,7 @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts 
*opts,
 }
 } else {
 if (!l2_cache_size_set) {
-*l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
+*l2_cache_size = MAX(DEFAULT_L2_CACHE_SIZE,
  (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
  * s->cluster_size);
 }
diff --git a/block/qcow2.h b/block/qcow2.h
index 81b844e936..39e1b279f8 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -27,6 +27,7 @@
 
 #include "crypto/block.h"
 #include "qemu/coroutine.h"
+#include "qemu/units.h"
 
 //#define DEBUG_ALLOC
 //#define DEBUG_ALLOC2
@@ -43,11 +44,11 @@
 
 /* 8 MB refcount table is enough for 2 PB images at 64k cluster size
  * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
-#define QCOW_MAX_REFTABLE_SIZE 0x80
+#define QCOW_MAX_REFTABLE_SIZE (8 * MiB)
 
 /* 32 MB L1 table is enough for 2 PB images at 64k cluster size
  * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */
-#define QCOW_MAX_L1_SIZE 0x200
+#define QCOW_MAX_L1_SIZE (32 * MiB)
 
 /* Allow for an average of 1k per snapshot table entry, should be plenty of
  * space for snapshot names and IDs */
@@ -75,9 +76,9 @@
 
 /* Whichever is more */
 #define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
-#define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
+#define DEFAULT_L2_CACHE_SIZE (1 * MiB)
 
-#define DEFAULT_CLUSTER_SIZE 65536
+#define DEFAULT_CLUSTER_SIZE (64 * KiB)
 
 
 #define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts"
-- 
2.17.1




[Qemu-devel] [PATCH v8 6/8] qcow2: Resize the cache upon image resizing

2018-08-12 Thread Leonid Bloch
The caches are now recalculated upon image resizing. This is done
because the new default behavior of assigning L2 cache relatively to
the image size, implies that the cache will be adapted accordingly
after an image resize.

Signed-off-by: Leonid Bloch 
Reviewed-by: Alberto Garcia 
---
 block/qcow2.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 01c39c56c0..1445cd5360 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3418,6 +3418,7 @@ static int coroutine_fn 
qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
 uint64_t old_length;
 int64_t new_l1_size;
 int ret;
+QDict *options;
 
 if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
 prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
@@ -3642,6 +3643,8 @@ static int coroutine_fn 
qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
 }
 }
 
+bs->total_sectors = offset / BDRV_SECTOR_SIZE;
+
 /* write updated header.size */
 offset = cpu_to_be64(offset);
 ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size),
@@ -3652,6 +3655,13 @@ static int coroutine_fn 
qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
 }
 
 s->l1_vm_state_index = new_l1_size;
+
+/* Update cache sizes */
+options = qdict_clone_shallow(bs->options);
+ret = qcow2_update_options(bs, options, s->flags, errp);
+if (ret < 0) {
+goto fail;
+}
 ret = 0;
 fail:
 qemu_co_mutex_unlock(>lock);
-- 
2.17.1




[Qemu-devel] [PATCH v8 4/8] qcow2: Assign the L2 cache relatively to the image size

2018-08-12 Thread Leonid Bloch
Sufficient L2 cache can noticeably improve the performance when using
large images with frequent I/O.

Previously, unless 'cache-size' was specified and was large enough, the
L2 cache was set to a certain size without taking the virtual image size
into account.

Now, the L2 cache assignment is aware of the virtual size of the image,
and will cover the entire image, unless the cache size needed for that is
larger than a certain maximum. This maximum is set to 1 MB by default
(enough to cover an 8 GB image with the default cluster size) but can
be increased or decreased using the 'l2-cache-size' option. This option
was previously documented as the *maximum* L2 cache size, and this patch
makes it behave as such, instead of as a constant size. Also, the
existing option 'cache-size' can limit the sum of both L2 and refcount
caches, as previously.

Signed-off-by: Leonid Bloch 
---
 block/qcow2.c  | 21 +
 block/qcow2.h  |  4 +---
 docs/qcow2-cache.txt   | 15 ++-
 qemu-options.hx|  6 +++---
 tests/qemu-iotests/137 |  1 -
 tests/qemu-iotests/137.out |  1 -
 6 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 7949d15fc6..01c39c56c0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -777,29 +777,35 @@ static void read_cache_sizes(BlockDriverState *bs, 
QemuOpts *opts,
  uint64_t *refcount_cache_size, Error **errp)
 {
 BDRVQcow2State *s = bs->opaque;
-uint64_t combined_cache_size;
+uint64_t combined_cache_size, l2_cache_max_setting;
 bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
 int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
+uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
 
 combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
 l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
 refcount_cache_size_set = qemu_opt_get(opts, 
QCOW2_OPT_REFCOUNT_CACHE_SIZE);
 
 combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
-*l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0);
+l2_cache_max_setting = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE,
+ DEFAULT_L2_CACHE_MAX_SIZE);
 *refcount_cache_size = qemu_opt_get_size(opts,
  QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
 
 *l2_cache_entry_size = qemu_opt_get_size(
 opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);
 
+*l2_cache_size = MIN(max_l2_cache, l2_cache_max_setting);
+
 if (combined_cache_size_set) {
 if (l2_cache_size_set && refcount_cache_size_set) {
 error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
" and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
"at the same time");
 return;
-} else if (*l2_cache_size > combined_cache_size) {
+} else if (l2_cache_size_set &&
+   (l2_cache_max_setting > combined_cache_size)) {
 error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
QCOW2_OPT_CACHE_SIZE);
 return;
@@ -814,9 +820,6 @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts 
*opts,
 } else if (refcount_cache_size_set) {
 *l2_cache_size = combined_cache_size - *refcount_cache_size;
 } else {
-uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
-uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
-
 /* Assign as much memory as possible to the L2 cache, and
  * use the remainder for the refcount cache */
 if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
@@ -828,12 +831,6 @@ static void read_cache_sizes(BlockDriverState *bs, 
QemuOpts *opts,
 *l2_cache_size = combined_cache_size - *refcount_cache_size;
 }
 }
-} else {
-if (!l2_cache_size_set) {
-*l2_cache_size = MAX(DEFAULT_L2_CACHE_SIZE,
- (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
- * s->cluster_size);
-}
 }
 /* l2_cache_size and refcount_cache_size are ensured to have at least
  * their minimum values in qcow2_update_options_prepare() */
diff --git a/block/qcow2.h b/block/qcow2.h
index 39e1b279f8..d917b5f577 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -74,9 +74,7 @@
 /* Must be at least 4 to cover all cases of refcount table growth */
 #define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */
 
-/* Whichever is more */
-#define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
-#define DEFAULT_L2_CACHE_SIZE (1 * MiB)
+#define DEFAULT_L2_CACHE_MAX_SIZE (1 * MiB)
 
 #define 

[Qemu-devel] [PATCH v8 0/8] Take the image size into account when allocating the L2 cache

2018-08-12 Thread Leonid Bloch
This series makes the qcow2 L2 cache assignment aware of the image size,
with the intention for it to cover the entire image. The importance of
this change is in noticeable performance improvement, especially with
heavy random I/O. The memory overhead is not big in most cases, as only
1 MB of cache for every 8 GB of image size is used. For cases with very
large images and/or small cluster sizes, or systems with limited RAM
resources, there is an upper limit on the default L2 cache: 32 MB. To
modify this limit one can use the already existing 'l2-cache-size' and
'cache-size' options. Moreover, this fixes the behavior of 'l2-cache-size',
as it was documented as the *maximum* L2 cache size, but in practice
behaved as the absolute size.

To compensate the memory overhead which may be increased following this
behavior, the default cache-clean-interval is set to 10 minutes by default
(was disabled by default before).

The L2 cache is also resized accordingly, by default, if the image is
resized.

Additionally, few minor changes are made (refactoring and documentation
fixes).

Differences from v1:
* .gitignore modification patch removed (unneeded).
* The grammar fix in conflicting cache sizing patch removed (merged).
* The update total_sectors when resizing patch squashed with the
  resizing patch.
* L2 cache is now capped at 32 MB.
* The default cache-clean-interval is set to 30 seconds.

Differences from v2:
* Made it clear in the documentation that setting cache-clean-interval
  to 0 disables this feature.

Differences from v3:
* The default cache-clean-interval is set to 10 minutes instead of 30
  seconds before.
* Commit message changes to better explain the patches.
* Some refactoring.

Differences from v4:
* Refactoring.
* Documentation and commit message fixes.

Differences from v5:
* A patch with cosmetic changes is split from the main patch
* A patch for avoiding duplication in refcount cache size assignment is
  split from the main patch.
* In the main patch the cap on the L2 cache size is set to only 1 MB (in
  order to be close to the previous behavior) and a separate patch sets
  it to 32 MB.
* Changes in the documentation fixes patch [1/8].

Differences from v6:
* A patch is added to make the defined sizes more humanly readable.

Differences from v7:
* Documentation and commit message changes.
* Fix: when cache-size is specified and is large enough, with the L2 and
  refcount caches not set, the L2 cache will be as large as needed to
  cover the entire image, and not as the default max. size. (Return to
  the previous behavior in this part, which was correct).
* Cosmetic changes patch not used.


Leonid Bloch (8):
  qcow2: Options' documentation fixes
  qcow2: Make sizes more humanly readable
  qcow2: Avoid duplication in setting the refcount cache size
  qcow2: Assign the L2 cache relatively to the image size
  qcow2: Increase the default upper limit on the L2 cache size
  qcow2: Resize the cache upon image resizing
  qcow2: Set the default cache-clean-interval to 10 minutes
  qcow2: Explicit number replaced by a constant

 block/qcow2.c  | 42 ++
 block/qcow2.h  | 12 +--
 docs/qcow2-cache.txt   | 39 ++-
 qapi/block-core.json   |  3 ++-
 qemu-options.hx| 11 ++
 tests/qemu-iotests/137 |  1 -
 tests/qemu-iotests/137.out |  1 -
 7 files changed, 64 insertions(+), 45 deletions(-)

-- 
2.17.1




[Qemu-devel] [PATCH v8 8/8] qcow2: Explicit number replaced by a constant

2018-08-12 Thread Leonid Bloch
Signed-off-by: Leonid Bloch 
Reviewed-by: Alberto Garcia 
---
 block/qcow2.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index f885afa0ed..ffb4a9e4a1 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1324,7 +1324,7 @@ static int coroutine_fn qcow2_do_open(BlockDriverState 
*bs, QDict *options,
 /* 2^(s->refcount_order - 3) is the refcount width in bytes */
 s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
 s->refcount_block_size = 1 << s->refcount_block_bits;
-bs->total_sectors = header.size / 512;
+bs->total_sectors = header.size / BDRV_SECTOR_SIZE;
 s->csize_shift = (62 - (s->cluster_bits - 8));
 s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
 s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
@@ -3450,7 +3450,7 @@ static int coroutine_fn 
qcow2_co_truncate(BlockDriverState *bs, int64_t offset,
 goto fail;
 }
 
-old_length = bs->total_sectors * 512;
+old_length = bs->total_sectors * BDRV_SECTOR_SIZE;
 new_l1_size = size_to_l1(s, offset);
 
 if (offset < old_length) {
-- 
2.17.1




[Qemu-devel] [PATCH v8 3/8] qcow2: Avoid duplication in setting the refcount cache size

2018-08-12 Thread Leonid Bloch
The refcount cache size does not need to be set to its minimum value in
read_cache_sizes(), as it is set to at least its minimum value in
qcow2_update_options_prepare().

Signed-off-by: Leonid Bloch 
---
 block/qcow2.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 67cc82f0b9..7949d15fc6 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -834,10 +834,9 @@ static void read_cache_sizes(BlockDriverState *bs, 
QemuOpts *opts,
  (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
  * s->cluster_size);
 }
-if (!refcount_cache_size_set) {
-*refcount_cache_size = min_refcount_cache;
-}
 }
+/* l2_cache_size and refcount_cache_size are ensured to have at least
+ * their minimum values in qcow2_update_options_prepare() */
 
 if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
 *l2_cache_entry_size > s->cluster_size ||
-- 
2.17.1




Re: [Qemu-devel] [PATCH v3 2/2] Add Nios II semihosting support.

2018-08-12 Thread Julian Brown
On Thu, 2 Aug 2018 15:56:47 -0600
Sandra Loosemore  wrote:

> On 05/18/2018 03:35 PM, Sandra Loosemore wrote:
> > On 05/18/2018 02:19 PM, Julian Brown wrote:  
> >> On Fri, 18 May 2018 21:52:04 +0200
> >> Marek Vasut  wrote:
> >>  
> >>> On 05/18/2018 09:23 PM, Julian Brown wrote:  
>  This patch (by Sandra Loosemore, mildly rebased) adds support for
>  semihosting for Nios II bare-metal emulation.
> 
>  Signed-off-by: Julian Brown 
>  Signed-off-by: Sandra Loosemore   
> >>>
> >>> Is there some documentation for this stuff ? It looks
> >>> interesting, but how can I try it here ?  
> >>
> >> There's no documentation AFAIK apart from that the entry points are
> >> the same as m68k, semihosting is invoked with "break 1", and r4/r5
> >> are used for passing arguments. I'm not actually sure how you can
> >> try this stuff without our startup code or other infrastructure
> >> (that I'm pretty sure we can't divulge). Sandra, any ideas?  
> > 
> > I don't see any reason why we couldn't contribute libgloss support, 
> > except that I don't have time to write such a BSP right now.  :-(
> > I recently did this for C-SKY, though, and the semihosting parts
> > were just a straightforward copy from the m68k port.  
> 
> I've posted a patch with libgloss semihosting support for nios2 here:
> 
> https://sourceware.org/ml/newlib/2018/msg00610.html
> 
> I hope this is enough to unblock consideration of the corresponding
> QEMU patch set now.  Here's a link to the original patch posting:
> 
> http://lists.nongnu.org/archive/html/qemu-devel/2018-05/msg04571.html

For completeness I've re-tested with a rebased QEMU and posted
again (with a couple of checkpatch.pl nits fixed, but otherwise
identical):

http://lists.nongnu.org/archive/html/qemu-devel/2018-08/msg01987.html

Thank you,

Julian



[Qemu-devel] [PATCH v4 2/2] Add Nios II semihosting support.

2018-08-12 Thread Julian Brown
This patch (by Sandra Loosemore, mildly rebased) adds support for
semihosting for Nios II bare-metal emulation.

Signed-off-by: Julian Brown 
Signed-off-by: Sandra Loosemore 
---
 qemu-options.hx|   8 +-
 target/nios2/Makefile.objs |   2 +-
 target/nios2/cpu.h |   4 +-
 target/nios2/helper.c  |  11 ++
 target/nios2/nios2-semi.c  | 446 +
 5 files changed, 465 insertions(+), 6 deletions(-)
 create mode 100644 target/nios2/nios2-semi.c

diff --git a/qemu-options.hx b/qemu-options.hx
index b1bf0f4..b4e9ea9 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3821,21 +3821,21 @@ ETEXI
 DEF("semihosting", 0, QEMU_OPTION_semihosting,
 "-semihostingsemihosting mode\n",
 QEMU_ARCH_ARM | QEMU_ARCH_M68K | QEMU_ARCH_XTENSA | QEMU_ARCH_LM32 |
-QEMU_ARCH_MIPS)
+QEMU_ARCH_MIPS | QEMU_ARCH_NIOS2)
 STEXI
 @item -semihosting
 @findex -semihosting
-Enable semihosting mode (ARM, M68K, Xtensa, MIPS only).
+Enable semihosting mode (ARM, M68K, Xtensa, MIPS, Nios II only).
 ETEXI
 DEF("semihosting-config", HAS_ARG, QEMU_OPTION_semihosting_config,
 "-semihosting-config 
[enable=on|off][,target=native|gdb|auto][,arg=str[,...]]\n" \
 "semihosting configuration\n",
 QEMU_ARCH_ARM | QEMU_ARCH_M68K | QEMU_ARCH_XTENSA | QEMU_ARCH_LM32 |
-QEMU_ARCH_MIPS)
+QEMU_ARCH_MIPS | QEMU_ARCH_NIOS2)
 STEXI
 @item -semihosting-config 
[enable=on|off][,target=native|gdb|auto][,arg=str[,...]]
 @findex -semihosting-config
-Enable and configure semihosting (ARM, M68K, Xtensa, MIPS only).
+Enable and configure semihosting (ARM, M68K, Xtensa, MIPS, Nios II only).
 @table @option
 @item target=@code{native|gdb|auto}
 Defines where the semihosting calls will be addressed, to QEMU (@code{native})
diff --git a/target/nios2/Makefile.objs b/target/nios2/Makefile.objs
index 2a11c5c..010de0e 100644
--- a/target/nios2/Makefile.objs
+++ b/target/nios2/Makefile.objs
@@ -1,4 +1,4 @@
-obj-y += translate.o op_helper.o helper.o cpu.o mmu.o
+obj-y += translate.o op_helper.o helper.o cpu.o mmu.o nios2-semi.o
 obj-$(CONFIG_SOFTMMU) += monitor.o
 
 $(obj)/op_helper.o: QEMU_CFLAGS += $(HELPER_CFLAGS)
diff --git a/target/nios2/cpu.h b/target/nios2/cpu.h
index 047f376..afd30d5 100644
--- a/target/nios2/cpu.h
+++ b/target/nios2/cpu.h
@@ -141,7 +141,7 @@ typedef struct Nios2CPUClass {
 #define R_PC 64
 
 /* Exceptions */
-#define EXCP_BREAK-1
+#define EXCP_BREAK0x1000
 #define EXCP_RESET0
 #define EXCP_PRESET   1
 #define EXCP_IRQ  2
@@ -223,6 +223,8 @@ void nios2_cpu_do_unaligned_access(CPUState *cpu, vaddr 
addr,
 qemu_irq *nios2_cpu_pic_init(Nios2CPU *cpu);
 void nios2_check_interrupts(CPUNios2State *env);
 
+void do_nios2_semihosting(CPUNios2State *env);
+
 #define TARGET_PHYS_ADDR_SPACE_BITS 32
 #ifdef CONFIG_USER_ONLY
 # define TARGET_VIRT_ADDR_SPACE_BITS 31
diff --git a/target/nios2/helper.c b/target/nios2/helper.c
index a8b8ec6..ca3b087 100644
--- a/target/nios2/helper.c
+++ b/target/nios2/helper.c
@@ -25,6 +25,7 @@
 #include "exec/exec-all.h"
 #include "exec/log.h"
 #include "exec/helper-proto.h"
+#include "exec/semihost.h"
 
 #if defined(CONFIG_USER_ONLY)
 
@@ -169,6 +170,16 @@ void nios2_cpu_do_interrupt(CPUState *cs)
 break;
 
 case EXCP_BREAK:
+qemu_log_mask(CPU_LOG_INT, "BREAK exception at pc=%x\n",
+  env->regs[R_PC]);
+
+if (semihosting_enabled()) {
+qemu_log_mask(CPU_LOG_INT, "Entering semihosting\n");
+env->regs[R_PC] += 4;
+do_nios2_semihosting(env);
+break;
+}
+
 if ((env->regs[CR_STATUS] & CR_STATUS_EH) == 0) {
 env->regs[CR_BSTATUS] = env->regs[CR_STATUS];
 env->regs[R_BA] = env->regs[R_PC] + 4;
diff --git a/target/nios2/nios2-semi.c b/target/nios2/nios2-semi.c
new file mode 100644
index 000..df6d6fd
--- /dev/null
+++ b/target/nios2/nios2-semi.c
@@ -0,0 +1,446 @@
+/*
+ *  Nios II Semihosting syscall interface.
+ *  This code is derived from m68k-semi.c.
+ *
+ *  Copyright (c) 2017 Mentor Graphics
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see .
+ */
+
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#if defined(CONFIG_USER_ONLY)
+#include "qemu.h"
+#else
+#include "qemu-common.h"
+#include "exec/gdbstub.h"
+#include "exec/softmmu-semi.h"
+#endif
+#include "qemu/log.h"

[Qemu-devel] [PATCH v4 1/2] Add generic Nios II board.

2018-08-12 Thread Julian Brown
This patch adds support for a generic MMU-less Nios II board that can
be used e.g. for bare-metal compiler testing.  Nios II booting is also
tweaked so that bare-metal binaries start executing in RAM starting at
0x, rather than an alias at 0xc000, which allows features
such as unwinding to work when binaries are linked to start at the
beginning of the address space.

The generic_nommu.c parts are by Andrew Jenner, based on code by Marek
Vasut.

Originally by Marek Vasut and Andrew Jenner.

Signed-off-by: Julian Brown 
Signed-off-by: Andrew Jenner 
Signed-off-by: Marek Vasut 
---
 hw/nios2/Makefile.objs   |   2 +-
 hw/nios2/boot.c  |   5 +-
 hw/nios2/generic_nommu.c | 128 +++
 3 files changed, 133 insertions(+), 2 deletions(-)
 create mode 100644 hw/nios2/generic_nommu.c

diff --git a/hw/nios2/Makefile.objs b/hw/nios2/Makefile.objs
index 6b5c421..680caaa 100644
--- a/hw/nios2/Makefile.objs
+++ b/hw/nios2/Makefile.objs
@@ -1 +1 @@
-obj-y = boot.o cpu_pic.o 10m50_devboard.o
+obj-y = boot.o cpu_pic.o 10m50_devboard.o generic_nommu.o
diff --git a/hw/nios2/boot.c b/hw/nios2/boot.c
index 4bb5b60..a027c11 100644
--- a/hw/nios2/boot.c
+++ b/hw/nios2/boot.c
@@ -140,6 +140,7 @@ void nios2_load_kernel(Nios2CPU *cpu, hwaddr ddr_base,
 uint64_t entry, low, high;
 uint32_t base32;
 int big_endian = 0;
+int kernel_space = 0;
 
 #ifdef TARGET_WORDS_BIGENDIAN
 big_endian = 1;
@@ -154,10 +155,12 @@ void nios2_load_kernel(Nios2CPU *cpu, hwaddr ddr_base,
 kernel_size = load_elf(kernel_filename, translate_kernel_address,
NULL, , NULL, NULL,
big_endian, EM_ALTERA_NIOS2, 0, 0);
+kernel_space = 1;
 }
 
 /* Always boot into physical ram. */
-boot_info.bootstrap_pc = ddr_base + 0xc000 + (entry & 0x07ff);
+boot_info.bootstrap_pc = ddr_base + (kernel_space ? 0xc000 : 0)
+ + (entry & 0x07ff);
 
 /* If it wasn't an ELF image, try an u-boot image. */
 if (kernel_size < 0) {
diff --git a/hw/nios2/generic_nommu.c b/hw/nios2/generic_nommu.c
new file mode 100644
index 000..f4bbc6e
--- /dev/null
+++ b/hw/nios2/generic_nommu.c
@@ -0,0 +1,128 @@
+/*
+ * Generic simulator target with no MMU
+ *
+ * Copyright (c) 2016 Marek Vasut 
+ *
+ * Based on LabX device code
+ *
+ * Copyright (c) 2012 Chris Wulff 
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see
+ * 
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "cpu.h"
+
+#include "hw/sysbus.h"
+#include "hw/hw.h"
+#include "hw/char/serial.h"
+#include "sysemu/sysemu.h"
+#include "hw/boards.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
+#include "qemu/config-file.h"
+
+#include "boot.h"
+
+#define BINARY_DEVICE_TREE_FILE"generic-nommu.dtb"
+
+static void nios2_generic_nommu_init(MachineState *machine)
+{
+Nios2CPU *cpu;
+DeviceState *dev;
+MemoryRegion *address_space_mem = get_system_memory();
+MemoryRegion *phys_tcm = g_new(MemoryRegion, 1);
+MemoryRegion *phys_tcm_alias = g_new(MemoryRegion, 1);
+MemoryRegion *phys_ram = g_new(MemoryRegion, 1);
+MemoryRegion *phys_ram_alias = g_new(MemoryRegion, 1);
+ram_addr_t tcm_base = 0x0;
+ram_addr_t tcm_size = 0x1000;/* 1 kiB, but QEMU limit is 4 kiB */
+ram_addr_t ram_base = 0x1000;
+ram_addr_t ram_size = 0x0800;
+qemu_irq *cpu_irq, irq[32];
+int i;
+
+/* Physical TCM (tb_ram_1k) with alias at 0xc000 */
+memory_region_init_ram(phys_tcm, NULL, "nios2.tcm", tcm_size,
+   _abort);
+memory_region_init_alias(phys_tcm_alias, NULL, "nios2.tcm.alias",
+ phys_tcm, 0, tcm_size);
+memory_region_add_subregion(address_space_mem, tcm_base, phys_tcm);
+memory_region_add_subregion(address_space_mem, 0xc000 + tcm_base,
+phys_tcm_alias);
+
+/* Physical DRAM with alias at 0xc000 */
+memory_region_init_ram(phys_ram, NULL, "nios2.ram", ram_size,
+   _abort);
+memory_region_init_alias(phys_ram_alias, NULL, "nios2.ram.alias",
+ phys_ram, 0, ram_size);

[Qemu-devel] [PATCH v4 0/2] Nios II generic board config and semihosting

2018-08-12 Thread Julian Brown
This is the fourth version of the patch series previously posted here:

http://lists.nongnu.org/archive/html/qemu-devel/2018-05/msg04571.html

The contents of the patches are largely the same apart from some linting
from the checkpatch.pl script. I've also rebased and re-tested with
Sandra's new libgloss bits as mentioned in:

http://lists.nongnu.org/archive/html/qemu-devel/2018-08/msg00358.html

Thanks,

Julian

Julian Brown (2):
  Add generic Nios II board.
  Add Nios II semihosting support.

 hw/nios2/Makefile.objs |   2 +-
 hw/nios2/boot.c|   5 +-
 hw/nios2/generic_nommu.c   | 128 +
 qemu-options.hx|   8 +-
 target/nios2/Makefile.objs |   2 +-
 target/nios2/cpu.h |   4 +-
 target/nios2/helper.c  |  11 ++
 target/nios2/nios2-semi.c  | 446 +
 8 files changed, 598 insertions(+), 8 deletions(-)
 create mode 100644 hw/nios2/generic_nommu.c
 create mode 100644 target/nios2/nios2-semi.c

-- 
2.8.1




Re: [Qemu-devel] [PATCH v3 12/19] linux-user: Setup split syscall infrastructure

2018-08-12 Thread Richard Henderson
On 08/12/2018 01:45 PM, Laurent Vivier wrote:
> Le 12/06/2018 à 02:51, Richard Henderson a écrit :
>> Defines a unified structure for implementation and strace.
>> Supplies a generator script to build the declarations and
>> the lookup function.
>>
>> Signed-off-by: Richard Henderson 
>> ---
>>  linux-user/syscall.h   | 178 +++
>>  linux-user/strace.c| 386 -
>>  linux-user/syscall.c   | 113 --
>>  linux-user/Makefile.objs   |  10 +
>>  linux-user/gen_syscall_list.py |  82 +++
>>  5 files changed, 595 insertions(+), 174 deletions(-)
>>  create mode 100644 linux-user/syscall.h
>>  create mode 100644 linux-user/gen_syscall_list.py
> ...
>> diff --git a/linux-user/Makefile.objs b/linux-user/Makefile.objs
>> index 59a5c17354..afa69ed6d2 100644
>> --- a/linux-user/Makefile.objs
>> +++ b/linux-user/Makefile.objs
>> @@ -7,3 +7,13 @@ obj-$(TARGET_HAS_BFLT) += flatload.o
>>  obj-$(TARGET_I386) += vm86.o
>>  obj-$(TARGET_ARM) += arm/nwfpe/
>>  obj-$(TARGET_M68K) += m68k-sim.o
>> +
>> +GEN_SYSCALL_LIST = $(SRC_PATH)/linux-user/gen_syscall_list.py
>> +SYSCALL_LIST = linux-user/syscall_list.h linux-user/syscall_list.inc.c
>> +
>> +$(SYSCALL_LIST): $(GEN_SYSCALL_LIST)
>> +$(call quiet-command,\
>> +  $(PYTHON) $(GEN_SYSCALL_LIST) $@, "GEN", $(TARGET_DIR)$@)
>> +
>> +linux-user/syscall.o \
>> +linux-user/strace.o: $(SYSCALL_LIST)
>> diff --git a/linux-user/gen_syscall_list.py b/linux-user/gen_syscall_list.py
>> new file mode 100644
>> index 00..2e0fc39100
>> --- /dev/null
>> +++ b/linux-user/gen_syscall_list.py
>> @@ -0,0 +1,82 @@
>> +#
>> +# Linux syscall table generator
>> +# Copyright (c) 2018 Linaro, Limited.
>> +#
>> +# This program is free software; you can redistribute it and/or modify
>> +# it under the terms of the GNU General Public License as published by
>> +# the Free Software Foundation; either version 2 of the License, or
>> +# (at your option) any later version.
>> +#
>> +# This program is distributed in the hope that it will be useful,
>> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +# GNU General Public License for more details.
>> +#
>> +# You should have received a copy of the GNU General Public License
>> +# along with this program; if not, see .
>> +#
>> +
>> +from __future__ import print_function
>> +import sys
>> +
>> +# These are sets of all syscalls that have been converted.
>> +# The lists are in collation order with '_' ignored.
>> +
>> +# These syscalls are supported by all targets.
>> +# Avoiding ifdefs for these can diagnose typos in $cpu/syscall_nr.h
>> +unconditional_syscalls = [
>> +]
>> +
>> +# These syscalls are only supported by some target or abis.
>> +conditional_syscalls = [
>> +]
>> +
>> +
>> +def header(f):
>> +# Do not ifdef the declarations -- their use may be complicated.
>> +all = unconditional_syscalls + conditional_syscalls
>> +all.sort()
>> +for s in all:
>> +print("extern const SyscallDef def_", s, ";", sep = '', file = f)
>> +
>> +
>> +def def_syscall(s, f):
>> +print("case TARGET_NR_", s, ": return _", s, ";",
>> +  sep = '', file = f);
>> +
>> +
>> +def source(f):
>> +print("static const SyscallDef *syscall_table(int num)",
>> +  "{",
>> +  "switch (num) {",
>> +  sep = '\n', file = f)
>> +
>> +for s in unconditional_syscalls:
>> +def_syscall(s, f)
>> +for s in conditional_syscalls:
>> +print("#ifdef TARGET_NR_", s, sep = '', file = f)
>> +def_syscall(s, f)
>> +print("#endif", file = f)
>> +
>> +print("}",
>> +  "return NULL;",
>> +  "}",
>> +  sep = '\n', file = f);
>> +
>> +
>> +def main():
>> +p = sys.argv[1]
>> +f = open(p, "w")
>> +
>> +print("/* This file is autogenerated by gensyscall.py.  */\n\n",
>> +  file = f)
>> +
>> +if p[len(p) - 1] == 'h':
>> +header(f)
>> +else:
>> +source(f)
>> +
>> +f.close();
>> +
>> +
>> +main()
>>
> 
> As we can see in patch 19/19 it's easy to forget to update the syscalls
> lists.
> 
> Should it be possible to generate syscalls switch from the macro we
> already have?

I didn't see how, right off.

It was possible when all of the syscalls were still in the same file, as we
could get defined-but-unused warnings.  With them in different files, we don't
get that.

Perhaps there's a way that

> static const SyscallDef *syscall_table(int num)
> {
> switch (num) {
> #include "syscall_file.def"
> #include "syscall_ipc.def"
> #include "syscall_mem.def"
> #include "syscall_proc.def"
> }
> return NULL;
> }
> 
> and in syscall_proc.def:

something like this achieves that, via missing-prototype warnings.

But if we have a structure like this, I wonder if we're better off with

#include "syscall_file.inc.c"
#include 

Re: [Qemu-devel] [PATCH v3 12/19] linux-user: Setup split syscall infrastructure

2018-08-12 Thread Laurent Vivier
Le 12/06/2018 à 02:51, Richard Henderson a écrit :
> Defines a unified structure for implementation and strace.
> Supplies a generator script to build the declarations and
> the lookup function.
> 
> Signed-off-by: Richard Henderson 
> ---
>  linux-user/syscall.h   | 178 +++
>  linux-user/strace.c| 386 -
>  linux-user/syscall.c   | 113 --
>  linux-user/Makefile.objs   |  10 +
>  linux-user/gen_syscall_list.py |  82 +++
>  5 files changed, 595 insertions(+), 174 deletions(-)
>  create mode 100644 linux-user/syscall.h
>  create mode 100644 linux-user/gen_syscall_list.py
...
> diff --git a/linux-user/Makefile.objs b/linux-user/Makefile.objs
> index 59a5c17354..afa69ed6d2 100644
> --- a/linux-user/Makefile.objs
> +++ b/linux-user/Makefile.objs
> @@ -7,3 +7,13 @@ obj-$(TARGET_HAS_BFLT) += flatload.o
>  obj-$(TARGET_I386) += vm86.o
>  obj-$(TARGET_ARM) += arm/nwfpe/
>  obj-$(TARGET_M68K) += m68k-sim.o
> +
> +GEN_SYSCALL_LIST = $(SRC_PATH)/linux-user/gen_syscall_list.py
> +SYSCALL_LIST = linux-user/syscall_list.h linux-user/syscall_list.inc.c
> +
> +$(SYSCALL_LIST): $(GEN_SYSCALL_LIST)
> + $(call quiet-command,\
> +   $(PYTHON) $(GEN_SYSCALL_LIST) $@, "GEN", $(TARGET_DIR)$@)
> +
> +linux-user/syscall.o \
> +linux-user/strace.o: $(SYSCALL_LIST)
> diff --git a/linux-user/gen_syscall_list.py b/linux-user/gen_syscall_list.py
> new file mode 100644
> index 00..2e0fc39100
> --- /dev/null
> +++ b/linux-user/gen_syscall_list.py
> @@ -0,0 +1,82 @@
> +#
> +# Linux syscall table generator
> +# Copyright (c) 2018 Linaro, Limited.
> +#
> +# This program is free software; you can redistribute it and/or modify
> +# it under the terms of the GNU General Public License as published by
> +# the Free Software Foundation; either version 2 of the License, or
> +# (at your option) any later version.
> +#
> +# This program is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, see .
> +#
> +
> +from __future__ import print_function
> +import sys
> +
> +# These are sets of all syscalls that have been converted.
> +# The lists are in collation order with '_' ignored.
> +
> +# These syscalls are supported by all targets.
> +# Avoiding ifdefs for these can diagnose typos in $cpu/syscall_nr.h
> +unconditional_syscalls = [
> +]
> +
> +# These syscalls are only supported by some target or abis.
> +conditional_syscalls = [
> +]
> +
> +
> +def header(f):
> +# Do not ifdef the declarations -- their use may be complicated.
> +all = unconditional_syscalls + conditional_syscalls
> +all.sort()
> +for s in all:
> +print("extern const SyscallDef def_", s, ";", sep = '', file = f)
> +
> +
> +def def_syscall(s, f):
> +print("case TARGET_NR_", s, ": return _", s, ";",
> +  sep = '', file = f);
> +
> +
> +def source(f):
> +print("static const SyscallDef *syscall_table(int num)",
> +  "{",
> +  "switch (num) {",
> +  sep = '\n', file = f)
> +
> +for s in unconditional_syscalls:
> +def_syscall(s, f)
> +for s in conditional_syscalls:
> +print("#ifdef TARGET_NR_", s, sep = '', file = f)
> +def_syscall(s, f)
> +print("#endif", file = f)
> +
> +print("}",
> +  "return NULL;",
> +  "}",
> +  sep = '\n', file = f);
> +
> +
> +def main():
> +p = sys.argv[1]
> +f = open(p, "w")
> +
> +print("/* This file is autogenerated by gensyscall.py.  */\n\n",
> +  file = f)
> +
> +if p[len(p) - 1] == 'h':
> +header(f)
> +else:
> +source(f)
> +
> +f.close();
> +
> +
> +main()
> 

As we can see in patch 19/19 it's easy to forget to update the syscalls
lists.

Should it be possible to generate syscalls switch from the macro we
already have?

Something like (it should not be as simple as this but...):

#define SYSCALL_DEF(NAME, ...) \
case TARGET_NR_##NAME: return @def_##NAME

static const SyscallDef *syscall_table(int num)
{
switch (num) {
#include "syscall_file.def"
#include "syscall_ipc.def"
#include "syscall_mem.def"
#include "syscall_proc.def"
}
return NULL;
}

and in syscall_proc.def:

...
SYSCALL_DEF(clone)
#ifdef TARGET_NR_fork
SYSCALL_DEF(fork);
#endif
#ifdef TARGET_NR_vfork
SYSCALL_DEF(vfork);
#endif
...
SYSCALL_DEF(set_tid_address, ARG_PTR);

and in syscall_proc.c:
...
SYSCALL_IMPL(set_tid_address)
{
return get_errno(set_tid_address((int *)g2h(arg1)));
}
#include "syscall_proc.def".

So compiler will detect any unused references (if SYSCALL_DEF doesn't
match SYSCALL_IMPL), the syscall_table has automatically all implemented
syscalls, 

Re: [Qemu-devel] [PATCH] target-i386: Fix lcall to call gate in IA-32e mode

2018-08-12 Thread Andrew Oates via Qemu-devel
On Sun, Aug 12, 2018 at 6:17 AM Paolo Bonzini  wrote:

> On 12/08/2018 05:07, Andrew Oates via Qemu-devel wrote:
> > Currently call gates are always treated as 32-bit gates.  In IA-32e mode
> > (either compatibility or 64-bit submode), system segment descriptors are
> > always 64-bit.  Treating them as 32-bit has the expected unfortunate
> > effect: only the lower 32 bits of the offset are loaded, the stack
> > pointer is truncated, a bad new stack pointer is loaded from the TSS (if
> > switching privilege levels), etc.
> >
> > This change adds support for 64-bit call gate to the lcall instruction.
> > Additionally, there should be a check for non-canonical stack pointers,
> > but I've omitted that since there doesn't seem to be checks for
> > non-canonical addresses in this code elsewhere.
>
> Thanks.  It's also missing a check for task gates in 64-bit mode and,
> since we are at it, we should also implement ljmp to call gates.
> Something like this (untested):
>

Yeah, I was going to do ljmp in a followup.  I'll go ahead and add it to
this patch.


>
> diff --git a/target/i386/seg_helper.c b/target/i386/seg_helper.c
> index d3b31f3c1b..0b2efe2cae 100644
> --- a/target/i386/seg_helper.c
> +++ b/target/i386/seg_helper.c
> @@ -1645,6 +1645,13 @@ void helper_ljmp_protected(CPUX86State *env, int
> new_cs, target_ulong new_eip,
>  rpl = new_cs & 3;
>  cpl = env->hflags & HF_CPL_MASK;
>  type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
> +#ifdef TARGET_X86_64
> +if (env->efer & MSR_EFER_LMA) {
> +if (type != 12) {
> +raise_exception_err_ra(env, EXCP0D_GPF, gate_cs & 0xfffc,
> GETPC());
> +}
> +}
> +#endif
>  switch (type) {
>  case 1: /* 286 TSS */
>  case 9: /* 386 TSS */
> @@ -1666,6 +1673,21 @@ void helper_ljmp_protected(CPUX86State *env, int
> new_cs, target_ulong new_eip,
>  new_eip = (e1 & 0x);
>  if (type == 12) {
>  new_eip |= (e2 & 0x);
> +#ifdef TARGET_X86_64
> +if (env->efer & MSR_EFER_LMA) {
> +/* load the upper 8 bytes of the 64-bit call gate */
> +if (load_segment_ra(env, , , new_cs + 8,
> GETPC())) {
> +raise_exception_err_ra(env, EXCP0D_GPF, new_cs &
> 0xfffc,
> +   GETPC());
> +}
> +type = (e2 >> DESC_TYPE_SHIFT) & 0x1f;
> +if (type != 0) {
> +raise_exception_err_ra(env, EXCP0D_GPF, gate_cs &
> 0xfffc,
> +   GETPC());
> +}
> +new_eip |= ((target_ulong)e1) << 32;
> +}
> +#endif
>  }
>  if (load_segment_ra(env, , , gate_cs, GETPC()) != 0) {
>  raise_exception_err_ra(env, EXCP0D_GPF, gate_cs & 0xfffc,
> GETPC());
> @@ -1812,6 +1834,13 @@ void helper_lcall_protected(CPUX86State *env, int
> new_cs, target_ulong new_eip,
>  type = (e2 >> DESC_TYPE_SHIFT) & 0x1f;
>  dpl = (e2 >> DESC_DPL_SHIFT) & 3;
>  rpl = new_cs & 3;
> +#ifdef TARGET_X86_64
> +if (env->efer & MSR_EFER_LMA) {
> +if (type != 12) {
> +raise_exception_err_ra(env, EXCP0D_GPF, gate_cs & 0xfffc,
> GETPC());
> +}
> +}
> +#endif
>  switch (type) {
>  case 1: /* available 286 TSS */
>  case 9: /* available 386 TSS */
>
>
> Out of curiosity, what OS is using 64-bit call gates? :)
>

As far as I know, no important ones :)  Just something I've been hacking
around with in my spare time.  The rational thing to do would have been to
just stop using 64-bit call gates, but I figured it would be more fun to
fix it for everyone.


>
> Paolo
>
> > I've left the raise_exception_err_ra lines unwapped at 80 columns to
> > match the style in the rest of the file.
> >
> > Signed-off-by: Andrew Oates 
> > ---
> >  target/i386/seg_helper.c | 145 ---
> >  1 file changed, 106 insertions(+), 39 deletions(-)
>
>


[Qemu-devel] [Bug 1777235] Re: NVME is missing support for Get Log Page command

2018-08-12 Thread Lorenz Brun via Qemu-devel
I have a patch for that, it was designed to run NVMe for MacOS guests
and implements at least the bare minimum of the spec. I'll try to polish
it up and upstream it as soon as I have time.

** Patch added: "0001-Enough-GetLogCmd-for-macOS.patch"
   
https://bugs.launchpad.net/qemu/+bug/1777235/+attachment/5174516/+files/0001-Enough-GetLogCmd-for-macOS.patch

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1777235

Title:
  NVME is missing support for Get Log Page command

Status in QEMU:
  New

Bug description:
  "Get Log Page" is a mandatory admin command by the specification (NVMe
  1.2, Section 5, Figure 40) currently not implemented by device.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1777235/+subscriptions



[Qemu-devel] QEMU on Solaris

2018-08-12 Thread Michele Denber

After configuring QEMU on my Sun I got this message:

"Host OS SunOS support is not currently maintained.
The QEMU project intends to remove support for this host OS in
a future release if nobody volunteers to maintain it and to
provide a build host for our continuous integration setup.
configure has succeeded and you can continue to build, but
if you care about QEMU on this platform you should contact
us upstream at qemu-devel@nongnu.org."

Well I do care about QEMU on my platform so I'd like to volunteer to 
provide a host for support.  I can provide you with a free account on my 
Sun Oracle Enterprise M3000 quad-core 2.75 GHz. SPARC64 VII running 
Solaris 10 Update 11.  I've got plenty of spare CPU cycles and lots of 
free disk.  Please let me know.


- Michele



Re: [Qemu-devel] [PATCH] secondary-vga: unregister vram on unplug.

2018-08-12 Thread Paolo Bonzini
On 11/08/2018 21:07, Remy NOEL wrote:
> On 08/07/2018 05:09 PM, Dr. David Alan Gilbert wrote:
> 
>> * Peter Maydell (peter.mayd...@linaro.org) wrote:
>>> On 7 August 2018 at 15:57, Dr. David Alan Gilbert
>>>  wrote:
 * Gerd Hoffmann (kra...@redhat.com) wrote:
> On Fri, Jul 20, 2018 at 10:19:48AM +0200, remy.n...@blade-group.com
> wrote:
>> From: "Remy Noel" 
>>
>> When removing a secondary-vga device and then adding it back (or
>> adding
>> an other one), qemu aborts with:
>>  "RAMBlock ":00:02.0/vga.vram" already registered, abort!".
>>
>> It is caused by the vram staying registered, preventing vga
>> replugging.
> David?  Does that look ok?
>
> This balances the
>
>   vmstate_register_ram(>vram, s->global_vmstate ?  NULL :
> DEVICE(obj));
>
> call in vga_common_init().  I'm wondering whenever the manual
> cleanup is
> actually needed in case owner is not NULL?
 I can't see anyone who is calling unregister_ram or the functions it
 calls as part of generic device cleanup, so I think it IS needed
 to manually do it.

 Which is a bit worrying since we have vastly more register's than
 unregister's.
>>> Paolo suggested in an email last month that vmstate_unregister_ram()
>>> should simply not exist, because it doesn't actually do anything useful:
>>> https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg01125.html
>>>
>>> (ie it was added in the first place because we'd ended up with
>>> two identically named ramblocks, but that only happened because
>>> a reference-counting bug meant we hadn't deleted the first one
>>> properly before creating the second.)
>>>
>>> So I think that the bug reported in this thread is similar:
>>> the problem is not that we're not calling vmstate_unregister_ram(),
>>> but that when the first instance of secondary-vga is removed
>>> it is not correctly destroying the ramblock.
>> Ah yes that makes more sense; I remember there was another similar bug
>> where a device screwed up and didn't delete it's RAM causing similar
>> problems.
>>
>> Dave
> Thanks for the feedback, after closer inspection, the secondary-vga
> refcount does, indeed, never reach 0.
> 
> I noticed the bug was not present in v2.12.0 and had been visible since
> 93abfc88bd649de1933588bfc7175605331b3ea9
> (https://lists.gnu.org/archive/html/qemu-devel/2018-06/msg07547.html).
> 
> This patch causes the secondary-vga object to be referenced by its
> subregions (mrs) which are themselves referenced by its mmio region
> which is referenced by the device causing a reference loop.
> We should probably break this loop upon exit, however, i am not sure
> whether we should deletes the subregions or delete the mmio properly.

I'll take a look...

Paolo




Re: [Qemu-devel] [PATCH 28/56] json: Fix \uXXXX for surrogate pairs

2018-08-12 Thread Paolo Bonzini
On 08/08/2018 14:03, Markus Armbruster wrote:
> +if (cp >= 0xD800 && cp <= 0xDBFF && !leading_surrogate
> +&& ptr[1] == '\\' && ptr[2] == 'u') {
> +ptr += 2;
> +leading_surrogate = cp;
> +goto hex;
> +}
> +if (cp >= 0xDC00 && cp <= 0xDFFF && leading_surrogate) {
> +cp &= 0x3FF;
> +cp |= (leading_surrogate & 0x3FF) << 10;
> +cp += 0x01;
> +}
> +

The leading surrogate is discarded for \uD800\uCAFE, I think.  Is this
desired?

Paolo



Re: [Qemu-devel] [PATCH v3 23/23] libqtest: Rename qtest_FOOv() to qtest_vFOO() for consistency

2018-08-12 Thread Paolo Bonzini
On 06/08/2018 08:53, Markus Armbruster wrote:
> 13 of 13 C99 library function pairs taking ... or a va_list parameter
> are called FOO() and vFOO().  In QEMU, we sometimes call the one
> taking a va_list FOOv() instead.  Bad taste.  libqtest.h uses both
> spellings.  Normalize it to the standard spelling.

Probably the bad example here was g_strdupv and g_strjoinv.  Those
however are taking a NULL-terminated char** [1] so the example was
copied incorrectly.

Paolo

[1] In this case, it's NULL not NUL. :)



Re: [Qemu-devel] [PATCH] hw/pci: add pci capability to pci-pci bridge

2018-08-12 Thread Laszlo Ersek
On 08/12/18 09:11, Marcel Apfelbaum wrote:
> On 08/07/2018 06:59 PM, Laszlo Ersek wrote:

>> First, under the label "shpc_error", we call pci_bridge_exitfn(), which
>> seems to clean up everything (checking individually for each thing to
>> clean up). Given this, I wonder why we introduced the "slotid_error" and
>> "msi_error" labels at all. Cascading teardown on the error path, and
>> invkoing a function that checks everything individually and then tears
>> it all down, are usually mutually exclusive.
> 
> I think is possible you miss-interpreted pci_bridge_dev_exitfn
> with pci_bridge_exitfn. The first one is the "catch all", the second
> one that is used in the error path is for the bridge specific cleanup.

Ah, you are correct, I totally mistook the call to pci_bridge_exitfn()
for pci_bridge_dev_exitfn(). I do see the difference now, so it's clear
that the error path in pci_bridge_dev_realize() is of the "cascading"
kind, and not of the "call a catch-all function" kind.

That gives this patch a clear pattern to follow.

Thank you!
Laszlo



Re: [Qemu-devel] RDMA wrongly detected as being supported on FreeBSD

2018-08-12 Thread Marcel Apfelbaum




On 08/12/2018 11:39 AM, Yuval Shaia wrote:

On Sat, Aug 11, 2018 at 07:47:49PM +0300, Marcel Apfelbaum wrote:

Hi,

On 08/06/2018 11:51 AM, Thomas Huth wrote:

On 07/28/2018 05:50 AM, Rebecca Cran wrote:

On 7/25/18 5:14 AM, Thomas Huth wrote:


Note that the error has been reported to happen on FreeBSD - so I doubt
that this  header should be here.

Anyway, our include/standard-headers/linux/types.h is also empty ... so
could you try whether it compiles if you simply remove this #include
line, Rebecca?

Sorry for the delay, I'm just getting back to this. Removing the include
causes it to fail later on, with:

/home/bcran/workspace/qemu/hw/rdma/vmw/pvrdma_cmd.c:60:17: warning:
implicit declaration of function 'mremap' is invalid in C99
[-Wimplicit-function-declaration]
      host_virt = mremap(curr_page, 0, length, MREMAP_MAYMOVE);

OK, thanks for checking. According to

   https://www.freebsd.org/cgi/man.cgi?query=mremap=NetBSD+5.0

that syscall should also be available on FreeBSD. So could you please do
one more test and see whether it works when you add the following line
somewhere at the beginning of the file:

#include 

It took some time, but I finally have a FreeBSD VM with RDMA enabled.

My findings:
- The RDMA libraries are there, so the flag  "RDMA"=yes is set correctly.
- include of  is not needed, I will remove it.
- In FreeBSD mremap is not implemented in all configurations, it depends
   on some SANITIZER_LINUX flag (Linux emulation?), I think;
   so pvrdma cannot be compiled.  I think we should have a different QEMU
configuration
   flag --enable-pvrdma that will specifically check it is not a "bsd"
platform.

A good idea.

So, do i get it right that you will "revert back" the things we did
to--enable-rdma and do all our stuff in new --enable-pvrdma section?


Yes, but I plan to do it as a patch "on top" rather than reverting the 
actual commit.


Thanks,
Marcel


   I'll send a patch for that too.

Thanks,
Marcel



   Thanks,
Thomas





Re: [Qemu-devel] [PATCH] target-i386: Fix lcall to call gate in IA-32e mode

2018-08-12 Thread Paolo Bonzini
On 12/08/2018 05:07, Andrew Oates via Qemu-devel wrote:
> Currently call gates are always treated as 32-bit gates.  In IA-32e mode
> (either compatibility or 64-bit submode), system segment descriptors are
> always 64-bit.  Treating them as 32-bit has the expected unfortunate
> effect: only the lower 32 bits of the offset are loaded, the stack
> pointer is truncated, a bad new stack pointer is loaded from the TSS (if
> switching privilege levels), etc.
> 
> This change adds support for 64-bit call gate to the lcall instruction.
> Additionally, there should be a check for non-canonical stack pointers,
> but I've omitted that since there doesn't seem to be checks for
> non-canonical addresses in this code elsewhere.

Thanks.  It's also missing a check for task gates in 64-bit mode and,
since we are at it, we should also implement ljmp to call gates.
Something like this (untested):

diff --git a/target/i386/seg_helper.c b/target/i386/seg_helper.c
index d3b31f3c1b..0b2efe2cae 100644
--- a/target/i386/seg_helper.c
+++ b/target/i386/seg_helper.c
@@ -1645,6 +1645,13 @@ void helper_ljmp_protected(CPUX86State *env, int new_cs, 
target_ulong new_eip,
 rpl = new_cs & 3;
 cpl = env->hflags & HF_CPL_MASK;
 type = (e2 >> DESC_TYPE_SHIFT) & 0xf;
+#ifdef TARGET_X86_64
+if (env->efer & MSR_EFER_LMA) {
+if (type != 12) {
+raise_exception_err_ra(env, EXCP0D_GPF, gate_cs & 0xfffc, 
GETPC());
+}
+}
+#endif
 switch (type) {
 case 1: /* 286 TSS */
 case 9: /* 386 TSS */
@@ -1666,6 +1673,21 @@ void helper_ljmp_protected(CPUX86State *env, int new_cs, 
target_ulong new_eip,
 new_eip = (e1 & 0x);
 if (type == 12) {
 new_eip |= (e2 & 0x);
+#ifdef TARGET_X86_64
+if (env->efer & MSR_EFER_LMA) {
+/* load the upper 8 bytes of the 64-bit call gate */
+if (load_segment_ra(env, , , new_cs + 8, GETPC())) {
+raise_exception_err_ra(env, EXCP0D_GPF, new_cs & 
0xfffc,
+   GETPC());
+}
+type = (e2 >> DESC_TYPE_SHIFT) & 0x1f;
+if (type != 0) {
+raise_exception_err_ra(env, EXCP0D_GPF, gate_cs & 
0xfffc,
+   GETPC());
+}
+new_eip |= ((target_ulong)e1) << 32;
+}
+#endif
 }
 if (load_segment_ra(env, , , gate_cs, GETPC()) != 0) {
 raise_exception_err_ra(env, EXCP0D_GPF, gate_cs & 0xfffc, 
GETPC());
@@ -1812,6 +1834,13 @@ void helper_lcall_protected(CPUX86State *env, int 
new_cs, target_ulong new_eip,
 type = (e2 >> DESC_TYPE_SHIFT) & 0x1f;
 dpl = (e2 >> DESC_DPL_SHIFT) & 3;
 rpl = new_cs & 3;
+#ifdef TARGET_X86_64
+if (env->efer & MSR_EFER_LMA) {
+if (type != 12) {
+raise_exception_err_ra(env, EXCP0D_GPF, gate_cs & 0xfffc, 
GETPC());
+}
+}
+#endif
 switch (type) {
 case 1: /* available 286 TSS */
 case 9: /* available 386 TSS */


Out of curiosity, what OS is using 64-bit call gates? :)

Paolo

> I've left the raise_exception_err_ra lines unwapped at 80 columns to
> match the style in the rest of the file.
> 
> Signed-off-by: Andrew Oates 
> ---
>  target/i386/seg_helper.c | 145 ---
>  1 file changed, 106 insertions(+), 39 deletions(-)




Re: [Qemu-devel] [PATCH v7 62/80] linux-user: Add syscall numbers for nanoMIPS

2018-08-12 Thread Laurent Vivier
Le 12/08/2018 à 17:44, Aleksandar Markovic a écrit :
> ping

Where can we find the kernel source for nanoMIPS to be able to compare
the QEMU syscall numbers definition?

Thanks,
Laurent



[Qemu-devel] Booting linux with ppc 40p machine

2018-08-12 Thread Guenter Roeck

Hi all,

I have been trying to boot linux on the 40p machine with qemu-system-ppc
for quite some time, with no success. I have managed to get into the BIOS,
but not further than that.

Assuming it is in possible to boot Linux on the 40p machine, is there a
qemu command line (and Linux kernel configuration) that is known to work ?

Thanks,
Guenter



Re: [Qemu-devel] [PATCH v7 62/80] linux-user: Add syscall numbers for nanoMIPS

2018-08-12 Thread Aleksandar Markovic
ping


From: Aleksandar Markovic 
Sent: Monday, August 6, 2018 7:00:29 PM
To: qemu-devel@nongnu.org
Cc: peter.mayd...@linaro.org; laur...@vivier.eu; riku.voi...@iki.fi; 
philippe.mathieu.da...@gmail.com; aurel...@aurel32.net; 
richard.hender...@linaro.org; Aleksandar Markovic; Stefan Markovic; Petar 
Jovanovic; Paul Burton; Aleksandar Rikalo; th...@redhat.com; arm...@redhat.com
Subject: [PATCH v7 62/80] linux-user: Add syscall numbers for nanoMIPS

From: Aleksandar Rikalo 

Add syscall numbers for nanoMIPS. nanoMIPS redefines its ABI
compared to preceding MIPS architectures, and its set of
supported system calls is significantly different.

Signed-off-by: Aleksandar Rikalo 
Signed-off-by: Aleksandar Markovic 
Signed-off-by: Stefan Markovic 
---
 linux-user/nanomips/syscall_nr.h | 275 +++
 1 file changed, 275 insertions(+)
 create mode 100644 linux-user/nanomips/syscall_nr.h

diff --git a/linux-user/nanomips/syscall_nr.h b/linux-user/nanomips/syscall_nr.h
new file mode 100644
index 000..b826e5c
--- /dev/null
+++ b/linux-user/nanomips/syscall_nr.h
@@ -0,0 +1,275 @@
+/*
+ * Linux mipsp32 style syscalls.
+ */
+#define TARGET_NR_io_setup   0
+#define TARGET_NR_io_destroy 1
+#define TARGET_NR_io_submit  2
+#define TARGET_NR_io_cancel  3
+#define TARGET_NR_io_getevents   4
+#define TARGET_NR_setxattr   5
+#define TARGET_NR_lsetxattr  6
+#define TARGET_NR_fsetxattr  7
+#define TARGET_NR_getxattr   8
+#define TARGET_NR_lgetxattr  9
+#define TARGET_NR_fgetxattr  10
+#define TARGET_NR_listxattr  11
+#define TARGET_NR_llistxattr 12
+#define TARGET_NR_flistxattr 13
+#define TARGET_NR_removexattr14
+#define TARGET_NR_lremovexattr   15
+#define TARGET_NR_fremovexattr   16
+#define TARGET_NR_getcwd 17
+#define TARGET_NR_lookup_dcookie 18
+#define TARGET_NR_eventfd2   19
+#define TARGET_NR_epoll_create1  20
+#define TARGET_NR_epoll_ctl  21
+#define TARGET_NR_epoll_pwait22
+#define TARGET_NR_dup23
+#define TARGET_NR_dup3   24
+#define TARGET_NR_fcntl6425
+#define TARGET_NR_inotify_init1  26
+#define TARGET_NR_inotify_add_watch  27
+#define TARGET_NR_inotify_rm_watch   28
+#define TARGET_NR_ioctl  29
+#define TARGET_NR_ioprio_set 30
+#define TARGET_NR_ioprio_get 31
+#define TARGET_NR_flock  32
+#define TARGET_NR_mknodat33
+#define TARGET_NR_mkdirat34
+#define TARGET_NR_unlinkat   35
+#define TARGET_NR_symlinkat  36
+#define TARGET_NR_linkat 37
+#define TARGET_NR_umount239
+#define TARGET_NR_mount  40
+#define TARGET_NR_pivot_root 41
+#define TARGET_NR_nfsservctl 42
+#define TARGET_NR_statfs64   43
+#define TARGET_NR_fstatfs64  44
+#define TARGET_NR_truncate64 45
+#define TARGET_NR_ftruncate6446
+#define TARGET_NR_fallocate  47
+#define TARGET_NR_faccessat  48
+#define TARGET_NR_chdir  49
+#define TARGET_NR_fchdir 50
+#define TARGET_NR_chroot 51
+#define TARGET_NR_fchmod 52
+#define TARGET_NR_fchmodat   53
+#define TARGET_NR_fchownat   54
+#define TARGET_NR_fchown 55
+#define TARGET_NR_openat 56
+#define TARGET_NR_close  57
+#define TARGET_NR_vhangup58
+#define TARGET_NR_pipe2  59
+#define TARGET_NR_quotactl   60
+#define TARGET_NR_getdents64 61
+#define TARGET_NR__llseek62
+#define TARGET_NR_read   63
+#define TARGET_NR_write  64
+#define TARGET_NR_readv  65
+#define TARGET_NR_writev 66
+#define TARGET_NR_pread6467
+#define TARGET_NR_pwrite64   68
+#define TARGET_NR_preadv 69
+#define TARGET_NR_pwritev70
+#define TARGET_NR_sendfile64 71
+#define TARGET_NR_pselect6   72
+#define TARGET_NR_ppoll  73
+#define TARGET_NR_signalfd4  74
+#define TARGET_NR_vmsplice   75
+#define TARGET_NR_splice 76
+#define TARGET_NR_tee77
+#define TARGET_NR_readlinkat

Re: [Qemu-devel] [PATCH v7 62/80] linux-user: Add syscall numbers for nanoMIPS

2018-08-12 Thread Peter Maydell
On 12 August 2018 at 17:23, Laurent Vivier  wrote:
> Le 12/08/2018 à 17:44, Aleksandar Markovic a écrit :
>> ping
>
> Where can we find the kernel source for nanoMIPS to be able to compare
> the QEMU syscall numbers definition?

I think we shouldn't take the linux-user code into QEMU until
the kernel port is in mainline, so that we don't accidentally
implement a version of the ABI which isn't the final one
accepted into the kernel. (This has happened to us before,
and it's an awkward situation, so I'd prefer to avoid it.)

thanks
-- PMM



Re: [Qemu-devel] RDMA wrongly detected as being supported on FreeBSD

2018-08-12 Thread Yuval Shaia
On Sat, Aug 11, 2018 at 07:47:49PM +0300, Marcel Apfelbaum wrote:
> Hi,
> 
> On 08/06/2018 11:51 AM, Thomas Huth wrote:
> > On 07/28/2018 05:50 AM, Rebecca Cran wrote:
> > > On 7/25/18 5:14 AM, Thomas Huth wrote:
> > > 
> > > > Note that the error has been reported to happen on FreeBSD - so I doubt
> > > > that this  header should be here.
> > > > 
> > > > Anyway, our include/standard-headers/linux/types.h is also empty ... so
> > > > could you try whether it compiles if you simply remove this #include
> > > > line, Rebecca?
> > > 
> > > Sorry for the delay, I'm just getting back to this. Removing the include
> > > causes it to fail later on, with:
> > > 
> > > /home/bcran/workspace/qemu/hw/rdma/vmw/pvrdma_cmd.c:60:17: warning:
> > > implicit declaration of function 'mremap' is invalid in C99
> > > [-Wimplicit-function-declaration]
> > >      host_virt = mremap(curr_page, 0, length, MREMAP_MAYMOVE);
> > OK, thanks for checking. According to
> > 
> >   https://www.freebsd.org/cgi/man.cgi?query=mremap=NetBSD+5.0
> > 
> > that syscall should also be available on FreeBSD. So could you please do
> > one more test and see whether it works when you add the following line
> > somewhere at the beginning of the file:
> > 
> >#include 
> 
> It took some time, but I finally have a FreeBSD VM with RDMA enabled.
> 
> My findings:
> - The RDMA libraries are there, so the flag  "RDMA"=yes is set correctly.
> - include of  is not needed, I will remove it.
> - In FreeBSD mremap is not implemented in all configurations, it depends
>   on some SANITIZER_LINUX flag (Linux emulation?), I think;
>   so pvrdma cannot be compiled.  I think we should have a different QEMU
> configuration
>   flag --enable-pvrdma that will specifically check it is not a "bsd"
> platform.

A good idea.

So, do i get it right that you will "revert back" the things we did
to--enable-rdma and do all our stuff in new --enable-pvrdma section?

>   I'll send a patch for that too.
> 
> Thanks,
> Marcel
> 
> 
> > 
> >   Thanks,
> >Thomas
> 



Re: [Qemu-devel] [PATCH] hw/pvrdma: remove not needed include

2018-08-12 Thread Yuval Shaia
On Sat, Aug 11, 2018 at 08:15:34PM +0300, Marcel Apfelbaum wrote:
> No need to include linux/types.h, is empty anyway.
> 
> Suggested-by: Thomas Huth 
> Signed-off-by: Marcel Apfelbaum 
> ---
>  hw/rdma/vmw/pvrdma_cmd.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
> index 14255d609f..3f697c8db7 100644
> --- a/hw/rdma/vmw/pvrdma_cmd.c
> +++ b/hw/rdma/vmw/pvrdma_cmd.c
> @@ -16,7 +16,6 @@
>  #include "qemu/osdep.h"
>  #include "qemu/error-report.h"
>  #include "cpu.h"
> -#include 
>  #include "hw/hw.h"
>  #include "hw/pci/pci.h"
>  #include "hw/pci/pci_ids.h"

Reviewed-by: Yuval Shaia 

> -- 
> 2.17.1
> 



Re: [Qemu-devel] [PATCH] hw/pci: add pci capability to pci-pci bridge

2018-08-12 Thread Marcel Apfelbaum

Hi Laszlo,

On 08/07/2018 06:59 PM, Laszlo Ersek wrote:

On 08/07/18 14:19, Laszlo Ersek wrote:

On 08/07/18 09:04, Jing Liu wrote:

Add hint to firmware (e.g. SeaBIOS) to reserve addtional
IO/MEM/PREF spaces for legacy pci-pci bridge, to enable
some pci devices hotplugging whose IO/MEM/PREF spaces
requests are larger than the ones in pci-pci bridge set
by firmware.

Signed-off-by: Jing Liu 
---
  hw/pci-bridge/pci_bridge_dev.c | 20 
  1 file changed, 20 insertions(+)
+cap_error:
+msi_uninit(dev);

(4) This error handler doesn't look entirely correct; we can reach it
without having initialized MSI. (MSI setup is conditional; and even if
we attempt it, it is permitted to fail with "msi=auto".) Therefore,
msi_uninit() should be wrapped into "if (msi_present())", same as you
see in pci_bridge_dev_exitfn().


You are right.  msi_present should be checked.




  msi_error:
  slotid_cap_cleanup(dev);
  slotid_error:

I tried to understand the error handling a bit better. I'm confused.

First, under the label "shpc_error", we call pci_bridge_exitfn(), which
seems to clean up everything (checking individually for each thing to
clean up). Given this, I wonder why we introduced the "slotid_error" and
"msi_error" labels at all. Cascading teardown on the error path, and
invkoing a function that checks everything individually and then tears
it all down, are usually mutually exclusive.


I think is possible you miss-interpreted pci_bridge_dev_exitfn
with pci_bridge_exitfn. The first one is the "catch all", the second
one that is used in the error path is for the bridge specific cleanup.


Second, msi_uninit() and shpc_cleanup() are internally inconsistent
between each other. The former removes the respective capability from
config space -- with pci_del_capability() --,


Right


  but the latter only has a
comment, "/* TODO: cleanup config space changes? */".


But also disables the QEMU_PCI_CAP_SLOTID (but no code checks it anyway)
I agree it should call pci_del_capability to delete the SHPC capability,
maybe is some "debt" from early development stages.


  The same comment
is present in the slotid_cap_cleanup() function. Given this
inconsistency,


Here we also miss a call to pci_del_capability.


I don't know what to suggest for the new capability's
teardown, in pci_bridge_dev_exitfn()  -- should we just ignore it (as
suggested by this patch)?


No, we should remove it properly.

I think it is not considered a "big" issue since adding/removing PCI 
capabilities
is not an operation that deals with resources, we only edit an array  
(the config space)

that will not be used anyway if the device init sequence failed.

That does not mean the code should not be clean.

Thanks,
Marcel



Thanks
Laszlo