date:20200326

[PATCH v8 06/74] cpu: introduce process_queued_cpu_work_locked

2020-03-26 Thread Robert Foley

From: "Emilio G. Cota" 

This completes the conversion to cpu_mutex_lock/unlock in the file.

Reviewed-by: Richard Henderson 
Reviewed-by: Alex Bennée 
Signed-off-by: Emilio G. Cota 
Signed-off-by: Robert Foley 
---
 cpus-common.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index bf5794cc17..f4a0f9ba3b 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -333,20 +333,19 @@ void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func 
func,
 queue_work_on_cpu(cpu, wi);
 }
 
-void process_queued_cpu_work(CPUState *cpu)
+/* Called with the CPU's lock held */
+static void process_queued_cpu_work_locked(CPUState *cpu)
 {
 struct qemu_work_item *wi;
 bool has_bql = qemu_mutex_iothread_locked();
 
-qemu_mutex_lock(>lock);
 if (QSIMPLEQ_EMPTY(>work_list)) {
-qemu_mutex_unlock(>lock);
 return;
 }
 while (!QSIMPLEQ_EMPTY(>work_list)) {
 wi = QSIMPLEQ_FIRST(>work_list);
 QSIMPLEQ_REMOVE_HEAD(>work_list, node);
-qemu_mutex_unlock(>lock);
+cpu_mutex_unlock(cpu);
 if (wi->exclusive) {
 /* Running work items outside the BQL avoids the following 
deadlock:
  * 1) start_exclusive() is called with the BQL taken while another
@@ -372,13 +371,19 @@ void process_queued_cpu_work(CPUState *cpu)
 qemu_mutex_unlock_iothread();
 }
 }
-qemu_mutex_lock(>lock);
+cpu_mutex_lock(cpu);
 if (wi->free) {
 g_free(wi);
 } else {
 atomic_mb_set(>done, true);
 }
 }
-qemu_mutex_unlock(>lock);
 qemu_cond_broadcast(>cond);
 }
+
+void process_queued_cpu_work(CPUState *cpu)
+{
+cpu_mutex_lock(cpu);
+process_queued_cpu_work_locked(cpu);
+cpu_mutex_unlock(cpu);
+}
-- 
2.17.1

[PATCH v8 04/74] cpu: make qemu_work_cond per-cpu

2020-03-26 Thread Robert Foley

From: "Emilio G. Cota" 

This eliminates the need to use the BQL to queue CPU work.

While at it, give the per-cpu field a generic name ("cond") since
it will soon be used for more than just queueing CPU work.

Reviewed-by: Richard Henderson 
Reviewed-by: Alex Bennée 
Signed-off-by: Emilio G. Cota 
Signed-off-by: Robert Foley 
---
 cpus-common.c | 72 ++-
 cpus.c|  2 +-
 hw/core/cpu.c |  1 +
 include/hw/core/cpu.h |  6 ++--
 4 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index f75cae23d9..9031c31005 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -26,7 +26,6 @@
 static QemuMutex qemu_cpu_list_lock;
 static QemuCond exclusive_cond;
 static QemuCond exclusive_resume;
-static QemuCond qemu_work_cond;
 
 /* >= 1 if a thread is inside start_exclusive/end_exclusive.  Written
  * under qemu_cpu_list_lock, read with atomic operations.
@@ -42,7 +41,6 @@ void qemu_init_cpu_list(void)
 qemu_mutex_init(_cpu_list_lock);
 qemu_cond_init(_cond);
 qemu_cond_init(_resume);
-qemu_cond_init(_work_cond);
 }
 
 void cpu_list_lock(void)
@@ -105,23 +103,37 @@ struct qemu_work_item {
 bool free, exclusive, done;
 };
 
-static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
+/* Called with the CPU's lock held */
+static void queue_work_on_cpu_locked(CPUState *cpu, struct qemu_work_item *wi)
 {
-qemu_mutex_lock(>lock);
 QSIMPLEQ_INSERT_TAIL(>work_list, wi, node);
 wi->done = false;
-qemu_mutex_unlock(>lock);
 
 qemu_cpu_kick(cpu);
 }
 
-void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data,
-   QemuMutex *mutex)
+static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
+{
+cpu_mutex_lock(cpu);
+queue_work_on_cpu_locked(cpu, wi);
+cpu_mutex_unlock(cpu);
+}
+
+void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
 {
 struct qemu_work_item wi;
+bool has_bql = qemu_mutex_iothread_locked();
+
+g_assert(no_cpu_mutex_locked());
 
 if (qemu_cpu_is_self(cpu)) {
-func(cpu, data);
+if (has_bql) {
+func(cpu, data);
+} else {
+qemu_mutex_lock_iothread();
+func(cpu, data);
+qemu_mutex_unlock_iothread();
+}
 return;
 }
 
@@ -131,13 +143,34 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, 
run_on_cpu_data data,
 wi.free = false;
 wi.exclusive = false;
 
-queue_work_on_cpu(cpu, );
+cpu_mutex_lock(cpu);
+queue_work_on_cpu_locked(cpu, );
+
+/*
+ * We are going to sleep on the CPU lock, so release the BQL.
+ *
+ * During the transition to per-CPU locks, we release the BQL _after_
+ * having kicked the destination CPU (from queue_work_on_cpu_locked above).
+ * This makes sure that the enqueued work will be seen by the CPU
+ * after being woken up from the kick, since the CPU sleeps on the BQL.
+ * Once we complete the transition to per-CPU locks, we will release
+ * the BQL earlier in this function.
+ */
+if (has_bql) {
+qemu_mutex_unlock_iothread();
+}
+
 while (!atomic_mb_read()) {
 CPUState *self_cpu = current_cpu;
 
-qemu_cond_wait(_work_cond, mutex);
+qemu_cond_wait(>cond, >lock);
 current_cpu = self_cpu;
 }
+cpu_mutex_unlock(cpu);
+
+if (has_bql) {
+qemu_mutex_lock_iothread();
+}
 }
 
 void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data 
data)
@@ -303,6 +336,7 @@ void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func 
func,
 void process_queued_cpu_work(CPUState *cpu)
 {
 struct qemu_work_item *wi;
+bool has_bql = qemu_mutex_iothread_locked();
 
 qemu_mutex_lock(>lock);
 if (QSIMPLEQ_EMPTY(>work_list)) {
@@ -320,13 +354,23 @@ void process_queued_cpu_work(CPUState *cpu)
  * BQL, so it goes to sleep; start_exclusive() is sleeping too, so
  * neither CPU can proceed.
  */
-qemu_mutex_unlock_iothread();
+if (has_bql) {
+qemu_mutex_unlock_iothread();
+}
 start_exclusive();
 wi->func(cpu, wi->data);
 end_exclusive();
-qemu_mutex_lock_iothread();
+if (has_bql) {
+qemu_mutex_lock_iothread();
+}
 } else {
-wi->func(cpu, wi->data);
+if (has_bql) {
+wi->func(cpu, wi->data);
+} else {
+qemu_mutex_lock_iothread();
+wi->func(cpu, wi->data);
+qemu_mutex_unlock_iothread();
+}
 }
 qemu_mutex_lock(>lock);
 if (wi->free) {
@@ -336,5 +380,5 @@ void process_queued_cpu_work(CPUState *cpu)
 }
 }
 qemu_mutex_unlock(>lock);
-qemu_cond_broadcast(_work_cond);
+

[PATCH v8 01/74] cpu: convert queued work to a QSIMPLEQ

2020-03-26 Thread Robert Foley

From: "Emilio G. Cota" 

Instead of open-coding it.

While at it, make sure that all accesses to the list are
performed while holding the list's lock.

Reviewed-by: Richard Henderson 
Reviewed-by: Alex Bennée 
Signed-off-by: Emilio G. Cota 
Signed-off-by: Robert Foley 
---
 cpus-common.c | 25 -
 cpus.c| 14 --
 hw/core/cpu.c |  1 +
 include/hw/core/cpu.h |  6 +++---
 4 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/cpus-common.c b/cpus-common.c
index eaf590cb38..3d659df464 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -99,7 +99,7 @@ void cpu_list_remove(CPUState *cpu)
 }
 
 struct qemu_work_item {
-struct qemu_work_item *next;
+QSIMPLEQ_ENTRY(qemu_work_item) node;
 run_on_cpu_func func;
 run_on_cpu_data data;
 bool free, exclusive, done;
@@ -108,13 +108,7 @@ struct qemu_work_item {
 static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi)
 {
 qemu_mutex_lock(>work_mutex);
-if (cpu->queued_work_first == NULL) {
-cpu->queued_work_first = wi;
-} else {
-cpu->queued_work_last->next = wi;
-}
-cpu->queued_work_last = wi;
-wi->next = NULL;
+QSIMPLEQ_INSERT_TAIL(>work_list, wi, node);
 wi->done = false;
 qemu_mutex_unlock(>work_mutex);
 
@@ -310,17 +304,14 @@ void process_queued_cpu_work(CPUState *cpu)
 {
 struct qemu_work_item *wi;
 
-if (cpu->queued_work_first == NULL) {
+qemu_mutex_lock(>work_mutex);
+if (QSIMPLEQ_EMPTY(>work_list)) {
+qemu_mutex_unlock(>work_mutex);
 return;
 }
-
-qemu_mutex_lock(>work_mutex);
-while (cpu->queued_work_first != NULL) {
-wi = cpu->queued_work_first;
-cpu->queued_work_first = wi->next;
-if (!cpu->queued_work_first) {
-cpu->queued_work_last = NULL;
-}
+while (!QSIMPLEQ_EMPTY(>work_list)) {
+wi = QSIMPLEQ_FIRST(>work_list);
+QSIMPLEQ_REMOVE_HEAD(>work_list, node);
 qemu_mutex_unlock(>work_mutex);
 if (wi->exclusive) {
 /* Running work items outside the BQL avoids the following 
deadlock:
diff --git a/cpus.c b/cpus.c
index ef441bdf62..151abbc04c 100644
--- a/cpus.c
+++ b/cpus.c
@@ -96,9 +96,19 @@ bool cpu_is_stopped(CPUState *cpu)
 return cpu->stopped || !runstate_is_running();
 }
 
+static inline bool cpu_work_list_empty(CPUState *cpu)
+{
+bool ret;
+
+qemu_mutex_lock(>work_mutex);
+ret = QSIMPLEQ_EMPTY(>work_list);
+qemu_mutex_unlock(>work_mutex);
+return ret;
+}
+
 static bool cpu_thread_is_idle(CPUState *cpu)
 {
-if (cpu->stop || cpu->queued_work_first) {
+if (cpu->stop || !cpu_work_list_empty(cpu)) {
 return false;
 }
 if (cpu_is_stopped(cpu)) {
@@ -1490,7 +1500,7 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
 cpu = first_cpu;
 }
 
-while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
+while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) {
 
 atomic_mb_set(_current_rr_cpu, cpu);
 current_cpu = cpu;
diff --git a/hw/core/cpu.c b/hw/core/cpu.c
index 786a1bec8a..2fc6aa2159 100644
--- a/hw/core/cpu.c
+++ b/hw/core/cpu.c
@@ -368,6 +368,7 @@ static void cpu_common_initfn(Object *obj)
 cpu->nr_threads = 1;
 
 qemu_mutex_init(>work_mutex);
+QSIMPLEQ_INIT(>work_list);
 QTAILQ_INIT(>breakpoints);
 QTAILQ_INIT(>watchpoints);
 
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 5bf94d28cf..398b65159e 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -331,8 +331,8 @@ struct qemu_work_item;
  * @opaque: User data.
  * @mem_io_pc: Host Program Counter at which the memory was accessed.
  * @kvm_fd: vCPU file descriptor for KVM.
- * @work_mutex: Lock to prevent multiple access to queued_work_*.
- * @queued_work_first: First asynchronous work pending.
+ * @work_mutex: Lock to prevent multiple access to @work_list.
+ * @work_list: List of pending asynchronous work.
  * @trace_dstate_delayed: Delayed changes to trace_dstate (includes all changes
  *to @trace_dstate).
  * @trace_dstate: Dynamic tracing state of events for this vCPU (bitmask).
@@ -376,7 +376,7 @@ struct CPUState {
 sigjmp_buf jmp_env;
 
 QemuMutex work_mutex;
-struct qemu_work_item *queued_work_first, *queued_work_last;
+QSIMPLEQ_HEAD(, qemu_work_item) work_list;
 
 CPUAddressSpace *cpu_ases;
 int num_ases;
-- 
2.17.1

[PATCH v8 03/74] cpu: introduce cpu_mutex_lock/unlock

2020-03-26 Thread Robert Foley

From: "Emilio G. Cota" 

The few direct users of >lock will be converted soon.

The per-thread bitmap introduced here might seem unnecessary,
since a bool could just do. However, once we complete the
conversion to per-vCPU locks, we will need to cover the use
case where all vCPUs are locked by the same thread, which
explains why the bitmap is introduced here.

Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
Signed-off-by: Robert Foley 
---
 cpus.c| 48 +--
 include/hw/core/cpu.h | 33 +
 stubs/Makefile.objs   |  1 +
 stubs/cpu-lock.c  | 28 +
 4 files changed, 108 insertions(+), 2 deletions(-)
 create mode 100644 stubs/cpu-lock.c

diff --git a/cpus.c b/cpus.c
index 71bd2f5d55..633734fb5c 100644
--- a/cpus.c
+++ b/cpus.c
@@ -91,6 +91,47 @@ static unsigned int throttle_percentage;
 #define CPU_THROTTLE_PCT_MAX 99
 #define CPU_THROTTLE_TIMESLICE_NS 1000
 
+/* XXX: is this really the max number of CPUs? */
+#define CPU_LOCK_BITMAP_SIZE 2048
+
+/*
+ * Note: we index the bitmap with cpu->cpu_index + 1 so that the logic
+ * also works during early CPU initialization, when cpu->cpu_index is set to
+ * UNASSIGNED_CPU_INDEX == -1.
+ */
+static __thread DECLARE_BITMAP(cpu_lock_bitmap, CPU_LOCK_BITMAP_SIZE);
+
+bool no_cpu_mutex_locked(void)
+{
+return bitmap_empty(cpu_lock_bitmap, CPU_LOCK_BITMAP_SIZE);
+}
+
+void cpu_mutex_lock_impl(CPUState *cpu, const char *file, int line)
+{
+/* coverity gets confused by the indirect function call */
+#ifdef __COVERITY__
+qemu_mutex_lock_impl(>lock, file, line);
+#else
+QemuMutexLockFunc f = atomic_read(_mutex_lock_func);
+
+g_assert(!cpu_mutex_locked(cpu));
+set_bit(cpu->cpu_index + 1, cpu_lock_bitmap);
+f(>lock, file, line);
+#endif
+}
+
+void cpu_mutex_unlock_impl(CPUState *cpu, const char *file, int line)
+{
+g_assert(cpu_mutex_locked(cpu));
+qemu_mutex_unlock_impl(>lock, file, line);
+clear_bit(cpu->cpu_index + 1, cpu_lock_bitmap);
+}
+
+bool cpu_mutex_locked(const CPUState *cpu)
+{
+return test_bit(cpu->cpu_index + 1, cpu_lock_bitmap);
+}
+
 bool cpu_is_stopped(CPUState *cpu)
 {
 return cpu->stopped || !runstate_is_running();
@@ -100,9 +141,9 @@ static inline bool cpu_work_list_empty(CPUState *cpu)
 {
 bool ret;
 
-qemu_mutex_lock(>lock);
+cpu_mutex_lock(cpu);
 ret = QSIMPLEQ_EMPTY(>work_list);
-qemu_mutex_unlock(>lock);
+cpu_mutex_unlock(cpu);
 return ret;
 }
 
@@ -1837,6 +1878,9 @@ void qemu_mutex_lock_iothread_impl(const char *file, int 
line)
 {
 QemuMutexLockFunc bql_lock = atomic_read(_bql_mutex_lock_func);
 
+/* enforce locking order */
+g_assert(no_cpu_mutex_locked());
+
 g_assert(!qemu_mutex_iothread_locked());
 bql_lock(_global_mutex, file, line);
 iothread_locked = true;
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 0b75fdb093..4521bba7a5 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -457,6 +457,39 @@ extern CPUTailQ cpus;
 
 extern __thread CPUState *current_cpu;
 
+/**
+ * cpu_mutex_lock - lock a CPU's mutex
+ * @cpu: the CPU whose mutex is to be locked
+ *
+ * To avoid deadlock, a CPU's mutex must be acquired after the BQL.
+ */
+#define cpu_mutex_lock(cpu) \
+cpu_mutex_lock_impl(cpu, __FILE__, __LINE__)
+void cpu_mutex_lock_impl(CPUState *cpu, const char *file, int line);
+
+/**
+ * cpu_mutex_unlock - unlock a CPU's mutex
+ * @cpu: the CPU whose mutex is to be unlocked
+ */
+#define cpu_mutex_unlock(cpu)   \
+cpu_mutex_unlock_impl(cpu, __FILE__, __LINE__)
+void cpu_mutex_unlock_impl(CPUState *cpu, const char *file, int line);
+
+/**
+ * cpu_mutex_locked - check whether a CPU's mutex is locked
+ * @cpu: the CPU of interest
+ *
+ * Returns true if the calling thread is currently holding the CPU's mutex.
+ */
+bool cpu_mutex_locked(const CPUState *cpu);
+
+/**
+ * no_cpu_mutex_locked - check whether any CPU mutex is held
+ *
+ * Returns true if the calling thread is not holding any CPU mutex.
+ */
+bool no_cpu_mutex_locked(void);
+
 static inline void cpu_tb_jmp_cache_clear(CPUState *cpu)
 {
 unsigned int i;
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 45be5dc0ed..d2dd6c94cc 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -5,6 +5,7 @@ stub-obj-y += blockdev-close-all-bdrv-states.o
 stub-obj-y += clock-warp.o
 stub-obj-y += cpu-get-clock.o
 stub-obj-y += cpu-get-icount.o
+stub-obj-y += cpu-lock.o
 stub-obj-y += dump.o
 stub-obj-y += error-printf.o
 stub-obj-y += fdset.o
diff --git a/stubs/cpu-lock.c b/stubs/cpu-lock.c
new file mode 100644
index 00..ca2ea8a9c2
--- /dev/null
+++ b/stubs/cpu-lock.c
@@ -0,0 +1,28 @@
+#include "qemu/osdep.h"
+#include "hw/core/cpu.h"
+
+void cpu_mutex_lock_impl(CPUState *cpu, const char *file, int line)
+{
+/* coverity gets confused by the indirect function call */
+#ifdef

[PATCH v8 00/74] per-CPU locks

2020-03-26 Thread Robert Foley

V7: https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg00786.html

This is a continuation of the series created by Emilio Cota.
We are picking up this patch set with the goal to apply 
any fixes or updates needed to get this accepted.

Quoting an earlier patch in the series:
"For context, the goal of this series is to substitute the BQL for the
per-CPU locks in many places, notably the execution loop in cpus.c.
This leads to better scalability for MTTCG, since CPUs don't have
to acquire a contended global lock (the BQL) every time they
stop executing code.
See the last commit for some performance numbers."

Listed below are the changes for this version of the patch, 
aside from the merge related changes.

Changes for V8:
- Fixed issue where in rr mode we could destroy the BQL twice.
  Added new function cpu_mutex_destroy().
- Removed g_assert(qemu_mutex_iothread_locked())
  from qemu_tcg_rr_all_cpu_threads_idle().  There is an existing
  case where we call qemu_tcg_rr_all_cpu_threads_idle() without
  the BQL held, so we cannot assert on the lock here.
- Found/fixed bug that had been hit in testing previously during 
  the last consideration of this patch.
  We reproduced the issue hit in the qtest: bios-tables-test.
  The issue was introduced by dropping the BQL, and found us
  (very rarely) missing the condition variable wakeup in
  qemu_tcg_rr_cpu_thread_fn().
- ppc: convert to cpu_halted
  - Converted new code for cpu_halted and cpu_halted_set.
- hw/semihosting: convert to cpu_halted_set
  -  Added this patch as this code was new and needed converting.
- ppc/translate_init.inc.c
  - Translated some new code here to use cpu_has_work_with_iothread_lock.
- ppc/sapr_hcall.c Translated new code to cpu_halted
- i386/hax-all.c - converted new code to cpu_interrupt_request and cpu_halted
- mips/kvm.c - converted new code to cpu_halted
- Some changes were related to files that moved, cpu.c and cpu.h
  moved to hw/core/, and some changes needed to be put
  there manually during the merge.

Emilio G. Cota (69):
  cpu: convert queued work to a QSIMPLEQ
  cpu: rename cpu->work_mutex to cpu->lock
  cpu: introduce cpu_mutex_lock/unlock
  cpu: make qemu_work_cond per-cpu
  cpu: move run_on_cpu to cpus-common
  cpu: introduce process_queued_cpu_work_locked
  cpu: make per-CPU locks an alias of the BQL in TCG rr mode
  tcg-runtime: define helper_cpu_halted_set
  ppc: convert to helper_cpu_halted_set
  cris: convert to helper_cpu_halted_set
  hppa: convert to helper_cpu_halted_set
  m68k: convert to helper_cpu_halted_set
  alpha: convert to helper_cpu_halted_set
  microblaze: convert to helper_cpu_halted_set
  cpu: define cpu_halted helpers
  tcg-runtime: convert to cpu_halted_set
  arm: convert to cpu_halted
  ppc: convert to cpu_halted
  sh4: convert to cpu_halted
  i386: convert to cpu_halted
  lm32: convert to cpu_halted
  m68k: convert to cpu_halted
  mips: convert to cpu_halted
  riscv: convert to cpu_halted
  s390x: convert to cpu_halted
  sparc: convert to cpu_halted
  xtensa: convert to cpu_halted
  gdbstub: convert to cpu_halted
  openrisc: convert to cpu_halted
  cpu-exec: convert to cpu_halted
  cpu: convert to cpu_halted
  cpu: define cpu_interrupt_request helpers
  exec: use cpu_reset_interrupt
  arm: convert to cpu_interrupt_request
  i386: convert to cpu_interrupt_request
  i386/kvm: convert to cpu_interrupt_request
  i386/hax-all: convert to cpu_interrupt_request
  i386/whpx-all: convert to cpu_interrupt_request
  i386/hvf: convert to cpu_request_interrupt
  ppc: convert to cpu_interrupt_request
  sh4: convert to cpu_interrupt_request
  cris: convert to cpu_interrupt_request
  hppa: convert to cpu_interrupt_request
  lm32: convert to cpu_interrupt_request
  m68k: convert to cpu_interrupt_request
  mips: convert to cpu_interrupt_request
  nios: convert to cpu_interrupt_request
  s390x: convert to cpu_interrupt_request
  alpha: convert to cpu_interrupt_request
  moxie: convert to cpu_interrupt_request
  sparc: convert to cpu_interrupt_request
  openrisc: convert to cpu_interrupt_request
  unicore32: convert to cpu_interrupt_request
  microblaze: convert to cpu_interrupt_request
  accel/tcg: convert to cpu_interrupt_request
  cpu: convert to interrupt_request
  cpu: call .cpu_has_work with the CPU lock held
  cpu: introduce cpu_has_work_with_iothread_lock
  ppc: convert to cpu_has_work_with_iothread_lock
  mips: convert to cpu_has_work_with_iothread_lock
  s390x: convert to cpu_has_work_with_iothread_lock
  riscv: convert to cpu_has_work_with_iothread_lock
  sparc: convert to cpu_has_work_with_iothread_lock
  xtensa: convert to cpu_has_work_with_iothread_lock
  cpu: rename all_cpu_threads_idle to qemu_tcg_rr_all_cpu_threads_idle
  cpu: protect CPU state with cpu->lock instead of the BQL
  cpus-common: release BQL earlier in run_on_cpu
  cpu: add async_run_on_cpu_no_bql
  cputlb: queue async flush jobs without the BQL

Paolo Bonzini (4):
  ppc: use cpu_reset_interrupt
  i386: use cpu_reset_interrupt

Re: [PATCH v16 QEMU 13/16] vfio: Add function to start and stop dirty pages tracking

2020-03-26 Thread Alex Williamson

On Wed, 25 Mar 2020 02:39:11 +0530
Kirti Wankhede  wrote:

> Call VFIO_IOMMU_DIRTY_PAGES ioctl to start and stop dirty pages tracking
> for VFIO devices.
> 
> Signed-off-by: Kirti Wankhede 
> ---
>  hw/vfio/migration.c | 36 
>  1 file changed, 36 insertions(+)
> 
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> index ab295d25620e..1827b7cfb316 100644
> --- a/hw/vfio/migration.c
> +++ b/hw/vfio/migration.c
> @@ -9,6 +9,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/main-loop.h"
> +#include 
>  #include 
>  
>  #include "sysemu/runstate.h"
> @@ -296,6 +297,32 @@ static int vfio_load_device_config_state(QEMUFile *f, 
> void *opaque)
>  return qemu_file_get_error(f);
>  }
>  
> +static int vfio_start_dirty_page_tracking(VFIODevice *vbasedev, bool start)
> +{
> +int ret;
> +VFIOContainer *container = vbasedev->group->container;
> +struct vfio_iommu_type1_dirty_bitmap dirty = {
> +.argsz = sizeof(dirty),
> +};
> +
> +if (start) {
> +if (vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) {
> +dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START;
> +} else {
> +return 0;
> +}
> +} else {
> +dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP;
> +}

Dirty logging and device saving are logically separate, why do we link
them here?

Why do we return success when we want to start logging if we haven't
started logging?

> +
> +ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, );
> +if (ret) {
> +error_report("Failed to set dirty tracking flag 0x%x errno: %d",
> + dirty.flags, errno);
> +}
> +return ret;
> +}
> +
>  /* -- */
>  
>  static int vfio_save_setup(QEMUFile *f, void *opaque)
> @@ -330,6 +357,11 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
>   */
>  qemu_put_be64(f, migration->region.size);
>  
> +ret = vfio_start_dirty_page_tracking(vbasedev, true);
> +if (ret) {
> +return ret;
> +}
> +

Haven't we corrupted the migration stream by exiting here?  Maybe this
implies the entire migration fails, therefore we don't need to add the
end marker?  Thanks,

Alex

>  qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
>  
>  ret = qemu_file_get_error(f);
> @@ -346,6 +378,8 @@ static void vfio_save_cleanup(void *opaque)
>  VFIODevice *vbasedev = opaque;
>  VFIOMigration *migration = vbasedev->migration;
>  
> +vfio_start_dirty_page_tracking(vbasedev, false);
> +
>  if (migration->region.mmaps) {
>  vfio_region_unmap(>region);
>  }
> @@ -669,6 +703,8 @@ static void vfio_migration_state_notifier(Notifier 
> *notifier, void *data)
>  if (ret) {
>  error_report("%s: Failed to set state RUNNING", vbasedev->name);
>  }
> +
> +vfio_start_dirty_page_tracking(vbasedev, false);
>  }
>  }
>

[Bug 1693050] Re: xhci HCIVERSION register read emulation incorrectly handled

2020-03-26 Thread Philippe Mathieu-Daudé

Odd, this should be fixed by commit 6ee021d41 and
36960b4d66..98f52cdbb5c.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1693050

Title:
  xhci HCIVERSION register read emulation incorrectly handled

Status in QEMU:
  New

Bug description:
  We had an illumos user trying to run illumos in QEMU 2.9.0 with the
  qemu-xhci device enabled. Note, that while this was discovered against
  QEMU 2.9.0, from my current read of the HEAD, it is still present. The
  illumos bug at https://www.illumos.org/issues/8173 has additional
  information on how the user invoked qemu. While investigating the
  problem we found that the illumos driver was reading a version of
  0x when reading the HCIVERSION register from qemu.

  In the illumos driver we're performing a 16-bit read of the version
  register at offset 0x2. From looking around at other OSes, while Linux
  performs a 4 byte read at offset 0x0 and masks out the version, others
  that care about the version are doing a two byte read, though not all
  actually act on the version and some just discard the information.

  The user who hit this was able to enable tracing (note the tracing
  file is attached to the illumos bug linked previously) and we hit the
  unimplemented register read with offset 0x2 at
  http://git.qemu.org/?p=qemu.git;a=blob;f=hw/usb/hcd-
  xhci.c;hb=HEAD#l2960. The xhci register specifies today that its
  allowed for users to do 1-4 byte reads; however, that it implements
  only four byte reads in its implementation
  (http://git.qemu.org/?p=qemu.git;a=blob;f=hw/usb/hcd-
  xhci.c;hb=HEAD#l). Hence why when we read the HCIVERSION register
  at offset 0x2, it isn't handled in xhci_cap_read() which then returns
  zeros.

  From digging into this, I think that we're coming into
  memory_region_dispatch_read() and then memory_region_dispatch_read1().
  However, I don't see anything in either the general memory region
  logic or in the xhci_cap_read() function that would deal with
  adjusting the offset that we're reading at and then masking it off
  again. While the access_with_adjusted_size() attempts to deal with
  this, it never adjusts the hardware address that's passed in to be a
  multiple of the implementation defined offset that I can see. I
  suspect that the FIXME at line 582
  (http://git.qemu.org/?p=qemu.git;a=blob;f=memory.c;hb=HEAD#l582) and
  the implementation in the xhci code is the crux of the problem.

  For the time being we're working around this in the illumos driver,
  but I wanted to point this out such that it might be helpful for other
  systems which are assuming that they can do the two byte read like on
  hardware.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1693050/+subscriptions

Re: [PATCH] i386/cpu: Expand MAX_FIXED_COUNTERS from 3 to 4 to for Icelake

2020-03-26 Thread Paolo Bonzini

On 17/03/20 06:54, Like Xu wrote:
> In the Intel SDM, "Table 18-2. Association of Fixed-Function
> Performance Counters with Architectural Performance Events",
> we may have a new fixed counter 'TOPDOWN.SLOTS' (since Icelake),
> which counts the number of available slots for an unhalted
> logical processor. Check commit 6017608936 in the kernel tree.
> 
> Signed-off-by: Like Xu 
> ---
>  target/i386/cpu.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 576f309bbf..ec2b67d425 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1185,7 +1185,7 @@ typedef struct {
>  #define CPU_NB_REGS CPU_NB_REGS32
>  #endif
>  
> -#define MAX_FIXED_COUNTERS 3
> +#define MAX_FIXED_COUNTERS 4
>  #define MAX_GP_COUNTERS(MSR_IA32_PERF_STATUS - MSR_P6_EVNTSEL0)
>  
>  #define TARGET_INSN_START_EXTRA_WORDS 1
> 

Hi Like, the problem with this patch is that it breaks live migration;
the vmstate_msr_architectural_pmu record hardcodes MAX_FIXED_COUNTERS as
the number of registers.

So it's more complicated, you need to add a new subsection (following
vmstate_msr_architectural_pmu) and transmit it only if the 4th counter
is nonzero (instead of the more complicated check in pmu_enable_needed).
 Just to be safe, I'd make the new subsection hold 16 counters and bump
MAX_FIXED_COUNTERS to 16.

Thanks,

Paolo

Re: [PATCH] qcow2: Remove unused fields from BDRVQcow2State

2020-03-26 Thread Philippe Mathieu-Daudé


On 3/26/20 6:07 PM, Kevin Wolf wrote:

These fields were already removed in commit c3c10f72, but then commit
b58deb34 revived them probably due to bad merge conflict resolution.
They are still unused, so remove them again.

Fixes: b58deb344ddff3b9d8b265bf73a65274767ee5f4
Signed-off-by: Kevin Wolf 
---
  block/qcow2.h | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index 0942126232..f4de0a27d5 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -301,9 +301,6 @@ typedef struct BDRVQcow2State {
  QEMUTimer *cache_clean_timer;
  unsigned cache_clean_interval;
  
-uint8_t *cluster_cache;

-uint8_t *cluster_data;
-uint64_t cluster_cache_offset;
  QLIST_HEAD(, QCowL2Meta) cluster_allocs;
  
  uint64_t *refcount_table;




Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH v2 2/2] mirror: Wait only for in-flight operations

2020-03-26 Thread Max Reitz

On 26.03.20 16:36, Kevin Wolf wrote:
> mirror_wait_for_free_in_flight_slot() just picks a random operation to
> wait for. However, a MirrorOp is already in s->ops_in_flight when
> mirror_co_read() waits for free slots, so if not enough slots are
> immediately available, an operation can end up waiting for itself, or
> two or more operations can wait for each other to complete, which
> results in a hang.
> 
> Fix this by adding a flag to MirrorOp that tells us if the request is
> already in flight (and therefore occupies slots that it will later
> free), and picking only such operations for waiting.
> 
> Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692
> Signed-off-by: Kevin Wolf 
> ---
>  block/mirror.c | 9 -
>  1 file changed, 8 insertions(+), 1 deletion(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 393131b135..88414d1653 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c

[...]

> @@ -1318,6 +1324,7 @@ static MirrorOp *coroutine_fn 
> active_write_prepare(MirrorBlockJob *s,
>  .offset = offset,
>  .bytes  = bytes,
>  .is_active_write= true,
> +.is_in_flight   = true,
>  };
>  qemu_co_queue_init(>waiting_requests);
>  QTAILQ_INSERT_TAIL(>ops_in_flight, op, next);
> 

There is a mirror_wait_on_conflicts() call after this.  I was a bit
worried about dependencies there.  But I don’t think there’s any
problem, because mirror_wait_for_any_operation() is only called by:

(1) mirror_wait_for_free_in_flight_slot(), which makes it look for
non-active operations only, and

(2) mirror_run(), which specifically waits for all active operations to
settle, so it makes sense to wait for all of them, even when they are
still doing their own dependency-waiting.

But still, I’m not sure whether this is conceptually the best thing to
do.  I think what we actually want is for
mirror_wait_for_free_in_flight_slot() to only wait for in-flight
operations; but the call in mirror_run() that waits for active-mirror
operations wants to wait for all active-mirror operations, not just the
ones that are in flight.

So I think conceptually it would make more sense to set is_in_flight
only after mirror_wait_on_conflicts(), and ensure that the
mirror_wait_for_any_operation() call from mirror_run() ignores
is_in_flight.  E.g. by having another parameter “bool in_flight” for
mirror_wait_for_any_operation() that chooses whether to check
is_in_flight or whether to ignore it.

In practice, @in_flight would always be the same as @active, but they
are different things.  But that would mean we would always ignore
is_in_flight for active-mirror operations.

In practice, there’s no difference to what this patch does, i.e. to just
let active-mirror operations have is_in_flight to be always true and let
mirror_wait_for_any_operation() check is_in_flight unconditionally.

So I don’t know.  Maybe this is a start:

Functionally-reviewed-by: Max Reitz 

Max

signature.asc
Description: OpenPGP digital signature

Re: [PATCH v6 23/61] target/riscv: vector integer merge and move instructions

2020-03-26 Thread Alistair Francis

On Tue, Mar 17, 2020 at 8:53 AM LIU Zhiwei  wrote:
>
> Signed-off-by: LIU Zhiwei 

Reviewed-by: Alistair Francis 

Alistair

> ---
>  target/riscv/helper.h   |  17 
>  target/riscv/insn32.decode  |   7 ++
>  target/riscv/insn_trans/trans_rvv.inc.c | 121 
>  target/riscv/vector_helper.c| 100 
>  4 files changed, 245 insertions(+)
>
> diff --git a/target/riscv/helper.h b/target/riscv/helper.h
> index 1f0d3d60e3..f378db9cbf 100644
> --- a/target/riscv/helper.h
> +++ b/target/riscv/helper.h
> @@ -665,3 +665,20 @@ DEF_HELPER_6(vwmaccsu_vx_w, void, ptr, ptr, tl, ptr, 
> env, i32)
>  DEF_HELPER_6(vwmaccus_vx_b, void, ptr, ptr, tl, ptr, env, i32)
>  DEF_HELPER_6(vwmaccus_vx_h, void, ptr, ptr, tl, ptr, env, i32)
>  DEF_HELPER_6(vwmaccus_vx_w, void, ptr, ptr, tl, ptr, env, i32)
> +
> +DEF_HELPER_6(vmerge_vvm_b, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmerge_vvm_h, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmerge_vvm_w, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmerge_vvm_d, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmerge_vxm_b, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmerge_vxm_h, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmerge_vxm_w, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmerge_vxm_d, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_4(vmv_v_v_b, void, ptr, ptr, env, i32)
> +DEF_HELPER_4(vmv_v_v_h, void, ptr, ptr, env, i32)
> +DEF_HELPER_4(vmv_v_v_w, void, ptr, ptr, env, i32)
> +DEF_HELPER_4(vmv_v_v_d, void, ptr, ptr, env, i32)
> +DEF_HELPER_4(vmv_v_x_b, void, ptr, i64, env, i32)
> +DEF_HELPER_4(vmv_v_x_h, void, ptr, i64, env, i32)
> +DEF_HELPER_4(vmv_v_x_w, void, ptr, i64, env, i32)
> +DEF_HELPER_4(vmv_v_x_d, void, ptr, i64, env, i32)
> diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
> index 9735ac3565..adb76956c9 100644
> --- a/target/riscv/insn32.decode
> +++ b/target/riscv/insn32.decode
> @@ -71,6 +71,7 @@
>  @r_nfvm  ... ... vm:1 . . ... . ...  %nf %rs2 %rs1 %rd
>  @r_vm.. vm:1 . . ... . ...  %rs2 %rs1 %rd
>  @r_vm_1  .. . . . ... . ... vm=1 %rs2 %rs1 %rd
> +@r_vm_0  .. . . . ... . ... vm=0 %rs2 %rs1 %rd
>  @r_wdvm  . wd:1 vm:1 . . ... . ...  %rs2 %rs1 %rd
>  @r2_zimm . zimm:11  . ... . ... %rs1 %rd
>
> @@ -400,6 +401,12 @@ vwmacc_vx   01 . . . 110 . 1010111 
> @r_vm
>  vwmaccsu_vv 10 . . . 010 . 1010111 @r_vm
>  vwmaccsu_vx 10 . . . 110 . 1010111 @r_vm
>  vwmaccus_vx 11 . . . 110 . 1010111 @r_vm
> +vmv_v_v 010111 1 0 . 000 . 1010111 @r2
> +vmv_v_x 010111 1 0 . 100 . 1010111 @r2
> +vmv_v_i 010111 1 0 . 011 . 1010111 @r2
> +vmerge_vvm  010111 0 . . 000 . 1010111 @r_vm_0
> +vmerge_vxm  010111 0 . . 100 . 1010111 @r_vm_0
> +vmerge_vim  010111 0 . . 011 . 1010111 @r_vm_0
>
>  vsetvli 0 ... . 111 . 1010111  @r2_zimm
>  vsetvl  100 . . 111 . 1010111  @r
> diff --git a/target/riscv/insn_trans/trans_rvv.inc.c 
> b/target/riscv/insn_trans/trans_rvv.inc.c
> index 269d04c7fb..42ef59364f 100644
> --- a/target/riscv/insn_trans/trans_rvv.inc.c
> +++ b/target/riscv/insn_trans/trans_rvv.inc.c
> @@ -1499,3 +1499,124 @@ GEN_OPIVX_WIDEN_TRANS(vwmaccu_vx)
>  GEN_OPIVX_WIDEN_TRANS(vwmacc_vx)
>  GEN_OPIVX_WIDEN_TRANS(vwmaccsu_vx)
>  GEN_OPIVX_WIDEN_TRANS(vwmaccus_vx)
> +
> +/* Vector Integer Merge and Move Instructions */
> +static bool trans_vmv_v_v(DisasContext *s, arg_vmv_v_v *a)
> +{
> +if (vext_check_isa_ill(s) &&
> +vext_check_reg(s, a->rd, false) &&
> +vext_check_reg(s, a->rs1, false)) {
> +
> +if (s->vl_eq_vlmax) {
> +tcg_gen_gvec_mov(s->sew, vreg_ofs(s, a->rd),
> + vreg_ofs(s, a->rs1),
> + MAXSZ(s), MAXSZ(s));
> +} else {
> +uint32_t data = FIELD_DP32(0, VDATA, LMUL, s->lmul);
> +static gen_helper_gvec_2_ptr * const fns[4] = {
> +gen_helper_vmv_v_v_b, gen_helper_vmv_v_v_h,
> +gen_helper_vmv_v_v_w, gen_helper_vmv_v_v_d,
> +};
> +
> +tcg_gen_gvec_2_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, a->rs1),
> +   cpu_env, 0, s->vlen / 8, data, fns[s->sew]);
> +}
> +return true;
> +}
> +return false;
> +}
> +
> +typedef void gen_helper_vmv_vx(TCGv_ptr, TCGv_i64, TCGv_env, TCGv_i32);
> +static bool trans_vmv_v_x(DisasContext *s, arg_vmv_v_x *a)
> +{
> +if (vext_check_isa_ill(s) &&
> +vext_check_reg(s, a->rd, false)) {
> +
> +TCGv s1 = tcg_temp_new();
> +gen_get_gpr(s1, a->rs1);
> +
> +if (s->vl_eq_vlmax) {
> +#ifdef TARGET_RISCV64
> +

[Bug 1869241] [NEW] svn checkout fails with E000075 "Value too large for defined data type"

2020-03-26 Thread Manuel Reimer

Public bug reported:

I try to autobuild for ARM7 with qemu-arm-static. Part of this is
downloading source via SVN.

Whenever I try to download a SVN repository I get the following output:

svn: E75: Can't read directory '...': Value too large for
defined data type

This is the repository I'm trying to check out:
https://svn.baycom.de/repos/vdr-mcli-plugin/

qemu-arm-static version is 4.2.0

I've also tried older versions without change.

Platform I try to emulate is armv7h (Arch Linux ARM for Raspberry Pi 2)

Host system is AMD64

This can be reproduced 100% of the time. Here I have the issue happening
on Travis CI:

https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L9944

** Affects: qemu
 Importance: Undecided
 Status: New

** Description changed:

  I try to autobuild for ARM7 with qemu-arm-static. Part of this is
  downloading source via SVN.
  
  Whenever I try to download a SVN repository I get the following output:
  
- svn: E75: Can't read directory '...': Value too large for
+ svn: E75: Can't read directory '...': Value too large for
  defined data type
  
  qemu-arm-static version is 4.2.0
  
  I've also tried older versions without change.
  
  Platform I try to emulate is armv7h (Arch Linux ARM for Raspberry Pi 2)
  
  Host system is AMD64
  
  This can be reproduced 100% of the time. Here I have the issue happening
  on Travis CI:
  
- https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L7228
+ https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L9944

** Description changed:

  I try to autobuild for ARM7 with qemu-arm-static. Part of this is
  downloading source via SVN.
  
  Whenever I try to download a SVN repository I get the following output:
  
  svn: E75: Can't read directory '...': Value too large for
  defined data type
+ 
+ This is the repository I'm trying to check out:
+ https://svn.baycom.de/repos/vdr-mcli-plugin/
  
  qemu-arm-static version is 4.2.0
  
  I've also tried older versions without change.
  
  Platform I try to emulate is armv7h (Arch Linux ARM for Raspberry Pi 2)
  
  Host system is AMD64
  
  This can be reproduced 100% of the time. Here I have the issue happening
  on Travis CI:
  
  https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L9944

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1869241

Title:
  svn checkout fails with E75 "Value too large for defined data
  type"

Status in QEMU:
  New

Bug description:
  I try to autobuild for ARM7 with qemu-arm-static. Part of this is
  downloading source via SVN.

  Whenever I try to download a SVN repository I get the following
  output:

  svn: E75: Can't read directory '...': Value too large for
  defined data type

  This is the repository I'm trying to check out:
  https://svn.baycom.de/repos/vdr-mcli-plugin/

  qemu-arm-static version is 4.2.0

  I've also tried older versions without change.

  Platform I try to emulate is armv7h (Arch Linux ARM for Raspberry Pi
  2)

  Host system is AMD64

  This can be reproduced 100% of the time. Here I have the issue
  happening on Travis CI:

  https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L9944

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1869241/+subscriptions

Re: [PATCH] qemu/atomic.h: add #ifdef guards for stdatomic.h

2020-03-26 Thread Paolo Bonzini

On 26/03/20 18:14, Peter Maydell wrote:
>> +#ifndef atomic_fetch_add
>>  #define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, 
>> __ATOMIC_SEQ_CST)
>>  #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, 
>> __ATOMIC_SEQ_CST)
>>  #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, 
>> __ATOMIC_SEQ_CST)
>>  #define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
>>  #define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, 
>> __ATOMIC_SEQ_CST)
>> +#endif
>
> This will work around FreeBSD's current implementation in particular,
> but I don't think there's anything in the C11 spec that mandates that
> atomic_fetch_add() and friends have to be macros and not simply
> functions...

That's not a problem as long as they are all functions, the macros would
simply override the function-based implementation.

Paolo

Re: [PATCH v4 0/2] Replaced locks with lock guard macros

2020-03-26 Thread Richard Henderson

On Wed, 25 Mar 2020 at 04:35, Daniel Brodsky  wrote:
> > There may be ways to rewrite that expression to avoid triggering the
> > warning on a 32-bit platform.  Untested, but does this help:
> >
> > if (sizeof(mask) > 4 && mask <= 0xu) {
> >
> > --
> > Eric Blake, Principal Software Engineer
> > Red Hat, Inc.   +1-919-301-3226
> > Virtualization:  qemu.org | libvirt.org
> >
> Unfortunately, the compiler still catches that the statement is always
> true for 32-bit.
> An alternative might be to convert cases like this to a macro instead,
> which doesn't
> cause any errors.

My preference is to add -Wno-tautological-type-limit-compare in
configure, so we don't have to work around this same issue elsewhere
in the code base.

r~

[Bug 1869073] Re: qemu-arm-static crashes "segmentation fault" when running "git clone -s"

2020-03-26 Thread Manuel Reimer

Actually this one magically went good. I'm testing in Virtual Box as my
ARM64 host. Maybe something went wrong there. After rebooting the whole
machine today "git" works well.

Will reopen if it happens again...

** Changed in: qemu
   Status: New => Incomplete

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1869073

Title:
  qemu-arm-static crashes "segmentation fault" when running "git clone
  -s"

Status in QEMU:
  Incomplete

Bug description:
  I want to use qemu-arm-static to cross-compile software. The compiler
  itself is a native cross-compiler connected via "distcc".

  The problem is that a script tries to do some stuff with "git" and
  with a "git clone -s" command the whole story reproducibly stops with
  a "segmentation fault".

  I don't know how to properly debug the issue but it happens 100% of
  the time that I get the "crash" or git just hangs forever with 100%
  CPU usage.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1869073/+subscriptions

Re: [PATCH v9 1/9] hw/vfio/common: Remove error print on mmio region translation by viommu

2020-03-26 Thread Alex Williamson

On Thu, 26 Mar 2020 18:35:48 +0100
Auger Eric  wrote:

> Hi Alex,
> 
> On 3/24/20 12:08 AM, Alex Williamson wrote:
> > [Cc +dwg who originated this warning]
> > 
> > On Mon, 23 Mar 2020 14:16:09 +0530
> > Bharat Bhushan  wrote:
> >   
> >> On ARM, the MSI doorbell is translated by the virtual IOMMU.
> >> As such address_space_translate() returns the MSI controller
> >> MMIO region and we get an "iommu map to non memory area"
> >> message. Let's remove this latter.
> >>
> >> Signed-off-by: Eric Auger 
> >> Signed-off-by: Bharat Bhushan 
> >> ---
> >>  hw/vfio/common.c | 2 --
> >>  1 file changed, 2 deletions(-)
> >>
> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> >> index 5ca11488d6..c586edf47a 100644
> >> --- a/hw/vfio/common.c
> >> +++ b/hw/vfio/common.c
> >> @@ -426,8 +426,6 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void 
> >> **vaddr,
> >>   , , writable,
> >>   MEMTXATTRS_UNSPECIFIED);
> >>  if (!memory_region_is_ram(mr)) {
> >> -error_report("iommu map to non memory area %"HWADDR_PRIx"",
> >> - xlat);
> >>  return false;
> >>  }
> >>
> > 
> > I'm a bit confused here, I think we need more justification beyond "we
> > hit this warning and we don't want to because it's ok in this one
> > special case, therefore remove it".  I assume the special case is that
> > the device MSI address is managed via the SET_IRQS ioctl and therefore
> > we won't actually get DMAs to this range.  
> Yes exactly. The guest creates a mapping between one giova and this gpa
> (corresponding to the MSI controller doorbell) because MSIs are mapped
> on ARM. But practically the physical device is programmed with an host
> chosen iova that maps onto the physical MSI controller's doorbell. so
> the device never performs DMA accesses to this range.
> 
>   But I imagine the case that
> > was in mind when adding this warning was general peer-to-peer between
> > and assigned and emulated device.  
> yes makes sense.
> 
>   Maybe there's an argument to be made
> > that such a p2p mapping might also be used in a non-vIOMMU case.  We
> > skip creating those mappings and drivers continue to work, maybe
> > because nobody attempts to do p2p DMA with the types of devices we
> > emulate, maybe because p2p DMA is not absolutely reliable on bare metal
> > and drivers test it before using it.  
> MSI doorbells are mapped using the IOMMU_MMIO flag (dma-iommu.c
> iommu_dma_get_msi_page).
> One idea could be to pass that flag through the IOMMU Notifier mechanism
> into the iotlb->perm. Eventually when we get this in vfio_get_vaddr() we
> would not print the warning. Could that make sense?

Yeah, if we can identify a valid case that doesn't need a warning,
that's fine by me.  Thanks,

Alex

Re: [PATCH v16 QEMU 05/16] vfio: Add migration region initialization and finalize function

2020-03-26 Thread Dr. David Alan Gilbert

* Kirti Wankhede (kwankh...@nvidia.com) wrote:
> - Migration functions are implemented for VFIO_DEVICE_TYPE_PCI device in this
>   patch series.
> - VFIO device supports migration or not is decided based of migration region
>   query. If migration region query is successful and migration region
>   initialization is successful then migration is supported else migration is
>   blocked.
> 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 
> ---
>  hw/vfio/Makefile.objs |   2 +-
>  hw/vfio/migration.c   | 138 
> ++
>  hw/vfio/trace-events  |   3 +
>  include/hw/vfio/vfio-common.h |   9 +++
>  4 files changed, 151 insertions(+), 1 deletion(-)
>  create mode 100644 hw/vfio/migration.c
> 
> diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs
> index 9bb1c09e8477..8b296c889ed9 100644
> --- a/hw/vfio/Makefile.objs
> +++ b/hw/vfio/Makefile.objs
> @@ -1,4 +1,4 @@
> -obj-y += common.o spapr.o
> +obj-y += common.o spapr.o migration.o
>  obj-$(CONFIG_VFIO_PCI) += pci.o pci-quirks.o display.o
>  obj-$(CONFIG_VFIO_CCW) += ccw.o
>  obj-$(CONFIG_VFIO_PLATFORM) += platform.o
> diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c
> new file mode 100644
> index ..a078dcf1dd8f
> --- /dev/null
> +++ b/hw/vfio/migration.c
> @@ -0,0 +1,138 @@
> +/*
> + * Migration support for VFIO devices
> + *
> + * Copyright NVIDIA, Inc. 2019

Time flies by...

> + *
> + * This work is licensed under the terms of the GNU GPL, version 2. See
> + * the COPYING file in the top-level directory.

Are you sure you want this to be V2 only? Most code added to qemu now is
v2 or later.

> + */
> +
> +#include "qemu/osdep.h"
> +#include 
> +
> +#include "hw/vfio/vfio-common.h"
> +#include "cpu.h"
> +#include "migration/migration.h"
> +#include "migration/qemu-file.h"
> +#include "migration/register.h"
> +#include "migration/blocker.h"
> +#include "migration/misc.h"
> +#include "qapi/error.h"
> +#include "exec/ramlist.h"
> +#include "exec/ram_addr.h"
> +#include "pci.h"
> +#include "trace.h"
> +
> +static void vfio_migration_region_exit(VFIODevice *vbasedev)
> +{
> +VFIOMigration *migration = vbasedev->migration;
> +
> +if (!migration) {
> +return;
> +}
> +
> +if (migration->region.size) {
> +vfio_region_exit(>region);
> +vfio_region_finalize(>region);
> +}
> +}
> +
> +static int vfio_migration_region_init(VFIODevice *vbasedev, int index)
> +{
> +VFIOMigration *migration = vbasedev->migration;
> +Object *obj = NULL;
> +int ret = -EINVAL;
> +
> +if (!vbasedev->ops->vfio_get_object) {
> +return ret;
> +}
> +
> +obj = vbasedev->ops->vfio_get_object(vbasedev);
> +if (!obj) {
> +return ret;
> +}
> +
> +ret = vfio_region_setup(obj, vbasedev, >region, index,
> +"migration");
> +if (ret) {
> +error_report("%s: Failed to setup VFIO migration region %d: %s",
> + vbasedev->name, index, strerror(-ret));
> +goto err;
> +}
> +
> +if (!migration->region.size) {
> +ret = -EINVAL;
> +error_report("%s: Invalid region size of VFIO migration region %d: 
> %s",
> + vbasedev->name, index, strerror(-ret));
> +goto err;
> +}
> +
> +return 0;
> +
> +err:
> +vfio_migration_region_exit(vbasedev);
> +return ret;
> +}
> +
> +static int vfio_migration_init(VFIODevice *vbasedev,
> +   struct vfio_region_info *info)
> +{
> +int ret;
> +
> +vbasedev->migration = g_new0(VFIOMigration, 1);
> +
> +ret = vfio_migration_region_init(vbasedev, info->index);
> +if (ret) {
> +error_report("%s: Failed to initialise migration region",
> + vbasedev->name);
> +g_free(vbasedev->migration);
> +vbasedev->migration = NULL;
> +return ret;
> +}
> +
> +return 0;
> +}
> +
> +/* -- */
> +
> +int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
> +{
> +struct vfio_region_info *info;
> +Error *local_err = NULL;
> +int ret;
> +
> +ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION,
> +   VFIO_REGION_SUBTYPE_MIGRATION, );
> +if (ret) {
> +goto add_blocker;
> +}
> +
> +ret = vfio_migration_init(vbasedev, info);
> +if (ret) {
> +goto add_blocker;
> +}
> +
> +trace_vfio_migration_probe(vbasedev->name, info->index);
> +return 0;
> +
> +add_blocker:
> +error_setg(>migration_blocker,
> +   "VFIO device doesn't support migration");
> +ret = migrate_add_blocker(vbasedev->migration_blocker, _err);
> +if (local_err) {
> +error_propagate(errp, local_err);
> +error_free(vbasedev->migration_blocker);
> +}
> +return ret;
> +}
> +
> +void vfio_migration_finalize(VFIODevice

Re: [RFC v6 01/24] update-linux-headers: Import iommu.h

2020-03-26 Thread Auger Eric

Hi Yi,

On 3/26/20 1:58 PM, Liu, Yi L wrote:
>> From: Eric Auger 
>> Sent: Saturday, March 21, 2020 12:58 AM
>> To: eric.auger@gmail.com; eric.au...@redhat.com; qemu-devel@nongnu.org;
>> Subject: [RFC v6 01/24] update-linux-headers: Import iommu.h
>>
>> Update the script to import the new iommu.h uapi header.
>>
>> Signed-off-by: Eric Auger 
>> ---
>>  scripts/update-linux-headers.sh | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/scripts/update-linux-headers.sh 
>> b/scripts/update-linux-headers.sh index
>> 29c27f4681..5b64ee3912 100755
>> --- a/scripts/update-linux-headers.sh
>> +++ b/scripts/update-linux-headers.sh
>> @@ -141,7 +141,7 @@ done
>>
>>  rm -rf "$output/linux-headers/linux"
>>  mkdir -p "$output/linux-headers/linux"
>> -for header in kvm.h vfio.h vfio_ccw.h vhost.h \
>> +for header in kvm.h vfio.h vfio_ccw.h vhost.h iommu.h \
>>psci.h psp-sev.h userfaultfd.h mman.h; do
>>  cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
>>  done
> 
> Hi Eric,
> 
> This patch already got acked by from Cornelia. :-)
> 
> https://patchwork.ozlabs.org/patch/1259643/
thanks for the heads-up! Every little step ... ;-)

Thanks

Eric
> 
> Regards,
> Yi Liu
>

Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices

2020-03-26 Thread Dr. David Alan Gilbert

* Kirti Wankhede (kwankh...@nvidia.com) wrote:
> These functions save and restore PCI device specific data - config
> space of PCI device.
> Tested save and restore with MSI and MSIX type.
> 
> Signed-off-by: Kirti Wankhede 
> Reviewed-by: Neo Jia 
> ---
>  hw/vfio/pci.c | 163 
> ++
>  include/hw/vfio/vfio-common.h |   2 +
>  2 files changed, 165 insertions(+)
> 
> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> index 6c77c12e44b9..8deb11e87ef7 100644
> --- a/hw/vfio/pci.c
> +++ b/hw/vfio/pci.c
> @@ -41,6 +41,7 @@
>  #include "trace.h"
>  #include "qapi/error.h"
>  #include "migration/blocker.h"
> +#include "migration/qemu-file.h"
>  
>  #define TYPE_VFIO_PCI "vfio-pci"
>  #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
>  }
>  }
>  
> +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> +{
> +PCIDevice *pdev = >pdev;
> +VFIOBAR *bar = >bars[nr];
> +uint64_t addr;
> +uint32_t addr_lo, addr_hi = 0;
> +
> +/* Skip unimplemented BARs and the upper half of 64bit BARS. */
> +if (!bar->size) {
> +return 0;
> +}
> +
> +addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4);
> +
> +addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> +   PCI_BASE_ADDRESS_MEM_MASK);
> +if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +addr_hi = pci_default_read_config(pdev,
> + PCI_BASE_ADDRESS_0 + (nr + 1) * 4, 
> 4);
> +}
> +
> +addr = ((uint64_t)addr_hi << 32) | addr_lo;
> +
> +if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> +return -EINVAL;
> +}
> +
> +return 0;
> +}
> +
> +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> +{
> +int i, ret;
> +
> +for (i = 0; i < PCI_ROM_SLOT; i++) {
> +ret = vfio_bar_validate(vdev, i);
> +if (ret) {
> +error_report("vfio: BAR address %d validation failed", i);
> +return ret;
> +}
> +}
> +return 0;
> +}
> +
>  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
>  {
>  VFIOBAR *bar = >bars[nr];
> @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
> *vbasedev)
>  return OBJECT(vdev);
>  }
>  
> +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +PCIDevice *pdev = >pdev;
> +uint16_t pci_cmd;
> +int i;
> +
> +for (i = 0; i < PCI_ROM_SLOT; i++) {
> +uint32_t bar;
> +
> +bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> +qemu_put_be32(f, bar);
> +}
> +
> +qemu_put_be32(f, vdev->interrupt);
> +if (vdev->interrupt == VFIO_INT_MSI) {
> +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +bool msi_64bit;
> +
> +msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
> PCI_MSI_FLAGS,
> +2);
> +msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> +
> +msi_addr_lo = pci_default_read_config(pdev,
> + pdev->msi_cap + PCI_MSI_ADDRESS_LO, 
> 4);
> +qemu_put_be32(f, msi_addr_lo);
> +
> +if (msi_64bit) {
> +msi_addr_hi = pci_default_read_config(pdev,
> + pdev->msi_cap + 
> PCI_MSI_ADDRESS_HI,
> + 4);
> +}
> +qemu_put_be32(f, msi_addr_hi);
> +
> +msi_data = pci_default_read_config(pdev,
> +pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
> PCI_MSI_DATA_32),
> +2);
> +qemu_put_be32(f, msi_data);
> +} else if (vdev->interrupt == VFIO_INT_MSIX) {
> +uint16_t offset;
> +
> +/* save enable bit and maskall bit */
> +offset = pci_default_read_config(pdev,
> +   pdev->msix_cap + PCI_MSIX_FLAGS + 1, 
> 2);
> +qemu_put_be16(f, offset);
> +msix_save(pdev, f);
> +}
> +pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +qemu_put_be16(f, pci_cmd);
> +}
> +
> +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
> +{
> +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> +PCIDevice *pdev = >pdev;
> +uint32_t interrupt_type;
> +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> +uint16_t pci_cmd;
> +bool msi_64bit;
> +int i, ret;
> +
> +/* retore pci bar configuration */
> +pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2);
> +vfio_pci_write_config(pdev, PCI_COMMAND,
> +pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), 
> 2);
> +for (i = 0; i < PCI_ROM_SLOT; i++) {
> +uint32_t

Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices

2020-03-26 Thread Alex Williamson

On Thu, 26 Mar 2020 17:29:26 +
"Dr. David Alan Gilbert"  wrote:

> * Alex Williamson (alex.william...@redhat.com) wrote:
> > On Wed, 25 Mar 2020 02:39:02 +0530
> > Kirti Wankhede  wrote:
> >   
> > > These functions save and restore PCI device specific data - config
> > > space of PCI device.
> > > Tested save and restore with MSI and MSIX type.
> > > 
> > > Signed-off-by: Kirti Wankhede 
> > > Reviewed-by: Neo Jia 
> > > ---
> > >  hw/vfio/pci.c | 163 
> > > ++
> > >  include/hw/vfio/vfio-common.h |   2 +
> > >  2 files changed, 165 insertions(+)
> > > 
> > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > index 6c77c12e44b9..8deb11e87ef7 100644
> > > --- a/hw/vfio/pci.c
> > > +++ b/hw/vfio/pci.c
> > > @@ -41,6 +41,7 @@
> > >  #include "trace.h"
> > >  #include "qapi/error.h"
> > >  #include "migration/blocker.h"
> > > +#include "migration/qemu-file.h"
> > >  
> > >  #define TYPE_VFIO_PCI "vfio-pci"
> > >  #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> > > @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
> > >  }
> > >  }
> > >  
> > > +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> > > +{
> > > +PCIDevice *pdev = >pdev;
> > > +VFIOBAR *bar = >bars[nr];
> > > +uint64_t addr;
> > > +uint32_t addr_lo, addr_hi = 0;
> > > +
> > > +/* Skip unimplemented BARs and the upper half of 64bit BARS. */
> > > +if (!bar->size) {
> > > +return 0;
> > > +}
> > > +
> > > +addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 
> > > 4);
> > > +
> > > +addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> > > +   PCI_BASE_ADDRESS_MEM_MASK);  
> > 
> > Nit, &= or combine with previous set.
> >   
> > > +if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > +addr_hi = pci_default_read_config(pdev,
> > > + PCI_BASE_ADDRESS_0 + (nr + 1) * 
> > > 4, 4);
> > > +}
> > > +
> > > +addr = ((uint64_t)addr_hi << 32) | addr_lo;  
> > 
> > Could we use a union?
> >   
> > > +
> > > +if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> > > +return -EINVAL;
> > > +}  
> > 
> > What specifically are we validating here?  This should be true no
> > matter what we wrote to the BAR or else BAR emulation is broken.  The
> > bits that could make this unaligned are not implemented in the BAR.  
> 
> That I think is based on a comment I asked a few versions back.
> Remember the value being checked here is a value loaded from the
> migration stream; it could be garbage, so it's good to do whatever
> checks you can.

It's not the migration stream though, we're reading it from config
space emulation.  The migration stream could have written absolutely
anything to the device BAR and this test should still be ok.  PCI BARs
are naturally aligned by definition.  The address bits that could make
the value unaligned are not implemented.  This is why we can determine
the size of the BAR by writing -1 to it.  Thanks,

Alex

> > > +
> > > +return 0;
> > > +}
> > > +
> > > +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> > > +{
> > > +int i, ret;
> > > +
> > > +for (i = 0; i < PCI_ROM_SLOT; i++) {
> > > +ret = vfio_bar_validate(vdev, i);
> > > +if (ret) {
> > > +error_report("vfio: BAR address %d validation failed", i);
> > > +return ret;
> > > +}
> > > +}
> > > +return 0;
> > > +}
> > > +
> > >  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> > >  {
> > >  VFIOBAR *bar = >bars[nr];
> > > @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
> > > *vbasedev)
> > >  return OBJECT(vdev);
> > >  }
> > >  
> > > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> > > +{
> > > +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, 
> > > vbasedev);
> > > +PCIDevice *pdev = >pdev;
> > > +uint16_t pci_cmd;
> > > +int i;
> > > +
> > > +for (i = 0; i < PCI_ROM_SLOT; i++) {
> > > +uint32_t bar;
> > > +
> > > +bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 
> > > 4);
> > > +qemu_put_be32(f, bar);
> > > +}
> > > +
> > > +qemu_put_be32(f, vdev->interrupt);
> > > +if (vdev->interrupt == VFIO_INT_MSI) {
> > > +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> > > +bool msi_64bit;
> > > +
> > > +msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
> > > PCI_MSI_FLAGS,
> > > +2);
> > > +msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> > > +
> > > +msi_addr_lo = pci_default_read_config(pdev,
> > > + pdev->msi_cap + 
> > > PCI_MSI_ADDRESS_LO, 4);
> > > +qemu_put_be32(f, msi_addr_lo);
> > > +
> > > +if (msi_64bit) {
> > >

Re: [PATCH v9 1/9] hw/vfio/common: Remove error print on mmio region translation by viommu

2020-03-26 Thread Auger Eric

Hi Alex,

On 3/24/20 12:08 AM, Alex Williamson wrote:
> [Cc +dwg who originated this warning]
> 
> On Mon, 23 Mar 2020 14:16:09 +0530
> Bharat Bhushan  wrote:
> 
>> On ARM, the MSI doorbell is translated by the virtual IOMMU.
>> As such address_space_translate() returns the MSI controller
>> MMIO region and we get an "iommu map to non memory area"
>> message. Let's remove this latter.
>>
>> Signed-off-by: Eric Auger 
>> Signed-off-by: Bharat Bhushan 
>> ---
>>  hw/vfio/common.c | 2 --
>>  1 file changed, 2 deletions(-)
>>
>> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
>> index 5ca11488d6..c586edf47a 100644
>> --- a/hw/vfio/common.c
>> +++ b/hw/vfio/common.c
>> @@ -426,8 +426,6 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void 
>> **vaddr,
>>   , , writable,
>>   MEMTXATTRS_UNSPECIFIED);
>>  if (!memory_region_is_ram(mr)) {
>> -error_report("iommu map to non memory area %"HWADDR_PRIx"",
>> - xlat);
>>  return false;
>>  }
>>  
> 
> I'm a bit confused here, I think we need more justification beyond "we
> hit this warning and we don't want to because it's ok in this one
> special case, therefore remove it".  I assume the special case is that
> the device MSI address is managed via the SET_IRQS ioctl and therefore
> we won't actually get DMAs to this range.
Yes exactly. The guest creates a mapping between one giova and this gpa
(corresponding to the MSI controller doorbell) because MSIs are mapped
on ARM. But practically the physical device is programmed with an host
chosen iova that maps onto the physical MSI controller's doorbell. so
the device never performs DMA accesses to this range.

  But I imagine the case that
> was in mind when adding this warning was general peer-to-peer between
> and assigned and emulated device.
yes makes sense.

  Maybe there's an argument to be made
> that such a p2p mapping might also be used in a non-vIOMMU case.  We
> skip creating those mappings and drivers continue to work, maybe
> because nobody attempts to do p2p DMA with the types of devices we
> emulate, maybe because p2p DMA is not absolutely reliable on bare metal
> and drivers test it before using it.
MSI doorbells are mapped using the IOMMU_MMIO flag (dma-iommu.c
iommu_dma_get_msi_page).
One idea could be to pass that flag through the IOMMU Notifier mechanism
into the iotlb->perm. Eventually when we get this in vfio_get_vaddr() we
would not print the warning. Could that make sense?

Thanks

Eric



  Anyway, I need a better argument
> why this is ok.  Thanks,
> 
> Alex
>

Re: [PATCH] qemu/atomic.h: add #ifdef guards for stdatomic.h

2020-03-26 Thread Alex Bennée



Peter Maydell  writes:

> On Thu, 26 Mar 2020 at 17:01, Alex Bennée  wrote:
>>
>> Deep inside the FreeBSD netmap headers we end up including stdatomic.h
>> which clashes with qemu's atomic functions which are modelled along
>> the C11 standard. To avoid a massive rename lets just ifdef around the
>> problem.
>>
>> Signed-off-by: Alex Bennée 
>> ---
>>  include/qemu/atomic.h | 6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
>> index f9cd24c8994..ff72db51154 100644
>> --- a/include/qemu/atomic.h
>> +++ b/include/qemu/atomic.h
>> @@ -208,11 +208,14 @@
>>  /* Provide shorter names for GCC atomic builtins, return old value */
>>  #define atomic_fetch_inc(ptr)  __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST)
>>  #define atomic_fetch_dec(ptr)  __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST)
>> +
>> +#ifndef atomic_fetch_add
>>  #define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, 
>> __ATOMIC_SEQ_CST)
>>  #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, 
>> __ATOMIC_SEQ_CST)
>>  #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, 
>> __ATOMIC_SEQ_CST)
>>  #define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
>>  #define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, 
>> __ATOMIC_SEQ_CST)
>> +#endif
>
> This will work around FreeBSD's current implementation in particular,
> but I don't think there's anything in the C11 spec that mandates that
> atomic_fetch_add() and friends have to be macros and not simply
> functions...

Sure there are two alternative options:

 - Move to using stdatomic headers - on Linux they seem to be C++ only
 - Rename all out atomic functions - seems a bit of a big patch for rc releases

I suspect we should look at option two for 5.1

-- 
Alex Bennée

Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices

2020-03-26 Thread Dr. David Alan Gilbert

* Alex Williamson (alex.william...@redhat.com) wrote:
> On Wed, 25 Mar 2020 02:39:02 +0530
> Kirti Wankhede  wrote:
> 
> > These functions save and restore PCI device specific data - config
> > space of PCI device.
> > Tested save and restore with MSI and MSIX type.
> > 
> > Signed-off-by: Kirti Wankhede 
> > Reviewed-by: Neo Jia 
> > ---
> >  hw/vfio/pci.c | 163 
> > ++
> >  include/hw/vfio/vfio-common.h |   2 +
> >  2 files changed, 165 insertions(+)
> > 
> > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > index 6c77c12e44b9..8deb11e87ef7 100644
> > --- a/hw/vfio/pci.c
> > +++ b/hw/vfio/pci.c
> > @@ -41,6 +41,7 @@
> >  #include "trace.h"
> >  #include "qapi/error.h"
> >  #include "migration/blocker.h"
> > +#include "migration/qemu-file.h"
> >  
> >  #define TYPE_VFIO_PCI "vfio-pci"
> >  #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI)
> > @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev)
> >  }
> >  }
> >  
> > +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr)
> > +{
> > +PCIDevice *pdev = >pdev;
> > +VFIOBAR *bar = >bars[nr];
> > +uint64_t addr;
> > +uint32_t addr_lo, addr_hi = 0;
> > +
> > +/* Skip unimplemented BARs and the upper half of 64bit BARS. */
> > +if (!bar->size) {
> > +return 0;
> > +}
> > +
> > +addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 
> > 4);
> > +
> > +addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK :
> > +   PCI_BASE_ADDRESS_MEM_MASK);
> 
> Nit, &= or combine with previous set.
> 
> > +if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > +addr_hi = pci_default_read_config(pdev,
> > + PCI_BASE_ADDRESS_0 + (nr + 1) * 
> > 4, 4);
> > +}
> > +
> > +addr = ((uint64_t)addr_hi << 32) | addr_lo;
> 
> Could we use a union?
> 
> > +
> > +if (!QEMU_IS_ALIGNED(addr, bar->size)) {
> > +return -EINVAL;
> > +}
> 
> What specifically are we validating here?  This should be true no
> matter what we wrote to the BAR or else BAR emulation is broken.  The
> bits that could make this unaligned are not implemented in the BAR.

That I think is based on a comment I asked a few versions back.
Remember the value being checked here is a value loaded from the
migration stream; it could be garbage, so it's good to do whatever
checks you can.

> > +
> > +return 0;
> > +}
> > +
> > +static int vfio_bars_validate(VFIOPCIDevice *vdev)
> > +{
> > +int i, ret;
> > +
> > +for (i = 0; i < PCI_ROM_SLOT; i++) {
> > +ret = vfio_bar_validate(vdev, i);
> > +if (ret) {
> > +error_report("vfio: BAR address %d validation failed", i);
> > +return ret;
> > +}
> > +}
> > +return 0;
> > +}
> > +
> >  static void vfio_bar_register(VFIOPCIDevice *vdev, int nr)
> >  {
> >  VFIOBAR *bar = >bars[nr];
> > @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice 
> > *vbasedev)
> >  return OBJECT(vdev);
> >  }
> >  
> > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f)
> > +{
> > +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
> > +PCIDevice *pdev = >pdev;
> > +uint16_t pci_cmd;
> > +int i;
> > +
> > +for (i = 0; i < PCI_ROM_SLOT; i++) {
> > +uint32_t bar;
> > +
> > +bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4);
> > +qemu_put_be32(f, bar);
> > +}
> > +
> > +qemu_put_be32(f, vdev->interrupt);
> > +if (vdev->interrupt == VFIO_INT_MSI) {
> > +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data;
> > +bool msi_64bit;
> > +
> > +msi_flags = pci_default_read_config(pdev, pdev->msi_cap + 
> > PCI_MSI_FLAGS,
> > +2);
> > +msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT);
> > +
> > +msi_addr_lo = pci_default_read_config(pdev,
> > + pdev->msi_cap + 
> > PCI_MSI_ADDRESS_LO, 4);
> > +qemu_put_be32(f, msi_addr_lo);
> > +
> > +if (msi_64bit) {
> > +msi_addr_hi = pci_default_read_config(pdev,
> > + pdev->msi_cap + 
> > PCI_MSI_ADDRESS_HI,
> > + 4);
> > +}
> > +qemu_put_be32(f, msi_addr_hi);
> > +
> > +msi_data = pci_default_read_config(pdev,
> > +pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : 
> > PCI_MSI_DATA_32),
> > +2);
> > +qemu_put_be32(f, msi_data);
> 
> Isn't the data field only a u16?
> 
> > +} else if (vdev->interrupt == VFIO_INT_MSIX) {
> > +uint16_t offset;
> > +
> > +/* save enable bit and maskall bit */
> > +offset = pci_default_read_config(pdev,
> > +

Re: [PATCH v2 0/3] Fix some AIO context locking in jobs

2020-03-26 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/20200326155628.859862-1-s.rei...@proxmox.com/



Hi,

This series failed the asan build test. Please find the testing commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
export ARCH=x86_64
make docker-image-fedora V=1 NETWORK=1
time make docker-test-debug@fedora TARGET_LIST=x86_64-softmmu J=14 NETWORK=1
=== TEST SCRIPT END ===

PASS 1 fdc-test /x86_64/fdc/cmos
PASS 2 fdc-test /x86_64/fdc/no_media_on_start
PASS 3 fdc-test /x86_64/fdc/read_without_media
==6158==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 4 fdc-test /x86_64/fdc/media_change
PASS 5 fdc-test /x86_64/fdc/sense_interrupt
PASS 6 fdc-test /x86_64/fdc/relative_seek
---
PASS 32 test-opts-visitor /visitor/opts/range/beyond
PASS 33 test-opts-visitor /visitor/opts/dict/unvisited
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  
tests/test-coroutine -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl 
--test-name="test-coroutine" 
==6219==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
==6219==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 
0x7ffcb9e2d000; bottom 0x7fc4e542; size: 0x0037d4a0d000 (239790510080)
False positive error reports may follow
For details see https://github.com/google/sanitizers/issues/189
PASS 1 test-coroutine /basic/no-dangling-access
---
PASS 11 test-aio /aio/event/wait
PASS 12 test-aio /aio/event/flush
PASS 13 test-aio /aio/event/wait/no-flush-cb
==6234==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 14 test-aio /aio/timer/schedule
PASS 15 test-aio /aio/coroutine/queue-chaining
PASS 16 test-aio /aio-gsource/flush
---
PASS 12 fdc-test /x86_64/fdc/read_no_dma_19
PASS 13 fdc-test /x86_64/fdc/fuzz-registers
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  
QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img 
tests/qtest/ide-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl 
--test-name="ide-test" 
==6242==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 28 test-aio /aio-gsource/timer/schedule
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  
tests/test-aio-multithread -m=quick -k --tap < /dev/null | 
./scripts/tap-driver.pl --test-name="test-aio-multithread" 
PASS 1 ide-test /x86_64/ide/identify
==6249==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 1 test-aio-multithread /aio/multi/lifecycle
==6251==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 2 ide-test /x86_64/ide/flush
PASS 2 test-aio-multithread /aio/multi/schedule
==6268==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 3 ide-test /x86_64/ide/bmdma/simple_rw
==6279==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 3 test-aio-multithread /aio/multi/mutex/contended
PASS 4 ide-test /x86_64/ide/bmdma/trim
==6290==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 4 test-aio-multithread /aio/multi/mutex/handoff
PASS 5 test-aio-multithread /aio/multi/mutex/mcs
PASS 6 test-aio-multithread /aio/multi/mutex/pthread
---
PASS 6 test-throttle /throttle/detach_attach
PASS 7 test-throttle /throttle/config_functions
PASS 8 test-throttle /throttle/accounting
==6307==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 9 test-throttle /throttle/groups
PASS 10 test-throttle /throttle/config/enabled
PASS 11 test-throttle /throttle/config/conflicting
---
PASS 14 test-throttle /throttle/config/max
PASS 15 test-throttle /throttle/config/iops_size
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}  
tests/test-thread-pool -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl 
--test-name="test-thread-pool" 
==6311==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 1 test-thread-pool /thread-pool/submit
PASS 2 test-thread-pool /thread-pool/submit-aio
PASS 3 test-thread-pool /thread-pool/submit-co
PASS 4 test-thread-pool /thread-pool/submit-many
==6378==WARNING: ASan doesn't fully support makecontext/swapcontext functions 
and may produce false positives in some cases!
PASS 5 test-thread-pool /thread-pool/cancel
PASS 6 test-thread-pool /thread-pool/cancel-async
MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}

Re: qemu-system-aarch64 windows binary run Arm64 defconfig kernel not working

2020-03-26 Thread yoma sophian

hi Peter:
> Are you using a 32-bit Windows qemu binary or a 64-bit one?
> (I'm wondering if this is a 32-bit vs 64-bit bug.)

unfortunately, 32-bit Windows qemu binary still fail on booting
upstream arm64 defconfig kernel.
in below log, I 1st purposely type wrong option, such that you can
tell where the binary located.
Then as you can see "synchronous external abort" still happens.

thanks for ur kind help,

$ qemu-system-aarch64 -v
C:\Program Files (x86)\qemu\qemu-system-aarch64.exe: -v: invalid option

$ qemu-system-aarch64 --version
QEMU emulator version 4.2.0 (v4.2.0-11797-g2890edc853-dirty)
Copyright (c) 2003-2019 Fabrice Bellard and the QEMU Project developers

$ ./run_script/run.sh
[ 0.00] Booting Linux on physical CPU 0x00 [0x411fd070]
[ 0.00] Linux version 5.5.0-rc7-00017-gb87a2ead99b2
(guest@ubuntu14_04) (gcc version 6.1.1 20160711 (Linaro GCC
6.1-2016.08)) #1 SMP PREEMPT Mon Mar 23 15:05:29 CST 2020
[ 0.00] Machine model: linux,dummy-virt
[ 0.00] efi: Getting EFI parameters from FDT:
[ 0.00] efi: UEFI not found.
[ 0.00] cma: Reserved 32 MiB at 0x7e00
[ 0.00] NUMA: No NUMA configuration found
[ 0.00] NUMA: Faking a node at [mem 0x4000-0x7fff]
[ 0.00] NUMA: NODE_DATA [mem 0x7ddf3100-0x7ddf4fff]
[ 0.00] Zone ranges:
[ 0.00] DMA [mem 0x4000-0x7fff]
[ 0.00] DMA32 empty
[ 0.00] Normal empty
[ 0.00] Movable zone start for each node
[ 0.00] Early memory node ranges
[ 0.00] node 0: [mem 0x4000-0x7fff]
[ 0.00] Initmem setup node 0 [mem 0x4000-0x7fff]
[ 0.00] psci: probing for conduit method from DT.
[ 0.00] psci: PSCIv0.2 detected in firmware.
[ 0.00] psci: Using standard PSCI v0.2 function IDs
[ 0.00] psci: Trusted OS migration not required
[ 0.00] percpu: Embedded 22 pages/cpu s53016 r8192 d28904 u90112
[ 0.00] Detected PIPT I-cache on CPU0
[ 0.00] CPU features: detected: ARM erratum 832075
[ 0.00] CPU features: detected: ARM erratum 834220
[ 0.00] CPU features: detected: EL2 vector hardening
[ 0.00] ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware
[ 0.00] CPU features: detected: ARM erratum 1319367
[ 0.00] Built 1 zonelists, mobility grouping on. Total pages: 258048
[ 0.00] Policy zone: DMA
[ 0.00] Kernel command line: console=ttyAMA0 root=/dev/vda
[ 0.00] Dentry cache hash table entries: 131072 (order: 8, 1048576
bytes, linear)
[ 0.00] Inode-cache hash table entries: 65536 (order: 7, 524288
bytes, linear)
[ 0.00] mem auto-init: stack:off, heap alloc:off, heap free:off
[ 0.00] Memory: 968848K/1048576K available (12284K kernel code,
1922K rwdata, 6776K rodata, 5184K init, 458K bss, 46960K reserved,
32768K cma-reserved)
[ 0.00] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1
[ 0.00] rcu: Preemptible hierarchical RCU implementation.
[ 0.00] rcu: RCU restricting CPUs from NR_CPUS=256 to nr_cpu_ids=2.
[ 0.00] Tasks RCU enabled.
[ 0.00] rcu: RCU calculated value of scheduler-enlistment delay is
25 jiffies.
[ 0.00] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=2
[ 0.00] NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0
[ 0.00] GICv2m: range[mem 0x0802-0x08020fff], SPI[80:143]
[ 0.00] random: get_random_bytes called from
start_kernel+0x2b8/0x454 with crng_init=0
[ 0.00] arch_timer: cp15 timer(s) running at 62.50MHz (virt).
[ 0.00] clocksource: arch_sys_counter: mask: 0xff
max_cycles: 0x1cd42e208c, max_idle_ns: 881590405314 ns
[ 0.000313] sched_clock: 56 bits at 62MHz, resolution 16ns, wraps
every 4398046511096ns
[ 0.012734] Console: colour dummy device 80x25
[ 0.016333] Calibrating delay loop (skipped), value calculated using
timer frequency.. 125.00 BogoMIPS (lpj=25)
[ 0.016608] pid_max: default: 32768 minimum: 301
[ 0.018603] LSM: Security Framework initializing
[ 0.020522] Mount-cache hash table entries: 2048 (order: 2, 16384 bytes, linear)
[ 0.020596] Mountpoint-cache hash table entries: 2048 (order: 2, 16384
bytes, linear)
[ 0.109764] ASID allocator initialised with 32768 entries
[ 0.118473] rcu: Hierarchical SRCU implementation.
[ 0.132871] EFI services will not be available.
[ 0.134747] smp: Bringing up secondary CPUs ...
[ 0.178926] Detected PIPT I-cache on CPU1
[ 0.180097] CPU1: Booted secondary processor 0x01 [0x411fd070]
[ 0.185193] smp: Brought up 1 node, 2 CPUs
[ 0.185252] SMP: Total of 2 processors activated.
[ 0.185318] CPU features: detected: 32-bit EL0 Support
[ 0.185433] CPU features: detected: CRC32 instructions
[ 0.215643] CPU: All CPU(s) started at EL1
[ 0.216146] alternatives: patching kernel code
[ 0.248177] devtmpfs: initialized
[ 0.268462] KASLR disabled due to lack of seed
[ 0.274809] clocksource: jiffies: mask: 0x max_cycles:
0x, max_idle_ns: 764504178510 ns
[ 0.275044] futex hash table entries: 512 (order: 3, 32768

Re: [PATCH] qemu/atomic.h: add #ifdef guards for stdatomic.h

2020-03-26 Thread Peter Maydell

On Thu, 26 Mar 2020 at 17:01, Alex Bennée  wrote:
>
> Deep inside the FreeBSD netmap headers we end up including stdatomic.h
> which clashes with qemu's atomic functions which are modelled along
> the C11 standard. To avoid a massive rename lets just ifdef around the
> problem.
>
> Signed-off-by: Alex Bennée 
> ---
>  include/qemu/atomic.h | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
> index f9cd24c8994..ff72db51154 100644
> --- a/include/qemu/atomic.h
> +++ b/include/qemu/atomic.h
> @@ -208,11 +208,14 @@
>  /* Provide shorter names for GCC atomic builtins, return old value */
>  #define atomic_fetch_inc(ptr)  __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST)
>  #define atomic_fetch_dec(ptr)  __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST)
> +
> +#ifndef atomic_fetch_add
>  #define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST)
>  #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST)
>  #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST)
>  #define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
>  #define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST)
> +#endif

This will work around FreeBSD's current implementation in particular,
but I don't think there's anything in the C11 spec that mandates that
atomic_fetch_add() and friends have to be macros and not simply
functions...

thanks
-- PMM

Re: [PATCH] qcow2: Remove unused fields from BDRVQcow2State

2020-03-26 Thread Eric Blake


On 3/26/20 12:07 PM, Kevin Wolf wrote:

These fields were already removed in commit c3c10f72, but then commit
b58deb34 revived them probably due to bad merge conflict resolution.
They are still unused, so remove them again.

Fixes: b58deb344ddff3b9d8b265bf73a65274767ee5f4
Signed-off-by: Kevin Wolf 
---
  block/qcow2.h | 3 ---
  1 file changed, 3 deletions(-)



Reviewed-by: Eric Blake 


diff --git a/block/qcow2.h b/block/qcow2.h
index 0942126232..f4de0a27d5 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -301,9 +301,6 @@ typedef struct BDRVQcow2State {
  QEMUTimer *cache_clean_timer;
  unsigned cache_clean_interval;
  
-uint8_t *cluster_cache;

-uint8_t *cluster_data;
-uint64_t cluster_cache_offset;
  QLIST_HEAD(, QCowL2Meta) cluster_allocs;
  
  uint64_t *refcount_table;




--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v2 0/3] Fix some AIO context locking in jobs

2020-03-26 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/20200326155628.859862-1-s.rei...@proxmox.com/



Hi,

This series failed the docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-centos7 V=1 NETWORK=1
time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
=== TEST SCRIPT END ===

  TESTcheck-unit: tests/test-bdrv-graph-mod
  TESTcheck-unit: tests/test-blockjob
qemu: qemu_mutex_unlock_impl: Operation not permitted
ERROR - too few tests run (expected 8, got 7)
make: *** [check-unit] Error 1
make: *** Waiting for unfinished jobs
  TESTiotest-qcow2: 020
  TESTiotest-qcow2: 021
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=2724b2af546443c7a60d0f061f1beab7', '-u', 
'1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
'-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 
'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
'/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-7887uhcy/src/docker-src.2020-03-26-12.56.41.28979:/var/tmp/qemu:z,ro',
 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=2724b2af546443c7a60d0f061f1beab7
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-7887uhcy/src'
make: *** [docker-run-test-quick@centos7] Error 2

real14m17.758s
user0m8.171s


The full log is available at
http://patchew.org/logs/20200326155628.859862-1-s.rei...@proxmox.com/testing.docker-quick@centos7/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

[PATCH] qcow2: Remove unused fields from BDRVQcow2State

2020-03-26 Thread Kevin Wolf

These fields were already removed in commit c3c10f72, but then commit
b58deb34 revived them probably due to bad merge conflict resolution.
They are still unused, so remove them again.

Fixes: b58deb344ddff3b9d8b265bf73a65274767ee5f4
Signed-off-by: Kevin Wolf 
---
 block/qcow2.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index 0942126232..f4de0a27d5 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -301,9 +301,6 @@ typedef struct BDRVQcow2State {
 QEMUTimer *cache_clean_timer;
 unsigned cache_clean_interval;
 
-uint8_t *cluster_cache;
-uint8_t *cluster_data;
-uint64_t cluster_cache_offset;
 QLIST_HEAD(, QCowL2Meta) cluster_allocs;
 
 uint64_t *refcount_table;
-- 
2.20.1

[PATCH] qemu/atomic.h: add #ifdef guards for stdatomic.h

2020-03-26 Thread Alex Bennée

Deep inside the FreeBSD netmap headers we end up including stdatomic.h
which clashes with qemu's atomic functions which are modelled along
the C11 standard. To avoid a massive rename lets just ifdef around the
problem.

Signed-off-by: Alex Bennée 
---
 include/qemu/atomic.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index f9cd24c8994..ff72db51154 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -208,11 +208,14 @@
 /* Provide shorter names for GCC atomic builtins, return old value */
 #define atomic_fetch_inc(ptr)  __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST)
 #define atomic_fetch_dec(ptr)  __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST)
+
+#ifndef atomic_fetch_add
 #define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_or(ptr, n)  __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST)
 #define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST)
+#endif
 
 #define atomic_inc_fetch(ptr)__atomic_add_fetch(ptr, 1, __ATOMIC_SEQ_CST)
 #define atomic_dec_fetch(ptr)__atomic_sub_fetch(ptr, 1, __ATOMIC_SEQ_CST)
@@ -392,11 +395,14 @@
 /* Provide shorter names for GCC atomic builtins.  */
 #define atomic_fetch_inc(ptr)  __sync_fetch_and_add(ptr, 1)
 #define atomic_fetch_dec(ptr)  __sync_fetch_and_add(ptr, -1)
+
+#ifndef atomic_fetch_add
 #define atomic_fetch_add(ptr, n) __sync_fetch_and_add(ptr, n)
 #define atomic_fetch_sub(ptr, n) __sync_fetch_and_sub(ptr, n)
 #define atomic_fetch_and(ptr, n) __sync_fetch_and_and(ptr, n)
 #define atomic_fetch_or(ptr, n) __sync_fetch_and_or(ptr, n)
 #define atomic_fetch_xor(ptr, n) __sync_fetch_and_xor(ptr, n)
+#endif
 
 #define atomic_inc_fetch(ptr)  __sync_add_and_fetch(ptr, 1)
 #define atomic_dec_fetch(ptr)  __sync_add_and_fetch(ptr, -1)
-- 
2.20.1

Re: [PULL 00/10] Block patches for 5.0-rc1

2020-03-26 Thread Peter Maydell

On Thu, 26 Mar 2020 at 14:29, Max Reitz  wrote:
>
> The following changes since commit 01e38186ecb1fc6275720c5425332eed280ea93d:
>
>   Merge remote-tracking branch 
> 'remotes/dgilbert/tags/pull-migration-20200325b' into staging (2020-03-26 
> 09:28:11 +)
>
> are available in the Git repository at:
>
>   https://github.com/XanClic/qemu.git tags/pull-block-2020-03-26
>
> for you to fetch changes up to a507c51790fa955c1fccd4deca3c50476a862b83:
>
>   iotests/138: Test leaks/corruptions fixed report (2020-03-26 14:52:43 +0100)
>
> 
> Block patches for 5.0-rc1:
> - Fix qemu-img convert with a host device or iscsi target
> - Use-after-free fix in mirror
> - Some minor qcow2 fixes
> - Minor sheepdog fix
> - Minor qemu-img check report fix
>
> 


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/5.0
for any user-visible changes.

-- PMM

Re: [PATCH] s390x: protvirt: Fix stray error_report_err in s390_machine_protect

2020-03-26 Thread Cornelia Huck

On Thu, 26 Mar 2020 10:05:05 -0400
Janosch Frank  wrote:

> In case the protection of the machine fails at s390_pv_vm_enable(),
> we'll currently report the local_error variable. Problem is that
> there's no migration blocker error that we can report at this point so
> the pointer is always NULL which leads to a SEGFAULT.
> 
> Let's remove the error report.

Yes, s390_pvm_vm_enable() will already moan on error.

> 
> Signed-off-by: Janosch Frank 
> Reported-by: Marc Hartmayer 
> Fixes: 520935eedf941da3 ("s390x: protvirt: Add migration blocker")

Note to self: update commit ID before I send a pull request (I will at
least need to rebase for a headers update...)

> ---
>  hw/s390x/s390-virtio-ccw.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
> index 3cf19c99f3468b7d..855ecf370d6e82fa 100644
> --- a/hw/s390x/s390-virtio-ccw.c
> +++ b/hw/s390x/s390-virtio-ccw.c
> @@ -358,7 +358,6 @@ static int s390_machine_protect(S390CcwMachineState *ms)
>  rc = s390_pv_vm_enable();
>  if (rc) {
>  qemu_balloon_inhibit(false);
> -error_report_err(local_err);
>  migrate_del_blocker(pv_mig_blocker);
>  error_free_or_abort(_mig_blocker);
>  return rc;

Thanks, applied.

Re: [PATCH] qemu-user: fix build with LLVM lld 10

2020-03-26 Thread Richard Henderson

On 3/26/20 6:43 AM, Roger Pau Monne wrote:
> lld 10.0.0 introduced a new linker option --image-base equivalent to
> the GNU -Ttext-segment one, hence use it when available.
> 
> This fixes the build of QEMU on systems using lld 10 or greater.
> 
> Signed-off-by: Dimitry Andric 
> Signed-off-by: Roger Pau Monné 
> ---
> Cc: Laurent Vivier 
> Cc: Richard Henderson 
> Cc: "Philippe Mathieu-Daudé" 
> Cc: Paolo Bonzini 
> Cc: "Daniel P. Berrangé" 
> ---

The Plan is still to drop this whole section of code.

However, it's still blocked on getting the x86_64 vsyscall patches upstream.


r~

Re: [PATCH v9 3/4] linux-user: Support futex_time64

2020-03-26 Thread Alistair Francis

On Wed, Mar 25, 2020 at 11:22 PM Laurent Vivier  wrote:
>
> Le 25/03/2020 à 18:41, Alistair Francis a écrit :
> > On Wed, Mar 18, 2020 at 3:54 PM Alistair Francis
> >  wrote:
> >>
> >> Add support for host and target futex_time64. If futex_time64 exists on
> >> the host we try that first before falling back to the standard futux
> >> syscall.
> >>
> >> Signed-off-by: Alistair Francis 
> >
> > @Laurent did you see this?
> >
> > I guess it's a little late for 5.0 but it would be nice to support.
>
> Yes, I've seen your patch.
>
> I think it can go into 5.0 RC because it's a bug fix and it is really
> needed because the 32bit futex will be broken if host timespec uses
> 64bit field.
>
> But I need to review and test before.

Thanks!

Alistair

>
> Thanks,
> Laurent

Re: booke206_tlb_ways() returning 0

2020-03-26 Thread Laurent Vivier

Le 26/03/2020 à 15:22, Peter Maydell a écrit :
> Hi; Coverity points out a possible issue in booke206_get_tlbm()
> (this is CID 1421942):
> 
> 
> static inline ppcmas_tlb_t *booke206_get_tlbm(CPUPPCState *env, const int 
> tlbn,
>   target_ulong ea, int way)
> {
> int r;
> uint32_t ways = booke206_tlb_ways(env, tlbn);
> int ways_bits = ctz32(ways);
> int tlb_bits = ctz32(booke206_tlb_size(env, tlbn));
> int i;
> 
> way &= ways - 1;
> ea >>= MAS2_EPN_SHIFT;
> ea &= (1 << (tlb_bits - ways_bits)) - 1;
> r = (ea << ways_bits) | way;
> 
> Here if booke206_tlb_ways() returns zero, then ways_bits()
> will be 32 and the shift left 'ea << ways_bits' is undefined
> behaviour.
> 
> My first assumption was that booke206_tlb_ways() is not supposed
> to ever return 0 (it's looking at what I think are read-only
> system registers, and it doesn't make much sense to have
> a zero-way TLB). So I tried adding an assert:
> 
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index 88d94495554..aedb6bcb265 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -2403,6 +2403,7 @@ static inline int booke206_tlb_ways(CPUPPCState
> *env, int tlbn)
>  {
>  uint32_t tlbncfg = env->spr[SPR_BOOKE_TLB0CFG + tlbn];
>  int r = tlbncfg >> TLBnCFG_ASSOC_SHIFT;
> +assert(r > 0);
>  return r;
>  }
> 
> However, this isn't right, because it causes one of the check-acceptance
> tests to fail, with this backtrace:
> 
> #3  0x7074d412 in __GI___assert_fail (assertion=0x560a0d7d
> "r > 0", file=0x560a0d40
> "/home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/cpu.h",
> line=2406, function=0x560a1720 <__PRETTY_FUNCTION__.20811>
> "booke206_tlb_ways") at assert.c:101
> #4  0x55a9939b in booke206_tlb_ways (env=0x56e52a30,
> tlbn=2) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/cpu.h:2406
> #5  0x55a9b3ac in mmubooke206_get_physical_address
> (env=0x56e52a30, ctx=0x7fffd77008a0, address=9223380835095282947,
> rw=0, access_type=0, mmu_idx=1) at
> /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1093
> #6  0x55a9c25d in get_physical_address_wtlb
> (env=0x56e52a30, ctx=0x7fffd77008a0, eaddr=9223380835095282947,
> rw=0, access_type=0, mmu_idx=1) at
> /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1455
> #7  0x55a9c82b in cpu_ppc_handle_mmu_fault
> (env=0x56e52a30, address=9223380835095282947, rw=0, mmu_idx=1)
> at 
> /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1597
> #8  0x55a9f975 in ppc_cpu_tlb_fill (cs=0x56e49560,
> addr=9223380835095282947, size=1, access_type=MMU_DATA_LOAD,
> mmu_idx=1, probe=false, retaddr=140735610345781) at
> /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:3053
> #9  0x558e1422 in tlb_fill (cpu=0x56e49560,
> addr=9223380835095282947, size=1, access_type=MMU_DATA_LOAD,
> mmu_idx=1, retaddr=140735610345781) at
> /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1017
> #10 0x558e279b in load_helper (env=0x56e52a30,
> addr=9223380835095282947, oi=1, retaddr=140735610345781, op=MO_8,
> code_read=false, full_load=0x558e2b3a ) at
> /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1534
> #11 0x558e2b80 in full_ldub_mmu (env=0x56e52a30,
> addr=9223380835095282947, oi=1, retaddr=140735610345781)
> at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1624
> #12 0x558e2bb4 in helper_ret_ldub_mmu (env=0x56e52a30,
> addr=9223380835095282947, oi=1, retaddr=140735610345781)
> at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1630
> #13 0x7fff900fd9c6 in code_gen_buffer ()
> #14 0x558f9915 in cpu_tb_exec (cpu=0x56e49560,
> itb=0x7fff900fd780 )
> at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:172
> #15 0x558fa732 in cpu_loop_exec_tb (cpu=0x56e49560,
> tb=0x7fff900fd780 , last_tb=0x7fffd7701078,
> tb_exit=0x7fffd7701070) at
> /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:619
> #16 0x558faa4c in cpu_exec (cpu=0x56e49560) at
> /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:732
> #17 0x558bcf29 in tcg_cpu_exec (cpu=0x56e49560) at
> /home/petmay01/linaro/qemu-from-laptop/qemu/cpus.c:1405
> #18 0x558bd77f in qemu_tcg_cpu_thread_fn (arg=0x56e49560)
> at /home/petmay01/linaro/qemu-from-laptop/qemu/cpus.c:1713
> #19 0x55f2ff3f in qemu_thread_start (args=0x56e8dd10) at
> /home/petmay01/linaro/qemu-from-laptop/qemu/util/qemu-thread-posix.c:519
> #20 0x70b156db in start_thread (arg=0x7fffd7704700) at
> pthread_create.c:463
> #21 0x7083e88f in clone () at
> ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
> 
> So under what circumstances can booke206_tlb_ways()
> validly return 0? Should booke206_get_tlbm() cope with

Re: [PULL 0/6] Linux user for 5.0 patches

2020-03-26 Thread Laurent Vivier

Le 26/03/2020 à 16:42, Peter Maydell a écrit :
> On Thu, 26 Mar 2020 at 07:24, Laurent Vivier  wrote:
>>
>> The following changes since commit 736cf607e40674776d752acc201f565723e86045:
>>
>>   Update version for v5.0.0-rc0 release (2020-03-24 17:50:00 +)
>>
>> are available in the Git repository at:
>>
>>   git://github.com/vivier/qemu.git tags/linux-user-for-5.0-pull-request
>>
>> for you to fetch changes up to a52f5f87bece827a338d6eb3332e3def86fb9c33:
>>
>>   linux-user: Flush out implementation of gettimeofday (2020-03-26 08:08:54 
>> +0100)
>>
>> 
>> Emulate x86_64 vsyscalls
>> Fix syscall_nr.h cleanup
>>
>> 
> 
> Still fails :-(

I would say it was expected...

I think your build dir is corrupted by a previous build.

You should have old .o file without .d file, and thus the .o file is not
refreshed (check the date of cpu_loop.o). We cannot cleanup that before
starting the build. The purpose of the cleanup in configure was to avoid
this kind of problem but I did it badly.

If you want to check incremental build, cleanup your build dir, checkout
v4.20 or master, build it, and then build the PR branch. it will work:
it's tested.

If you build 4.2.0, then master and then PR, the build dir will be
corrupted by the build of master that removes the .d files without
removing the .o files. So when you build the PR, cpu_loop.o  file
remains and is not rebuild with the new values in cpu.h.

We can't do anything to avoid that now, except removing all .o files
under *-linux-user...

Sorry for the mess...

Thanks,
Laurent

Re: [PATCH RFC 9/9] KVM: Dirty ring support

2020-03-26 Thread Peter Xu

On Thu, Mar 26, 2020 at 02:14:36PM +, Dr. David Alan Gilbert wrote:
> * Peter Xu (pet...@redhat.com) wrote:
> > On Wed, Mar 25, 2020 at 08:41:44PM +, Dr. David Alan Gilbert wrote:
> > 
> > [...]
> > 
> > > > +enum KVMReaperState {
> > > > +KVM_REAPER_NONE = 0,
> > > > +/* The reaper is sleeping */
> > > > +KVM_REAPER_WAIT,
> > > > +/* The reaper is reaping for dirty pages */
> > > > +KVM_REAPER_REAPING,
> > > > +};
> > > 
> > > That probably needs to be KVMDirtyRingReaperState
> > > given there are many things that could be reaped.
> > 
> > Sure.
> > 
> > > 
> > > > +/*
> > > > + * KVM reaper instance, responsible for collecting the KVM dirty bits
> > > > + * via the dirty ring.
> > > > + */
> > > > +struct KVMDirtyRingReaper {
> > > > +/* The reaper thread */
> > > > +QemuThread reaper_thr;
> > > > +/*
> > > > + * Telling the reaper thread to wakeup.  This should be used as a
> > > > + * generic interface to kick the reaper thread, like, in vcpu
> > > > + * threads where it gets a exit due to ring full.
> > > > + */
> > > > +EventNotifier reaper_event;
> > > 
> > > I think I'd just used a simple semaphore for this type of thing.
> > 
> > I'm actually uncertain on which is cheaper...
> > 
> > At the meantime, I wanted to poll two handles at the same time below
> > (in kvm_dirty_ring_reaper_thread).  I don't know how to do that with
> > semaphore.  Could it?
> 
> If you're OK with EventNotifier stick with it;  it's just I'm used
> to doing with it with a semaphore; e.g. a flag then the semaphore - but
> that's fine.

Ah yes flags could work, though we probably need to be careful with
flags and use atomic accesses to avoid race conditions of flag lost.

Then I'll keep it, thanks.

> 
> > [...]
> > 
> > > > @@ -412,6 +460,18 @@ int kvm_init_vcpu(CPUState *cpu)
> > > >  (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
> > > >  }
> > > >  
> > > > +if (s->kvm_dirty_gfn_count) {
> > > > +cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> > > > +   PROT_READ | PROT_WRITE, MAP_SHARED,
> > > 
> > > Is the MAP_SHARED required?
> > 
> > Yes it's required.  It's the same when we map the per-vcpu kvm_run.
> > 
> > If we use MAP_PRIVATE, it'll be in a COW fashion - when the userspace
> > writes to the dirty gfns the 1st time, it'll copy the current dirty
> > ring page in the kernel and from now on QEMU will never be able to see
> > what the kernel writes to the dirty gfn pages.  MAP_SHARED means the
> > userspace and the kernel shares exactly the same page(s).
> 
> OK, worth a comment.

Sure.

-- 
Peter Xu

Re: [PATCH v9 2/9] memory: Add interface to set iommu page size mask

2020-03-26 Thread Auger Eric

Hi Bharat,
On 3/23/20 9:46 AM, Bharat Bhushan wrote:
> Allow to set page size mask to be supported by iommu.
by iommu memory region. I mean this is not global to the IOMMU.
> This is required to expose page size mask compatible with
> host with virtio-iommu.
> 
> Signed-off-by: Bharat Bhushan 
> ---
>  include/exec/memory.h | 20 
>  memory.c  | 10 ++
>  2 files changed, 30 insertions(+)
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index e85b7de99a..063c424854 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -355,6 +355,16 @@ typedef struct IOMMUMemoryRegionClass {
>   * @iommu: the IOMMUMemoryRegion
>   */
>  int (*num_indexes)(IOMMUMemoryRegion *iommu);
> +
> +/*
> + * Set supported IOMMU page size
> + *
> + * Optional method: if this is supported then set page size that
> + * can be supported by IOMMU. This is called to set supported page
> + * size as per host Linux.
What about: If supported, allows to restrict the page size mask that can
be supported with a given IOMMU memory region. For example, this allows
to propagate host physical IOMMU page size mask limitations to the
virtual IOMMU (vfio assignment with virtual iommu).
> + */
> + void (*iommu_set_page_size_mask)(IOMMUMemoryRegion *iommu,
> +  uint64_t page_size_mask);
>  } IOMMUMemoryRegionClass;
>  
>  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> @@ -1363,6 +1373,16 @@ int 
> memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr,
>   */
>  int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr);
>  
> +/**
> + * memory_region_iommu_set_page_size_mask: set the supported pages
> + * size by iommu.
supported page sizes for a given IOMMU memory region
> + *
> + * @iommu_mr: the memory region
IOMMU memory region
> + * @page_size_mask: supported page size mask
> + */
> +void memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr,
> +uint64_t page_size_mask);
> +
>  /**
>   * memory_region_name: get a memory region's name
>   *
> diff --git a/memory.c b/memory.c
> index aeaa8dcc9e..14c8783084 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -1833,6 +1833,16 @@ static int 
> memory_region_update_iommu_notify_flags(IOMMUMemoryRegion *iommu_mr,
>  return ret;
>  }
>  
> +void memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr,
> +uint64_t page_size_mask)
> +{
> +IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
> +
> +if (imrc->iommu_set_page_size_mask) {
> +imrc->iommu_set_page_size_mask(iommu_mr, page_size_mask);
Shouldn't it return an int in case the setting cannot be applied?
> +}
> +}
> +
>  int memory_region_register_iommu_notifier(MemoryRegion *mr,
>IOMMUNotifier *n, Error **errp)
>  {
> 
Thanks
Eric

Re: [PATCH RFC 8/9] KVM: Add dirty-ring-size property

2020-03-26 Thread Peter Xu

On Thu, Mar 26, 2020 at 01:41:44PM +, Dr. David Alan Gilbert wrote:
> * Peter Xu (pet...@redhat.com) wrote:
> > On Wed, Mar 25, 2020 at 08:00:31PM +, Dr. David Alan Gilbert wrote:
> > > > @@ -2077,6 +2079,33 @@ static int kvm_init(MachineState *ms)
> > > >  s->memory_listener.listener.coalesced_io_add = 
> > > > kvm_coalesce_mmio_region;
> > > >  s->memory_listener.listener.coalesced_io_del = 
> > > > kvm_uncoalesce_mmio_region;
> > > >  
> > > > +/*
> > > > + * Enable KVM dirty ring if supported, otherwise fall back to
> > > > + * dirty logging mode
> > > > + */
> > > > +if (s->kvm_dirty_ring_size > 0) {
> > > > +/* Read the max supported pages */
> > > > +ret = kvm_vm_check_extension(kvm_state, 
> > > > KVM_CAP_DIRTY_LOG_RING);
> > > > +if (ret > 0) {
> > > > +if (s->kvm_dirty_ring_size > ret) {
> > > > +error_report("KVM dirty ring size %d too big (maximum 
> > > > is %d). "
> > > > + "Please use a smaller value.",
> > > > + s->kvm_dirty_ring_size, ret);
> > > > +goto err;
> > > > +}
> > > > +
> > > > +ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0,
> > > > +s->kvm_dirty_ring_size);
> > > > +if (ret) {
> > > > +error_report("Enabling of KVM dirty ring failed: %d", 
> > > > ret);
> > > > +goto err;
> > > > +}
> > > > +
> > > > +s->kvm_dirty_gfn_count =
> > > > +s->kvm_dirty_ring_size / sizeof(struct kvm_dirty_gfn);
> > > 
> > > What happens if I was to pass dirty-ring-size=1 ?
> > > Then the count would be 0 and things would get upset somewhere?
> > > Do you need to check for a minimum postive value?
> > 
> > The above kvm_vm_enable_cap() should fail directly and QEMU will stop.
> > Yes we should check it, but kernel will do that in all cases, so I
> > just didn't do that explicitly again in the userspace.
> 
> We probably should have that check since you can give them a more
> obvious error message.

Yes we can.  Or I can enhance the error message when we failed with
kvm_vm_enable_cap() so the user will get the important hints.  I think
maybe that's more important than the explicit check itself.

> 
> > I was planning this to be an advanced feature so the user of this
> > parameter should know the rules to pass values in.
> 
> Advanced users just make advanced mistakes :-)
> 
> I did wonder if perhaps this option should be a count of entries rather
> than a byte size.

Sometimes it's easier to know "how many bytes we used", while instead
sometimes we want to know "how many dirty addresses we can track".
But sure I can switch, considering the users might be more interested
in the latter. :)

Thanks,

-- 
Peter Xu

[PATCH v2 3/3] replication: acquire aio context before calling job_cancel_sync

2020-03-26 Thread Stefan Reiter

job_cancel_sync requires the job's lock to be held, all other callers
already do this (replication_stop, drive_backup_abort,
blockdev_backup_abort, job_cancel_sync_all, cancel_common).

Signed-off-by: Stefan Reiter 
---
 block/replication.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/replication.c b/block/replication.c
index 413d95407d..6c809cda4e 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -144,12 +144,16 @@ fail:
 static void replication_close(BlockDriverState *bs)
 {
 BDRVReplicationState *s = bs->opaque;
+Job *commit_job = >commit_job->job;
+AioContext *commit_ctx = commit_job->aio_context;
 
 if (s->stage == BLOCK_REPLICATION_RUNNING) {
 replication_stop(s->rs, false, NULL);
 }
 if (s->stage == BLOCK_REPLICATION_FAILOVER) {
-job_cancel_sync(>commit_job->job);
+aio_context_acquire(commit_ctx);
+job_cancel_sync(commit_job);
+aio_context_release(commit_ctx);
 }
 
 if (s->mode == REPLICATION_MODE_SECONDARY) {
-- 
2.26.0

[PATCH v2 2/3] job: take each job's lock individually in job_txn_apply

2020-03-26 Thread Stefan Reiter

All callers of job_txn_apply hold a single job's lock, but different
jobs within a transaction can have different contexts, thus we need to
lock each one individually before applying the callback function.

Similar to job_completed_txn_abort this also requires releasing the
caller's context before and reacquiring it after to avoid recursive
locks which might break AIO_WAIT_WHILE in the callback.

Signed-off-by: Stefan Reiter 
---
 job.c | 32 
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/job.c b/job.c
index 134a07b92e..e0966162fa 100644
--- a/job.c
+++ b/job.c
@@ -136,17 +136,33 @@ static void job_txn_del_job(Job *job)
 }
 }
 
-static int job_txn_apply(JobTxn *txn, int fn(Job *))
+static int job_txn_apply(Job *job, int fn(Job *))
 {
-Job *job, *next;
+AioContext *outer_ctx = job->aio_context;
+AioContext *inner_ctx;
+Job *other_job, *next;
+JobTxn *txn = job->txn;
 int rc = 0;
 
-QLIST_FOREACH_SAFE(job, >jobs, txn_list, next) {
-rc = fn(job);
+/*
+ * Similar to job_completed_txn_abort, we take each job's lock before
+ * applying fn, but since we assume that outer_ctx is held by the caller,
+ * we need to release it here to avoid holding the lock twice - which would
+ * break AIO_WAIT_WHILE from within fn.
+ */
+aio_context_release(outer_ctx);
+
+QLIST_FOREACH_SAFE(other_job, >jobs, txn_list, next) {
+inner_ctx = other_job->aio_context;
+aio_context_acquire(inner_ctx);
+rc = fn(other_job);
+aio_context_release(inner_ctx);
 if (rc) {
 break;
 }
 }
+
+aio_context_acquire(outer_ctx);
 return rc;
 }
 
@@ -774,11 +790,11 @@ static void job_do_finalize(Job *job)
 assert(job && job->txn);
 
 /* prepare the transaction to complete */
-rc = job_txn_apply(job->txn, job_prepare);
+rc = job_txn_apply(job, job_prepare);
 if (rc) {
 job_completed_txn_abort(job);
 } else {
-job_txn_apply(job->txn, job_finalize_single);
+job_txn_apply(job, job_finalize_single);
 }
 }
 
@@ -824,10 +840,10 @@ static void job_completed_txn_success(Job *job)
 assert(other_job->ret == 0);
 }
 
-job_txn_apply(txn, job_transition_to_pending);
+job_txn_apply(job, job_transition_to_pending);
 
 /* If no jobs need manual finalization, automatically do so */
-if (job_txn_apply(txn, job_needs_finalize) == 0) {
+if (job_txn_apply(job, job_needs_finalize) == 0) {
 job_do_finalize(job);
 }
 }
-- 
2.26.0

[PATCH v2 1/3] backup: don't acquire aio_context in backup_clean

2020-03-26 Thread Stefan Reiter

All code-paths leading to backup_clean (via job_clean) have the job's
context already acquired. The job's context is guaranteed to be the same
as the one used by backup_top via backup_job_create.

Since the previous logic effectively acquired the lock twice, this
broke cleanup of backups for disks using IO threads, since the BDRV_POLL_WHILE
in bdrv_backup_top_drop -> bdrv_do_drained_begin would only release the lock
once, thus deadlocking with the IO thread.

This is a partial revert of 0abf2581717a19.

Signed-off-by: Stefan Reiter 
---
 block/backup.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 7430ca5883..a7a7dcaf4c 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -126,11 +126,7 @@ static void backup_abort(Job *job)
 static void backup_clean(Job *job)
 {
 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
-AioContext *aio_context = bdrv_get_aio_context(s->backup_top);
-
-aio_context_acquire(aio_context);
 bdrv_backup_top_drop(s->backup_top);
-aio_context_release(aio_context);
 }
 
 void backup_do_checkpoint(BlockJob *job, Error **errp)
-- 
2.26.0

[PATCH v2 0/3] Fix some AIO context locking in jobs

2020-03-26 Thread Stefan Reiter

Contains three seperate but related patches cleaning up and fixing some
issues regarding aio_context_acquire/aio_context_release for jobs. Mostly
affects blockjobs running for devices that have IO threads enabled AFAICT.

This is based on the discussions here:
https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg07929.html

I *think* the second patch also fixes the hangs on backup abort that I and
Dietmar noticed in v1, but I'm not sure, they we're somewhat intermittent
before too.

Changes from v1:
* fixed commit message for patch 1
* added patches 2 and 3

Stefan Reiter (3):
  backup: don't acquire aio_context in backup_clean
  job: take each job's lock individually in job_txn_apply
  replication: acquire aio context before calling job_cancel_sync

 block/backup.c  |  4 
 block/replication.c |  6 +-
 job.c   | 32 
 3 files changed, 29 insertions(+), 13 deletions(-)

-- 
2.26.0

Re: [PATCH v1 12/22] intel_iommu: add PASID cache management infrastructure

2020-03-26 Thread Peter Xu

On Thu, Mar 26, 2020 at 01:57:10PM +, Liu, Yi L wrote:
> > From: Liu, Yi L
> > Sent: Thursday, March 26, 2020 2:15 PM
> > To: 'Peter Xu' 
> > Subject: RE: [PATCH v1 12/22] intel_iommu: add PASID cache management
> > infrastructure
> > 
> > > From: Peter Xu 
> > > Sent: Wednesday, March 25, 2020 10:52 PM
> > > To: Liu, Yi L 
> > > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache management
> > > infrastructure
> > >
> > > On Wed, Mar 25, 2020 at 12:20:21PM +, Liu, Yi L wrote:
> > > > > From: Peter Xu 
> > > > > Sent: Wednesday, March 25, 2020 1:32 AM
> > > > > To: Liu, Yi L 
> > > > > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache
> > > > > management infrastructure
> > > > >
> > > > > On Sun, Mar 22, 2020 at 05:36:09AM -0700, Liu Yi L wrote:
> > > > > > This patch adds a PASID cache management infrastructure based on
> > > > > > new added structure VTDPASIDAddressSpace, which is used to track
> > > > > > the PASID usage and future PASID tagged DMA address translation
> > > > > > support in vIOMMU.
> [...]
> > > > >
> > > > > 
> > > > >
> > > > > > +/*
> > > > > > + * Step 2: loop all the exisitng vtd_dev_icx instances.
> > > > > > + * Ideally, needs to loop all devices to find if there is any 
> > > > > > new
> > > > > > + * PASID binding regards to the PASID cache invalidation 
> > > > > > request.
> > > > > > + * But it is enough to loop the devices which are backed by 
> > > > > > host
> > > > > > + * IOMMU. For devices backed by vIOMMU (a.k.a emulated 
> > > > > > devices),
> > > > > > + * if new PASID happened on them, their vtd_pasid_as instance 
> > > > > > could
> > > > > > + * be created during future vIOMMU DMA translation.
> > > > > > + */
> > > > > > +QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) {
> > > > > > +VTDPASIDAddressSpace *vtd_pasid_as;
> > > > > > +VTDPASIDCacheEntry *pc_entry;
> > > > > > +VTDPASIDEntry pe;
> > > > > > +VTDBus *vtd_bus = vtd_dev_icx->vtd_bus;
> > > > > > +uint16_t devfn = vtd_dev_icx->devfn;
> > > > > > +int bus_n = pci_bus_num(vtd_bus->bus);
> > > > > > +
> > > > > > +/* i) fetch vtd_pasid_as and check if it is valid */
> > > > > > +vtd_pasid_as = vtd_add_find_pasid_as(s, vtd_bus,
> > > > > > + devfn, pasid);
> > > > >
> > > > > I don't feel like it's correct here...
> > > > >
> > > > > Assuming we have two devices assigned D1, D2.  D1 uses PASID=1, D2
> > > > > uses
> > > PASID=2.
> > > > > When invalidating against PASID=1, are you also going to create a
> > > > > VTDPASIDAddressSpace also for D2 with PASID=1?
> > > >
> > > > Answer is no. Before going forward, let's see if the below flow looks 
> > > > good to you.
> > > >
> > > > Let me add one more device besides D1 and D2. Say device D3 which
> > > > also uses PASID=1. And assume it begins with no PASID usage in guest.
> > > >
> > > > Then the flow from scratch is:
> > > >
> > > > a) guest IOMMU driver setup PASID entry for D1 with PASID=1,
> > > >then invalidates against PASID #1
> > > > b) trap to QEMU, and comes to this function. Since there is
> > > >no previous pasid cache invalidation, so the Step 1 of this
> > > >function has nothing to do, then goes to Step 2 which is to
> > > >loop all assigned devices and check if the guest pasid entry
> > > >is present. In this loop, should find D1's pasid entry for
> > > >PASID#1 is present. So create the VTDPASIDAddressSpace and
> > > >initialize its pasid_cache_entry and pasid_cache_gen fields.
> > > > c) guest IOMMU driver setup PASID entry for D2 with PASID=2,
> > > >then invalidates against PASID #2
> > > > d) same with b), only difference is the Step 1 of this function
> > > >will loop the VTDPASIDAddressSpace created in b), but its
> > > >pasid is 1 which is not the target of current pasid cache
> > > >invalidation. Same with b), in Step 2, it will create a
> > > >VTDPASIDAddressSpace for D2+PASID#2
> > > > e) guest IOMMU driver setup PASID entry for D3 with PASID=1,
> > > >then invalidates against PASID #1
> > > > f) trap to QEMU and comes to this function. Step 1 loops two
> > > >VTDPASIDAddressSpace created in b) and d), and it finds there
> > > >is a VTDPASIDAddressSpace whose pasid is 1. vtd_flush_pasid()
> > > >will check if the cached pasid entry is the same with the one
> > > >in guest memory. In this flow, it should be the same, so
> > > >vtd_flush_pasid() will do nothing for it. Then comes to Step 2,
> > > >it loops D1, D2, D3.
> > > >- For D1, no need to do more since there is already a
> > > >  VTDPASIDAddressSpace created for D1+PASID#1.
> > > >- For D2, its guest pasid entry for PASID#1 is not present, so
> > > >  no need to do anything for it.
> > > >- For D3, its guest pasid entry for PASID#1 is present and it
> > > >  is exactly the reason for this

Re: [PATCH v2 0/2] mirror: Fix hang (operation waiting for itself/circular dependency)

2020-03-26 Thread Eric Blake


On 3/26/20 10:36 AM, Kevin Wolf wrote:

The recent fix didn't actually fix the whole problem. Operations can't
only wait for themselves, but we can also end up with circular
dependencies like two operations waiting for each other to complete.

This reverts the first fix and implements another approach.

v2:
- Mark active mirror operations as in-flight, too

Kevin Wolf (2):
   Revert "mirror: Don't let an operation wait for itself"
   mirror: Wait only for in-flight operations


Series:
Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v9 4/9] virtio-iommu: set supported page size mask

2020-03-26 Thread Auger Eric

Hi Bharat,

On 3/23/20 9:46 AM, Bharat Bhushan wrote:
> Add optional interface to set page size mask.
> Currently this is set global configuration and not
> per endpoint.
This allows to override the page size mask per end-point?
> 
> Signed-off-by: Bharat Bhushan 
> ---
>  include/hw/virtio/virtio-iommu.h | 1 +
>  hw/virtio/virtio-iommu.c | 9 +
>  2 files changed, 10 insertions(+)
> 
> diff --git a/include/hw/virtio/virtio-iommu.h 
> b/include/hw/virtio/virtio-iommu.h
> index 6f67f1020a..4efa09610a 100644
> --- a/include/hw/virtio/virtio-iommu.h
> +++ b/include/hw/virtio/virtio-iommu.h
> @@ -35,6 +35,7 @@ typedef struct IOMMUDevice {
>  void *viommu;
>  PCIBus   *bus;
>  int   devfn;
> +uint64_t  page_size_mask;
>  IOMMUMemoryRegion  iommu_mr;
>  AddressSpace  as;
>  } IOMMUDevice;
> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
> index 4cee8083bc..a28818202c 100644
> --- a/hw/virtio/virtio-iommu.c
> +++ b/hw/virtio/virtio-iommu.c
> @@ -650,6 +650,14 @@ static gint int_cmp(gconstpointer a, gconstpointer b, 
> gpointer user_data)
>  return (ua > ub) - (ua < ub);
>  }
>  
> +static void virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
> +uint64_t page_size_mask)
> +{
> +IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr);
> +
> +sdev->page_size_mask = page_size_mask;
> +}
> +
>  static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
>  {
>  VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> @@ -865,6 +873,7 @@ static void 
> virtio_iommu_memory_region_class_init(ObjectClass *klass,
>  IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>  
>  imrc->translate = virtio_iommu_translate;
> +imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask;
>  }
>  
>  static const TypeInfo virtio_iommu_info = {
> 
Thanks

Eric

Re: [PATCH v9 8/9] virtio-iommu: Implement probe request

2020-03-26 Thread Auger Eric

Hi Bharat

On 3/23/20 9:46 AM, Bharat Bhushan wrote:
> This patch implements the PROBE request. Currently supported
> page size mask per endpoint is returned. Also append a NONE
> property in the end.
> 
> Signed-off-by: Bharat Bhushan 
> Signed-off-by: Eric Auger 
> ---
>  include/standard-headers/linux/virtio_iommu.h |   6 +
Changes to virtio_iommu.h should be in a separate patch
you should use ./scripts/update-linux-headers.sh
See for instance:
ddda37483d  linux-headers: update
until the uapi updates are not upstream you can link to your kernel
branch and mention this is a temporary linux header update or partial if
you just want to pick up the iommu.h changes.

>  hw/virtio/virtio-iommu.c  | 161 +-
>  hw/virtio/trace-events|   2 +
>  3 files changed, 166 insertions(+), 3 deletions(-)
> 
> diff --git a/include/standard-headers/linux/virtio_iommu.h 
> b/include/standard-headers/linux/virtio_iommu.h
> index b9443b83a1..8a0d47b907 100644
> --- a/include/standard-headers/linux/virtio_iommu.h
> +++ b/include/standard-headers/linux/virtio_iommu.h
> @@ -111,6 +111,7 @@ struct virtio_iommu_req_unmap {
>  
>  #define VIRTIO_IOMMU_PROBE_T_NONE0
>  #define VIRTIO_IOMMU_PROBE_T_RESV_MEM1
> +#define VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK  2
>  
>  #define VIRTIO_IOMMU_PROBE_T_MASK0xfff
>  
> @@ -130,6 +131,11 @@ struct virtio_iommu_probe_resv_mem {
>   uint64_tend;
>  };
>  
> +struct virtio_iommu_probe_pgsize_mask {
> + struct virtio_iommu_probe_property  head;
> + uint64_tpgsize_bitmap;
> +};
> +
>  struct virtio_iommu_req_probe {
>   struct virtio_iommu_req_headhead;
>   uint32_tendpoint;
> diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
> index 747e3cf1da..63fbacdcdc 100644
> --- a/hw/virtio/virtio-iommu.c
> +++ b/hw/virtio/virtio-iommu.c
> @@ -38,6 +38,10 @@
>  
>  /* Max size */
>  #define VIOMMU_DEFAULT_QUEUE_SIZE 256
> +#define VIOMMU_PROBE_SIZE 512
> +
> +#define SUPPORTED_PROBE_PROPERTIES (\
> +1 << VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK)
>  
>  typedef struct VirtIOIOMMUDomain {
>  uint32_t id;
> @@ -62,6 +66,13 @@ typedef struct VirtIOIOMMUMapping {
>  uint32_t flags;
>  } VirtIOIOMMUMapping;
>  
> +typedef struct VirtIOIOMMUPropBuffer {
> +VirtIOIOMMUEndpoint *endpoint;
> +size_t filled;
> +uint8_t *start;
> +bool error;
> +} VirtIOIOMMUPropBuffer;
> +
>  static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev)
>  {
>  return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn);
> @@ -490,6 +501,114 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s,
>  return ret;
>  }
>  
> +static int virtio_iommu_fill_none_prop(VirtIOIOMMUPropBuffer *bufstate)
> +{
> +struct virtio_iommu_probe_property *prop;
> +
> +prop = (struct virtio_iommu_probe_property *)
> +(bufstate->start + bufstate->filled);
> +prop->type = 0;
> +prop->length = 0;
> +bufstate->filled += sizeof(*prop);
> +trace_virtio_iommu_fill_none_property(bufstate->endpoint->id);
> +return 0;
> +}
> +
> +static int virtio_iommu_fill_page_size_mask(VirtIOIOMMUPropBuffer *bufstate)
> +{
> +struct virtio_iommu_probe_pgsize_mask *page_size_mask;
> +size_t prop_size = sizeof(*page_size_mask);
> +VirtIOIOMMUEndpoint *ep = bufstate->endpoint;
> +VirtIOIOMMU *s = ep->viommu;
> +IOMMUDevice *sdev;
> +
> +if (bufstate->filled + prop_size >= VIOMMU_PROBE_SIZE) {
> +bufstate->error = true;
> +/* get the traversal stopped by returning true */
> +return true;
> +}
> +
> +page_size_mask = (struct virtio_iommu_probe_pgsize_mask *)
> + (bufstate->start + bufstate->filled);
> +
> +page_size_mask->head.type = VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK;
> +page_size_mask->head.length = prop_size;
> +QLIST_FOREACH(sdev, >notifiers_list, next) {
> +if (ep->id == sdev->devfn) {
> +page_size_mask->pgsize_bitmap = sdev->page_size_mask;
> + }
> +}
> +bufstate->filled += sizeof(*page_size_mask);
> +trace_virtio_iommu_fill_pgsize_mask_property(bufstate->endpoint->id,
> + 
> page_size_mask->pgsize_bitmap,
> + bufstate->filled);
> +return false;
> +}
> +
> +/* Fill the properties[] buffer with properties of type @type */
> +static int virtio_iommu_fill_property(int type,
> +  VirtIOIOMMUPropBuffer *bufstate)
> +{
> +int ret = -ENOSPC;
> +
> +if (bufstate->filled + sizeof(struct virtio_iommu_probe_property)
> +>= VIOMMU_PROBE_SIZE) {
> +/* no space left for the header */
> +bufstate->error = true;
> +goto out;
> +}
> +
> +switch (type) {
> +case

Re: [PULL 0/6] Linux user for 5.0 patches

2020-03-26 Thread Peter Maydell

On Thu, 26 Mar 2020 at 07:24, Laurent Vivier  wrote:
>
> The following changes since commit 736cf607e40674776d752acc201f565723e86045:
>
>   Update version for v5.0.0-rc0 release (2020-03-24 17:50:00 +)
>
> are available in the Git repository at:
>
>   git://github.com/vivier/qemu.git tags/linux-user-for-5.0-pull-request
>
> for you to fetch changes up to a52f5f87bece827a338d6eb3332e3def86fb9c33:
>
>   linux-user: Flush out implementation of gettimeofday (2020-03-26 08:08:54 
> +0100)
>
> 
> Emulate x86_64 vsyscalls
> Fix syscall_nr.h cleanup
>
> 

Still fails :-(

/home/petmay01/linaro/qemu-for-merges/build/all-linux-static/x86_64-linux-user/qemu-x86_64
-L ./gnemul/qemu-x86_64 x86_64/ls -l dummyfile
qemu: 0x40008117e9: unhandled CPU exception 0x101 - aborting
RAX=003f RBX=6e34 RCX=004000800b18
RDX=004000813180
RSI=0064 RDI=004000800670 RBP=6f40
RSP=004000800668
R8 = R9 =004000800b44 R10=004000801a18
R11=004000801260
R12=0040008008c0 R13=0008 R14=00400040
R15=0040008032d0
RIP=0040008117e9 RFL=0246 [---Z-P-] CPL=3 II=0 A20=1 SMM=0 HLT=0
ES =   
CS =0033   00effb00 DPL=3 CS64 [-RA]
SS =002b   00cff300 DPL=3 DS   [-WA]
DS =   
FS =   
GS =   
LDT=   8200 DPL=0 LDT
TR =   8b00 DPL=0 TSS64-busy
GDT= 00400091a000 007f
IDT= 004000919000 01ff
CR0=80010001 CR2= CR3= CR4=0220
DR0= DR1= DR2=
DR3=
DR6=0ff0 DR7=0400
EFER=0500
Makefile:6: recipe for target 'test' failed

thanks
-- PMM

[PATCH v2 2/2] mirror: Wait only for in-flight operations

2020-03-26 Thread Kevin Wolf

mirror_wait_for_free_in_flight_slot() just picks a random operation to
wait for. However, a MirrorOp is already in s->ops_in_flight when
mirror_co_read() waits for free slots, so if not enough slots are
immediately available, an operation can end up waiting for itself, or
two or more operations can wait for each other to complete, which
results in a hang.

Fix this by adding a flag to MirrorOp that tells us if the request is
already in flight (and therefore occupies slots that it will later
free), and picking only such operations for waiting.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692
Signed-off-by: Kevin Wolf 
---
 block/mirror.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/mirror.c b/block/mirror.c
index 393131b135..88414d1653 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -102,6 +102,7 @@ struct MirrorOp {
 
 bool is_pseudo_op;
 bool is_active_write;
+bool is_in_flight;
 CoQueue waiting_requests;
 Coroutine *co;
 
@@ -293,7 +294,9 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool 
active)
  * caller of this function.  Since there is only one pseudo op
  * at any given time, we will always find some real operation
  * to wait on. */
-if (!op->is_pseudo_op && op->is_active_write == active) {
+if (!op->is_pseudo_op && op->is_in_flight &&
+op->is_active_write == active)
+{
 qemu_co_queue_wait(>waiting_requests, NULL);
 return;
 }
@@ -367,6 +370,7 @@ static void coroutine_fn mirror_co_read(void *opaque)
 /* Copy the dirty cluster.  */
 s->in_flight++;
 s->bytes_in_flight += op->bytes;
+op->is_in_flight = true;
 trace_mirror_one_iteration(s, op->offset, op->bytes);
 
 ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes,
@@ -382,6 +386,7 @@ static void coroutine_fn mirror_co_zero(void *opaque)
 op->s->in_flight++;
 op->s->bytes_in_flight += op->bytes;
 *op->bytes_handled = op->bytes;
+op->is_in_flight = true;
 
 ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
@@ -396,6 +401,7 @@ static void coroutine_fn mirror_co_discard(void *opaque)
 op->s->in_flight++;
 op->s->bytes_in_flight += op->bytes;
 *op->bytes_handled = op->bytes;
+op->is_in_flight = true;
 
 ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes);
 mirror_write_complete(op, ret);
@@ -1318,6 +1324,7 @@ static MirrorOp *coroutine_fn 
active_write_prepare(MirrorBlockJob *s,
 .offset = offset,
 .bytes  = bytes,
 .is_active_write= true,
+.is_in_flight   = true,
 };
 qemu_co_queue_init(>waiting_requests);
 QTAILQ_INSERT_TAIL(>ops_in_flight, op, next);
-- 
2.20.1

[PATCH v2 1/2] Revert "mirror: Don't let an operation wait for itself"

2020-03-26 Thread Kevin Wolf

This reverts commit 7e6c4ff792734e196c8ca82564c56b5e7c6288ca.

The fix was incomplete as it only protected against requests waiting for
themselves, but not against requests waiting for each other. We need a
different solution.

Signed-off-by: Kevin Wolf 
---
 block/mirror.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 447051dbc6..393131b135 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -283,14 +283,11 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t 
*offset,
 }
 
 static inline void coroutine_fn
-mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active)
+mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
 {
 MirrorOp *op;
 
 QTAILQ_FOREACH(op, >ops_in_flight, next) {
-if (self == op) {
-continue;
-}
 /* Do not wait on pseudo ops, because it may in turn wait on
  * some other operation to start, which may in fact be the
  * caller of this function.  Since there is only one pseudo op
@@ -305,10 +302,10 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp 
*self, bool active)
 }
 
 static inline void coroutine_fn
-mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s, MirrorOp *self)
+mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
 {
 /* Only non-active operations use up in-flight slots */
-mirror_wait_for_any_operation(s, self, false);
+mirror_wait_for_any_operation(s, false);
 }
 
 /* Perform a mirror copy operation.
@@ -351,7 +348,7 @@ static void coroutine_fn mirror_co_read(void *opaque)
 
 while (s->buf_free_count < nb_chunks) {
 trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
-mirror_wait_for_free_in_flight_slot(s, op);
+mirror_wait_for_free_in_flight_slot(s);
 }
 
 /* Now make a QEMUIOVector taking enough granularity-sized chunks
@@ -558,7 +555,7 @@ static uint64_t coroutine_fn 
mirror_iteration(MirrorBlockJob *s)
 
 while (s->in_flight >= MAX_IN_FLIGHT) {
 trace_mirror_yield_in_flight(s, offset, s->in_flight);
-mirror_wait_for_free_in_flight_slot(s, pseudo_op);
+mirror_wait_for_free_in_flight_slot(s);
 }
 
 if (s->ret < 0) {
@@ -612,7 +609,7 @@ static void mirror_free_init(MirrorBlockJob *s)
 static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s)
 {
 while (s->in_flight > 0) {
-mirror_wait_for_free_in_flight_slot(s, NULL);
+mirror_wait_for_free_in_flight_slot(s);
 }
 }
 
@@ -809,7 +806,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 if (s->in_flight >= MAX_IN_FLIGHT) {
 trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
s->in_flight);
-mirror_wait_for_free_in_flight_slot(s, NULL);
+mirror_wait_for_free_in_flight_slot(s);
 continue;
 }
 
@@ -962,7 +959,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
 /* Do not start passive operations while there are active
  * writes in progress */
 while (s->in_active_write_counter) {
-mirror_wait_for_any_operation(s, NULL, true);
+mirror_wait_for_any_operation(s, true);
 }
 
 if (s->ret < 0) {
@@ -988,7 +985,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
 if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
 (cnt == 0 && s->in_flight > 0)) {
 trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
-mirror_wait_for_free_in_flight_slot(s, NULL);
+mirror_wait_for_free_in_flight_slot(s);
 continue;
 } else if (cnt != 0) {
 delay_ns = mirror_iteration(s);
-- 
2.20.1

[PATCH v2 0/2] mirror: Fix hang (operation waiting for itself/circular dependency)

2020-03-26 Thread Kevin Wolf

The recent fix didn't actually fix the whole problem. Operations can't
only wait for themselves, but we can also end up with circular
dependencies like two operations waiting for each other to complete.

This reverts the first fix and implements another approach.

v2:
- Mark active mirror operations as in-flight, too

Kevin Wolf (2):
  Revert "mirror: Don't let an operation wait for itself"
  mirror: Wait only for in-flight operations

 block/mirror.c | 30 +-
 1 file changed, 17 insertions(+), 13 deletions(-)

-- 
2.20.1

[PATCH] gdbstub: fix compiler complaining

2020-03-26 Thread Denis Plotnikov

./gdbstub.c: In function ‘handle_query_thread_extra’:
/usr/include/glib-2.0/glib/glib-autocleanups.h:28:10:
error: ‘cpu_name’ may be used uninitialized in this function
[-Werror=maybe-uninitialized]
g_free (*pp);
   ^
./gdbstub.c:2063:26: note: ‘cpu_name’ was declared here
g_autofree char *cpu_name;
 ^
cc1: all warnings being treated as errors

Signed-off-by: Denis Plotnikov 
---
 gdbstub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gdbstub.c b/gdbstub.c
index 013fb1ac0f..171e150950 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -2060,8 +2060,8 @@ static void handle_query_thread_extra(GdbCmdContext 
*gdb_ctx, void *user_ctx)
 /* Print the CPU model and name in multiprocess mode */
 ObjectClass *oc = object_get_class(OBJECT(cpu));
 const char *cpu_model = object_class_get_name(oc);
-g_autofree char *cpu_name;
-cpu_name  = object_get_canonical_path_component(OBJECT(cpu));
+g_autofree char *cpu_name =
+object_get_canonical_path_component(OBJECT(cpu));
 g_string_printf(rs, "%s %s [%s]", cpu_model, cpu_name,
 cpu->halted ? "halted " : "running");
 } else {
-- 
2.17.0

[Bug 1869006] Re: PCIe cards passthrough to TCG guest works on 2GB of guest memory but fails on 4GB (vfio_dma_map invalid arg)

2020-03-26 Thread Marcin Juszkiewicz

I wanted to try the same on machine without MSI. But my desktop refuses
to boot into sane environment with pci=nomsi kernel option.

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1869006

Title:
  PCIe cards passthrough to TCG guest works on 2GB of guest memory but
  fails on 4GB (vfio_dma_map invalid arg)

Status in QEMU:
  New

Bug description:
  During one meeting coworker asked "did someone tried to passthrough
  PCIe card to other arch guest?" and I decided to check it.

  Plugged SATA and USB3 controllers into spare slots on mainboard and
  started playing. On 1GB VM instance it worked (both cold- and hot-
  plugged). On 4GB one it did not:

  Błąd podczas uruchamiania domeny: internal error: process exited while 
connecting to monitor: 2020-03-25T13:43:39.107524Z qemu-system-aarch64: -device 
vfio-pci,host=:29:00.0,id=hostdev0,bus=pci.3,addr=0x0: VFIO_MAP_DMA: -22
  2020-03-25T13:43:39.107560Z qemu-system-aarch64: -device 
vfio-pci,host=:29:00.0,id=hostdev0,bus=pci.3,addr=0x0: vfio :29:00.0: 
failed to setup container for group 28: memory listener initialization failed: 
Region mach-virt.ram: vfio_dma_map(0x563169753c80, 0x4000, 0x1, 
0x7fb2a3e0) = -22 (Invalid argument)

  Traceback (most recent call last):
File "/usr/share/virt-manager/virtManager/asyncjob.py", line 75, in 
cb_wrapper
  callback(asyncjob, *args, **kwargs)
File "/usr/share/virt-manager/virtManager/asyncjob.py", line 111, in tmpcb
  callback(*args, **kwargs)
File "/usr/share/virt-manager/virtManager/object/libvirtobject.py", line 
66, in newfn
  ret = fn(self, *args, **kwargs)
File "/usr/share/virt-manager/virtManager/object/domain.py", line 1279, in 
startup
  self._backend.create()
File "/usr/lib64/python3.8/site-packages/libvirt.py", line 1234, in create
  if ret == -1: raise libvirtError ('virDomainCreate() failed', dom=self)
  libvirt.libvirtError: internal error: process exited while connecting to 
monitor: 2020-03-25T13:43:39.107524Z qemu-system-aarch64: -device 
vfio-pci,host=:29:00.0,id=hostdev0,bus=pci.3,addr=0x0: VFIO_MAP_DMA: -22
  2020-03-25T13:43:39.107560Z qemu-system-aarch64: -device 
vfio-pci,host=:29:00.0,id=hostdev0,bus=pci.3,addr=0x0: vfio :29:00.0: 
failed to setup container for group 28: memory listener initialization failed: 
Region mach-virt.ram: vfio_dma_map(0x563169753c80, 0x4000, 0x1, 
0x7fb2a3e0) = -22 (Invalid argument)

  
  I played with memory and 3054 MB is maximum value possible to boot VM with 
coldplugged host PCIe cards.

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1869006/+subscriptions

[PULL 10/10] iotests/138: Test leaks/corruptions fixed report

2020-03-26 Thread Max Reitz

Test that qemu-img check reports the number of leaks and corruptions
fixed in its JSON report (after a successful run).

While touching the _unsupported_imgopts line, adjust the note on why
data_file does not work with this test: The current comment sounds a bit
like it is a mistake for qemu-img check not to check external data
files' refcounts.  But there are no such refcounts, so it is no mistake.
Just say that qemu-img check does not do much for external data files,
and this is why this test does not work with them.

Signed-off-by: Max Reitz 
Message-Id: <20200324172757.1173824-4-mre...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/138 | 41 --
 tests/qemu-iotests/138.out | 14 +
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/138 b/tests/qemu-iotests/138
index 54b01046ad..1d5b0bed6d 100755
--- a/tests/qemu-iotests/138
+++ b/tests/qemu-iotests/138
@@ -41,8 +41,10 @@ _supported_fmt qcow2
 _supported_proto file
 _supported_os Linux
 # With an external data file, data clusters are not refcounted
-# (and so qemu-img check does not check their refcount)
-_unsupported_imgopts data_file
+# (so qemu-img check would not do much);
+# we want to modify the refcounts, so we need them to have a specific
+# format (namely u16)
+_unsupported_imgopts data_file 'refcount_bits=\([^1]\|.\([^6]\|$\)\)'
 
 echo
 echo '=== Check on an image with a multiple of 2^32 clusters ==='
@@ -65,6 +67,41 @@ poke_file "$TEST_IMG" $((2048 + 8)) 
"\x00\x80\x00\x00\x00\x00\x00\x00"
 # allocate memory", we have an error showing that l2 entry is invalid.
 _check_test_img
 
+echo
+echo '=== Check leaks-fixed/corruptions-fixed report'
+echo
+
+# After leaks and corruptions were fixed, those numbers should be
+# reported by qemu-img check
+_make_test_img 64k
+
+# Allocate data cluster
+$QEMU_IO -c 'write 0 64k' "$TEST_IMG" | _filter_qemu_io
+
+reftable_ofs=$(peek_file_be "$TEST_IMG" 48 8)
+refblock_ofs=$(peek_file_be "$TEST_IMG" $reftable_ofs 8)
+
+# Introduce a leak: Make the image header's refcount 2
+poke_file_be "$TEST_IMG" "$refblock_ofs" 2 2
+
+l1_ofs=$(peek_file_be "$TEST_IMG" 40 8)
+
+# Introduce a corruption: Drop the COPIED flag from the (first) L1 entry
+l1_entry=$(peek_file_be "$TEST_IMG" $l1_ofs 8)
+l1_entry=$((l1_entry & ~(1 << 63)))
+poke_file_be "$TEST_IMG" $l1_ofs 8 $l1_entry
+
+echo
+# Should print the number of corruptions and leaks fixed
+# (Filter out all JSON fields (recognizable by their four-space
+# indentation), but keep the "-fixed" fields (by removing two spaces
+# from their indentation))
+# (Also filter out the L1 entry, because why not)
+_check_test_img -r all --output=json \
+| sed -e 's/^  \(.*\)-fixed"/\1-fixed"/' \
+  -e '/^/d' \
+  -e "s/\\([^0-9a-f]\\)$(printf %x 
$l1_entry)\\([^0-9a-f]\\)/\1L1_ENTRY_VALUE\2/"
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/138.out b/tests/qemu-iotests/138.out
index aca7d47a80..79681e7cc9 100644
--- a/tests/qemu-iotests/138.out
+++ b/tests/qemu-iotests/138.out
@@ -9,4 +9,18 @@ ERROR: counting reference for region exceeding the end of the 
file by one cluste
 
 1 errors were found on the image.
 Data may be corrupted, or further writes to the image may corrupt it.
+
+=== Check leaks-fixed/corruptions-fixed report
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536
+wrote 65536/65536 bytes at offset 0
+64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+Leaked cluster 0 refcount=2 reference=1
+Repairing cluster 0 refcount=2 reference=1
+Repairing OFLAG_COPIED L2 cluster: l1_index=0 l1_entry=L1_ENTRY_VALUE 
refcount=1
+{
+  "corruptions-fixed": 1,
+  "leaks-fixed": 1,
+}
 *** done
-- 
2.25.1

[PULL 08/10] qemu-img: Fix check's leak/corruption fix report

2020-03-26 Thread Max Reitz

There are two problems with qemu-img check's report on how many leaks
and/or corruptions have been fixed:

(1) ImageCheck.has_leaks_fixed and ImageCheck.has_corruptions_fixed are
only true when ImageCheck.leaks or ImageCheck.corruptions (respectively)
are non-zero.  qcow2's check implementation will set the latter to zero
after it has fixed leaks and corruptions, though, so leaks-fixed and
corruptions-fixed are actually never reported after successful repairs.
We should always report them when they are non-zero, just like all the
other fields of ImageCheck.

(2) After something has been fixed and we run the check a second time,
leaks_fixed and corruptions_fixed are taken from the first run; but
has_leaks_fixed and has_corruptions_fixed are not.  The second run
actually cannot fix anything, so with (1) fixed, has_leaks_fixed and
has_corruptions_fixed will always be false here.  (With (1) unfixed,
they will at least be false on successful runs, because then the number
of leaks and corruptions found in the second run should be 0.)
We should save has_leaks_fixed and has_corruptions_fixed just like we
save leaks_fixed and corruptions_fixed.

Signed-off-by: Max Reitz 
Reviewed-by: Eric Blake 
Message-Id: <20200324172757.1173824-2-mre...@redhat.com>
Signed-off-by: Max Reitz 
---
 qemu-img.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index afddf33f08..b167376bd7 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -647,9 +647,9 @@ static int collect_image_check(BlockDriverState *bs,
 check->leaks= result.leaks;
 check->has_leaks= result.leaks != 0;
 check->corruptions_fixed= result.corruptions_fixed;
-check->has_corruptions_fixed= result.corruptions != 0;
+check->has_corruptions_fixed= result.corruptions_fixed != 0;
 check->leaks_fixed  = result.leaks_fixed;
-check->has_leaks_fixed  = result.leaks != 0;
+check->has_leaks_fixed  = result.leaks_fixed != 0;
 check->image_end_offset = result.image_end_offset;
 check->has_image_end_offset = result.image_end_offset != 0;
 check->total_clusters   = result.bfi.total_clusters;
@@ -803,9 +803,12 @@ static int img_check(int argc, char **argv)
 
 if (check->corruptions_fixed || check->leaks_fixed) {
 int corruptions_fixed, leaks_fixed;
+bool has_leaks_fixed, has_corruptions_fixed;
 
 leaks_fixed = check->leaks_fixed;
+has_leaks_fixed = check->has_leaks_fixed;
 corruptions_fixed   = check->corruptions_fixed;
+has_corruptions_fixed = check->has_corruptions_fixed;
 
 if (output_format == OFORMAT_HUMAN) {
 qprintf(quiet,
@@ -822,7 +825,9 @@ static int img_check(int argc, char **argv)
 ret = collect_image_check(bs, check, filename, fmt, 0);
 
 check->leaks_fixed  = leaks_fixed;
+check->has_leaks_fixed  = has_leaks_fixed;
 check->corruptions_fixed= corruptions_fixed;
+check->has_corruptions_fixed = has_corruptions_fixed;
 }
 
 if (!ret) {
-- 
2.25.1

[PULL 06/10] qcow2: Avoid feature name extension on small cluster size

2020-03-26 Thread Max Reitz

From: Eric Blake 

As the feature name table can be quite large (over 9k if all 64 bits
of all three feature fields have names; a mere 8 features leaves only
8 bytes for a backing file name in a 512-byte cluster), it is unwise
to emit this optional header in images with small cluster sizes.

Update iotest 036 to skip running on small cluster sizes; meanwhile,
note that iotest 061 never passed on alternative cluster sizes
(however, I limited this patch to tests with output affected by adding
feature names, rather than auditing for other tests that are not
robust to alternative cluster sizes).

Signed-off-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Alberto Garcia 
Message-Id: <20200324174233.1622067-4-ebl...@redhat.com>
Signed-off-by: Max Reitz 
---
 block/qcow2.c  | 11 +--
 tests/qemu-iotests/036 |  6 --
 tests/qemu-iotests/061 |  6 --
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index b74cbeb047..2bb536b014 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2823,8 +2823,15 @@ int qcow2_update_header(BlockDriverState *bs)
 buflen -= ret;
 }
 
-/* Feature table */
-if (s->qcow_version >= 3) {
+/*
+ * Feature table.  A mere 8 feature names occupies 392 bytes, and
+ * when coupled with the v3 minimum header of 104 bytes plus the
+ * 8-byte end-of-extension marker, that would leave only 8 bytes
+ * for a backing file name in an image with 512-byte clusters.
+ * Thus, we choose to omit this header for cluster sizes 4k and
+ * smaller.
+ */
+if (s->qcow_version >= 3 && s->cluster_size > 4096) {
 static const Qcow2Feature features[] = {
 {
 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
diff --git a/tests/qemu-iotests/036 b/tests/qemu-iotests/036
index 512598421c..cf522de7a1 100755
--- a/tests/qemu-iotests/036
+++ b/tests/qemu-iotests/036
@@ -44,8 +44,10 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 _supported_fmt qcow2
 _supported_proto file
 # Only qcow2v3 and later supports feature bits;
-# qcow2.py does not support external data files
-_unsupported_imgopts 'compat=0.10' data_file
+# qcow2.py does not support external data files;
+# this test requires a cluster size large enough for the feature table
+_unsupported_imgopts 'compat=0.10' data_file \
+'cluster_size=\(512\|1024\|2048\|4096\)'
 
 echo
 echo === Image with unknown incompatible feature bit ===
diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061
index 36b040491f..ce285d3084 100755
--- a/tests/qemu-iotests/061
+++ b/tests/qemu-iotests/061
@@ -44,8 +44,10 @@ _supported_os Linux
 # Conversion between different compat versions can only really work
 # with refcount_bits=16;
 # we have explicit tests for data_file here, but the whole test does
-# not work with it
-_unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' data_file
+# not work with it;
+# we have explicit tests for various cluster sizes, the remaining tests
+# require the default 64k cluster
+_unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' data_file 
cluster_size
 
 echo
 echo "=== Testing version downgrade with zero expansion ==="
-- 
2.25.1

[PULL 09/10] iotests: Add poke_file_[bl]e functions

2020-03-26 Thread Max Reitz

Similarly to peek_file_[bl]e, we may want to write binary integers into
a file.  Currently, this often means messing around with poke_file and
raw binary strings.  I hope these functions make it a bit more
comfortable.

Signed-off-by: Max Reitz 
Code-suggested-by: Eric Blake 
Message-Id: <20200324172757.1173824-3-mre...@redhat.com>
Reviewed-by: Eric Blake 
Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/common.rc | 24 
 1 file changed, 24 insertions(+)

diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 4c246c0450..bf3b9fdea0 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -53,6 +53,30 @@ poke_file()
 printf "$3" | dd "of=$1" bs=1 "seek=$2" conv=notrunc &>/dev/null
 }
 
+# poke_file_le $img_filename $offset $byte_width $value
+# Example: poke_file_le "$TEST_IMG" 512 2 65534
+poke_file_le()
+{
+local img=$1 ofs=$2 len=$3 val=$4 str=''
+
+while ((len--)); do
+str+=$(printf '\\x%02x' $((val & 0xff)))
+val=$((val >> 8))
+done
+
+poke_file "$img" "$ofs" "$str"
+}
+
+# poke_file_be $img_filename $offset $byte_width $value
+# Example: poke_file_be "$TEST_IMG" 512 2 65279
+poke_file_be()
+{
+local img=$1 ofs=$2 len=$3 val=$4
+local str=$(printf "%0$((len * 2))x\n" $val | sed 's/\(..\)/\\x\1/g')
+
+poke_file "$img" "$ofs" "$str"
+}
+
 # peek_file_le 'test.img' 512 2 => 65534
 peek_file_le()
 {
-- 
2.25.1

[PULL 05/10] qcow2: List autoclear bit names in header

2020-03-26 Thread Max Reitz

From: Eric Blake 

The feature table is supposed to advertise the name of all feature
bits that we support; however, we forgot to update the table for
autoclear bits.  While at it, move the table to read-only memory in
code, and tweak the qcow2 spec to name the second autoclear bit.
Update iotests that are affected by the longer header length.

Fixes: 88ddffae
Fixes: 93c24936
Signed-off-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20200324174233.1622067-3-ebl...@redhat.com>
Signed-off-by: Max Reitz 
---
 docs/interop/qcow2.txt |  3 ++-
 block/qcow2.c  | 12 +++-
 tests/qemu-iotests/031.out |  8 
 tests/qemu-iotests/036.out |  4 ++--
 tests/qemu-iotests/061.out | 14 +++---
 5 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt
index 5597e24474..640e0eca40 100644
--- a/docs/interop/qcow2.txt
+++ b/docs/interop/qcow2.txt
@@ -143,7 +143,8 @@ the next fields through header_length.
 bit is unset, the bitmaps extension data must 
be
 considered inconsistent.
 
-Bit 1:  If this bit is set, the external data file can
+Bit 1:  Raw external data bit
+If this bit is set, the external data file can
 be read as a consistent standalone raw image
 without looking at the qcow2 metadata.
 
diff --git a/block/qcow2.c b/block/qcow2.c
index b565cf912e..b74cbeb047 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2825,7 +2825,7 @@ int qcow2_update_header(BlockDriverState *bs)
 
 /* Feature table */
 if (s->qcow_version >= 3) {
-Qcow2Feature features[] = {
+static const Qcow2Feature features[] = {
 {
 .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
 .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
@@ -2846,6 +2846,16 @@ int qcow2_update_header(BlockDriverState *bs)
 .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
 .name = "lazy refcounts",
 },
+{
+.type = QCOW2_FEAT_TYPE_AUTOCLEAR,
+.bit  = QCOW2_AUTOCLEAR_BITMAPS_BITNR,
+.name = "bitmaps",
+},
+{
+.type = QCOW2_FEAT_TYPE_AUTOCLEAR,
+.bit  = QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR,
+.name = "raw external data",
+},
 };
 
 ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
diff --git a/tests/qemu-iotests/031.out b/tests/qemu-iotests/031.out
index d535e407bc..46f97c5a4e 100644
--- a/tests/qemu-iotests/031.out
+++ b/tests/qemu-iotests/031.out
@@ -117,7 +117,7 @@ header_length 104
 
 Header extension:
 magic 0x6803f857
-length192
+length288
 data  
 
 Header extension:
@@ -150,7 +150,7 @@ header_length 104
 
 Header extension:
 magic 0x6803f857
-length192
+length288
 data  
 
 Header extension:
@@ -164,7 +164,7 @@ No errors were found on the image.
 
 magic 0x514649fb
 version   3
-backing_file_offset   0x178
+backing_file_offset   0x1d8
 backing_file_size 0x17
 cluster_bits  16
 size  67108864
@@ -188,7 +188,7 @@ data  'host_device'
 
 Header extension:
 magic 0x6803f857
-length192
+length288
 data  
 
 Header extension:
diff --git a/tests/qemu-iotests/036.out b/tests/qemu-iotests/036.out
index 0b52b934e1..23b699ce06 100644
--- a/tests/qemu-iotests/036.out
+++ b/tests/qemu-iotests/036.out
@@ -26,7 +26,7 @@ compatible_features   []
 autoclear_features[63]
 Header extension:
 magic 0x6803f857
-length192
+length288
 data  
 
 
@@ -38,7 +38,7 @@ compatible_features   []
 autoclear_features[]
 Header extension:
 magic 0x6803f857
-length192
+length288
 data  
 
 *** done
diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out
index 8b3091a412..413cc4e0f4 100644
--- a/tests/qemu-iotests/061.out
+++ b/tests/qemu-iotests/061.out
@@ -26,7 +26,7 @@ header_length 104
 
 Header extension:
 magic 0x6803f857
-length192
+length288
 data  
 
 magic 0x514649fb
@@ -84,7 +84,7 @@ header_length 104
 
 Header extension:
 magic 0x6803f857
-length192
+length288
 data  
 
 magic

[PULL 07/10] sheepdog: Consistently set bdrv_has_zero_init_truncate

2020-03-26 Thread Max Reitz

From: Eric Blake 

block_int.h claims that .bdrv_has_zero_init must return 0 if
.bdrv_has_zero_init_truncate does likewise; but this is violated if
only the former callback is provided if .bdrv_co_truncate also exists.
When adding the latter callback, it was mistakenly added to only one
of the three possible sheepdog instantiations.

Fixes: 1dcaf527
Signed-off-by: Eric Blake 
Message-Id: <20200324174233.1622067-5-ebl...@redhat.com>
Reviewed-by: John Snow 
Reviewed-by: Philippe Mathieu-Daudé 
Signed-off-by: Max Reitz 
---
 block/sheepdog.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index a8a7e32a41..59f7ebb171 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -3271,6 +3271,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
 .bdrv_co_create   = sd_co_create,
 .bdrv_co_create_opts  = sd_co_create_opts,
 .bdrv_has_zero_init   = bdrv_has_zero_init_1,
+.bdrv_has_zero_init_truncate  = bdrv_has_zero_init_1,
 .bdrv_getlength   = sd_getlength,
 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
 .bdrv_co_truncate = sd_co_truncate,
@@ -3309,6 +3310,7 @@ static BlockDriver bdrv_sheepdog_unix = {
 .bdrv_co_create   = sd_co_create,
 .bdrv_co_create_opts  = sd_co_create_opts,
 .bdrv_has_zero_init   = bdrv_has_zero_init_1,
+.bdrv_has_zero_init_truncate  = bdrv_has_zero_init_1,
 .bdrv_getlength   = sd_getlength,
 .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
 .bdrv_co_truncate = sd_co_truncate,
-- 
2.25.1

[PULL 02/10] block: pass BlockDriver reference to the .bdrv_co_create

2020-03-26 Thread Max Reitz

From: Maxim Levitsky 

This will allow the reuse of a single generic .bdrv_co_create
implementation for several drivers.
No functional changes.

Signed-off-by: Maxim Levitsky 
Message-Id: <20200326011218.29230-2-mlevi...@redhat.com>
Reviewed-by: Denis V. Lunev 
Signed-off-by: Max Reitz 
---
 include/block/block_int.h | 3 ++-
 block.c   | 3 ++-
 block/crypto.c| 3 ++-
 block/file-posix.c| 4 +++-
 block/file-win32.c| 4 +++-
 block/gluster.c   | 3 ++-
 block/nfs.c   | 4 +++-
 block/parallels.c | 3 ++-
 block/qcow.c  | 3 ++-
 block/qcow2.c | 4 +++-
 block/qed.c   | 3 ++-
 block/raw-format.c| 4 +++-
 block/rbd.c   | 3 ++-
 block/sheepdog.c  | 4 +++-
 block/ssh.c   | 4 +++-
 block/vdi.c   | 4 +++-
 block/vhdx.c  | 3 ++-
 block/vmdk.c  | 4 +++-
 block/vpc.c   | 6 --
 19 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index ae9c4da4d0..57c8ea24b2 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -135,7 +135,8 @@ struct BlockDriver {
 void (*bdrv_close)(BlockDriverState *bs);
 int coroutine_fn (*bdrv_co_create)(BlockdevCreateOptions *opts,
Error **errp);
-int coroutine_fn (*bdrv_co_create_opts)(const char *filename,
+int coroutine_fn (*bdrv_co_create_opts)(BlockDriver *drv,
+const char *filename,
 QemuOpts *opts,
 Error **errp);
 int (*bdrv_make_empty)(BlockDriverState *bs);
diff --git a/block.c b/block.c
index cccae5add9..ff23e20443 100644
--- a/block.c
+++ b/block.c
@@ -483,7 +483,8 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque)
 CreateCo *cco = opaque;
 assert(cco->drv);
 
-ret = cco->drv->bdrv_co_create_opts(cco->filename, cco->opts, _err);
+ret = cco->drv->bdrv_co_create_opts(cco->drv,
+cco->filename, cco->opts, _err);
 error_propagate(>err, local_err);
 cco->ret = ret;
 }
diff --git a/block/crypto.c b/block/crypto.c
index 4425ebeb47..d577f89659 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -601,7 +601,8 @@ fail:
 return ret;
 }
 
-static int coroutine_fn block_crypto_co_create_opts_luks(const char *filename,
+static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv,
+ const char *filename,
  QemuOpts *opts,
  Error **errp)
 {
diff --git a/block/file-posix.c b/block/file-posix.c
index 9bc3838b2a..65bc980bc4 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2405,7 +2405,9 @@ out:
 return result;
 }
 
-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts 
*opts,
+static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
+   const char *filename,
+   QemuOpts *opts,
Error **errp)
 {
 BlockdevCreateOptions options;
diff --git a/block/file-win32.c b/block/file-win32.c
index 77e8ff7b68..15859839a1 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -588,7 +588,9 @@ static int raw_co_create(BlockdevCreateOptions *options, 
Error **errp)
 return 0;
 }
 
-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts 
*opts,
+static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
+   const char *filename,
+   QemuOpts *opts,
Error **errp)
 {
 BlockdevCreateOptions options;
diff --git a/block/gluster.c b/block/gluster.c
index 4fa4a77a47..0aa1f2cda4 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -1130,7 +1130,8 @@ out:
 return ret;
 }
 
-static int coroutine_fn qemu_gluster_co_create_opts(const char *filename,
+static int coroutine_fn qemu_gluster_co_create_opts(BlockDriver *drv,
+const char *filename,
 QemuOpts *opts,
 Error **errp)
 {
diff --git a/block/nfs.c b/block/nfs.c
index 9a6311e270..cc2413d5ab 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -662,7 +662,9 @@ out:
 return ret;
 }
 
-static int coroutine_fn nfs_file_co_create_opts(const char *url, QemuOpts 
*opts,
+static int coroutine_fn nfs_file_co_create_opts(BlockDriver *drv,
+const char *url,
+QemuOpts *opts,

[PULL 04/10] qcow2: Comment typo fixes

2020-03-26 Thread Max Reitz

From: Eric Blake 

Various trivial typos noticed while working on this file.

Signed-off-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Alberto Garcia 
Message-Id: <20200324174233.1622067-2-ebl...@redhat.com>
Signed-off-by: Max Reitz 
---
 block/qcow2.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 5f65fce924..b565cf912e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -177,7 +177,7 @@ static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock 
*block, size_t offset,
 }
 
 
-/* 
+/*
  * read qcow2 extension and fill bs
  * start reading from start_offset
  * finish reading upon magic of value 0 or when end_offset reached
@@ -3255,7 +3255,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, 
Error **errp)
  * inconsistency later.
  *
  * We do need a refcount table because growing the refcount table means
- * allocating two new refcount blocks - the seconds of which would be at
+ * allocating two new refcount blocks - the second of which would be at
  * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
  * size for any qcow2 image.
  */
@@ -3500,7 +3500,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, 
Error **errp)
 goto out;
 }
 
-/* Want a backing file? There you go.*/
+/* Want a backing file? There you go. */
 if (qcow2_opts->has_backing_file) {
 const char *backing_format = NULL;
 
-- 
2.25.1

[PULL 00/10] Block patches for 5.0-rc1

2020-03-26 Thread Max Reitz

The following changes since commit 01e38186ecb1fc6275720c5425332eed280ea93d:

  Merge remote-tracking branch 'remotes/dgilbert/tags/pull-migration-20200325b' 
into staging (2020-03-26 09:28:11 +)

are available in the Git repository at:

  https://github.com/XanClic/qemu.git tags/pull-block-2020-03-26

for you to fetch changes up to a507c51790fa955c1fccd4deca3c50476a862b83:

  iotests/138: Test leaks/corruptions fixed report (2020-03-26 14:52:43 +0100)


Block patches for 5.0-rc1:
- Fix qemu-img convert with a host device or iscsi target
- Use-after-free fix in mirror
- Some minor qcow2 fixes
- Minor sheepdog fix
- Minor qemu-img check report fix


Eric Blake (4):
  qcow2: Comment typo fixes
  qcow2: List autoclear bit names in header
  qcow2: Avoid feature name extension on small cluster size
  sheepdog: Consistently set bdrv_has_zero_init_truncate

Max Reitz (3):
  qemu-img: Fix check's leak/corruption fix report
  iotests: Add poke_file_[bl]e functions
  iotests/138: Test leaks/corruptions fixed report

Maxim Levitsky (2):
  block: pass BlockDriver reference to the .bdrv_co_create
  block: trickle down the fallback image creation function use to the
block drivers

Vladimir Sementsov-Ogievskiy (1):
  block/mirror: fix use after free of local_err

 docs/interop/qcow2.txt   |  3 ++-
 include/block/block.h|  1 +
 include/block/block_int.h| 14 +++-
 block.c  | 38 +++--
 block/crypto.c   |  3 ++-
 block/file-posix.c   | 11 --
 block/file-win32.c   |  4 +++-
 block/gluster.c  |  3 ++-
 block/iscsi.c| 16 --
 block/mirror.c   |  1 +
 block/nbd.c  |  6 ++
 block/nfs.c  |  4 +++-
 block/nvme.c |  3 +++
 block/parallels.c|  3 ++-
 block/qcow.c |  3 ++-
 block/qcow2.c| 33 +++--
 block/qed.c  |  3 ++-
 block/raw-format.c   |  4 +++-
 block/rbd.c  |  3 ++-
 block/sheepdog.c |  6 +-
 block/ssh.c  |  4 +++-
 block/vdi.c  |  4 +++-
 block/vhdx.c |  3 ++-
 block/vmdk.c |  4 +++-
 block/vpc.c  |  6 --
 qemu-img.c   |  9 ++--
 tests/qemu-iotests/031.out   |  8 +++
 tests/qemu-iotests/036   |  6 --
 tests/qemu-iotests/036.out   |  4 ++--
 tests/qemu-iotests/061   |  6 --
 tests/qemu-iotests/061.out   | 14 ++--
 tests/qemu-iotests/138   | 41 ++--
 tests/qemu-iotests/138.out   | 14 
 tests/qemu-iotests/common.rc | 24 +
 34 files changed, 233 insertions(+), 76 deletions(-)

-- 
2.25.1

[PULL 03/10] block: trickle down the fallback image creation function use to the block drivers

2020-03-26 Thread Max Reitz

From: Maxim Levitsky 

Instead of checking the .bdrv_co_create_opts to see if we need the
fallback, just implement the .bdrv_co_create_opts in the drivers that
need it.

This way we don't break various places that need to know if the
underlying protocol/format really supports image creation, and this way
we still allow some drivers to not support image creation.

Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816007

Note that technically this driver reverts the image creation fallback
for the vxhs driver since I don't have a means to test it, and IMHO it
is better to leave it not supported as it was prior to generic image
creation patches.

Also drop iscsi_create_opts which was left accidentally.

Signed-off-by: Maxim Levitsky 
Message-Id: <20200326011218.29230-3-mlevi...@redhat.com>
Reviewed-by: Denis V. Lunev 
[mreitz: Fixed alignment, and moved bdrv_co_create_opts_simple() and
 bdrv_create_opts_simple from block.h into block_int.h]
Signed-off-by: Max Reitz 
---
 include/block/block.h |  1 +
 include/block/block_int.h | 11 +++
 block.c   | 35 ---
 block/file-posix.c|  7 ++-
 block/iscsi.c | 16 
 block/nbd.c   |  6 ++
 block/nvme.c  |  3 +++
 7 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index e569a4d747..b05995fe9c 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -283,6 +283,7 @@ BlockDriver *bdrv_find_format(const char *format_name);
 int bdrv_create(BlockDriver *drv, const char* filename,
 QemuOpts *opts, Error **errp);
 int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);
+
 BlockDriverState *bdrv_new(void);
 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
  Error **errp);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 57c8ea24b2..4c3587ea19 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1331,4 +1331,15 @@ int refresh_total_sectors(BlockDriverState *bs, int64_t 
hint);
 void bdrv_set_monitor_owned(BlockDriverState *bs);
 BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp);
 
+/**
+ * Simple implementation of bdrv_co_create_opts for protocol drivers
+ * which only support creation via opening a file
+ * (usually existing raw storage device)
+ */
+int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
+const char *filename,
+QemuOpts *opts,
+Error **errp);
+extern QemuOptsList bdrv_create_opts_simple;
+
 #endif /* BLOCK_INT_H */
diff --git a/block.c b/block.c
index ff23e20443..af3faf664e 100644
--- a/block.c
+++ b/block.c
@@ -598,8 +598,15 @@ static int 
create_file_fallback_zero_first_sector(BlockBackend *blk,
 return 0;
 }
 
-static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv,
- QemuOpts *opts, Error **errp)
+/**
+ * Simple implementation of bdrv_co_create_opts for protocol drivers
+ * which only support creation via opening a file
+ * (usually existing raw storage device)
+ */
+int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
+const char *filename,
+QemuOpts *opts,
+Error **errp)
 {
 BlockBackend *blk;
 QDict *options;
@@ -663,11 +670,7 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, 
Error **errp)
 return -ENOENT;
 }
 
-if (drv->bdrv_co_create_opts) {
-return bdrv_create(drv, filename, opts, errp);
-} else {
-return bdrv_create_file_fallback(filename, drv, opts, errp);
-}
+return bdrv_create(drv, filename, opts, errp);
 }
 
 int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
@@ -1592,9 +1595,9 @@ QemuOptsList bdrv_runtime_opts = {
 },
 };
 
-static QemuOptsList fallback_create_opts = {
-.name = "fallback-create-opts",
-.head = QTAILQ_HEAD_INITIALIZER(fallback_create_opts.head),
+QemuOptsList bdrv_create_opts_simple = {
+.name = "simple-create-opts",
+.head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
 .desc = {
 {
 .name = BLOCK_OPT_SIZE,
@@ -5963,13 +5966,15 @@ void bdrv_img_create(const char *filename, const char 
*fmt,
 return;
 }
 
+if (!proto_drv->create_opts) {
+error_setg(errp, "Protocol driver '%s' does not support image 
creation",
+   proto_drv->format_name);
+return;
+}
+
 /* Create parameter list */
 create_opts = qemu_opts_append(create_opts, drv->create_opts);
-if (proto_drv->create_opts) {
-create_opts =

[PULL 01/10] block/mirror: fix use after free of local_err

2020-03-26 Thread Max Reitz

From: Vladimir Sementsov-Ogievskiy 

local_err is used again in mirror_exit_common() after
bdrv_set_backing_hd(), so we must zero it. Otherwise try to set
non-NULL local_err will crash.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Message-Id: <20200324153630.11882-3-vsement...@virtuozzo.com>
Reviewed-by: Eric Blake 
Reviewed-by: John Snow 
Signed-off-by: Max Reitz 
---
 block/mirror.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/mirror.c b/block/mirror.c
index 447051dbc6..6203e5946e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -678,6 +678,7 @@ static int mirror_exit_common(Job *job)
 bdrv_set_backing_hd(target_bs, backing, _err);
 if (local_err) {
 error_report_err(local_err);
+local_err = NULL;
 ret = -EPERM;
 }
 }
-- 
2.25.1

booke206_tlb_ways() returning 0

2020-03-26 Thread Peter Maydell

Hi; Coverity points out a possible issue in booke206_get_tlbm()
(this is CID 1421942):


static inline ppcmas_tlb_t *booke206_get_tlbm(CPUPPCState *env, const int tlbn,
  target_ulong ea, int way)
{
int r;
uint32_t ways = booke206_tlb_ways(env, tlbn);
int ways_bits = ctz32(ways);
int tlb_bits = ctz32(booke206_tlb_size(env, tlbn));
int i;

way &= ways - 1;
ea >>= MAS2_EPN_SHIFT;
ea &= (1 << (tlb_bits - ways_bits)) - 1;
r = (ea << ways_bits) | way;

Here if booke206_tlb_ways() returns zero, then ways_bits()
will be 32 and the shift left 'ea << ways_bits' is undefined
behaviour.

My first assumption was that booke206_tlb_ways() is not supposed
to ever return 0 (it's looking at what I think are read-only
system registers, and it doesn't make much sense to have
a zero-way TLB). So I tried adding an assert:

diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index 88d94495554..aedb6bcb265 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -2403,6 +2403,7 @@ static inline int booke206_tlb_ways(CPUPPCState
*env, int tlbn)
 {
 uint32_t tlbncfg = env->spr[SPR_BOOKE_TLB0CFG + tlbn];
 int r = tlbncfg >> TLBnCFG_ASSOC_SHIFT;
+assert(r > 0);
 return r;
 }

However, this isn't right, because it causes one of the check-acceptance
tests to fail, with this backtrace:

#3  0x7074d412 in __GI___assert_fail (assertion=0x560a0d7d
"r > 0", file=0x560a0d40
"/home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/cpu.h",
line=2406, function=0x560a1720 <__PRETTY_FUNCTION__.20811>
"booke206_tlb_ways") at assert.c:101
#4  0x55a9939b in booke206_tlb_ways (env=0x56e52a30,
tlbn=2) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/cpu.h:2406
#5  0x55a9b3ac in mmubooke206_get_physical_address
(env=0x56e52a30, ctx=0x7fffd77008a0, address=9223380835095282947,
rw=0, access_type=0, mmu_idx=1) at
/home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1093
#6  0x55a9c25d in get_physical_address_wtlb
(env=0x56e52a30, ctx=0x7fffd77008a0, eaddr=9223380835095282947,
rw=0, access_type=0, mmu_idx=1) at
/home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1455
#7  0x55a9c82b in cpu_ppc_handle_mmu_fault
(env=0x56e52a30, address=9223380835095282947, rw=0, mmu_idx=1)
at /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1597
#8  0x55a9f975 in ppc_cpu_tlb_fill (cs=0x56e49560,
addr=9223380835095282947, size=1, access_type=MMU_DATA_LOAD,
mmu_idx=1, probe=false, retaddr=140735610345781) at
/home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:3053
#9  0x558e1422 in tlb_fill (cpu=0x56e49560,
addr=9223380835095282947, size=1, access_type=MMU_DATA_LOAD,
mmu_idx=1, retaddr=140735610345781) at
/home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1017
#10 0x558e279b in load_helper (env=0x56e52a30,
addr=9223380835095282947, oi=1, retaddr=140735610345781, op=MO_8,
code_read=false, full_load=0x558e2b3a ) at
/home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1534
#11 0x558e2b80 in full_ldub_mmu (env=0x56e52a30,
addr=9223380835095282947, oi=1, retaddr=140735610345781)
at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1624
#12 0x558e2bb4 in helper_ret_ldub_mmu (env=0x56e52a30,
addr=9223380835095282947, oi=1, retaddr=140735610345781)
at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1630
#13 0x7fff900fd9c6 in code_gen_buffer ()
#14 0x558f9915 in cpu_tb_exec (cpu=0x56e49560,
itb=0x7fff900fd780 )
at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:172
#15 0x558fa732 in cpu_loop_exec_tb (cpu=0x56e49560,
tb=0x7fff900fd780 , last_tb=0x7fffd7701078,
tb_exit=0x7fffd7701070) at
/home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:619
#16 0x558faa4c in cpu_exec (cpu=0x56e49560) at
/home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:732
#17 0x558bcf29 in tcg_cpu_exec (cpu=0x56e49560) at
/home/petmay01/linaro/qemu-from-laptop/qemu/cpus.c:1405
#18 0x558bd77f in qemu_tcg_cpu_thread_fn (arg=0x56e49560)
at /home/petmay01/linaro/qemu-from-laptop/qemu/cpus.c:1713
#19 0x55f2ff3f in qemu_thread_start (args=0x56e8dd10) at
/home/petmay01/linaro/qemu-from-laptop/qemu/util/qemu-thread-posix.c:519
#20 0x70b156db in start_thread (arg=0x7fffd7704700) at
pthread_create.c:463
#21 0x7083e88f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:95

So under what circumstances can booke206_tlb_ways()
validly return 0? Should booke206_get_tlbm() cope with a
zero return, or can it assert() that it will never
call booke206_tlb_ways() in a way that will cause one?

thanks
-- PMM

Re: [PATCH RFC 9/9] KVM: Dirty ring support

2020-03-26 Thread Dr. David Alan Gilbert

* Peter Xu (pet...@redhat.com) wrote:
> On Wed, Mar 25, 2020 at 08:41:44PM +, Dr. David Alan Gilbert wrote:
> 
> [...]
> 
> > > +enum KVMReaperState {
> > > +KVM_REAPER_NONE = 0,
> > > +/* The reaper is sleeping */
> > > +KVM_REAPER_WAIT,
> > > +/* The reaper is reaping for dirty pages */
> > > +KVM_REAPER_REAPING,
> > > +};
> > 
> > That probably needs to be KVMDirtyRingReaperState
> > given there are many things that could be reaped.
> 
> Sure.
> 
> > 
> > > +/*
> > > + * KVM reaper instance, responsible for collecting the KVM dirty bits
> > > + * via the dirty ring.
> > > + */
> > > +struct KVMDirtyRingReaper {
> > > +/* The reaper thread */
> > > +QemuThread reaper_thr;
> > > +/*
> > > + * Telling the reaper thread to wakeup.  This should be used as a
> > > + * generic interface to kick the reaper thread, like, in vcpu
> > > + * threads where it gets a exit due to ring full.
> > > + */
> > > +EventNotifier reaper_event;
> > 
> > I think I'd just used a simple semaphore for this type of thing.
> 
> I'm actually uncertain on which is cheaper...
> 
> At the meantime, I wanted to poll two handles at the same time below
> (in kvm_dirty_ring_reaper_thread).  I don't know how to do that with
> semaphore.  Could it?

If you're OK with EventNotifier stick with it;  it's just I'm used
to doing with it with a semaphore; e.g. a flag then the semaphore - but
that's fine.

> [...]
> 
> > > @@ -412,6 +460,18 @@ int kvm_init_vcpu(CPUState *cpu)
> > >  (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
> > >  }
> > >  
> > > +if (s->kvm_dirty_gfn_count) {
> > > +cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size,
> > > +   PROT_READ | PROT_WRITE, MAP_SHARED,
> > 
> > Is the MAP_SHARED required?
> 
> Yes it's required.  It's the same when we map the per-vcpu kvm_run.
> 
> If we use MAP_PRIVATE, it'll be in a COW fashion - when the userspace
> writes to the dirty gfns the 1st time, it'll copy the current dirty
> ring page in the kernel and from now on QEMU will never be able to see
> what the kernel writes to the dirty gfn pages.  MAP_SHARED means the
> userspace and the kernel shares exactly the same page(s).

OK, worth a comment.

> > 
> > > +   cpu->kvm_fd,
> > > +   PAGE_SIZE * 
> > > KVM_DIRTY_LOG_PAGE_OFFSET);
> > > +if (cpu->kvm_dirty_gfns == MAP_FAILED) {
> > > +ret = -errno;
> > > +DPRINTF("mmap'ing vcpu dirty gfns failed\n");
> > 
> > Include errno?
> 
> Will do.
> 
> [...]
> 
> > > +static uint64_t kvm_dirty_ring_reap(KVMState *s)
> > > +{
> > > +KVMMemoryListener *kml;
> > > +int ret, i, locked_count = s->nr_as;
> > > +CPUState *cpu;
> > > +uint64_t total = 0;
> > > +
> > > +/*
> > > + * We need to lock all kvm slots for all address spaces here,
> > > + * because:
> > > + *
> > > + * (1) We need to mark dirty for dirty bitmaps in multiple slots
> > > + * and for tons of pages, so it's better to take the lock here
> > > + * once rather than once per page.  And more importantly,
> > > + *
> > > + * (2) We must _NOT_ publish dirty bits to the other threads
> > > + * (e.g., the migration thread) via the kvm memory slot dirty
> > > + * bitmaps before correctly re-protect those dirtied pages.
> > > + * Otherwise we can have potential risk of data corruption if
> > > + * the page data is read in the other thread before we do
> > > + * reset below.
> > > + */
> > > +for (i = 0; i < s->nr_as; i++) {
> > > +kml = s->as[i].ml;
> > > +if (!kml) {
> > > +/*
> > > + * This is tricky - we grow s->as[] dynamically now.  Take
> > > + * care of that case.  We also assumed the as[] will fill
> > > + * one by one starting from zero.  Without this, we race
> > > + * with register_smram_listener.
> > > + *
> > > + * TODO: make all these prettier...
> > > + */
> > > +locked_count = i;
> > > +break;
> > > +}
> > > +kvm_slots_lock(kml);
> > > +}
> > > +
> > > +CPU_FOREACH(cpu) {
> > > +total += kvm_dirty_ring_reap_one(s, cpu);
> > > +}
> > > +
> > > +if (total) {
> > > +ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
> > > +assert(ret == total);
> > > +}
> > > +
> > > +/* Unlock whatever locks that we have locked */
> > > +for (i = 0; i < locked_count; i++) {
> > > +kvm_slots_unlock(s->as[i].ml);
> > > +}
> > > +
> > > +CPU_FOREACH(cpu) {
> > > +if (cpu->kvm_dirty_ring_full) {
> > > +qemu_sem_post(>kvm_dirty_ring_avail);
> > > +}
> > 
> > Why do you need to wait until here - couldn't you release
> > each vcpu after you've reaped it?
> 
> We probably still need to wait.

[PATCH] s390x: protvirt: Fix stray error_report_err in s390_machine_protect

2020-03-26 Thread Janosch Frank

In case the protection of the machine fails at s390_pv_vm_enable(),
we'll currently report the local_error variable. Problem is that
there's no migration blocker error that we can report at this point so
the pointer is always NULL which leads to a SEGFAULT.

Let's remove the error report.

Signed-off-by: Janosch Frank 
Reported-by: Marc Hartmayer 
Fixes: 520935eedf941da3 ("s390x: protvirt: Add migration blocker")
---
 hw/s390x/s390-virtio-ccw.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 3cf19c99f3468b7d..855ecf370d6e82fa 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -358,7 +358,6 @@ static int s390_machine_protect(S390CcwMachineState *ms)
 rc = s390_pv_vm_enable();
 if (rc) {
 qemu_balloon_inhibit(false);
-error_report_err(local_err);
 migrate_del_blocker(pv_mig_blocker);
 error_free_or_abort(_mig_blocker);
 return rc;
-- 
2.25.1

Re: [PATCH] acpi: pcihp: fix left shift undefined behavior in acpi_pcihp_eject_slot()

2020-03-26 Thread Michael S. Tsirkin

On Thu, Mar 26, 2020 at 09:56:24AM -0400, Igor Mammedov wrote:
> Coverity spots subj in following guest triggered code path
>   pci_write(, data = 0) -> acpi_pcihp_eject_slot(,slots = 0)
>  uinst32_t slot = ctz32(slots)
>  ...
>  ... = ~(1U << slot)
> where 'slot' value is 32 in case 'slots' bitmap is empty.
> 'slots' is a bitmap and empty one shouldn't  do anything
> so return early doing nothing if resulted slot value is
> not valid (i.e. not in 0-31 range)
> 
> Reported-by: Peter Maydell 
> Signed-off-by: Igor Mammedov 

tagged, thanks!

> ---
>  hw/acpi/pcihp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
> index 4dcef372bf..0dc963e983 100644
> --- a/hw/acpi/pcihp.c
> +++ b/hw/acpi/pcihp.c
> @@ -154,7 +154,7 @@ static void acpi_pcihp_eject_slot(AcpiPciHpState *s, 
> unsigned bsel, unsigned slo
>  
>  trace_acpi_pci_eject_slot(bsel, slot);
>  
> -if (!bus) {
> +if (!bus || slot > 31) {
>  return;
>  }
>  
> -- 
> 2.18.1

RE: [PATCH v1 12/22] intel_iommu: add PASID cache management infrastructure

2020-03-26 Thread Liu, Yi L

> From: Liu, Yi L
> Sent: Thursday, March 26, 2020 2:15 PM
> To: 'Peter Xu' 
> Subject: RE: [PATCH v1 12/22] intel_iommu: add PASID cache management
> infrastructure
> 
> > From: Peter Xu 
> > Sent: Wednesday, March 25, 2020 10:52 PM
> > To: Liu, Yi L 
> > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache management
> > infrastructure
> >
> > On Wed, Mar 25, 2020 at 12:20:21PM +, Liu, Yi L wrote:
> > > > From: Peter Xu 
> > > > Sent: Wednesday, March 25, 2020 1:32 AM
> > > > To: Liu, Yi L 
> > > > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache
> > > > management infrastructure
> > > >
> > > > On Sun, Mar 22, 2020 at 05:36:09AM -0700, Liu Yi L wrote:
> > > > > This patch adds a PASID cache management infrastructure based on
> > > > > new added structure VTDPASIDAddressSpace, which is used to track
> > > > > the PASID usage and future PASID tagged DMA address translation
> > > > > support in vIOMMU.
[...]
> > > >
> > > > 
> > > >
> > > > > +/*
> > > > > + * Step 2: loop all the exisitng vtd_dev_icx instances.
> > > > > + * Ideally, needs to loop all devices to find if there is any new
> > > > > + * PASID binding regards to the PASID cache invalidation request.
> > > > > + * But it is enough to loop the devices which are backed by host
> > > > > + * IOMMU. For devices backed by vIOMMU (a.k.a emulated devices),
> > > > > + * if new PASID happened on them, their vtd_pasid_as instance 
> > > > > could
> > > > > + * be created during future vIOMMU DMA translation.
> > > > > + */
> > > > > +QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) {
> > > > > +VTDPASIDAddressSpace *vtd_pasid_as;
> > > > > +VTDPASIDCacheEntry *pc_entry;
> > > > > +VTDPASIDEntry pe;
> > > > > +VTDBus *vtd_bus = vtd_dev_icx->vtd_bus;
> > > > > +uint16_t devfn = vtd_dev_icx->devfn;
> > > > > +int bus_n = pci_bus_num(vtd_bus->bus);
> > > > > +
> > > > > +/* i) fetch vtd_pasid_as and check if it is valid */
> > > > > +vtd_pasid_as = vtd_add_find_pasid_as(s, vtd_bus,
> > > > > + devfn, pasid);
> > > >
> > > > I don't feel like it's correct here...
> > > >
> > > > Assuming we have two devices assigned D1, D2.  D1 uses PASID=1, D2
> > > > uses
> > PASID=2.
> > > > When invalidating against PASID=1, are you also going to create a
> > > > VTDPASIDAddressSpace also for D2 with PASID=1?
> > >
> > > Answer is no. Before going forward, let's see if the below flow looks 
> > > good to you.
> > >
> > > Let me add one more device besides D1 and D2. Say device D3 which
> > > also uses PASID=1. And assume it begins with no PASID usage in guest.
> > >
> > > Then the flow from scratch is:
> > >
> > > a) guest IOMMU driver setup PASID entry for D1 with PASID=1,
> > >then invalidates against PASID #1
> > > b) trap to QEMU, and comes to this function. Since there is
> > >no previous pasid cache invalidation, so the Step 1 of this
> > >function has nothing to do, then goes to Step 2 which is to
> > >loop all assigned devices and check if the guest pasid entry
> > >is present. In this loop, should find D1's pasid entry for
> > >PASID#1 is present. So create the VTDPASIDAddressSpace and
> > >initialize its pasid_cache_entry and pasid_cache_gen fields.
> > > c) guest IOMMU driver setup PASID entry for D2 with PASID=2,
> > >then invalidates against PASID #2
> > > d) same with b), only difference is the Step 1 of this function
> > >will loop the VTDPASIDAddressSpace created in b), but its
> > >pasid is 1 which is not the target of current pasid cache
> > >invalidation. Same with b), in Step 2, it will create a
> > >VTDPASIDAddressSpace for D2+PASID#2
> > > e) guest IOMMU driver setup PASID entry for D3 with PASID=1,
> > >then invalidates against PASID #1
> > > f) trap to QEMU and comes to this function. Step 1 loops two
> > >VTDPASIDAddressSpace created in b) and d), and it finds there
> > >is a VTDPASIDAddressSpace whose pasid is 1. vtd_flush_pasid()
> > >will check if the cached pasid entry is the same with the one
> > >in guest memory. In this flow, it should be the same, so
> > >vtd_flush_pasid() will do nothing for it. Then comes to Step 2,
> > >it loops D1, D2, D3.
> > >- For D1, no need to do more since there is already a
> > >  VTDPASIDAddressSpace created for D1+PASID#1.
> > >- For D2, its guest pasid entry for PASID#1 is not present, so
> > >  no need to do anything for it.
> > >- For D3, its guest pasid entry for PASID#1 is present and it
> > >  is exactly the reason for this invalidation. So create a
> > >  VTDPASIDAddressSpace for and init the pasid_cache_entry and
> > >  pasid_cache_gen fields.
> > >
> > > > I feel like we shouldn't create VTDPASIDAddressSpace only if it
> > > > existed, say, until when we reach vtd_dev_get_pe_from_pasid() below with
> retcode==0.
> > >
> > >

[PATCH] acpi: pcihp: fix left shift undefined behavior in acpi_pcihp_eject_slot()

2020-03-26 Thread Igor Mammedov

Coverity spots subj in following guest triggered code path
  pci_write(, data = 0) -> acpi_pcihp_eject_slot(,slots = 0)
 uinst32_t slot = ctz32(slots)
 ...
 ... = ~(1U << slot)
where 'slot' value is 32 in case 'slots' bitmap is empty.
'slots' is a bitmap and empty one shouldn't  do anything
so return early doing nothing if resulted slot value is
not valid (i.e. not in 0-31 range)

Reported-by: Peter Maydell 
Signed-off-by: Igor Mammedov 
---
 hw/acpi/pcihp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c
index 4dcef372bf..0dc963e983 100644
--- a/hw/acpi/pcihp.c
+++ b/hw/acpi/pcihp.c
@@ -154,7 +154,7 @@ static void acpi_pcihp_eject_slot(AcpiPciHpState *s, 
unsigned bsel, unsigned slo
 
 trace_acpi_pci_eject_slot(bsel, slot);
 
-if (!bus) {
+if (!bus || slot > 31) {
 return;
 }
 
-- 
2.18.1

Re: [PATCH for-5.0 v2 0/3] qemu-img: Fix check's leak/corruption fix report

2020-03-26 Thread Max Reitz

On 24.03.20 18:27, Max Reitz wrote:
> Branch: https://github.com/XanClic/qemu.git fix-check-result-v2
> Branch: https://git.xanclic.moe/XanClic/qemu.git fix-check-result-v2
> 
> v1: https://lists.nongnu.org/archive/html/qemu-block/2020-02/msg01418.html

Thanks for the review again, applied to my block branch.

Max



signature.asc
Description: OpenPGP digital signature

Re: [PATCH 2/2] util/bufferiszero: improve avx2 accelerator

2020-03-26 Thread Robert Hoo

On Thu, 2020-03-26 at 08:26 -0500, Eric Blake wrote:
> On 3/25/20 9:09 PM, Hu, Robert wrote:
> > (Don't know why my Linux-Evolution missed this mail.)
> > > -Original Message-
> > > Long line; it's nice to wrap commit messages around column 70 or
> > > so (because
> > > reading 'git log' in an 80-column window adds indentation).
> > > 
> > 
> > [Hu, Robert]
> > I think I set my vim on wrap. This probably escaped by paste.
> > I ran checkpatch.pl on the patches before sending. It escaped check
> > but didn't
> > escaped your eagle eye Thank you.
> 
> checkpatch doesn't flag commit message long lines.  Maybe it could
> be 
> patched to do so, but it's not at the top of my list to write that
> patch.
> 
> > 
> > > > I just fix a boudary case on his original patch.
> > > 
> > > boundary
> > 
> > [Hu, Robert]
> > Emm... again spell error. Usually I would paste descriptions into
> > some editors
> > with spell check, but forgot this time.
> > Vim doesn't have spell check I think. What editor would you suggest
> > me to
> > integrate with git editing?
> 
> I'm an emacs user, so I have no suggestions for vim, but I'd be very 
> surprised if there were not some vim expert online that could figure
> out 
> how to wire in a spell-checker to vim.  Google quickly finds: 
> https://www.ostechnix.com/use-spell-check-feature-vim-text-editor/
> 
nice, thanks:)

[PATCH] qemu-user: fix build with LLVM lld 10

2020-03-26 Thread Roger Pau Monne

lld 10.0.0 introduced a new linker option --image-base equivalent to
the GNU -Ttext-segment one, hence use it when available.

This fixes the build of QEMU on systems using lld 10 or greater.

Signed-off-by: Dimitry Andric 
Signed-off-by: Roger Pau Monné 
---
Cc: Laurent Vivier 
Cc: Richard Henderson 
Cc: "Philippe Mathieu-Daudé" 
Cc: Paolo Bonzini 
Cc: "Daniel P. Berrangé" 
---
 configure | 42 +++---
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/configure b/configure
index da09c35895..92d57d84fa 100755
--- a/configure
+++ b/configure
@@ -6514,27 +6514,31 @@ if ( [ "$linux_user" = yes ] || [ "$bsd_user" = yes ] ) 
&& [ "$pie" = no ]; then
 cat > $TMPC &1; then
-error_exit \
-"We need to link the QEMU user mode binaries at a" \
-"specific text address. Unfortunately your linker" \
-"doesn't support either the -Ttext-segment option or" \
-"printing the default linker script with --verbose." \
-"If you don't want the user mode binaries, pass the" \
-"--disable-user option to configure."
-  fi
+  textseg_ldflags="-Wl,-Ttext-segment=$textseg_addr"
+  if ! compile_prog "" "$textseg_ldflags"; then
+# In case ld does not support -Ttext-segment, edit the default linker
+# script via sed to set the .text start addr.  This is needed on 
FreeBSD
+# at least.
+if ! $ld --verbose >/dev/null 2>&1; then
+  error_exit \
+  "We need to link the QEMU user mode binaries at a" \
+  "specific text address. Unfortunately your linker" \
+  "doesn't support either the --image-base or -Ttext-segment" \
+  "options or printing the default linker script with" \
+  "--verbose. If you don't want the user mode binaries," \
+  "pass the --disable-user option to configure."
+fi
 
-  $ld --verbose | sed \
--e '1,/==/d' \
--e '/==/,$d' \
--e "s/[.] = [0-9a-fx]* [+] SIZEOF_HEADERS/. = $textseg_addr + 
SIZEOF_HEADERS/" \
--e "s/__executable_start = [0-9a-fx]*/__executable_start = 
$textseg_addr/" > config-host.ld
-  textseg_ldflags="-Wl,-T../config-host.ld"
+$ld --verbose | sed \
+  -e '1,/==/d' \
+  -e '/==/,$d' \
+  -e "s/[.] = [0-9a-fx]* [+] SIZEOF_HEADERS/. = $textseg_addr + 
SIZEOF_HEADERS/" \
+  -e "s/__executable_start = [0-9a-fx]*/__executable_start = 
$textseg_addr/" > config-host.ld
+textseg_ldflags="-Wl,-T../config-host.ld"
+  fi
 fi
   fi
 fi
-- 
2.26.0

Re: [PATCH RFC 8/9] KVM: Add dirty-ring-size property

2020-03-26 Thread Dr. David Alan Gilbert

* Peter Xu (pet...@redhat.com) wrote:
> On Wed, Mar 25, 2020 at 08:00:31PM +, Dr. David Alan Gilbert wrote:
> > > @@ -2077,6 +2079,33 @@ static int kvm_init(MachineState *ms)
> > >  s->memory_listener.listener.coalesced_io_add = 
> > > kvm_coalesce_mmio_region;
> > >  s->memory_listener.listener.coalesced_io_del = 
> > > kvm_uncoalesce_mmio_region;
> > >  
> > > +/*
> > > + * Enable KVM dirty ring if supported, otherwise fall back to
> > > + * dirty logging mode
> > > + */
> > > +if (s->kvm_dirty_ring_size > 0) {
> > > +/* Read the max supported pages */
> > > +ret = kvm_vm_check_extension(kvm_state, KVM_CAP_DIRTY_LOG_RING);
> > > +if (ret > 0) {
> > > +if (s->kvm_dirty_ring_size > ret) {
> > > +error_report("KVM dirty ring size %d too big (maximum is 
> > > %d). "
> > > + "Please use a smaller value.",
> > > + s->kvm_dirty_ring_size, ret);
> > > +goto err;
> > > +}
> > > +
> > > +ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0,
> > > +s->kvm_dirty_ring_size);
> > > +if (ret) {
> > > +error_report("Enabling of KVM dirty ring failed: %d", 
> > > ret);
> > > +goto err;
> > > +}
> > > +
> > > +s->kvm_dirty_gfn_count =
> > > +s->kvm_dirty_ring_size / sizeof(struct kvm_dirty_gfn);
> > 
> > What happens if I was to pass dirty-ring-size=1 ?
> > Then the count would be 0 and things would get upset somewhere?
> > Do you need to check for a minimum postive value?
> 
> The above kvm_vm_enable_cap() should fail directly and QEMU will stop.
> Yes we should check it, but kernel will do that in all cases, so I
> just didn't do that explicitly again in the userspace.

We probably should have that check since you can give them a more
obvious error message.

> I was planning this to be an advanced feature so the user of this
> parameter should know the rules to pass values in.

Advanced users just make advanced mistakes :-)

I did wonder if perhaps this option should be a count of entries rather
than a byte size.

> Of course we can introduce another global knob to enable/disable this
> feature as a whole, then I can offer a default value for the size
> parameter.  I just didn't bother that in this RFC version, but I can
> switch to that if that's preferred.
> 
> [...]
> 
> > > @@ -3065,6 +3123,12 @@ static void kvm_accel_class_init(ObjectClass *oc, 
> > > void *data)
> > >  NULL, NULL, _abort);
> > >  object_class_property_set_description(oc, "kvm-shadow-mem",
> > >  "KVM shadow MMU size", _abort);
> > > +
> > > +object_class_property_add(oc, "dirty-ring-size", "int",
> > > +kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
> > > +NULL, NULL, _abort);
> > 
> > I don't think someone passing in a non-number should cause an abort;
> > it should exit, but I don't think it should abort/core.
> 
> It won't - _abort is for the operation to add the property.  It
> should never fail.
> 
> User illegal input will look like this (a graceful exit):
> 
> $ ./x86_64-softmmu/qemu-system-x86_64 -accel kvm,dirty-ring-size=a
> qemu-system-x86_64: -accel kvm,dirty-ring-size=a: Parameter 'dirty-ring-size' 
> expects int64

OK good.

> > 
> > > +object_class_property_set_description(oc, "dirty-ring-size",
> > > +"KVM dirty ring size (<=0 to disable)", _abort);
> > >  }
> > >  
> > >  static const TypeInfo kvm_accel_type = {
> > > diff --git a/qemu-options.hx b/qemu-options.hx
> > > index 224a8e8712..140bd38726 100644
> > > --- a/qemu-options.hx
> > > +++ b/qemu-options.hx
> > > @@ -119,6 +119,7 @@ DEF("accel", HAS_ARG, QEMU_OPTION_accel,
> > >  "kernel-irqchip=on|off|split controls accelerated 
> > > irqchip support (default=on)\n"
> > >  "kvm-shadow-mem=size of KVM shadow MMU in bytes\n"
> > >  "tb-size=n (TCG translation block cache size)\n"
> > > +"dirty-ring-size=n (KVM dirty ring size in Bytes)\n"
> > >  "thread=single|multi (enable multi-threaded TCG)\n", 
> > > QEMU_ARCH_ALL)
> > >  STEXI
> > >  @item -accel @var{name}[,prop=@var{value}[,...]]
> > > @@ -140,6 +141,8 @@ irqchip completely is not recommended except for 
> > > debugging purposes.
> > >  Defines the size of the KVM shadow MMU.
> > >  @item tb-size=@var{n}
> > >  Controls the size (in MiB) of the TCG translation block cache.
> > > +@item dirty-ring-size=@val{n}
> > > +Controls the size (in Bytes) of KVM dirty ring (<=0 to disable).
> > 
> > I don't see the point in allowing < 0 ; I'd ban it.
> 
> Sure; I can switch to an uint64.

Great.

Dave

> Thanks,
> 
> -- 
> Peter Xu
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers

2020-03-26 Thread Max Reitz

On 26.03.20 14:35, Eric Blake wrote:
> On 3/26/20 8:28 AM, Kevin Wolf wrote:
>> Am 26.03.2020 um 14:20 hat Eric Blake geschrieben:
 +++ b/block/file-posix.c
 @@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = {
    .bdrv_reopen_prepare = raw_reopen_prepare,
    .bdrv_reopen_commit  = raw_reopen_commit,
    .bdrv_reopen_abort   = raw_reopen_abort,
 +    .bdrv_co_create_opts = bdrv_co_create_opts_simple,
 +    .create_opts = _create_opts_simple,
>>>
>>> I'd drop the leading & for consistency with the rest of this struct
>>> initializer.
>>
>> This one isn't a function pointer, so I think the & is necessary.
> 
> Ah, right. Visual pattern-matching failed me, since I didn't read the
> actual types in the .h file.
> 
> Hmm - is it possible to write the patch in such a way that .create_opts
> can be left NULL when .bdrv_co_create_opts is bdrv_co_create_opts_simple?

Setting .create_opts is actually the main point of this series, so we
don’t have to look for and fix all places that decide whether a block
driver is capable of image creation based on whether it’s set or not.

Max



signature.asc
Description: OpenPGP digital signature

Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'

2020-03-26 Thread Igor Mammedov

On Thu, 26 Mar 2020 09:31:09 -0400
"Michael S. Tsirkin"  wrote:

> On Thu, Mar 26, 2020 at 09:28:27AM -0400, Michael S. Tsirkin wrote:
> > On Thu, Mar 26, 2020 at 02:23:17PM +0100, Igor Mammedov wrote:  
> > > On Thu, 26 Mar 2020 11:52:36 +
> > > Peter Maydell  wrote:
> > >   
> > > > Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot()
> > > > is passed a zero 'slots' argument then ctz32(slots) will return 32,
> > > > and then the code that does '1U << slot' is C undefined behaviour
> > > > because it's an oversized shift. (This is CID 1421896.)
> > > > 
> > > > Since the pci_write() function in this file can call
> > > > acpi_pcihp_eject_slot() with an arbitrary value from the guest,
> > > > I think we need to handle 'slots == 0' safely. But what should
> > > > the behaviour be?  
> > > 
> > > it also uncovers a bug, where we are not able to eject slot 0 on bridge,  
> > 
> > 
> > And that is by design. A standard PCI SHPC register can support up to 31
> > hotpluggable slots. So we chose slot 0 as non hotpluggable.
> > It's consistent across SHPC, PCI-E, so I made ACPI match.  
> 
> Sorry I was confused. It's a PCI thing, PCI-E does not have
> slot numbers for downstream ports at all.

Scratch that, it was mistake on my part where I tests with a change
that masks 0 and wrongly at that.

slot 0 on bridge can be removed just fine 

> 
> > You can't hot-add it either.
> >   
> > > can be reproduced with:
> > > 
> > >  -enable-kvm -m 4G -device pci-bridge,chassis_nr=1 -global 
> > > PIIX4_PM.acpi-pci-hotplug-with-bridge-support=on -device 
> > > virtio-net-pci,bus=pci.1,addr=0,id=netdev12
> > > 
> > > (monitor) device_del netdev12
> > > (monitor) qtree # still shows the device
> > > 
> > > reason is that acpi_pcihp_eject_slot()
> > >if (PCI_SLOT(dev->devfn) == slot) { # doesn't  match (0 != 32)
> > > 
> > > so device is not deleted  
> > 
> > We should probably teach QEMU that some slots aren't hotpluggable
> > even if device in it is hotpluggable in theory. But that is
> > a separate issue.
> >   
> > > > thanks
> > > > -- PMM
> > > >   
>

Re: [PATCH 0/2] Fix the generic image creation code

2020-03-26 Thread Max Reitz

On 26.03.20 13:23, Max Reitz wrote:
> On 26.03.20 02:12, Maxim Levitsky wrote:
>> The recent patches from Max Reitz allowed some block drivers to not
>> provide the .bdrv_co_create_opts and still allow qemu-img to
>> create/format images as long as the image is already existing
>> (that is the case with various block storage drivers like nbd/iscsi/nvme, 
>> etc)
>>
>> However it was found out that some places in the code depend on the
>> .bdrv_co_create_opts/.create_opts to be != NULL to decide if to allow
>> image creation.
>>
>> To avoid adding failback code to all these places, just make generic failback
>> code be used by the drivers that need it, so that for outside user, there
>> is no diffirence if failback was used or not.
>>
>> Best regards,
>>  Maxim Levitsky
>>
>> Maxim Levitsky (2):
>>   block: pass BlockDriver reference to the .bdrv_co_create
>>   block: trickle down the fallback image creation function use to the
>> block drivers
> 
> Thanks, fixed the function parameter alignment, moved the declarations
> from block.h into block_int.h, and applied the series to my block branch:

(And the spelling fixes suggested by Eric)

> https://git.xanclic.moe/XanClic/qemu/commits/branch/block
> 
> Max
> 




signature.asc
Description: OpenPGP digital signature

Re: [PATCH v6 7/7] virtio-net: add migration support for RSS and hash report

2020-03-26 Thread Michael S. Tsirkin

Code looks OK but patchew testing shows failures. If they are false positives
pls reply to that mail.

On Thu, Mar 26, 2020 at 02:34:39PM +0200, Yuri Benditovich wrote:
> ping
> 
> On Fri, Mar 20, 2020 at 1:58 PM Yuri Benditovich 
> wrote:
> 
> Save and restore RSS/hash report configuration.
> 
> Signed-off-by: Yuri Benditovich 
> ---
>  hw/net/virtio-net.c | 37 +
>  1 file changed, 37 insertions(+)
> 
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index a0614ad4e6..7de7587abd 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -2842,6 +2842,13 @@ static int virtio_net_post_load_device(void 
> *opaque,
> int version_id)
>          }
>      }
> 
> +    if (n->rss_data.enabled) {
> +        trace_virtio_net_rss_enable(n->rss_data.hash_types,
> +                                    n->rss_data.indirections_len,
> +                                    sizeof(n->rss_data.key));
> +    } else {
> +        trace_virtio_net_rss_disable();
> +    }
>      return 0;
>  }
> 
> @@ -3019,6 +3026,32 @@ static const VMStateDescription
> vmstate_virtio_net_has_vnet = {
>      },
>  };
> 
> +static bool virtio_net_rss_needed(void *opaque)
> +{
> +    return VIRTIO_NET(opaque)->rss_data.enabled;
> +}
> +
> +static const VMStateDescription vmstate_virtio_net_rss = {
> +    .name      = "virtio-net-device/rss",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = virtio_net_rss_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(rss_data.enabled, VirtIONet),
> +        VMSTATE_BOOL(rss_data.redirect, VirtIONet),
> +        VMSTATE_BOOL(rss_data.populate_hash, VirtIONet),
> +        VMSTATE_UINT32(rss_data.hash_types, VirtIONet),
> +        VMSTATE_UINT16(rss_data.indirections_len, VirtIONet),
> +        VMSTATE_UINT16(rss_data.default_queue, VirtIONet),
> +        VMSTATE_UINT8_ARRAY(rss_data.key, VirtIONet,
> +                            VIRTIO_NET_RSS_MAX_KEY_SIZE),
> +        VMSTATE_VARRAY_UINT16_ALLOC(rss_data.indirections_table,
> VirtIONet,
> +                                    rss_data.indirections_len, 0,
> +                                    vmstate_info_uint16, uint16_t),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
>  static const VMStateDescription vmstate_virtio_net_device = {
>      .name = "virtio-net-device",
>      .version_id = VIRTIO_NET_VM_VERSION,
> @@ -3069,6 +3102,10 @@ static const VMStateDescription
> vmstate_virtio_net_device = {
>                              has_ctrl_guest_offloads),
>          VMSTATE_END_OF_LIST()
>     },
> +    .subsections = (const VMStateDescription * []) {
> +        _virtio_net_rss,
> +        NULL
> +    }
>  };
> 
>  static NetClientInfo net_virtio_info = {
> --
> 2.17.1
> 
>

Re: [PATCH for-5.0 v2 0/4] bug fixes extracted from larger qcow2 zero bit work

2020-03-26 Thread Max Reitz

On 24.03.20 18:42, Eric Blake wrote:
> My proposal [1] to add an autoclear all-zero-content bit to the qcow2
> format has now stalled into 5.1 territory, but several patches in my
> initial proposal are uncontroversial and obvious bug fixes worth
> having in 5.0.
> 
> [1] https://lists.gnu.org/archive/html/qemu-devel/2020-01/msg08075.html
> 
> Eric Blake (4):
>   qcow2: Comment typo fixes
>   qcow2: List autoclear bit names in header
>   qcow2: Avoid feature name extension on small cluster size
>   sheepdog: Consistently set bdrv_has_zero_init_truncate

OK, so my only real gripe was with the question whether we could make
patch 3 decide dynamically when to include or not include the feature
name table.  There’s little difference in practice: Right now, we could
probably get away with including it in images with 4k clusters, then it
would automatically switch to 8k clusters minimum (which is the limit
chosen in patch 3 as-is), and at some theoretical point in the far, far
future it would switch 16k clusters minimum.

I don’t think anyone really cares about whether the feature name table
is in images with 4k clusters or not, and I think we still have a couple
of decades before we the table gets too big for images with 8k clusters
(and we probably won’t be using qcow2 then anymore).

So tl;dr: There’s no practical benefit of deciding dynamically, hence
I’m taking this series as-is:

https://git.xanclic.moe/XanClic/qemu/commits/branch/block

Thanks!

Max

signature.asc
Description: OpenPGP digital signature

Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers

2020-03-26 Thread Eric Blake


On 3/26/20 8:28 AM, Kevin Wolf wrote:

Am 26.03.2020 um 14:20 hat Eric Blake geschrieben:

+++ b/block/file-posix.c
@@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = {
   .bdrv_reopen_prepare = raw_reopen_prepare,
   .bdrv_reopen_commit  = raw_reopen_commit,
   .bdrv_reopen_abort   = raw_reopen_abort,
+.bdrv_co_create_opts = bdrv_co_create_opts_simple,
+.create_opts = _create_opts_simple,


I'd drop the leading & for consistency with the rest of this struct
initializer.


This one isn't a function pointer, so I think the & is necessary.


Ah, right. Visual pattern-matching failed me, since I didn't read the 
actual types in the .h file.


Hmm - is it possible to write the patch in such a way that .create_opts 
can be left NULL when .bdrv_co_create_opts is bdrv_co_create_opts_simple?


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH v6 0/7] reference implementation of RSS and hash report

2020-03-26 Thread Michael S. Tsirkin

On Fri, Mar 20, 2020 at 01:57:44PM +0200, Yuri Benditovich wrote:
> Support for VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT
> features in QEMU for reference purpose.
> Implements Toeplitz hash calculation for incoming
> packets according to configuration provided by driver.
> Uses calculated hash for decision on receive virtqueue
> and/or reports the hash in the virtio header

Series:

Reviewed-by: Michael S. Tsirkin 

> Changes from v5:
> RSS migration state moved to subsection and migrated
> only if enabled (patch 7)
> Updated sign off (patch 6)
> 
> Yuri Benditovich (7):
>   virtio-net: introduce RSS and hash report features
>   virtio-net: implement RSS configuration command
>   virtio-net: implement RX RSS processing
>   tap: allow extended virtio header with hash info
>   virtio-net: reference implementation of hash report
>   vmstate.h: provide VMSTATE_VARRAY_UINT16_ALLOC macro
>   virtio-net: add migration support for RSS and hash report
> 
>  hw/net/trace-events|   3 +
>  hw/net/virtio-net.c| 448 +++--
>  include/hw/virtio/virtio-net.h |  16 ++
>  include/migration/vmstate.h|  10 +
>  net/tap.c  |  11 +-
>  5 files changed, 460 insertions(+), 28 deletions(-)
> 
> -- 
> 2.17.1

Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers

2020-03-26 Thread Maxim Levitsky

On Thu, 2020-03-26 at 08:20 -0500, Eric Blake wrote:
> On 3/25/20 8:12 PM, Maxim Levitsky wrote:
> > Instead of checking the .bdrv_co_create_opts to see if we need the failback,
> 
> fallback
100% true.
> 
> > just implement the .bdrv_co_create_opts in the drivers that need it.
> > 
> > This way we don't break various places that need to know if the underlying
> > protocol/format really supports image creation, and this way we still
> > allow some drivers to not support image creation.
> > 
> > Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd
> > Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816007
> > 
> > Note that technically this driver reverts the image creation failback
> 
> fallback
> 
> > for the vxhs driver since I don't have a means to test it,
> > and IMHO it is better to leave it not supported as it was prior to
> > generic image creation patches.
> > 
> > Also drop iscsi_create_opts which was left accidently
> 
> accidentally
True. I did a spell check on the commit message, but I guess I updated it
afterward with this.

> 
> > 
> > Signed-off-by: Maxim Levitsky 
> > ---
> > +++ b/block/file-posix.c
> > @@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = {
> >   .bdrv_reopen_prepare = raw_reopen_prepare,
> >   .bdrv_reopen_commit  = raw_reopen_commit,
> >   .bdrv_reopen_abort   = raw_reopen_abort,
> > +.bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > +.create_opts = _create_opts_simple,
> 
> I'd drop the leading & for consistency with the rest of this struct 
> initializer.

Can I? This is struct reference and I think that only for function references,
the leading & is optional.


Best regards,
Maxim Levitsky

Re: [PATCH v6 0/7] reference implementation of RSS and hash report

2020-03-26 Thread Michael S. Tsirkin

On Fri, Mar 20, 2020 at 06:38:51AM -0700, no-re...@patchew.org wrote:
> qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or 
> directory
> qemu-system-x86_64: falling back to tcg
> =
> ==8831==ERROR: AddressSanitizer: stack-buffer-overflow on address 
> 0x7ffc0dacbe80 at pc 0x55ee01d9e92a bp 0x7ffc0dac7080 sp 0x7ffc0dac6830
> READ of size 1518 at 0x7ffc0dacbe80 thread T0
> #0 0x55ee01d9e929 in __asan_memcpy 
> (/tmp/qemu-test/build/x86_64-softmmu/qemu-system-x86_64+0x1acb929)
> #1 0x55ee04694dad in iov_from_buf_full /tmp/qemu-test/src/util/iov.c:33:13

Hmm this seems to be iov related so could be triggered by the new
patches. Could you investigate pls?

-- 
MST

RE: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb invalidation to host

2020-03-26 Thread Liu, Yi L

> From: Peter Xu 
> Sent: Thursday, March 26, 2020 9:23 PM
> To: Liu, Yi L 
> Subject: Re: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb
> invalidation to host
> 
> On Thu, Mar 26, 2020 at 09:02:48AM -0400, Peter Xu wrote:
> 
> [...]
> 
> > > > > > +static inline bool vtd_pasid_cache_valid(
> > > > > > +  VTDPASIDAddressSpace *vtd_pasid_as) {
> > > > > > +return vtd_pasid_as->iommu_state &&
> > ^
> >
> > > > >
> > > > > This check can be dropped because always true?
> > > > >
> > > > > If you agree with both the changes, please add:
> > > > >
> > > > > Reviewed-by: Peter Xu 
> > > >
> > > > I think the code should ensure all the pasid_as in hash table is
> > > > valid. And we can since all the operations are under protection of 
> > > > iommu_lock.
> > > >
> > > Peter,
> > >
> > > I think my reply was wrong. pasid_as in has table may be stale since
> > > the per pasid_as cache_gen may be not identical with the cache_gen
> > > in iommu_state. e.g. vtd_pasid_cache_reset() only increases the
> > > cache_gen in iommu_state. So there will be pasid_as in hash table
> > > which has cached pasid entry but its cache_gen is not equal to the
> > > one in iommu_state. For such pasid_as, we should treat it as stale.
> > > So I guess the vtd_pasid_cache_valid() is still necessary.
> >
> > I guess you misread my comment. :)
> >
> > I was saying the "vtd_pasid_as->iommu_state" check is not needed,
> > because iommu_state was always set if the address space is created.
> > vtd_pasid_cache_valid() is needed.
> >
> > Also, please double confirm that vtd_pasid_cache_reset() should drop
> > all the address spaces (as I think it should), not "only increase the
> > cache_gen".  IMHO you should only increase the cache_gen in the PSI
> > hook (vtd_pasid_cache_psi()) only.
> 
> Sorry, I mean GSI (vtd_pasid_cache_gsi), not PSI.

Got it.. Really confused me. :-) 

Regards,
Yi Liu

Re: [PATCH v6 7/7] virtio-net: add migration support for RSS and hash report

2020-03-26 Thread Juan Quintela

Yuri Benditovich  wrote:
> Save and restore RSS/hash report configuration.
>
> Signed-off-by: Yuri Benditovich 

Reviewed-by: Juan Quintela 

sorry, hadn't seen it.

vmstate parts are right.

Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'

2020-03-26 Thread Michael S. Tsirkin

On Thu, Mar 26, 2020 at 01:50:41PM +0100, Igor Mammedov wrote:
> On Thu, 26 Mar 2020 13:29:01 +0100
> Igor Mammedov  wrote:
> 
> > On Thu, 26 Mar 2020 11:52:36 +
> > Peter Maydell  wrote:
> > 
> > > Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot()
> > > is passed a zero 'slots' argument then ctz32(slots) will return 32,
> > > and then the code that does '1U << slot' is C undefined behaviour
> > > because it's an oversized shift. (This is CID 1421896.)
> > > 
> > > Since the pci_write() function in this file can call
> > > acpi_pcihp_eject_slot() with an arbitrary value from the guest,
> > > I think we need to handle 'slots == 0' safely. But what should
> > > the behaviour be?  
> > 
> > 0 is not valid value, we should ignore and return early in this case
> > like we do with bsel. I'll post a path shortly.
> well, looking more that is only true for main bus, for bridges it can be
> slot number can be zero,

It can but we don't allow slot zero hotplug with SHPC
so it's easier if we don't allow this with ACPI either.

> then AML left shifts it and writes into B0EJ
> which traps into pci_write(, data) and that is supposed to eject
> slot 0 according to guest(AML).
> 
> Michael,
> what's your take on it?
> 
> > 
> > > 
> > > thanks
> > > -- PMM
> > >   
> > 
> >

Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'

2020-03-26 Thread Michael S. Tsirkin

On Thu, Mar 26, 2020 at 09:28:27AM -0400, Michael S. Tsirkin wrote:
> On Thu, Mar 26, 2020 at 02:23:17PM +0100, Igor Mammedov wrote:
> > On Thu, 26 Mar 2020 11:52:36 +
> > Peter Maydell  wrote:
> > 
> > > Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot()
> > > is passed a zero 'slots' argument then ctz32(slots) will return 32,
> > > and then the code that does '1U << slot' is C undefined behaviour
> > > because it's an oversized shift. (This is CID 1421896.)
> > > 
> > > Since the pci_write() function in this file can call
> > > acpi_pcihp_eject_slot() with an arbitrary value from the guest,
> > > I think we need to handle 'slots == 0' safely. But what should
> > > the behaviour be?
> > 
> > it also uncovers a bug, where we are not able to eject slot 0 on bridge,
> 
> 
> And that is by design. A standard PCI SHPC register can support up to 31
> hotpluggable slots. So we chose slot 0 as non hotpluggable.
> It's consistent across SHPC, PCI-E, so I made ACPI match.

Sorry I was confused. It's a PCI thing, PCI-E does not have
slot numbers for downstream ports at all.

> You can't hot-add it either.
> 
> > can be reproduced with:
> > 
> >  -enable-kvm -m 4G -device pci-bridge,chassis_nr=1 -global 
> > PIIX4_PM.acpi-pci-hotplug-with-bridge-support=on -device 
> > virtio-net-pci,bus=pci.1,addr=0,id=netdev12
> > 
> > (monitor) device_del netdev12
> > (monitor) qtree # still shows the device
> > 
> > reason is that acpi_pcihp_eject_slot()
> >if (PCI_SLOT(dev->devfn) == slot) { # doesn't  match (0 != 32)
> > 
> > so device is not deleted
> 
> We should probably teach QEMU that some slots aren't hotpluggable
> even if device in it is hotpluggable in theory. But that is
> a separate issue.
> 
> > > thanks
> > > -- PMM
> > >

Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers

2020-03-26 Thread Kevin Wolf

Am 26.03.2020 um 14:20 hat Eric Blake geschrieben:
> > +++ b/block/file-posix.c
> > @@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = {
> >   .bdrv_reopen_prepare = raw_reopen_prepare,
> >   .bdrv_reopen_commit  = raw_reopen_commit,
> >   .bdrv_reopen_abort   = raw_reopen_abort,
> > +.bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > +.create_opts = _create_opts_simple,
> 
> I'd drop the leading & for consistency with the rest of this struct
> initializer.

This one isn't a function pointer, so I think the & is necessary.

Kevin

Re: [PATCH 1/2] block: pass BlockDriver reference to the .bdrv_co_create

2020-03-26 Thread Eric Blake


On 3/26/20 8:22 AM, Maxim Levitsky wrote:

On Thu, 2020-03-26 at 08:18 -0500, Eric Blake wrote:

On 3/25/20 8:12 PM, Maxim Levitsky wrote:

This will allow to reuse a single generic .bdrv_co_create


"allow to ${verb}" is not idiomatic, better is "allow ${subject} to
${verb}" or "allow ${verb}ing".  In this case, I'd go with:

This will allow the reuse of a single...

I agree! This commit will probably go as is but next time I'll keep it in mind!


Max hasn't sent the pull request yet; there's still time for him to 
amend his staging queue if he wants.  But yeah, it's not a huge deal if 
the patch goes in without spelling/grammar polish.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'

2020-03-26 Thread Michael S. Tsirkin

On Thu, Mar 26, 2020 at 02:23:17PM +0100, Igor Mammedov wrote:
> On Thu, 26 Mar 2020 11:52:36 +
> Peter Maydell  wrote:
> 
> > Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot()
> > is passed a zero 'slots' argument then ctz32(slots) will return 32,
> > and then the code that does '1U << slot' is C undefined behaviour
> > because it's an oversized shift. (This is CID 1421896.)
> > 
> > Since the pci_write() function in this file can call
> > acpi_pcihp_eject_slot() with an arbitrary value from the guest,
> > I think we need to handle 'slots == 0' safely. But what should
> > the behaviour be?
> 
> it also uncovers a bug, where we are not able to eject slot 0 on bridge,


And that is by design. A standard PCI SHPC register can support up to 31
hotpluggable slots. So we chose slot 0 as non hotpluggable.
It's consistent across SHPC, PCI-E, so I made ACPI match.
You can't hot-add it either.

> can be reproduced with:
> 
>  -enable-kvm -m 4G -device pci-bridge,chassis_nr=1 -global 
> PIIX4_PM.acpi-pci-hotplug-with-bridge-support=on -device 
> virtio-net-pci,bus=pci.1,addr=0,id=netdev12
> 
> (monitor) device_del netdev12
> (monitor) qtree # still shows the device
> 
> reason is that acpi_pcihp_eject_slot()
>if (PCI_SLOT(dev->devfn) == slot) { # doesn't  match (0 != 32)
> 
> so device is not deleted

We should probably teach QEMU that some slots aren't hotpluggable
even if device in it is hotpluggable in theory. But that is
a separate issue.

> > thanks
> > -- PMM
> >

Re: [PATCH 2/2] util/bufferiszero: improve avx2 accelerator

2020-03-26 Thread Eric Blake


On 3/25/20 9:09 PM, Hu, Robert wrote:

(Don't know why my Linux-Evolution missed this mail.)

-Original Message-



Long line; it's nice to wrap commit messages around column 70 or so (because
reading 'git log' in an 80-column window adds indentation).


[Hu, Robert]
I think I set my vim on wrap. This probably escaped by paste.
I ran checkpatch.pl on the patches before sending. It escaped check but didn't
escaped your eagle eye Thank you.


checkpatch doesn't flag commit message long lines.  Maybe it could be 
patched to do so, but it's not at the top of my list to write that patch.





I just fix a boudary case on his original patch.


boundary

[Hu, Robert]
Emm... again spell error. Usually I would paste descriptions into some editors
with spell check, but forgot this time.
Vim doesn't have spell check I think. What editor would you suggest me to
integrate with git editing?


I'm an emacs user, so I have no suggestions for vim, but I'd be very 
surprised if there were not some vim expert online that could figure out 
how to wire in a spell-checker to vim.  Google quickly finds: 
https://www.ostechnix.com/use-spell-check-feature-vim-text-editor/


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

RE: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb invalidation to host

2020-03-26 Thread Liu, Yi L

> From: Peter Xu 
> Sent: Thursday, March 26, 2020 9:03 PM
> To: Liu, Yi L 
> Subject: Re: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb
> invalidation to host
> 
> On Thu, Mar 26, 2020 at 05:41:39AM +, Liu, Yi L wrote:
> > > From: Liu, Yi L
> > > Sent: Wednesday, March 25, 2020 9:22 PM
> > > To: 'Peter Xu' 
> > > Subject: RE: [PATCH v1 20/22] intel_iommu: propagate PASID-based
> > > iotlb invalidation to host
> > >
> > > > From: Peter Xu 
> > > > Sent: Wednesday, March 25, 2020 2:34 AM
> > > > To: Liu, Yi L 
> > > > Subject: Re: [PATCH v1 20/22] intel_iommu: propagate PASID-based
> > > > iotlb invalidation to host
> > > >
> > > > On Sun, Mar 22, 2020 at 05:36:17AM -0700, Liu Yi L wrote:
> > > > > This patch propagates PASID-based iotlb invalidation to host.
> > > > >
> > > > > Intel VT-d 3.0 supports nested translation in PASID granular.
> > > > > Guest SVA support could be implemented by configuring nested
> > > > > translation on specific PASID. This is also known as dual stage
> > > > > DMA translation.
> > > > >
> > > > > Under such configuration, guest owns the GVA->GPA translation
> > > > > which is configured as first level page table in host side for a
> > > > > specific pasid, and host owns GPA->HPA translation. As guest
> > > > > owns first level translation table, piotlb invalidation should
> > > > > be propagated to host since host IOMMU will cache first level
> > > > > page table related mappings during DMA address translation.
> > > > >
> > > > > This patch traps the guest PASID-based iotlb flush and propagate
> > > > > it to host.
> > > > >
> > > > > Cc: Kevin Tian 
> > > > > Cc: Jacob Pan 
> > > > > Cc: Peter Xu 
> > > > > Cc: Yi Sun 
> > > > > Cc: Paolo Bonzini 
> > > > > Cc: Richard Henderson 
> > > > > Cc: Eduardo Habkost 
> > > > > Signed-off-by: Liu Yi L 
> > > > > ---
> > > > >  hw/i386/intel_iommu.c  | 139
> > > > +
> > > > >  hw/i386/intel_iommu_internal.h |   7 +++
> > > > >  2 files changed, 146 insertions(+)
> > > > >
> > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index
> > > > > b9ac07d..10d314d 100644
> > > > > --- a/hw/i386/intel_iommu.c
> > > > > +++ b/hw/i386/intel_iommu.c
> > > > > @@ -3134,15 +3134,154 @@ static bool
> > > > vtd_process_pasid_desc(IntelIOMMUState *s,
> > > > >  return (ret == 0) ? true : false;  }
> > > > >
> > > > > +/**
> > > > > + * Caller of this function should hold iommu_lock.
> > > > > + */
> > > > > +static void vtd_invalidate_piotlb(IntelIOMMUState *s,
> > > > > +  VTDBus *vtd_bus,
> > > > > +  int devfn,
> > > > > +  DualIOMMUStage1Cache
> > > > > +*stage1_cache) {
> > > > > +VTDHostIOMMUContext *vtd_dev_icx;
> > > > > +HostIOMMUContext *host_icx;
> > > > > +
> > > > > +vtd_dev_icx = vtd_bus->dev_icx[devfn];
> > > > > +if (!vtd_dev_icx) {
> > > > > +goto out;
> > > > > +}
> > > > > +host_icx = vtd_dev_icx->host_icx;
> > > > > +if (!host_icx) {
> > > > > +goto out;
> > > > > +}
> > > > > +if (host_iommu_ctx_flush_stage1_cache(host_icx, stage1_cache)) {
> > > > > +error_report("Cache flush failed");
> > > >
> > > > I think this should not easily be triggered by the guest, but just
> > > > in case... Let's use
> > > > error_report_once() to be safe.
> > >
> > > Agreed.
> > >
> > > > > +}
> > > > > +out:
> > > > > +return;
> > > > > +}
> > > > > +
> > > > > +static inline bool vtd_pasid_cache_valid(
> > > > > +  VTDPASIDAddressSpace *vtd_pasid_as) {
> > > > > +return vtd_pasid_as->iommu_state &&
> ^
> 
> > > >
> > > > This check can be dropped because always true?
> > > >
> > > > If you agree with both the changes, please add:
> > > >
> > > > Reviewed-by: Peter Xu 
> > >
> > > I think the code should ensure all the pasid_as in hash table is
> > > valid. And we can since all the operations are under protection of 
> > > iommu_lock.
> > >
> > Peter,
> >
> > I think my reply was wrong. pasid_as in has table may be stale since
> > the per pasid_as cache_gen may be not identical with the cache_gen in
> > iommu_state. e.g. vtd_pasid_cache_reset() only increases the cache_gen
> > in iommu_state. So there will be pasid_as in hash table which has
> > cached pasid entry but its cache_gen is not equal to the one in
> > iommu_state. For such pasid_as, we should treat it as stale.
> > So I guess the vtd_pasid_cache_valid() is still necessary.
> 
> I guess you misread my comment. :)
> 
> I was saying the "vtd_pasid_as->iommu_state" check is not needed, because
> iommu_state was always set if the address space is created.
> vtd_pasid_cache_valid() is needed.

ok, I see.

> Also, please double confirm that vtd_pasid_cache_reset() should drop all the
> address spaces (as I think it should), not "only increase the cache_gen". 

yes, I'm just

Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'

2020-03-26 Thread Igor Mammedov

On Thu, 26 Mar 2020 11:52:36 +
Peter Maydell  wrote:

> Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot()
> is passed a zero 'slots' argument then ctz32(slots) will return 32,
> and then the code that does '1U << slot' is C undefined behaviour
> because it's an oversized shift. (This is CID 1421896.)
> 
> Since the pci_write() function in this file can call
> acpi_pcihp_eject_slot() with an arbitrary value from the guest,
> I think we need to handle 'slots == 0' safely. But what should
> the behaviour be?

it also uncovers a bug, where we are not able to eject slot 0 on bridge,
can be reproduced with:

 -enable-kvm -m 4G -device pci-bridge,chassis_nr=1 -global 
PIIX4_PM.acpi-pci-hotplug-with-bridge-support=on -device 
virtio-net-pci,bus=pci.1,addr=0,id=netdev12

(monitor) device_del netdev12
(monitor) qtree # still shows the device

reason is that acpi_pcihp_eject_slot()
   if (PCI_SLOT(dev->devfn) == slot) { # doesn't  match (0 != 32)

so device is not deleted

> thanks
> -- PMM
>

Re: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb invalidation to host

2020-03-26 Thread Peter Xu

On Thu, Mar 26, 2020 at 09:02:48AM -0400, Peter Xu wrote:

[...]

> > > > > +static inline bool vtd_pasid_cache_valid(
> > > > > +  VTDPASIDAddressSpace *vtd_pasid_as) {
> > > > > +return vtd_pasid_as->iommu_state &&
> ^
> 
> > > >
> > > > This check can be dropped because always true?
> > > >
> > > > If you agree with both the changes, please add:
> > > >
> > > > Reviewed-by: Peter Xu 
> > > 
> > > I think the code should ensure all the pasid_as in hash table is valid. 
> > > And we can
> > > since all the operations are under protection of iommu_lock.
> > > 
> > Peter,
> > 
> > I think my reply was wrong. pasid_as in has table may be stale since
> > the per pasid_as cache_gen may be not identical with the cache_gen
> > in iommu_state. e.g. vtd_pasid_cache_reset() only increases the
> > cache_gen in iommu_state. So there will be pasid_as in hash table
> > which has cached pasid entry but its cache_gen is not equal to the
> > one in iommu_state. For such pasid_as, we should treat it as stale.
> > So I guess the vtd_pasid_cache_valid() is still necessary.
> 
> I guess you misread my comment. :)
> 
> I was saying the "vtd_pasid_as->iommu_state" check is not needed,
> because iommu_state was always set if the address space is created.
> vtd_pasid_cache_valid() is needed.
> 
> Also, please double confirm that vtd_pasid_cache_reset() should drop
> all the address spaces (as I think it should), not "only increase the
> cache_gen".  IMHO you should only increase the cache_gen in the PSI
> hook (vtd_pasid_cache_psi()) only.

Sorry, I mean GSI (vtd_pasid_cache_gsi), not PSI.

-- 
Peter Xu

Re: [PATCH 1/2] block: pass BlockDriver reference to the .bdrv_co_create

2020-03-26 Thread Maxim Levitsky

On Thu, 2020-03-26 at 08:18 -0500, Eric Blake wrote:
> On 3/25/20 8:12 PM, Maxim Levitsky wrote:
> > This will allow to reuse a single generic .bdrv_co_create
> 
> "allow to ${verb}" is not idiomatic, better is "allow ${subject} to 
> ${verb}" or "allow ${verb}ing".  In this case, I'd go with:
> 
> This will allow the reuse of a single...
I agree! This commit will probably go as is but next time I'll keep it in mind!

Best regards,
Maxim Levitsky

> 
> > implementation for several drivers.
> > No functional changes.
> > 
> > Signed-off-by: Maxim Levitsky 
> > ---

Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers

2020-03-26 Thread Eric Blake


On 3/25/20 8:12 PM, Maxim Levitsky wrote:

Instead of checking the .bdrv_co_create_opts to see if we need the failback,


fallback


just implement the .bdrv_co_create_opts in the drivers that need it.

This way we don't break various places that need to know if the underlying
protocol/format really supports image creation, and this way we still
allow some drivers to not support image creation.

Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816007

Note that technically this driver reverts the image creation failback


fallback


for the vxhs driver since I don't have a means to test it,
and IMHO it is better to leave it not supported as it was prior to
generic image creation patches.

Also drop iscsi_create_opts which was left accidently


accidentally



Signed-off-by: Maxim Levitsky 
---
+++ b/block/file-posix.c
@@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = {
  .bdrv_reopen_prepare = raw_reopen_prepare,
  .bdrv_reopen_commit  = raw_reopen_commit,
  .bdrv_reopen_abort   = raw_reopen_abort,
+.bdrv_co_create_opts = bdrv_co_create_opts_simple,
+.create_opts = _create_opts_simple,


I'd drop the leading & for consistency with the rest of this struct 
initializer.



--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

Re: [PATCH 1/2] block: pass BlockDriver reference to the .bdrv_co_create

2020-03-26 Thread Eric Blake


On 3/25/20 8:12 PM, Maxim Levitsky wrote:

This will allow to reuse a single generic .bdrv_co_create


"allow to ${verb}" is not idiomatic, better is "allow ${subject} to 
${verb}" or "allow ${verb}ing".  In this case, I'd go with:


This will allow the reuse of a single...


implementation for several drivers.
No functional changes.

Signed-off-by: Maxim Levitsky 
---

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

RE: [RFC v6 05/24] memory: Introduce IOMMU Memory Region inject_faults API

2020-03-26 Thread Liu, Yi L

Hi Eric,

I'm also considering how to inject iommu fault to vIOMMU. As our
previous discussion (long time ago), MemoryRegion way doesn't work
well for VTd case. So I'd like see your opinion on the proposal
below:
I've a patch to make vIOMMUs register PCIIOMMUOps to PCI layer.
Current usage is to get address space and set/unset HostIOMMUContext
(added by me). I think it may be also nice to add the fault injection
callback in the PCIIOMMUOps. Thoughts?

https://patchwork.ozlabs.org/patch/1259645/

Regards,
Yi Liu

> From: Eric Auger 
> Sent: Saturday, March 21, 2020 12:58 AM
> To: eric.auger@gmail.com; eric.au...@redhat.com; qemu-devel@nongnu.org;
> Subject: [RFC v6 05/24] memory: Introduce IOMMU Memory Region inject_faults
> API
> 
> This new API allows to inject @count iommu_faults into
> the IOMMU memory region.
> 
> Signed-off-by: Eric Auger 
> ---
>  include/exec/memory.h | 25 +
>  memory.c  | 10 ++
>  2 files changed, 35 insertions(+)
> 
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index f2c773163f..141a5dc197 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -57,6 +57,8 @@ struct MemoryRegionMmio {
>  CPUWriteMemoryFunc *write[3];
>  };
> 
> +struct iommu_fault;
> +
>  typedef struct IOMMUTLBEntry IOMMUTLBEntry;
> 
>  /* See address_space_translate: bit 0 is read, bit 1 is write.  */
> @@ -357,6 +359,19 @@ typedef struct IOMMUMemoryRegionClass {
>   * @iommu: the IOMMUMemoryRegion
>   */
>  int (*num_indexes)(IOMMUMemoryRegion *iommu);
> +
> +/*
> + * Inject @count faults into the IOMMU memory region
> + *
> + * Optional method: if this method is not provided, then
> + * memory_region_injection_faults() will return -ENOENT
> + *
> + * @iommu: the IOMMU memory region to inject the faults in
> + * @count: number of faults to inject
> + * @buf: fault buffer
> + */
> +int (*inject_faults)(IOMMUMemoryRegion *iommu, int count,
> + struct iommu_fault *buf);
>  } IOMMUMemoryRegionClass;
> 
>  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> @@ -1365,6 +1380,16 @@ int
> memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr,
>   */
>  int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr);
> 
> +/**
> + * memory_region_inject_faults : inject @count faults stored in @buf
> + *
> + * @iommu_mr: the IOMMU memory region
> + * @count: number of faults to be injected
> + * @buf: buffer containing the faults
> + */
> +int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count,
> +struct iommu_fault *buf);
> +
>  /**
>   * memory_region_name: get a memory region's name
>   *
> diff --git a/memory.c b/memory.c
> index 09be40edd2..9cdd77e0de 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -2001,6 +2001,16 @@ int
> memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr)
>  return imrc->num_indexes(iommu_mr);
>  }
> 
> +int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count,
> +struct iommu_fault *buf)
> +{
> +IOMMUMemoryRegionClass *imrc =
> IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
> +if (!imrc->inject_faults) {
> +return -ENOENT;
> +}
> +return imrc->inject_faults(iommu_mr, count, buf);
> +}
> +
>  void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client)
>  {
>  uint8_t mask = 1 << client;
> --
> 2.20.1

[Bug 1867519] Re: qemu 4.2 segfaults on VF detach

2020-03-26 Thread Launchpad Bug Tracker

** Merge proposal linked:
   
https://code.launchpad.net/~paelzer/ubuntu/+source/qemu/+git/qemu/+merge/381033

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1867519

Title:
  qemu 4.2 segfaults on VF detach

Status in QEMU:
  Fix Committed
Status in qemu package in Ubuntu:
  Fix Released

Bug description:
  After updating Ubuntu 20.04 to the Beta version, we get the following
  error and the virtual machines stucks when detaching PCI devices using
  virsh command:

  Error:
  error: Failed to detach device from /tmp/vf_interface_attached.xml
  error: internal error: End of file from qemu monitor

  steps to reproduce:
   1. create a VM over Ubuntu 20.04 (5.4.0-14-generic)
   2. attach PCI device to this VM (Mellanox VF for example)
   3. try to detaching  the PCI device using virsh command:
 a. create a pci interface xml file:
  






  
 b.  #virsh detach-device  


  - Ubuntu release:
Description:Ubuntu Focal Fossa (development branch)
Release:20.04

  - Package ver:
libvirt0:
Installed: 6.0.0-0ubuntu3
Candidate: 6.0.0-0ubuntu5
Version table:
   6.0.0-0ubuntu5 500
  500 http://il.archive.ubuntu.com/ubuntu focal/main amd64 Packages
   *** 6.0.0-0ubuntu3 100
  100 /var/lib/dpkg/status

  - What you expected to happen: 
PCI device detached without any errors.

  - What happened instead:
getting the errors above and he VM stuck

  additional info:
  after downgrading the libvirt0 package and all the dependent packages to 5.4 
the previous, version, seems that the issue disappeared

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1867519/+subscriptions

Re: [PATCH v2] block: make BlockConf.*_size properties 32-bit

2020-03-26 Thread Eric Blake


On 3/26/20 1:52 AM, Roman Kagan wrote:

Devices (virtio-blk, scsi, etc.) and the block layer are happy to use
32-bit for logical_block_size, physical_block_size, and min_io_size.
However, the properties in BlockConf are defined as uint16_t limiting
the values to 32768.

This appears unnecessary tight, and we've seen bigger block sizes handy
at times.

Make them 32 bit instead and lift the limitation up to 2 MiB which
appears to be good enough for everybody.


and matches the current qemu limit for qcow2 cluster sizes



As the values can now be fairly big and awkward to type, make the
property setter accept common size suffixes (k, m).

Signed-off-by: Roman Kagan 
---
v1 -> v2:
- cap the property at 2 MiB [Eric]
- accept size suffixes




+++ b/hw/core/qdev-properties.c
@@ -14,6 +14,7 @@
  #include "qapi/visitor.h"
  #include "chardev/char.h"
  #include "qemu/uuid.h"
+#include "qemu/units.h"
  
  void qdev_prop_set_after_realize(DeviceState *dev, const char *name,

Error **errp)
@@ -729,30 +730,39 @@ const PropertyInfo qdev_prop_pci_devfn = {
  
  /* --- blocksize --- */
  
+/* lower limit is sector size */

+#define MIN_BLOCK_SIZE  512
+#define MIN_BLOCK_SIZE_STR  "512 B"
+/* upper limit is arbitrary, 2 MiB looks sufficient */
+#define MAX_BLOCK_SIZE  (2 * MiB)
+#define MAX_BLOCK_SIZE_STR  "2 MiB"


For this comment, I might add that it matches our limit for qcow2 
cluster size.


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org

< 1 2 3 4 >

201 - 300 of 368 matches

Mail list logo