[PATCH v8 06/74] cpu: introduce process_queued_cpu_work_locked
From: "Emilio G. Cota" This completes the conversion to cpu_mutex_lock/unlock in the file. Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Signed-off-by: Robert Foley --- cpus-common.c | 17 +++-- 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cpus-common.c b/cpus-common.c index bf5794cc17..f4a0f9ba3b 100644 --- a/cpus-common.c +++ b/cpus-common.c @@ -333,20 +333,19 @@ void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, queue_work_on_cpu(cpu, wi); } -void process_queued_cpu_work(CPUState *cpu) +/* Called with the CPU's lock held */ +static void process_queued_cpu_work_locked(CPUState *cpu) { struct qemu_work_item *wi; bool has_bql = qemu_mutex_iothread_locked(); -qemu_mutex_lock(>lock); if (QSIMPLEQ_EMPTY(>work_list)) { -qemu_mutex_unlock(>lock); return; } while (!QSIMPLEQ_EMPTY(>work_list)) { wi = QSIMPLEQ_FIRST(>work_list); QSIMPLEQ_REMOVE_HEAD(>work_list, node); -qemu_mutex_unlock(>lock); +cpu_mutex_unlock(cpu); if (wi->exclusive) { /* Running work items outside the BQL avoids the following deadlock: * 1) start_exclusive() is called with the BQL taken while another @@ -372,13 +371,19 @@ void process_queued_cpu_work(CPUState *cpu) qemu_mutex_unlock_iothread(); } } -qemu_mutex_lock(>lock); +cpu_mutex_lock(cpu); if (wi->free) { g_free(wi); } else { atomic_mb_set(>done, true); } } -qemu_mutex_unlock(>lock); qemu_cond_broadcast(>cond); } + +void process_queued_cpu_work(CPUState *cpu) +{ +cpu_mutex_lock(cpu); +process_queued_cpu_work_locked(cpu); +cpu_mutex_unlock(cpu); +} -- 2.17.1
[PATCH v8 04/74] cpu: make qemu_work_cond per-cpu
From: "Emilio G. Cota" This eliminates the need to use the BQL to queue CPU work. While at it, give the per-cpu field a generic name ("cond") since it will soon be used for more than just queueing CPU work. Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Signed-off-by: Robert Foley --- cpus-common.c | 72 ++- cpus.c| 2 +- hw/core/cpu.c | 1 + include/hw/core/cpu.h | 6 ++-- 4 files changed, 63 insertions(+), 18 deletions(-) diff --git a/cpus-common.c b/cpus-common.c index f75cae23d9..9031c31005 100644 --- a/cpus-common.c +++ b/cpus-common.c @@ -26,7 +26,6 @@ static QemuMutex qemu_cpu_list_lock; static QemuCond exclusive_cond; static QemuCond exclusive_resume; -static QemuCond qemu_work_cond; /* >= 1 if a thread is inside start_exclusive/end_exclusive. Written * under qemu_cpu_list_lock, read with atomic operations. @@ -42,7 +41,6 @@ void qemu_init_cpu_list(void) qemu_mutex_init(_cpu_list_lock); qemu_cond_init(_cond); qemu_cond_init(_resume); -qemu_cond_init(_work_cond); } void cpu_list_lock(void) @@ -105,23 +103,37 @@ struct qemu_work_item { bool free, exclusive, done; }; -static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi) +/* Called with the CPU's lock held */ +static void queue_work_on_cpu_locked(CPUState *cpu, struct qemu_work_item *wi) { -qemu_mutex_lock(>lock); QSIMPLEQ_INSERT_TAIL(>work_list, wi, node); wi->done = false; -qemu_mutex_unlock(>lock); qemu_cpu_kick(cpu); } -void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data, - QemuMutex *mutex) +static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi) +{ +cpu_mutex_lock(cpu); +queue_work_on_cpu_locked(cpu, wi); +cpu_mutex_unlock(cpu); +} + +void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data) { struct qemu_work_item wi; +bool has_bql = qemu_mutex_iothread_locked(); + +g_assert(no_cpu_mutex_locked()); if (qemu_cpu_is_self(cpu)) { -func(cpu, data); +if (has_bql) { +func(cpu, data); +} else { +qemu_mutex_lock_iothread(); +func(cpu, data); +qemu_mutex_unlock_iothread(); +} return; } @@ -131,13 +143,34 @@ void do_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data, wi.free = false; wi.exclusive = false; -queue_work_on_cpu(cpu, ); +cpu_mutex_lock(cpu); +queue_work_on_cpu_locked(cpu, ); + +/* + * We are going to sleep on the CPU lock, so release the BQL. + * + * During the transition to per-CPU locks, we release the BQL _after_ + * having kicked the destination CPU (from queue_work_on_cpu_locked above). + * This makes sure that the enqueued work will be seen by the CPU + * after being woken up from the kick, since the CPU sleeps on the BQL. + * Once we complete the transition to per-CPU locks, we will release + * the BQL earlier in this function. + */ +if (has_bql) { +qemu_mutex_unlock_iothread(); +} + while (!atomic_mb_read()) { CPUState *self_cpu = current_cpu; -qemu_cond_wait(_work_cond, mutex); +qemu_cond_wait(>cond, >lock); current_cpu = self_cpu; } +cpu_mutex_unlock(cpu); + +if (has_bql) { +qemu_mutex_lock_iothread(); +} } void async_run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data) @@ -303,6 +336,7 @@ void async_safe_run_on_cpu(CPUState *cpu, run_on_cpu_func func, void process_queued_cpu_work(CPUState *cpu) { struct qemu_work_item *wi; +bool has_bql = qemu_mutex_iothread_locked(); qemu_mutex_lock(>lock); if (QSIMPLEQ_EMPTY(>work_list)) { @@ -320,13 +354,23 @@ void process_queued_cpu_work(CPUState *cpu) * BQL, so it goes to sleep; start_exclusive() is sleeping too, so * neither CPU can proceed. */ -qemu_mutex_unlock_iothread(); +if (has_bql) { +qemu_mutex_unlock_iothread(); +} start_exclusive(); wi->func(cpu, wi->data); end_exclusive(); -qemu_mutex_lock_iothread(); +if (has_bql) { +qemu_mutex_lock_iothread(); +} } else { -wi->func(cpu, wi->data); +if (has_bql) { +wi->func(cpu, wi->data); +} else { +qemu_mutex_lock_iothread(); +wi->func(cpu, wi->data); +qemu_mutex_unlock_iothread(); +} } qemu_mutex_lock(>lock); if (wi->free) { @@ -336,5 +380,5 @@ void process_queued_cpu_work(CPUState *cpu) } } qemu_mutex_unlock(>lock); -qemu_cond_broadcast(_work_cond); +
[PATCH v8 01/74] cpu: convert queued work to a QSIMPLEQ
From: "Emilio G. Cota" Instead of open-coding it. While at it, make sure that all accesses to the list are performed while holding the list's lock. Reviewed-by: Richard Henderson Reviewed-by: Alex Bennée Signed-off-by: Emilio G. Cota Signed-off-by: Robert Foley --- cpus-common.c | 25 - cpus.c| 14 -- hw/core/cpu.c | 1 + include/hw/core/cpu.h | 6 +++--- 4 files changed, 24 insertions(+), 22 deletions(-) diff --git a/cpus-common.c b/cpus-common.c index eaf590cb38..3d659df464 100644 --- a/cpus-common.c +++ b/cpus-common.c @@ -99,7 +99,7 @@ void cpu_list_remove(CPUState *cpu) } struct qemu_work_item { -struct qemu_work_item *next; +QSIMPLEQ_ENTRY(qemu_work_item) node; run_on_cpu_func func; run_on_cpu_data data; bool free, exclusive, done; @@ -108,13 +108,7 @@ struct qemu_work_item { static void queue_work_on_cpu(CPUState *cpu, struct qemu_work_item *wi) { qemu_mutex_lock(>work_mutex); -if (cpu->queued_work_first == NULL) { -cpu->queued_work_first = wi; -} else { -cpu->queued_work_last->next = wi; -} -cpu->queued_work_last = wi; -wi->next = NULL; +QSIMPLEQ_INSERT_TAIL(>work_list, wi, node); wi->done = false; qemu_mutex_unlock(>work_mutex); @@ -310,17 +304,14 @@ void process_queued_cpu_work(CPUState *cpu) { struct qemu_work_item *wi; -if (cpu->queued_work_first == NULL) { +qemu_mutex_lock(>work_mutex); +if (QSIMPLEQ_EMPTY(>work_list)) { +qemu_mutex_unlock(>work_mutex); return; } - -qemu_mutex_lock(>work_mutex); -while (cpu->queued_work_first != NULL) { -wi = cpu->queued_work_first; -cpu->queued_work_first = wi->next; -if (!cpu->queued_work_first) { -cpu->queued_work_last = NULL; -} +while (!QSIMPLEQ_EMPTY(>work_list)) { +wi = QSIMPLEQ_FIRST(>work_list); +QSIMPLEQ_REMOVE_HEAD(>work_list, node); qemu_mutex_unlock(>work_mutex); if (wi->exclusive) { /* Running work items outside the BQL avoids the following deadlock: diff --git a/cpus.c b/cpus.c index ef441bdf62..151abbc04c 100644 --- a/cpus.c +++ b/cpus.c @@ -96,9 +96,19 @@ bool cpu_is_stopped(CPUState *cpu) return cpu->stopped || !runstate_is_running(); } +static inline bool cpu_work_list_empty(CPUState *cpu) +{ +bool ret; + +qemu_mutex_lock(>work_mutex); +ret = QSIMPLEQ_EMPTY(>work_list); +qemu_mutex_unlock(>work_mutex); +return ret; +} + static bool cpu_thread_is_idle(CPUState *cpu) { -if (cpu->stop || cpu->queued_work_first) { +if (cpu->stop || !cpu_work_list_empty(cpu)) { return false; } if (cpu_is_stopped(cpu)) { @@ -1490,7 +1500,7 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg) cpu = first_cpu; } -while (cpu && !cpu->queued_work_first && !cpu->exit_request) { +while (cpu && cpu_work_list_empty(cpu) && !cpu->exit_request) { atomic_mb_set(_current_rr_cpu, cpu); current_cpu = cpu; diff --git a/hw/core/cpu.c b/hw/core/cpu.c index 786a1bec8a..2fc6aa2159 100644 --- a/hw/core/cpu.c +++ b/hw/core/cpu.c @@ -368,6 +368,7 @@ static void cpu_common_initfn(Object *obj) cpu->nr_threads = 1; qemu_mutex_init(>work_mutex); +QSIMPLEQ_INIT(>work_list); QTAILQ_INIT(>breakpoints); QTAILQ_INIT(>watchpoints); diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 5bf94d28cf..398b65159e 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -331,8 +331,8 @@ struct qemu_work_item; * @opaque: User data. * @mem_io_pc: Host Program Counter at which the memory was accessed. * @kvm_fd: vCPU file descriptor for KVM. - * @work_mutex: Lock to prevent multiple access to queued_work_*. - * @queued_work_first: First asynchronous work pending. + * @work_mutex: Lock to prevent multiple access to @work_list. + * @work_list: List of pending asynchronous work. * @trace_dstate_delayed: Delayed changes to trace_dstate (includes all changes *to @trace_dstate). * @trace_dstate: Dynamic tracing state of events for this vCPU (bitmask). @@ -376,7 +376,7 @@ struct CPUState { sigjmp_buf jmp_env; QemuMutex work_mutex; -struct qemu_work_item *queued_work_first, *queued_work_last; +QSIMPLEQ_HEAD(, qemu_work_item) work_list; CPUAddressSpace *cpu_ases; int num_ases; -- 2.17.1
[PATCH v8 03/74] cpu: introduce cpu_mutex_lock/unlock
From: "Emilio G. Cota" The few direct users of >lock will be converted soon. The per-thread bitmap introduced here might seem unnecessary, since a bool could just do. However, once we complete the conversion to per-vCPU locks, we will need to cover the use case where all vCPUs are locked by the same thread, which explains why the bitmap is introduced here. Reviewed-by: Richard Henderson Signed-off-by: Emilio G. Cota Signed-off-by: Robert Foley --- cpus.c| 48 +-- include/hw/core/cpu.h | 33 + stubs/Makefile.objs | 1 + stubs/cpu-lock.c | 28 + 4 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 stubs/cpu-lock.c diff --git a/cpus.c b/cpus.c index 71bd2f5d55..633734fb5c 100644 --- a/cpus.c +++ b/cpus.c @@ -91,6 +91,47 @@ static unsigned int throttle_percentage; #define CPU_THROTTLE_PCT_MAX 99 #define CPU_THROTTLE_TIMESLICE_NS 1000 +/* XXX: is this really the max number of CPUs? */ +#define CPU_LOCK_BITMAP_SIZE 2048 + +/* + * Note: we index the bitmap with cpu->cpu_index + 1 so that the logic + * also works during early CPU initialization, when cpu->cpu_index is set to + * UNASSIGNED_CPU_INDEX == -1. + */ +static __thread DECLARE_BITMAP(cpu_lock_bitmap, CPU_LOCK_BITMAP_SIZE); + +bool no_cpu_mutex_locked(void) +{ +return bitmap_empty(cpu_lock_bitmap, CPU_LOCK_BITMAP_SIZE); +} + +void cpu_mutex_lock_impl(CPUState *cpu, const char *file, int line) +{ +/* coverity gets confused by the indirect function call */ +#ifdef __COVERITY__ +qemu_mutex_lock_impl(>lock, file, line); +#else +QemuMutexLockFunc f = atomic_read(_mutex_lock_func); + +g_assert(!cpu_mutex_locked(cpu)); +set_bit(cpu->cpu_index + 1, cpu_lock_bitmap); +f(>lock, file, line); +#endif +} + +void cpu_mutex_unlock_impl(CPUState *cpu, const char *file, int line) +{ +g_assert(cpu_mutex_locked(cpu)); +qemu_mutex_unlock_impl(>lock, file, line); +clear_bit(cpu->cpu_index + 1, cpu_lock_bitmap); +} + +bool cpu_mutex_locked(const CPUState *cpu) +{ +return test_bit(cpu->cpu_index + 1, cpu_lock_bitmap); +} + bool cpu_is_stopped(CPUState *cpu) { return cpu->stopped || !runstate_is_running(); @@ -100,9 +141,9 @@ static inline bool cpu_work_list_empty(CPUState *cpu) { bool ret; -qemu_mutex_lock(>lock); +cpu_mutex_lock(cpu); ret = QSIMPLEQ_EMPTY(>work_list); -qemu_mutex_unlock(>lock); +cpu_mutex_unlock(cpu); return ret; } @@ -1837,6 +1878,9 @@ void qemu_mutex_lock_iothread_impl(const char *file, int line) { QemuMutexLockFunc bql_lock = atomic_read(_bql_mutex_lock_func); +/* enforce locking order */ +g_assert(no_cpu_mutex_locked()); + g_assert(!qemu_mutex_iothread_locked()); bql_lock(_global_mutex, file, line); iothread_locked = true; diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h index 0b75fdb093..4521bba7a5 100644 --- a/include/hw/core/cpu.h +++ b/include/hw/core/cpu.h @@ -457,6 +457,39 @@ extern CPUTailQ cpus; extern __thread CPUState *current_cpu; +/** + * cpu_mutex_lock - lock a CPU's mutex + * @cpu: the CPU whose mutex is to be locked + * + * To avoid deadlock, a CPU's mutex must be acquired after the BQL. + */ +#define cpu_mutex_lock(cpu) \ +cpu_mutex_lock_impl(cpu, __FILE__, __LINE__) +void cpu_mutex_lock_impl(CPUState *cpu, const char *file, int line); + +/** + * cpu_mutex_unlock - unlock a CPU's mutex + * @cpu: the CPU whose mutex is to be unlocked + */ +#define cpu_mutex_unlock(cpu) \ +cpu_mutex_unlock_impl(cpu, __FILE__, __LINE__) +void cpu_mutex_unlock_impl(CPUState *cpu, const char *file, int line); + +/** + * cpu_mutex_locked - check whether a CPU's mutex is locked + * @cpu: the CPU of interest + * + * Returns true if the calling thread is currently holding the CPU's mutex. + */ +bool cpu_mutex_locked(const CPUState *cpu); + +/** + * no_cpu_mutex_locked - check whether any CPU mutex is held + * + * Returns true if the calling thread is not holding any CPU mutex. + */ +bool no_cpu_mutex_locked(void); + static inline void cpu_tb_jmp_cache_clear(CPUState *cpu) { unsigned int i; diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs index 45be5dc0ed..d2dd6c94cc 100644 --- a/stubs/Makefile.objs +++ b/stubs/Makefile.objs @@ -5,6 +5,7 @@ stub-obj-y += blockdev-close-all-bdrv-states.o stub-obj-y += clock-warp.o stub-obj-y += cpu-get-clock.o stub-obj-y += cpu-get-icount.o +stub-obj-y += cpu-lock.o stub-obj-y += dump.o stub-obj-y += error-printf.o stub-obj-y += fdset.o diff --git a/stubs/cpu-lock.c b/stubs/cpu-lock.c new file mode 100644 index 00..ca2ea8a9c2 --- /dev/null +++ b/stubs/cpu-lock.c @@ -0,0 +1,28 @@ +#include "qemu/osdep.h" +#include "hw/core/cpu.h" + +void cpu_mutex_lock_impl(CPUState *cpu, const char *file, int line) +{ +/* coverity gets confused by the indirect function call */ +#ifdef
[PATCH v8 00/74] per-CPU locks
V7: https://lists.gnu.org/archive/html/qemu-devel/2019-03/msg00786.html This is a continuation of the series created by Emilio Cota. We are picking up this patch set with the goal to apply any fixes or updates needed to get this accepted. Quoting an earlier patch in the series: "For context, the goal of this series is to substitute the BQL for the per-CPU locks in many places, notably the execution loop in cpus.c. This leads to better scalability for MTTCG, since CPUs don't have to acquire a contended global lock (the BQL) every time they stop executing code. See the last commit for some performance numbers." Listed below are the changes for this version of the patch, aside from the merge related changes. Changes for V8: - Fixed issue where in rr mode we could destroy the BQL twice. Added new function cpu_mutex_destroy(). - Removed g_assert(qemu_mutex_iothread_locked()) from qemu_tcg_rr_all_cpu_threads_idle(). There is an existing case where we call qemu_tcg_rr_all_cpu_threads_idle() without the BQL held, so we cannot assert on the lock here. - Found/fixed bug that had been hit in testing previously during the last consideration of this patch. We reproduced the issue hit in the qtest: bios-tables-test. The issue was introduced by dropping the BQL, and found us (very rarely) missing the condition variable wakeup in qemu_tcg_rr_cpu_thread_fn(). - ppc: convert to cpu_halted - Converted new code for cpu_halted and cpu_halted_set. - hw/semihosting: convert to cpu_halted_set - Added this patch as this code was new and needed converting. - ppc/translate_init.inc.c - Translated some new code here to use cpu_has_work_with_iothread_lock. - ppc/sapr_hcall.c Translated new code to cpu_halted - i386/hax-all.c - converted new code to cpu_interrupt_request and cpu_halted - mips/kvm.c - converted new code to cpu_halted - Some changes were related to files that moved, cpu.c and cpu.h moved to hw/core/, and some changes needed to be put there manually during the merge. Emilio G. Cota (69): cpu: convert queued work to a QSIMPLEQ cpu: rename cpu->work_mutex to cpu->lock cpu: introduce cpu_mutex_lock/unlock cpu: make qemu_work_cond per-cpu cpu: move run_on_cpu to cpus-common cpu: introduce process_queued_cpu_work_locked cpu: make per-CPU locks an alias of the BQL in TCG rr mode tcg-runtime: define helper_cpu_halted_set ppc: convert to helper_cpu_halted_set cris: convert to helper_cpu_halted_set hppa: convert to helper_cpu_halted_set m68k: convert to helper_cpu_halted_set alpha: convert to helper_cpu_halted_set microblaze: convert to helper_cpu_halted_set cpu: define cpu_halted helpers tcg-runtime: convert to cpu_halted_set arm: convert to cpu_halted ppc: convert to cpu_halted sh4: convert to cpu_halted i386: convert to cpu_halted lm32: convert to cpu_halted m68k: convert to cpu_halted mips: convert to cpu_halted riscv: convert to cpu_halted s390x: convert to cpu_halted sparc: convert to cpu_halted xtensa: convert to cpu_halted gdbstub: convert to cpu_halted openrisc: convert to cpu_halted cpu-exec: convert to cpu_halted cpu: convert to cpu_halted cpu: define cpu_interrupt_request helpers exec: use cpu_reset_interrupt arm: convert to cpu_interrupt_request i386: convert to cpu_interrupt_request i386/kvm: convert to cpu_interrupt_request i386/hax-all: convert to cpu_interrupt_request i386/whpx-all: convert to cpu_interrupt_request i386/hvf: convert to cpu_request_interrupt ppc: convert to cpu_interrupt_request sh4: convert to cpu_interrupt_request cris: convert to cpu_interrupt_request hppa: convert to cpu_interrupt_request lm32: convert to cpu_interrupt_request m68k: convert to cpu_interrupt_request mips: convert to cpu_interrupt_request nios: convert to cpu_interrupt_request s390x: convert to cpu_interrupt_request alpha: convert to cpu_interrupt_request moxie: convert to cpu_interrupt_request sparc: convert to cpu_interrupt_request openrisc: convert to cpu_interrupt_request unicore32: convert to cpu_interrupt_request microblaze: convert to cpu_interrupt_request accel/tcg: convert to cpu_interrupt_request cpu: convert to interrupt_request cpu: call .cpu_has_work with the CPU lock held cpu: introduce cpu_has_work_with_iothread_lock ppc: convert to cpu_has_work_with_iothread_lock mips: convert to cpu_has_work_with_iothread_lock s390x: convert to cpu_has_work_with_iothread_lock riscv: convert to cpu_has_work_with_iothread_lock sparc: convert to cpu_has_work_with_iothread_lock xtensa: convert to cpu_has_work_with_iothread_lock cpu: rename all_cpu_threads_idle to qemu_tcg_rr_all_cpu_threads_idle cpu: protect CPU state with cpu->lock instead of the BQL cpus-common: release BQL earlier in run_on_cpu cpu: add async_run_on_cpu_no_bql cputlb: queue async flush jobs without the BQL Paolo Bonzini (4): ppc: use cpu_reset_interrupt i386: use cpu_reset_interrupt
Re: [PATCH v16 QEMU 13/16] vfio: Add function to start and stop dirty pages tracking
On Wed, 25 Mar 2020 02:39:11 +0530 Kirti Wankhede wrote: > Call VFIO_IOMMU_DIRTY_PAGES ioctl to start and stop dirty pages tracking > for VFIO devices. > > Signed-off-by: Kirti Wankhede > --- > hw/vfio/migration.c | 36 > 1 file changed, 36 insertions(+) > > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > index ab295d25620e..1827b7cfb316 100644 > --- a/hw/vfio/migration.c > +++ b/hw/vfio/migration.c > @@ -9,6 +9,7 @@ > > #include "qemu/osdep.h" > #include "qemu/main-loop.h" > +#include > #include > > #include "sysemu/runstate.h" > @@ -296,6 +297,32 @@ static int vfio_load_device_config_state(QEMUFile *f, > void *opaque) > return qemu_file_get_error(f); > } > > +static int vfio_start_dirty_page_tracking(VFIODevice *vbasedev, bool start) > +{ > +int ret; > +VFIOContainer *container = vbasedev->group->container; > +struct vfio_iommu_type1_dirty_bitmap dirty = { > +.argsz = sizeof(dirty), > +}; > + > +if (start) { > +if (vbasedev->device_state & VFIO_DEVICE_STATE_SAVING) { > +dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_START; > +} else { > +return 0; > +} > +} else { > +dirty.flags = VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP; > +} Dirty logging and device saving are logically separate, why do we link them here? Why do we return success when we want to start logging if we haven't started logging? > + > +ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, ); > +if (ret) { > +error_report("Failed to set dirty tracking flag 0x%x errno: %d", > + dirty.flags, errno); > +} > +return ret; > +} > + > /* -- */ > > static int vfio_save_setup(QEMUFile *f, void *opaque) > @@ -330,6 +357,11 @@ static int vfio_save_setup(QEMUFile *f, void *opaque) > */ > qemu_put_be64(f, migration->region.size); > > +ret = vfio_start_dirty_page_tracking(vbasedev, true); > +if (ret) { > +return ret; > +} > + Haven't we corrupted the migration stream by exiting here? Maybe this implies the entire migration fails, therefore we don't need to add the end marker? Thanks, Alex > qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE); > > ret = qemu_file_get_error(f); > @@ -346,6 +378,8 @@ static void vfio_save_cleanup(void *opaque) > VFIODevice *vbasedev = opaque; > VFIOMigration *migration = vbasedev->migration; > > +vfio_start_dirty_page_tracking(vbasedev, false); > + > if (migration->region.mmaps) { > vfio_region_unmap(>region); > } > @@ -669,6 +703,8 @@ static void vfio_migration_state_notifier(Notifier > *notifier, void *data) > if (ret) { > error_report("%s: Failed to set state RUNNING", vbasedev->name); > } > + > +vfio_start_dirty_page_tracking(vbasedev, false); > } > } >
[Bug 1693050] Re: xhci HCIVERSION register read emulation incorrectly handled
Odd, this should be fixed by commit 6ee021d41 and 36960b4d66..98f52cdbb5c. -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1693050 Title: xhci HCIVERSION register read emulation incorrectly handled Status in QEMU: New Bug description: We had an illumos user trying to run illumos in QEMU 2.9.0 with the qemu-xhci device enabled. Note, that while this was discovered against QEMU 2.9.0, from my current read of the HEAD, it is still present. The illumos bug at https://www.illumos.org/issues/8173 has additional information on how the user invoked qemu. While investigating the problem we found that the illumos driver was reading a version of 0x when reading the HCIVERSION register from qemu. In the illumos driver we're performing a 16-bit read of the version register at offset 0x2. From looking around at other OSes, while Linux performs a 4 byte read at offset 0x0 and masks out the version, others that care about the version are doing a two byte read, though not all actually act on the version and some just discard the information. The user who hit this was able to enable tracing (note the tracing file is attached to the illumos bug linked previously) and we hit the unimplemented register read with offset 0x2 at http://git.qemu.org/?p=qemu.git;a=blob;f=hw/usb/hcd- xhci.c;hb=HEAD#l2960. The xhci register specifies today that its allowed for users to do 1-4 byte reads; however, that it implements only four byte reads in its implementation (http://git.qemu.org/?p=qemu.git;a=blob;f=hw/usb/hcd- xhci.c;hb=HEAD#l). Hence why when we read the HCIVERSION register at offset 0x2, it isn't handled in xhci_cap_read() which then returns zeros. From digging into this, I think that we're coming into memory_region_dispatch_read() and then memory_region_dispatch_read1(). However, I don't see anything in either the general memory region logic or in the xhci_cap_read() function that would deal with adjusting the offset that we're reading at and then masking it off again. While the access_with_adjusted_size() attempts to deal with this, it never adjusts the hardware address that's passed in to be a multiple of the implementation defined offset that I can see. I suspect that the FIXME at line 582 (http://git.qemu.org/?p=qemu.git;a=blob;f=memory.c;hb=HEAD#l582) and the implementation in the xhci code is the crux of the problem. For the time being we're working around this in the illumos driver, but I wanted to point this out such that it might be helpful for other systems which are assuming that they can do the two byte read like on hardware. To manage notifications about this bug go to: https://bugs.launchpad.net/qemu/+bug/1693050/+subscriptions
Re: [PATCH] i386/cpu: Expand MAX_FIXED_COUNTERS from 3 to 4 to for Icelake
On 17/03/20 06:54, Like Xu wrote: > In the Intel SDM, "Table 18-2. Association of Fixed-Function > Performance Counters with Architectural Performance Events", > we may have a new fixed counter 'TOPDOWN.SLOTS' (since Icelake), > which counts the number of available slots for an unhalted > logical processor. Check commit 6017608936 in the kernel tree. > > Signed-off-by: Like Xu > --- > target/i386/cpu.h | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/target/i386/cpu.h b/target/i386/cpu.h > index 576f309bbf..ec2b67d425 100644 > --- a/target/i386/cpu.h > +++ b/target/i386/cpu.h > @@ -1185,7 +1185,7 @@ typedef struct { > #define CPU_NB_REGS CPU_NB_REGS32 > #endif > > -#define MAX_FIXED_COUNTERS 3 > +#define MAX_FIXED_COUNTERS 4 > #define MAX_GP_COUNTERS(MSR_IA32_PERF_STATUS - MSR_P6_EVNTSEL0) > > #define TARGET_INSN_START_EXTRA_WORDS 1 > Hi Like, the problem with this patch is that it breaks live migration; the vmstate_msr_architectural_pmu record hardcodes MAX_FIXED_COUNTERS as the number of registers. So it's more complicated, you need to add a new subsection (following vmstate_msr_architectural_pmu) and transmit it only if the 4th counter is nonzero (instead of the more complicated check in pmu_enable_needed). Just to be safe, I'd make the new subsection hold 16 counters and bump MAX_FIXED_COUNTERS to 16. Thanks, Paolo
Re: [PATCH] qcow2: Remove unused fields from BDRVQcow2State
On 3/26/20 6:07 PM, Kevin Wolf wrote: These fields were already removed in commit c3c10f72, but then commit b58deb34 revived them probably due to bad merge conflict resolution. They are still unused, so remove them again. Fixes: b58deb344ddff3b9d8b265bf73a65274767ee5f4 Signed-off-by: Kevin Wolf --- block/qcow2.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/qcow2.h b/block/qcow2.h index 0942126232..f4de0a27d5 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -301,9 +301,6 @@ typedef struct BDRVQcow2State { QEMUTimer *cache_clean_timer; unsigned cache_clean_interval; -uint8_t *cluster_cache; -uint8_t *cluster_data; -uint64_t cluster_cache_offset; QLIST_HEAD(, QCowL2Meta) cluster_allocs; uint64_t *refcount_table; Reviewed-by: Philippe Mathieu-Daudé
Re: [PATCH v2 2/2] mirror: Wait only for in-flight operations
On 26.03.20 16:36, Kevin Wolf wrote: > mirror_wait_for_free_in_flight_slot() just picks a random operation to > wait for. However, a MirrorOp is already in s->ops_in_flight when > mirror_co_read() waits for free slots, so if not enough slots are > immediately available, an operation can end up waiting for itself, or > two or more operations can wait for each other to complete, which > results in a hang. > > Fix this by adding a flag to MirrorOp that tells us if the request is > already in flight (and therefore occupies slots that it will later > free), and picking only such operations for waiting. > > Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692 > Signed-off-by: Kevin Wolf > --- > block/mirror.c | 9 - > 1 file changed, 8 insertions(+), 1 deletion(-) > > diff --git a/block/mirror.c b/block/mirror.c > index 393131b135..88414d1653 100644 > --- a/block/mirror.c > +++ b/block/mirror.c [...] > @@ -1318,6 +1324,7 @@ static MirrorOp *coroutine_fn > active_write_prepare(MirrorBlockJob *s, > .offset = offset, > .bytes = bytes, > .is_active_write= true, > +.is_in_flight = true, > }; > qemu_co_queue_init(>waiting_requests); > QTAILQ_INSERT_TAIL(>ops_in_flight, op, next); > There is a mirror_wait_on_conflicts() call after this. I was a bit worried about dependencies there. But I don’t think there’s any problem, because mirror_wait_for_any_operation() is only called by: (1) mirror_wait_for_free_in_flight_slot(), which makes it look for non-active operations only, and (2) mirror_run(), which specifically waits for all active operations to settle, so it makes sense to wait for all of them, even when they are still doing their own dependency-waiting. But still, I’m not sure whether this is conceptually the best thing to do. I think what we actually want is for mirror_wait_for_free_in_flight_slot() to only wait for in-flight operations; but the call in mirror_run() that waits for active-mirror operations wants to wait for all active-mirror operations, not just the ones that are in flight. So I think conceptually it would make more sense to set is_in_flight only after mirror_wait_on_conflicts(), and ensure that the mirror_wait_for_any_operation() call from mirror_run() ignores is_in_flight. E.g. by having another parameter “bool in_flight” for mirror_wait_for_any_operation() that chooses whether to check is_in_flight or whether to ignore it. In practice, @in_flight would always be the same as @active, but they are different things. But that would mean we would always ignore is_in_flight for active-mirror operations. In practice, there’s no difference to what this patch does, i.e. to just let active-mirror operations have is_in_flight to be always true and let mirror_wait_for_any_operation() check is_in_flight unconditionally. So I don’t know. Maybe this is a start: Functionally-reviewed-by: Max Reitz Max signature.asc Description: OpenPGP digital signature
Re: [PATCH v6 23/61] target/riscv: vector integer merge and move instructions
On Tue, Mar 17, 2020 at 8:53 AM LIU Zhiwei wrote: > > Signed-off-by: LIU Zhiwei Reviewed-by: Alistair Francis Alistair > --- > target/riscv/helper.h | 17 > target/riscv/insn32.decode | 7 ++ > target/riscv/insn_trans/trans_rvv.inc.c | 121 > target/riscv/vector_helper.c| 100 > 4 files changed, 245 insertions(+) > > diff --git a/target/riscv/helper.h b/target/riscv/helper.h > index 1f0d3d60e3..f378db9cbf 100644 > --- a/target/riscv/helper.h > +++ b/target/riscv/helper.h > @@ -665,3 +665,20 @@ DEF_HELPER_6(vwmaccsu_vx_w, void, ptr, ptr, tl, ptr, > env, i32) > DEF_HELPER_6(vwmaccus_vx_b, void, ptr, ptr, tl, ptr, env, i32) > DEF_HELPER_6(vwmaccus_vx_h, void, ptr, ptr, tl, ptr, env, i32) > DEF_HELPER_6(vwmaccus_vx_w, void, ptr, ptr, tl, ptr, env, i32) > + > +DEF_HELPER_6(vmerge_vvm_b, void, ptr, ptr, ptr, ptr, env, i32) > +DEF_HELPER_6(vmerge_vvm_h, void, ptr, ptr, ptr, ptr, env, i32) > +DEF_HELPER_6(vmerge_vvm_w, void, ptr, ptr, ptr, ptr, env, i32) > +DEF_HELPER_6(vmerge_vvm_d, void, ptr, ptr, ptr, ptr, env, i32) > +DEF_HELPER_6(vmerge_vxm_b, void, ptr, ptr, tl, ptr, env, i32) > +DEF_HELPER_6(vmerge_vxm_h, void, ptr, ptr, tl, ptr, env, i32) > +DEF_HELPER_6(vmerge_vxm_w, void, ptr, ptr, tl, ptr, env, i32) > +DEF_HELPER_6(vmerge_vxm_d, void, ptr, ptr, tl, ptr, env, i32) > +DEF_HELPER_4(vmv_v_v_b, void, ptr, ptr, env, i32) > +DEF_HELPER_4(vmv_v_v_h, void, ptr, ptr, env, i32) > +DEF_HELPER_4(vmv_v_v_w, void, ptr, ptr, env, i32) > +DEF_HELPER_4(vmv_v_v_d, void, ptr, ptr, env, i32) > +DEF_HELPER_4(vmv_v_x_b, void, ptr, i64, env, i32) > +DEF_HELPER_4(vmv_v_x_h, void, ptr, i64, env, i32) > +DEF_HELPER_4(vmv_v_x_w, void, ptr, i64, env, i32) > +DEF_HELPER_4(vmv_v_x_d, void, ptr, i64, env, i32) > diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode > index 9735ac3565..adb76956c9 100644 > --- a/target/riscv/insn32.decode > +++ b/target/riscv/insn32.decode > @@ -71,6 +71,7 @@ > @r_nfvm ... ... vm:1 . . ... . ... %nf %rs2 %rs1 %rd > @r_vm.. vm:1 . . ... . ... %rs2 %rs1 %rd > @r_vm_1 .. . . . ... . ... vm=1 %rs2 %rs1 %rd > +@r_vm_0 .. . . . ... . ... vm=0 %rs2 %rs1 %rd > @r_wdvm . wd:1 vm:1 . . ... . ... %rs2 %rs1 %rd > @r2_zimm . zimm:11 . ... . ... %rs1 %rd > > @@ -400,6 +401,12 @@ vwmacc_vx 01 . . . 110 . 1010111 > @r_vm > vwmaccsu_vv 10 . . . 010 . 1010111 @r_vm > vwmaccsu_vx 10 . . . 110 . 1010111 @r_vm > vwmaccus_vx 11 . . . 110 . 1010111 @r_vm > +vmv_v_v 010111 1 0 . 000 . 1010111 @r2 > +vmv_v_x 010111 1 0 . 100 . 1010111 @r2 > +vmv_v_i 010111 1 0 . 011 . 1010111 @r2 > +vmerge_vvm 010111 0 . . 000 . 1010111 @r_vm_0 > +vmerge_vxm 010111 0 . . 100 . 1010111 @r_vm_0 > +vmerge_vim 010111 0 . . 011 . 1010111 @r_vm_0 > > vsetvli 0 ... . 111 . 1010111 @r2_zimm > vsetvl 100 . . 111 . 1010111 @r > diff --git a/target/riscv/insn_trans/trans_rvv.inc.c > b/target/riscv/insn_trans/trans_rvv.inc.c > index 269d04c7fb..42ef59364f 100644 > --- a/target/riscv/insn_trans/trans_rvv.inc.c > +++ b/target/riscv/insn_trans/trans_rvv.inc.c > @@ -1499,3 +1499,124 @@ GEN_OPIVX_WIDEN_TRANS(vwmaccu_vx) > GEN_OPIVX_WIDEN_TRANS(vwmacc_vx) > GEN_OPIVX_WIDEN_TRANS(vwmaccsu_vx) > GEN_OPIVX_WIDEN_TRANS(vwmaccus_vx) > + > +/* Vector Integer Merge and Move Instructions */ > +static bool trans_vmv_v_v(DisasContext *s, arg_vmv_v_v *a) > +{ > +if (vext_check_isa_ill(s) && > +vext_check_reg(s, a->rd, false) && > +vext_check_reg(s, a->rs1, false)) { > + > +if (s->vl_eq_vlmax) { > +tcg_gen_gvec_mov(s->sew, vreg_ofs(s, a->rd), > + vreg_ofs(s, a->rs1), > + MAXSZ(s), MAXSZ(s)); > +} else { > +uint32_t data = FIELD_DP32(0, VDATA, LMUL, s->lmul); > +static gen_helper_gvec_2_ptr * const fns[4] = { > +gen_helper_vmv_v_v_b, gen_helper_vmv_v_v_h, > +gen_helper_vmv_v_v_w, gen_helper_vmv_v_v_d, > +}; > + > +tcg_gen_gvec_2_ptr(vreg_ofs(s, a->rd), vreg_ofs(s, a->rs1), > + cpu_env, 0, s->vlen / 8, data, fns[s->sew]); > +} > +return true; > +} > +return false; > +} > + > +typedef void gen_helper_vmv_vx(TCGv_ptr, TCGv_i64, TCGv_env, TCGv_i32); > +static bool trans_vmv_v_x(DisasContext *s, arg_vmv_v_x *a) > +{ > +if (vext_check_isa_ill(s) && > +vext_check_reg(s, a->rd, false)) { > + > +TCGv s1 = tcg_temp_new(); > +gen_get_gpr(s1, a->rs1); > + > +if (s->vl_eq_vlmax) { > +#ifdef TARGET_RISCV64 > +
[Bug 1869241] [NEW] svn checkout fails with E000075 "Value too large for defined data type"
Public bug reported: I try to autobuild for ARM7 with qemu-arm-static. Part of this is downloading source via SVN. Whenever I try to download a SVN repository I get the following output: svn: E75: Can't read directory '...': Value too large for defined data type This is the repository I'm trying to check out: https://svn.baycom.de/repos/vdr-mcli-plugin/ qemu-arm-static version is 4.2.0 I've also tried older versions without change. Platform I try to emulate is armv7h (Arch Linux ARM for Raspberry Pi 2) Host system is AMD64 This can be reproduced 100% of the time. Here I have the issue happening on Travis CI: https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L9944 ** Affects: qemu Importance: Undecided Status: New ** Description changed: I try to autobuild for ARM7 with qemu-arm-static. Part of this is downloading source via SVN. Whenever I try to download a SVN repository I get the following output: - svn: E75: Can't read directory '...': Value too large for + svn: E75: Can't read directory '...': Value too large for defined data type qemu-arm-static version is 4.2.0 I've also tried older versions without change. Platform I try to emulate is armv7h (Arch Linux ARM for Raspberry Pi 2) Host system is AMD64 This can be reproduced 100% of the time. Here I have the issue happening on Travis CI: - https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L7228 + https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L9944 ** Description changed: I try to autobuild for ARM7 with qemu-arm-static. Part of this is downloading source via SVN. Whenever I try to download a SVN repository I get the following output: svn: E75: Can't read directory '...': Value too large for defined data type + + This is the repository I'm trying to check out: + https://svn.baycom.de/repos/vdr-mcli-plugin/ qemu-arm-static version is 4.2.0 I've also tried older versions without change. Platform I try to emulate is armv7h (Arch Linux ARM for Raspberry Pi 2) Host system is AMD64 This can be reproduced 100% of the time. Here I have the issue happening on Travis CI: https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L9944 -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1869241 Title: svn checkout fails with E75 "Value too large for defined data type" Status in QEMU: New Bug description: I try to autobuild for ARM7 with qemu-arm-static. Part of this is downloading source via SVN. Whenever I try to download a SVN repository I get the following output: svn: E75: Can't read directory '...': Value too large for defined data type This is the repository I'm trying to check out: https://svn.baycom.de/repos/vdr-mcli-plugin/ qemu-arm-static version is 4.2.0 I've also tried older versions without change. Platform I try to emulate is armv7h (Arch Linux ARM for Raspberry Pi 2) Host system is AMD64 This can be reproduced 100% of the time. Here I have the issue happening on Travis CI: https://travis-ci.com/github/VDR4Arch/vdr4arch/jobs/304839747#L9944 To manage notifications about this bug go to: https://bugs.launchpad.net/qemu/+bug/1869241/+subscriptions
Re: [PATCH] qemu/atomic.h: add #ifdef guards for stdatomic.h
On 26/03/20 18:14, Peter Maydell wrote: >> +#ifndef atomic_fetch_add >> #define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, >> __ATOMIC_SEQ_CST) >> #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, >> __ATOMIC_SEQ_CST) >> #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, >> __ATOMIC_SEQ_CST) >> #define atomic_fetch_or(ptr, n) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST) >> #define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, >> __ATOMIC_SEQ_CST) >> +#endif > > This will work around FreeBSD's current implementation in particular, > but I don't think there's anything in the C11 spec that mandates that > atomic_fetch_add() and friends have to be macros and not simply > functions... That's not a problem as long as they are all functions, the macros would simply override the function-based implementation. Paolo
Re: [PATCH v4 0/2] Replaced locks with lock guard macros
On Wed, 25 Mar 2020 at 04:35, Daniel Brodsky wrote: > > There may be ways to rewrite that expression to avoid triggering the > > warning on a 32-bit platform. Untested, but does this help: > > > > if (sizeof(mask) > 4 && mask <= 0xu) { > > > > -- > > Eric Blake, Principal Software Engineer > > Red Hat, Inc. +1-919-301-3226 > > Virtualization: qemu.org | libvirt.org > > > Unfortunately, the compiler still catches that the statement is always > true for 32-bit. > An alternative might be to convert cases like this to a macro instead, > which doesn't > cause any errors. My preference is to add -Wno-tautological-type-limit-compare in configure, so we don't have to work around this same issue elsewhere in the code base. r~
[Bug 1869073] Re: qemu-arm-static crashes "segmentation fault" when running "git clone -s"
Actually this one magically went good. I'm testing in Virtual Box as my ARM64 host. Maybe something went wrong there. After rebooting the whole machine today "git" works well. Will reopen if it happens again... ** Changed in: qemu Status: New => Incomplete -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1869073 Title: qemu-arm-static crashes "segmentation fault" when running "git clone -s" Status in QEMU: Incomplete Bug description: I want to use qemu-arm-static to cross-compile software. The compiler itself is a native cross-compiler connected via "distcc". The problem is that a script tries to do some stuff with "git" and with a "git clone -s" command the whole story reproducibly stops with a "segmentation fault". I don't know how to properly debug the issue but it happens 100% of the time that I get the "crash" or git just hangs forever with 100% CPU usage. To manage notifications about this bug go to: https://bugs.launchpad.net/qemu/+bug/1869073/+subscriptions
Re: [PATCH v9 1/9] hw/vfio/common: Remove error print on mmio region translation by viommu
On Thu, 26 Mar 2020 18:35:48 +0100 Auger Eric wrote: > Hi Alex, > > On 3/24/20 12:08 AM, Alex Williamson wrote: > > [Cc +dwg who originated this warning] > > > > On Mon, 23 Mar 2020 14:16:09 +0530 > > Bharat Bhushan wrote: > > > >> On ARM, the MSI doorbell is translated by the virtual IOMMU. > >> As such address_space_translate() returns the MSI controller > >> MMIO region and we get an "iommu map to non memory area" > >> message. Let's remove this latter. > >> > >> Signed-off-by: Eric Auger > >> Signed-off-by: Bharat Bhushan > >> --- > >> hw/vfio/common.c | 2 -- > >> 1 file changed, 2 deletions(-) > >> > >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c > >> index 5ca11488d6..c586edf47a 100644 > >> --- a/hw/vfio/common.c > >> +++ b/hw/vfio/common.c > >> @@ -426,8 +426,6 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void > >> **vaddr, > >> , , writable, > >> MEMTXATTRS_UNSPECIFIED); > >> if (!memory_region_is_ram(mr)) { > >> -error_report("iommu map to non memory area %"HWADDR_PRIx"", > >> - xlat); > >> return false; > >> } > >> > > > > I'm a bit confused here, I think we need more justification beyond "we > > hit this warning and we don't want to because it's ok in this one > > special case, therefore remove it". I assume the special case is that > > the device MSI address is managed via the SET_IRQS ioctl and therefore > > we won't actually get DMAs to this range. > Yes exactly. The guest creates a mapping between one giova and this gpa > (corresponding to the MSI controller doorbell) because MSIs are mapped > on ARM. But practically the physical device is programmed with an host > chosen iova that maps onto the physical MSI controller's doorbell. so > the device never performs DMA accesses to this range. > > But I imagine the case that > > was in mind when adding this warning was general peer-to-peer between > > and assigned and emulated device. > yes makes sense. > > Maybe there's an argument to be made > > that such a p2p mapping might also be used in a non-vIOMMU case. We > > skip creating those mappings and drivers continue to work, maybe > > because nobody attempts to do p2p DMA with the types of devices we > > emulate, maybe because p2p DMA is not absolutely reliable on bare metal > > and drivers test it before using it. > MSI doorbells are mapped using the IOMMU_MMIO flag (dma-iommu.c > iommu_dma_get_msi_page). > One idea could be to pass that flag through the IOMMU Notifier mechanism > into the iotlb->perm. Eventually when we get this in vfio_get_vaddr() we > would not print the warning. Could that make sense? Yeah, if we can identify a valid case that doesn't need a warning, that's fine by me. Thanks, Alex
Re: [PATCH v16 QEMU 05/16] vfio: Add migration region initialization and finalize function
* Kirti Wankhede (kwankh...@nvidia.com) wrote: > - Migration functions are implemented for VFIO_DEVICE_TYPE_PCI device in this > patch series. > - VFIO device supports migration or not is decided based of migration region > query. If migration region query is successful and migration region > initialization is successful then migration is supported else migration is > blocked. > > Signed-off-by: Kirti Wankhede > Reviewed-by: Neo Jia > --- > hw/vfio/Makefile.objs | 2 +- > hw/vfio/migration.c | 138 > ++ > hw/vfio/trace-events | 3 + > include/hw/vfio/vfio-common.h | 9 +++ > 4 files changed, 151 insertions(+), 1 deletion(-) > create mode 100644 hw/vfio/migration.c > > diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs > index 9bb1c09e8477..8b296c889ed9 100644 > --- a/hw/vfio/Makefile.objs > +++ b/hw/vfio/Makefile.objs > @@ -1,4 +1,4 @@ > -obj-y += common.o spapr.o > +obj-y += common.o spapr.o migration.o > obj-$(CONFIG_VFIO_PCI) += pci.o pci-quirks.o display.o > obj-$(CONFIG_VFIO_CCW) += ccw.o > obj-$(CONFIG_VFIO_PLATFORM) += platform.o > diff --git a/hw/vfio/migration.c b/hw/vfio/migration.c > new file mode 100644 > index ..a078dcf1dd8f > --- /dev/null > +++ b/hw/vfio/migration.c > @@ -0,0 +1,138 @@ > +/* > + * Migration support for VFIO devices > + * > + * Copyright NVIDIA, Inc. 2019 Time flies by... > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. Are you sure you want this to be V2 only? Most code added to qemu now is v2 or later. > + */ > + > +#include "qemu/osdep.h" > +#include > + > +#include "hw/vfio/vfio-common.h" > +#include "cpu.h" > +#include "migration/migration.h" > +#include "migration/qemu-file.h" > +#include "migration/register.h" > +#include "migration/blocker.h" > +#include "migration/misc.h" > +#include "qapi/error.h" > +#include "exec/ramlist.h" > +#include "exec/ram_addr.h" > +#include "pci.h" > +#include "trace.h" > + > +static void vfio_migration_region_exit(VFIODevice *vbasedev) > +{ > +VFIOMigration *migration = vbasedev->migration; > + > +if (!migration) { > +return; > +} > + > +if (migration->region.size) { > +vfio_region_exit(>region); > +vfio_region_finalize(>region); > +} > +} > + > +static int vfio_migration_region_init(VFIODevice *vbasedev, int index) > +{ > +VFIOMigration *migration = vbasedev->migration; > +Object *obj = NULL; > +int ret = -EINVAL; > + > +if (!vbasedev->ops->vfio_get_object) { > +return ret; > +} > + > +obj = vbasedev->ops->vfio_get_object(vbasedev); > +if (!obj) { > +return ret; > +} > + > +ret = vfio_region_setup(obj, vbasedev, >region, index, > +"migration"); > +if (ret) { > +error_report("%s: Failed to setup VFIO migration region %d: %s", > + vbasedev->name, index, strerror(-ret)); > +goto err; > +} > + > +if (!migration->region.size) { > +ret = -EINVAL; > +error_report("%s: Invalid region size of VFIO migration region %d: > %s", > + vbasedev->name, index, strerror(-ret)); > +goto err; > +} > + > +return 0; > + > +err: > +vfio_migration_region_exit(vbasedev); > +return ret; > +} > + > +static int vfio_migration_init(VFIODevice *vbasedev, > + struct vfio_region_info *info) > +{ > +int ret; > + > +vbasedev->migration = g_new0(VFIOMigration, 1); > + > +ret = vfio_migration_region_init(vbasedev, info->index); > +if (ret) { > +error_report("%s: Failed to initialise migration region", > + vbasedev->name); > +g_free(vbasedev->migration); > +vbasedev->migration = NULL; > +return ret; > +} > + > +return 0; > +} > + > +/* -- */ > + > +int vfio_migration_probe(VFIODevice *vbasedev, Error **errp) > +{ > +struct vfio_region_info *info; > +Error *local_err = NULL; > +int ret; > + > +ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_MIGRATION, > + VFIO_REGION_SUBTYPE_MIGRATION, ); > +if (ret) { > +goto add_blocker; > +} > + > +ret = vfio_migration_init(vbasedev, info); > +if (ret) { > +goto add_blocker; > +} > + > +trace_vfio_migration_probe(vbasedev->name, info->index); > +return 0; > + > +add_blocker: > +error_setg(>migration_blocker, > + "VFIO device doesn't support migration"); > +ret = migrate_add_blocker(vbasedev->migration_blocker, _err); > +if (local_err) { > +error_propagate(errp, local_err); > +error_free(vbasedev->migration_blocker); > +} > +return ret; > +} > + > +void vfio_migration_finalize(VFIODevice
Re: [RFC v6 01/24] update-linux-headers: Import iommu.h
Hi Yi, On 3/26/20 1:58 PM, Liu, Yi L wrote: >> From: Eric Auger >> Sent: Saturday, March 21, 2020 12:58 AM >> To: eric.auger@gmail.com; eric.au...@redhat.com; qemu-devel@nongnu.org; >> Subject: [RFC v6 01/24] update-linux-headers: Import iommu.h >> >> Update the script to import the new iommu.h uapi header. >> >> Signed-off-by: Eric Auger >> --- >> scripts/update-linux-headers.sh | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/scripts/update-linux-headers.sh >> b/scripts/update-linux-headers.sh index >> 29c27f4681..5b64ee3912 100755 >> --- a/scripts/update-linux-headers.sh >> +++ b/scripts/update-linux-headers.sh >> @@ -141,7 +141,7 @@ done >> >> rm -rf "$output/linux-headers/linux" >> mkdir -p "$output/linux-headers/linux" >> -for header in kvm.h vfio.h vfio_ccw.h vhost.h \ >> +for header in kvm.h vfio.h vfio_ccw.h vhost.h iommu.h \ >>psci.h psp-sev.h userfaultfd.h mman.h; do >> cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux" >> done > > Hi Eric, > > This patch already got acked by from Cornelia. :-) > > https://patchwork.ozlabs.org/patch/1259643/ thanks for the heads-up! Every little step ... ;-) Thanks Eric > > Regards, > Yi Liu >
Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices
* Kirti Wankhede (kwankh...@nvidia.com) wrote: > These functions save and restore PCI device specific data - config > space of PCI device. > Tested save and restore with MSI and MSIX type. > > Signed-off-by: Kirti Wankhede > Reviewed-by: Neo Jia > --- > hw/vfio/pci.c | 163 > ++ > include/hw/vfio/vfio-common.h | 2 + > 2 files changed, 165 insertions(+) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 6c77c12e44b9..8deb11e87ef7 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -41,6 +41,7 @@ > #include "trace.h" > #include "qapi/error.h" > #include "migration/blocker.h" > +#include "migration/qemu-file.h" > > #define TYPE_VFIO_PCI "vfio-pci" > #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI) > @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev) > } > } > > +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr) > +{ > +PCIDevice *pdev = >pdev; > +VFIOBAR *bar = >bars[nr]; > +uint64_t addr; > +uint32_t addr_lo, addr_hi = 0; > + > +/* Skip unimplemented BARs and the upper half of 64bit BARS. */ > +if (!bar->size) { > +return 0; > +} > + > +addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, 4); > + > +addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK : > + PCI_BASE_ADDRESS_MEM_MASK); > +if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) { > +addr_hi = pci_default_read_config(pdev, > + PCI_BASE_ADDRESS_0 + (nr + 1) * 4, > 4); > +} > + > +addr = ((uint64_t)addr_hi << 32) | addr_lo; > + > +if (!QEMU_IS_ALIGNED(addr, bar->size)) { > +return -EINVAL; > +} > + > +return 0; > +} > + > +static int vfio_bars_validate(VFIOPCIDevice *vdev) > +{ > +int i, ret; > + > +for (i = 0; i < PCI_ROM_SLOT; i++) { > +ret = vfio_bar_validate(vdev, i); > +if (ret) { > +error_report("vfio: BAR address %d validation failed", i); > +return ret; > +} > +} > +return 0; > +} > + > static void vfio_bar_register(VFIOPCIDevice *vdev, int nr) > { > VFIOBAR *bar = >bars[nr]; > @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice > *vbasedev) > return OBJECT(vdev); > } > > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > +{ > +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > +PCIDevice *pdev = >pdev; > +uint16_t pci_cmd; > +int i; > + > +for (i = 0; i < PCI_ROM_SLOT; i++) { > +uint32_t bar; > + > +bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4); > +qemu_put_be32(f, bar); > +} > + > +qemu_put_be32(f, vdev->interrupt); > +if (vdev->interrupt == VFIO_INT_MSI) { > +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data; > +bool msi_64bit; > + > +msi_flags = pci_default_read_config(pdev, pdev->msi_cap + > PCI_MSI_FLAGS, > +2); > +msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT); > + > +msi_addr_lo = pci_default_read_config(pdev, > + pdev->msi_cap + PCI_MSI_ADDRESS_LO, > 4); > +qemu_put_be32(f, msi_addr_lo); > + > +if (msi_64bit) { > +msi_addr_hi = pci_default_read_config(pdev, > + pdev->msi_cap + > PCI_MSI_ADDRESS_HI, > + 4); > +} > +qemu_put_be32(f, msi_addr_hi); > + > +msi_data = pci_default_read_config(pdev, > +pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : > PCI_MSI_DATA_32), > +2); > +qemu_put_be32(f, msi_data); > +} else if (vdev->interrupt == VFIO_INT_MSIX) { > +uint16_t offset; > + > +/* save enable bit and maskall bit */ > +offset = pci_default_read_config(pdev, > + pdev->msix_cap + PCI_MSIX_FLAGS + 1, > 2); > +qemu_put_be16(f, offset); > +msix_save(pdev, f); > +} > +pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2); > +qemu_put_be16(f, pci_cmd); > +} > + > +static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) > +{ > +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > +PCIDevice *pdev = >pdev; > +uint32_t interrupt_type; > +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data; > +uint16_t pci_cmd; > +bool msi_64bit; > +int i, ret; > + > +/* retore pci bar configuration */ > +pci_cmd = pci_default_read_config(pdev, PCI_COMMAND, 2); > +vfio_pci_write_config(pdev, PCI_COMMAND, > +pci_cmd & (!(PCI_COMMAND_IO | PCI_COMMAND_MEMORY)), > 2); > +for (i = 0; i < PCI_ROM_SLOT; i++) { > +uint32_t
Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices
On Thu, 26 Mar 2020 17:29:26 + "Dr. David Alan Gilbert" wrote: > * Alex Williamson (alex.william...@redhat.com) wrote: > > On Wed, 25 Mar 2020 02:39:02 +0530 > > Kirti Wankhede wrote: > > > > > These functions save and restore PCI device specific data - config > > > space of PCI device. > > > Tested save and restore with MSI and MSIX type. > > > > > > Signed-off-by: Kirti Wankhede > > > Reviewed-by: Neo Jia > > > --- > > > hw/vfio/pci.c | 163 > > > ++ > > > include/hw/vfio/vfio-common.h | 2 + > > > 2 files changed, 165 insertions(+) > > > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > > index 6c77c12e44b9..8deb11e87ef7 100644 > > > --- a/hw/vfio/pci.c > > > +++ b/hw/vfio/pci.c > > > @@ -41,6 +41,7 @@ > > > #include "trace.h" > > > #include "qapi/error.h" > > > #include "migration/blocker.h" > > > +#include "migration/qemu-file.h" > > > > > > #define TYPE_VFIO_PCI "vfio-pci" > > > #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI) > > > @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev) > > > } > > > } > > > > > > +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr) > > > +{ > > > +PCIDevice *pdev = >pdev; > > > +VFIOBAR *bar = >bars[nr]; > > > +uint64_t addr; > > > +uint32_t addr_lo, addr_hi = 0; > > > + > > > +/* Skip unimplemented BARs and the upper half of 64bit BARS. */ > > > +if (!bar->size) { > > > +return 0; > > > +} > > > + > > > +addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, > > > 4); > > > + > > > +addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK : > > > + PCI_BASE_ADDRESS_MEM_MASK); > > > > Nit, &= or combine with previous set. > > > > > +if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) { > > > +addr_hi = pci_default_read_config(pdev, > > > + PCI_BASE_ADDRESS_0 + (nr + 1) * > > > 4, 4); > > > +} > > > + > > > +addr = ((uint64_t)addr_hi << 32) | addr_lo; > > > > Could we use a union? > > > > > + > > > +if (!QEMU_IS_ALIGNED(addr, bar->size)) { > > > +return -EINVAL; > > > +} > > > > What specifically are we validating here? This should be true no > > matter what we wrote to the BAR or else BAR emulation is broken. The > > bits that could make this unaligned are not implemented in the BAR. > > That I think is based on a comment I asked a few versions back. > Remember the value being checked here is a value loaded from the > migration stream; it could be garbage, so it's good to do whatever > checks you can. It's not the migration stream though, we're reading it from config space emulation. The migration stream could have written absolutely anything to the device BAR and this test should still be ok. PCI BARs are naturally aligned by definition. The address bits that could make the value unaligned are not implemented. This is why we can determine the size of the BAR by writing -1 to it. Thanks, Alex > > > + > > > +return 0; > > > +} > > > + > > > +static int vfio_bars_validate(VFIOPCIDevice *vdev) > > > +{ > > > +int i, ret; > > > + > > > +for (i = 0; i < PCI_ROM_SLOT; i++) { > > > +ret = vfio_bar_validate(vdev, i); > > > +if (ret) { > > > +error_report("vfio: BAR address %d validation failed", i); > > > +return ret; > > > +} > > > +} > > > +return 0; > > > +} > > > + > > > static void vfio_bar_register(VFIOPCIDevice *vdev, int nr) > > > { > > > VFIOBAR *bar = >bars[nr]; > > > @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice > > > *vbasedev) > > > return OBJECT(vdev); > > > } > > > > > > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > > > +{ > > > +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, > > > vbasedev); > > > +PCIDevice *pdev = >pdev; > > > +uint16_t pci_cmd; > > > +int i; > > > + > > > +for (i = 0; i < PCI_ROM_SLOT; i++) { > > > +uint32_t bar; > > > + > > > +bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, > > > 4); > > > +qemu_put_be32(f, bar); > > > +} > > > + > > > +qemu_put_be32(f, vdev->interrupt); > > > +if (vdev->interrupt == VFIO_INT_MSI) { > > > +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data; > > > +bool msi_64bit; > > > + > > > +msi_flags = pci_default_read_config(pdev, pdev->msi_cap + > > > PCI_MSI_FLAGS, > > > +2); > > > +msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT); > > > + > > > +msi_addr_lo = pci_default_read_config(pdev, > > > + pdev->msi_cap + > > > PCI_MSI_ADDRESS_LO, 4); > > > +qemu_put_be32(f, msi_addr_lo); > > > + > > > +if (msi_64bit) { > > >
Re: [PATCH v9 1/9] hw/vfio/common: Remove error print on mmio region translation by viommu
Hi Alex, On 3/24/20 12:08 AM, Alex Williamson wrote: > [Cc +dwg who originated this warning] > > On Mon, 23 Mar 2020 14:16:09 +0530 > Bharat Bhushan wrote: > >> On ARM, the MSI doorbell is translated by the virtual IOMMU. >> As such address_space_translate() returns the MSI controller >> MMIO region and we get an "iommu map to non memory area" >> message. Let's remove this latter. >> >> Signed-off-by: Eric Auger >> Signed-off-by: Bharat Bhushan >> --- >> hw/vfio/common.c | 2 -- >> 1 file changed, 2 deletions(-) >> >> diff --git a/hw/vfio/common.c b/hw/vfio/common.c >> index 5ca11488d6..c586edf47a 100644 >> --- a/hw/vfio/common.c >> +++ b/hw/vfio/common.c >> @@ -426,8 +426,6 @@ static bool vfio_get_vaddr(IOMMUTLBEntry *iotlb, void >> **vaddr, >> , , writable, >> MEMTXATTRS_UNSPECIFIED); >> if (!memory_region_is_ram(mr)) { >> -error_report("iommu map to non memory area %"HWADDR_PRIx"", >> - xlat); >> return false; >> } >> > > I'm a bit confused here, I think we need more justification beyond "we > hit this warning and we don't want to because it's ok in this one > special case, therefore remove it". I assume the special case is that > the device MSI address is managed via the SET_IRQS ioctl and therefore > we won't actually get DMAs to this range. Yes exactly. The guest creates a mapping between one giova and this gpa (corresponding to the MSI controller doorbell) because MSIs are mapped on ARM. But practically the physical device is programmed with an host chosen iova that maps onto the physical MSI controller's doorbell. so the device never performs DMA accesses to this range. But I imagine the case that > was in mind when adding this warning was general peer-to-peer between > and assigned and emulated device. yes makes sense. Maybe there's an argument to be made > that such a p2p mapping might also be used in a non-vIOMMU case. We > skip creating those mappings and drivers continue to work, maybe > because nobody attempts to do p2p DMA with the types of devices we > emulate, maybe because p2p DMA is not absolutely reliable on bare metal > and drivers test it before using it. MSI doorbells are mapped using the IOMMU_MMIO flag (dma-iommu.c iommu_dma_get_msi_page). One idea could be to pass that flag through the IOMMU Notifier mechanism into the iotlb->perm. Eventually when we get this in vfio_get_vaddr() we would not print the warning. Could that make sense? Thanks Eric Anyway, I need a better argument > why this is ok. Thanks, > > Alex >
Re: [PATCH] qemu/atomic.h: add #ifdef guards for stdatomic.h
Peter Maydell writes: > On Thu, 26 Mar 2020 at 17:01, Alex Bennée wrote: >> >> Deep inside the FreeBSD netmap headers we end up including stdatomic.h >> which clashes with qemu's atomic functions which are modelled along >> the C11 standard. To avoid a massive rename lets just ifdef around the >> problem. >> >> Signed-off-by: Alex Bennée >> --- >> include/qemu/atomic.h | 6 ++ >> 1 file changed, 6 insertions(+) >> >> diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h >> index f9cd24c8994..ff72db51154 100644 >> --- a/include/qemu/atomic.h >> +++ b/include/qemu/atomic.h >> @@ -208,11 +208,14 @@ >> /* Provide shorter names for GCC atomic builtins, return old value */ >> #define atomic_fetch_inc(ptr) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST) >> #define atomic_fetch_dec(ptr) __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST) >> + >> +#ifndef atomic_fetch_add >> #define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, >> __ATOMIC_SEQ_CST) >> #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, >> __ATOMIC_SEQ_CST) >> #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, >> __ATOMIC_SEQ_CST) >> #define atomic_fetch_or(ptr, n) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST) >> #define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, >> __ATOMIC_SEQ_CST) >> +#endif > > This will work around FreeBSD's current implementation in particular, > but I don't think there's anything in the C11 spec that mandates that > atomic_fetch_add() and friends have to be macros and not simply > functions... Sure there are two alternative options: - Move to using stdatomic headers - on Linux they seem to be C++ only - Rename all out atomic functions - seems a bit of a big patch for rc releases I suspect we should look at option two for 5.1 -- Alex Bennée
Re: [PATCH v16 QEMU 04/16] vfio: Add save and load functions for VFIO PCI devices
* Alex Williamson (alex.william...@redhat.com) wrote: > On Wed, 25 Mar 2020 02:39:02 +0530 > Kirti Wankhede wrote: > > > These functions save and restore PCI device specific data - config > > space of PCI device. > > Tested save and restore with MSI and MSIX type. > > > > Signed-off-by: Kirti Wankhede > > Reviewed-by: Neo Jia > > --- > > hw/vfio/pci.c | 163 > > ++ > > include/hw/vfio/vfio-common.h | 2 + > > 2 files changed, 165 insertions(+) > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > index 6c77c12e44b9..8deb11e87ef7 100644 > > --- a/hw/vfio/pci.c > > +++ b/hw/vfio/pci.c > > @@ -41,6 +41,7 @@ > > #include "trace.h" > > #include "qapi/error.h" > > #include "migration/blocker.h" > > +#include "migration/qemu-file.h" > > > > #define TYPE_VFIO_PCI "vfio-pci" > > #define PCI_VFIO(obj)OBJECT_CHECK(VFIOPCIDevice, obj, TYPE_VFIO_PCI) > > @@ -1632,6 +1633,50 @@ static void vfio_bars_prepare(VFIOPCIDevice *vdev) > > } > > } > > > > +static int vfio_bar_validate(VFIOPCIDevice *vdev, int nr) > > +{ > > +PCIDevice *pdev = >pdev; > > +VFIOBAR *bar = >bars[nr]; > > +uint64_t addr; > > +uint32_t addr_lo, addr_hi = 0; > > + > > +/* Skip unimplemented BARs and the upper half of 64bit BARS. */ > > +if (!bar->size) { > > +return 0; > > +} > > + > > +addr_lo = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + nr * 4, > > 4); > > + > > +addr_lo = addr_lo & (bar->ioport ? PCI_BASE_ADDRESS_IO_MASK : > > + PCI_BASE_ADDRESS_MEM_MASK); > > Nit, &= or combine with previous set. > > > +if (bar->type == PCI_BASE_ADDRESS_MEM_TYPE_64) { > > +addr_hi = pci_default_read_config(pdev, > > + PCI_BASE_ADDRESS_0 + (nr + 1) * > > 4, 4); > > +} > > + > > +addr = ((uint64_t)addr_hi << 32) | addr_lo; > > Could we use a union? > > > + > > +if (!QEMU_IS_ALIGNED(addr, bar->size)) { > > +return -EINVAL; > > +} > > What specifically are we validating here? This should be true no > matter what we wrote to the BAR or else BAR emulation is broken. The > bits that could make this unaligned are not implemented in the BAR. That I think is based on a comment I asked a few versions back. Remember the value being checked here is a value loaded from the migration stream; it could be garbage, so it's good to do whatever checks you can. > > + > > +return 0; > > +} > > + > > +static int vfio_bars_validate(VFIOPCIDevice *vdev) > > +{ > > +int i, ret; > > + > > +for (i = 0; i < PCI_ROM_SLOT; i++) { > > +ret = vfio_bar_validate(vdev, i); > > +if (ret) { > > +error_report("vfio: BAR address %d validation failed", i); > > +return ret; > > +} > > +} > > +return 0; > > +} > > + > > static void vfio_bar_register(VFIOPCIDevice *vdev, int nr) > > { > > VFIOBAR *bar = >bars[nr]; > > @@ -2414,11 +2459,129 @@ static Object *vfio_pci_get_object(VFIODevice > > *vbasedev) > > return OBJECT(vdev); > > } > > > > +static void vfio_pci_save_config(VFIODevice *vbasedev, QEMUFile *f) > > +{ > > +VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); > > +PCIDevice *pdev = >pdev; > > +uint16_t pci_cmd; > > +int i; > > + > > +for (i = 0; i < PCI_ROM_SLOT; i++) { > > +uint32_t bar; > > + > > +bar = pci_default_read_config(pdev, PCI_BASE_ADDRESS_0 + i * 4, 4); > > +qemu_put_be32(f, bar); > > +} > > + > > +qemu_put_be32(f, vdev->interrupt); > > +if (vdev->interrupt == VFIO_INT_MSI) { > > +uint32_t msi_flags, msi_addr_lo, msi_addr_hi = 0, msi_data; > > +bool msi_64bit; > > + > > +msi_flags = pci_default_read_config(pdev, pdev->msi_cap + > > PCI_MSI_FLAGS, > > +2); > > +msi_64bit = (msi_flags & PCI_MSI_FLAGS_64BIT); > > + > > +msi_addr_lo = pci_default_read_config(pdev, > > + pdev->msi_cap + > > PCI_MSI_ADDRESS_LO, 4); > > +qemu_put_be32(f, msi_addr_lo); > > + > > +if (msi_64bit) { > > +msi_addr_hi = pci_default_read_config(pdev, > > + pdev->msi_cap + > > PCI_MSI_ADDRESS_HI, > > + 4); > > +} > > +qemu_put_be32(f, msi_addr_hi); > > + > > +msi_data = pci_default_read_config(pdev, > > +pdev->msi_cap + (msi_64bit ? PCI_MSI_DATA_64 : > > PCI_MSI_DATA_32), > > +2); > > +qemu_put_be32(f, msi_data); > > Isn't the data field only a u16? > > > +} else if (vdev->interrupt == VFIO_INT_MSIX) { > > +uint16_t offset; > > + > > +/* save enable bit and maskall bit */ > > +offset = pci_default_read_config(pdev, > > +
Re: [PATCH v2 0/3] Fix some AIO context locking in jobs
Patchew URL: https://patchew.org/QEMU/20200326155628.859862-1-s.rei...@proxmox.com/ Hi, This series failed the asan build test. Please find the testing commands and their output below. If you have Docker installed, you can probably reproduce it locally. === TEST SCRIPT BEGIN === #!/bin/bash export ARCH=x86_64 make docker-image-fedora V=1 NETWORK=1 time make docker-test-debug@fedora TARGET_LIST=x86_64-softmmu J=14 NETWORK=1 === TEST SCRIPT END === PASS 1 fdc-test /x86_64/fdc/cmos PASS 2 fdc-test /x86_64/fdc/no_media_on_start PASS 3 fdc-test /x86_64/fdc/read_without_media ==6158==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 4 fdc-test /x86_64/fdc/media_change PASS 5 fdc-test /x86_64/fdc/sense_interrupt PASS 6 fdc-test /x86_64/fdc/relative_seek --- PASS 32 test-opts-visitor /visitor/opts/range/beyond PASS 33 test-opts-visitor /visitor/opts/dict/unvisited MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))} tests/test-coroutine -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-coroutine" ==6219==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! ==6219==WARNING: ASan is ignoring requested __asan_handle_no_return: stack top: 0x7ffcb9e2d000; bottom 0x7fc4e542; size: 0x0037d4a0d000 (239790510080) False positive error reports may follow For details see https://github.com/google/sanitizers/issues/189 PASS 1 test-coroutine /basic/no-dangling-access --- PASS 11 test-aio /aio/event/wait PASS 12 test-aio /aio/event/flush PASS 13 test-aio /aio/event/wait/no-flush-cb ==6234==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 14 test-aio /aio/timer/schedule PASS 15 test-aio /aio/coroutine/queue-chaining PASS 16 test-aio /aio-gsource/flush --- PASS 12 fdc-test /x86_64/fdc/read_no_dma_19 PASS 13 fdc-test /x86_64/fdc/fuzz-registers MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))} QTEST_QEMU_BINARY=x86_64-softmmu/qemu-system-x86_64 QTEST_QEMU_IMG=qemu-img tests/qtest/ide-test -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="ide-test" ==6242==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 28 test-aio /aio-gsource/timer/schedule MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))} tests/test-aio-multithread -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-aio-multithread" PASS 1 ide-test /x86_64/ide/identify ==6249==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 1 test-aio-multithread /aio/multi/lifecycle ==6251==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 2 ide-test /x86_64/ide/flush PASS 2 test-aio-multithread /aio/multi/schedule ==6268==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 3 ide-test /x86_64/ide/bmdma/simple_rw ==6279==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 3 test-aio-multithread /aio/multi/mutex/contended PASS 4 ide-test /x86_64/ide/bmdma/trim ==6290==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 4 test-aio-multithread /aio/multi/mutex/handoff PASS 5 test-aio-multithread /aio/multi/mutex/mcs PASS 6 test-aio-multithread /aio/multi/mutex/pthread --- PASS 6 test-throttle /throttle/detach_attach PASS 7 test-throttle /throttle/config_functions PASS 8 test-throttle /throttle/accounting ==6307==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 9 test-throttle /throttle/groups PASS 10 test-throttle /throttle/config/enabled PASS 11 test-throttle /throttle/config/conflicting --- PASS 14 test-throttle /throttle/config/max PASS 15 test-throttle /throttle/config/iops_size MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))} tests/test-thread-pool -m=quick -k --tap < /dev/null | ./scripts/tap-driver.pl --test-name="test-thread-pool" ==6311==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 1 test-thread-pool /thread-pool/submit PASS 2 test-thread-pool /thread-pool/submit-aio PASS 3 test-thread-pool /thread-pool/submit-co PASS 4 test-thread-pool /thread-pool/submit-many ==6378==WARNING: ASan doesn't fully support makecontext/swapcontext functions and may produce false positives in some cases! PASS 5 test-thread-pool /thread-pool/cancel PASS 6 test-thread-pool /thread-pool/cancel-async MALLOC_PERTURB_=${MALLOC_PERTURB_:-$(( ${RANDOM:-0} % 255 + 1))}
Re: qemu-system-aarch64 windows binary run Arm64 defconfig kernel not working
hi Peter: > Are you using a 32-bit Windows qemu binary or a 64-bit one? > (I'm wondering if this is a 32-bit vs 64-bit bug.) unfortunately, 32-bit Windows qemu binary still fail on booting upstream arm64 defconfig kernel. in below log, I 1st purposely type wrong option, such that you can tell where the binary located. Then as you can see "synchronous external abort" still happens. thanks for ur kind help, $ qemu-system-aarch64 -v C:\Program Files (x86)\qemu\qemu-system-aarch64.exe: -v: invalid option $ qemu-system-aarch64 --version QEMU emulator version 4.2.0 (v4.2.0-11797-g2890edc853-dirty) Copyright (c) 2003-2019 Fabrice Bellard and the QEMU Project developers $ ./run_script/run.sh [ 0.00] Booting Linux on physical CPU 0x00 [0x411fd070] [ 0.00] Linux version 5.5.0-rc7-00017-gb87a2ead99b2 (guest@ubuntu14_04) (gcc version 6.1.1 20160711 (Linaro GCC 6.1-2016.08)) #1 SMP PREEMPT Mon Mar 23 15:05:29 CST 2020 [ 0.00] Machine model: linux,dummy-virt [ 0.00] efi: Getting EFI parameters from FDT: [ 0.00] efi: UEFI not found. [ 0.00] cma: Reserved 32 MiB at 0x7e00 [ 0.00] NUMA: No NUMA configuration found [ 0.00] NUMA: Faking a node at [mem 0x4000-0x7fff] [ 0.00] NUMA: NODE_DATA [mem 0x7ddf3100-0x7ddf4fff] [ 0.00] Zone ranges: [ 0.00] DMA [mem 0x4000-0x7fff] [ 0.00] DMA32 empty [ 0.00] Normal empty [ 0.00] Movable zone start for each node [ 0.00] Early memory node ranges [ 0.00] node 0: [mem 0x4000-0x7fff] [ 0.00] Initmem setup node 0 [mem 0x4000-0x7fff] [ 0.00] psci: probing for conduit method from DT. [ 0.00] psci: PSCIv0.2 detected in firmware. [ 0.00] psci: Using standard PSCI v0.2 function IDs [ 0.00] psci: Trusted OS migration not required [ 0.00] percpu: Embedded 22 pages/cpu s53016 r8192 d28904 u90112 [ 0.00] Detected PIPT I-cache on CPU0 [ 0.00] CPU features: detected: ARM erratum 832075 [ 0.00] CPU features: detected: ARM erratum 834220 [ 0.00] CPU features: detected: EL2 vector hardening [ 0.00] ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware [ 0.00] CPU features: detected: ARM erratum 1319367 [ 0.00] Built 1 zonelists, mobility grouping on. Total pages: 258048 [ 0.00] Policy zone: DMA [ 0.00] Kernel command line: console=ttyAMA0 root=/dev/vda [ 0.00] Dentry cache hash table entries: 131072 (order: 8, 1048576 bytes, linear) [ 0.00] Inode-cache hash table entries: 65536 (order: 7, 524288 bytes, linear) [ 0.00] mem auto-init: stack:off, heap alloc:off, heap free:off [ 0.00] Memory: 968848K/1048576K available (12284K kernel code, 1922K rwdata, 6776K rodata, 5184K init, 458K bss, 46960K reserved, 32768K cma-reserved) [ 0.00] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1 [ 0.00] rcu: Preemptible hierarchical RCU implementation. [ 0.00] rcu: RCU restricting CPUs from NR_CPUS=256 to nr_cpu_ids=2. [ 0.00] Tasks RCU enabled. [ 0.00] rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies. [ 0.00] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=2 [ 0.00] NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0 [ 0.00] GICv2m: range[mem 0x0802-0x08020fff], SPI[80:143] [ 0.00] random: get_random_bytes called from start_kernel+0x2b8/0x454 with crng_init=0 [ 0.00] arch_timer: cp15 timer(s) running at 62.50MHz (virt). [ 0.00] clocksource: arch_sys_counter: mask: 0xff max_cycles: 0x1cd42e208c, max_idle_ns: 881590405314 ns [ 0.000313] sched_clock: 56 bits at 62MHz, resolution 16ns, wraps every 4398046511096ns [ 0.012734] Console: colour dummy device 80x25 [ 0.016333] Calibrating delay loop (skipped), value calculated using timer frequency.. 125.00 BogoMIPS (lpj=25) [ 0.016608] pid_max: default: 32768 minimum: 301 [ 0.018603] LSM: Security Framework initializing [ 0.020522] Mount-cache hash table entries: 2048 (order: 2, 16384 bytes, linear) [ 0.020596] Mountpoint-cache hash table entries: 2048 (order: 2, 16384 bytes, linear) [ 0.109764] ASID allocator initialised with 32768 entries [ 0.118473] rcu: Hierarchical SRCU implementation. [ 0.132871] EFI services will not be available. [ 0.134747] smp: Bringing up secondary CPUs ... [ 0.178926] Detected PIPT I-cache on CPU1 [ 0.180097] CPU1: Booted secondary processor 0x01 [0x411fd070] [ 0.185193] smp: Brought up 1 node, 2 CPUs [ 0.185252] SMP: Total of 2 processors activated. [ 0.185318] CPU features: detected: 32-bit EL0 Support [ 0.185433] CPU features: detected: CRC32 instructions [ 0.215643] CPU: All CPU(s) started at EL1 [ 0.216146] alternatives: patching kernel code [ 0.248177] devtmpfs: initialized [ 0.268462] KASLR disabled due to lack of seed [ 0.274809] clocksource: jiffies: mask: 0x max_cycles: 0x, max_idle_ns: 764504178510 ns [ 0.275044] futex hash table entries: 512 (order: 3, 32768
Re: [PATCH] qemu/atomic.h: add #ifdef guards for stdatomic.h
On Thu, 26 Mar 2020 at 17:01, Alex Bennée wrote: > > Deep inside the FreeBSD netmap headers we end up including stdatomic.h > which clashes with qemu's atomic functions which are modelled along > the C11 standard. To avoid a massive rename lets just ifdef around the > problem. > > Signed-off-by: Alex Bennée > --- > include/qemu/atomic.h | 6 ++ > 1 file changed, 6 insertions(+) > > diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h > index f9cd24c8994..ff72db51154 100644 > --- a/include/qemu/atomic.h > +++ b/include/qemu/atomic.h > @@ -208,11 +208,14 @@ > /* Provide shorter names for GCC atomic builtins, return old value */ > #define atomic_fetch_inc(ptr) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST) > #define atomic_fetch_dec(ptr) __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST) > + > +#ifndef atomic_fetch_add > #define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST) > #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST) > #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST) > #define atomic_fetch_or(ptr, n) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST) > #define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST) > +#endif This will work around FreeBSD's current implementation in particular, but I don't think there's anything in the C11 spec that mandates that atomic_fetch_add() and friends have to be macros and not simply functions... thanks -- PMM
Re: [PATCH] qcow2: Remove unused fields from BDRVQcow2State
On 3/26/20 12:07 PM, Kevin Wolf wrote: These fields were already removed in commit c3c10f72, but then commit b58deb34 revived them probably due to bad merge conflict resolution. They are still unused, so remove them again. Fixes: b58deb344ddff3b9d8b265bf73a65274767ee5f4 Signed-off-by: Kevin Wolf --- block/qcow2.h | 3 --- 1 file changed, 3 deletions(-) Reviewed-by: Eric Blake diff --git a/block/qcow2.h b/block/qcow2.h index 0942126232..f4de0a27d5 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -301,9 +301,6 @@ typedef struct BDRVQcow2State { QEMUTimer *cache_clean_timer; unsigned cache_clean_interval; -uint8_t *cluster_cache; -uint8_t *cluster_data; -uint64_t cluster_cache_offset; QLIST_HEAD(, QCowL2Meta) cluster_allocs; uint64_t *refcount_table; -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v2 0/3] Fix some AIO context locking in jobs
Patchew URL: https://patchew.org/QEMU/20200326155628.859862-1-s.rei...@proxmox.com/ Hi, This series failed the docker-quick@centos7 build test. Please find the testing commands and their output below. If you have Docker installed, you can probably reproduce it locally. === TEST SCRIPT BEGIN === #!/bin/bash make docker-image-centos7 V=1 NETWORK=1 time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1 === TEST SCRIPT END === TESTcheck-unit: tests/test-bdrv-graph-mod TESTcheck-unit: tests/test-blockjob qemu: qemu_mutex_unlock_impl: Operation not permitted ERROR - too few tests run (expected 8, got 7) make: *** [check-unit] Error 1 make: *** Waiting for unfinished jobs TESTiotest-qcow2: 020 TESTiotest-qcow2: 021 --- raise CalledProcessError(retcode, cmd) subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', '--label', 'com.qemu.instance.uuid=2724b2af546443c7a60d0f061f1beab7', '-u', '1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', '/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', '/var/tmp/patchew-tester-tmp-7887uhcy/src/docker-src.2020-03-26-12.56.41.28979:/var/tmp/qemu:z,ro', 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit status 2. filter=--filter=label=com.qemu.instance.uuid=2724b2af546443c7a60d0f061f1beab7 make[1]: *** [docker-run] Error 1 make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-7887uhcy/src' make: *** [docker-run-test-quick@centos7] Error 2 real14m17.758s user0m8.171s The full log is available at http://patchew.org/logs/20200326155628.859862-1-s.rei...@proxmox.com/testing.docker-quick@centos7/?type=message. --- Email generated automatically by Patchew [https://patchew.org/]. Please send your feedback to patchew-de...@redhat.com
[PATCH] qcow2: Remove unused fields from BDRVQcow2State
These fields were already removed in commit c3c10f72, but then commit b58deb34 revived them probably due to bad merge conflict resolution. They are still unused, so remove them again. Fixes: b58deb344ddff3b9d8b265bf73a65274767ee5f4 Signed-off-by: Kevin Wolf --- block/qcow2.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/qcow2.h b/block/qcow2.h index 0942126232..f4de0a27d5 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -301,9 +301,6 @@ typedef struct BDRVQcow2State { QEMUTimer *cache_clean_timer; unsigned cache_clean_interval; -uint8_t *cluster_cache; -uint8_t *cluster_data; -uint64_t cluster_cache_offset; QLIST_HEAD(, QCowL2Meta) cluster_allocs; uint64_t *refcount_table; -- 2.20.1
[PATCH] qemu/atomic.h: add #ifdef guards for stdatomic.h
Deep inside the FreeBSD netmap headers we end up including stdatomic.h which clashes with qemu's atomic functions which are modelled along the C11 standard. To avoid a massive rename lets just ifdef around the problem. Signed-off-by: Alex Bennée --- include/qemu/atomic.h | 6 ++ 1 file changed, 6 insertions(+) diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h index f9cd24c8994..ff72db51154 100644 --- a/include/qemu/atomic.h +++ b/include/qemu/atomic.h @@ -208,11 +208,14 @@ /* Provide shorter names for GCC atomic builtins, return old value */ #define atomic_fetch_inc(ptr) __atomic_fetch_add(ptr, 1, __ATOMIC_SEQ_CST) #define atomic_fetch_dec(ptr) __atomic_fetch_sub(ptr, 1, __ATOMIC_SEQ_CST) + +#ifndef atomic_fetch_add #define atomic_fetch_add(ptr, n) __atomic_fetch_add(ptr, n, __ATOMIC_SEQ_CST) #define atomic_fetch_sub(ptr, n) __atomic_fetch_sub(ptr, n, __ATOMIC_SEQ_CST) #define atomic_fetch_and(ptr, n) __atomic_fetch_and(ptr, n, __ATOMIC_SEQ_CST) #define atomic_fetch_or(ptr, n) __atomic_fetch_or(ptr, n, __ATOMIC_SEQ_CST) #define atomic_fetch_xor(ptr, n) __atomic_fetch_xor(ptr, n, __ATOMIC_SEQ_CST) +#endif #define atomic_inc_fetch(ptr)__atomic_add_fetch(ptr, 1, __ATOMIC_SEQ_CST) #define atomic_dec_fetch(ptr)__atomic_sub_fetch(ptr, 1, __ATOMIC_SEQ_CST) @@ -392,11 +395,14 @@ /* Provide shorter names for GCC atomic builtins. */ #define atomic_fetch_inc(ptr) __sync_fetch_and_add(ptr, 1) #define atomic_fetch_dec(ptr) __sync_fetch_and_add(ptr, -1) + +#ifndef atomic_fetch_add #define atomic_fetch_add(ptr, n) __sync_fetch_and_add(ptr, n) #define atomic_fetch_sub(ptr, n) __sync_fetch_and_sub(ptr, n) #define atomic_fetch_and(ptr, n) __sync_fetch_and_and(ptr, n) #define atomic_fetch_or(ptr, n) __sync_fetch_and_or(ptr, n) #define atomic_fetch_xor(ptr, n) __sync_fetch_and_xor(ptr, n) +#endif #define atomic_inc_fetch(ptr) __sync_add_and_fetch(ptr, 1) #define atomic_dec_fetch(ptr) __sync_add_and_fetch(ptr, -1) -- 2.20.1
Re: [PULL 00/10] Block patches for 5.0-rc1
On Thu, 26 Mar 2020 at 14:29, Max Reitz wrote: > > The following changes since commit 01e38186ecb1fc6275720c5425332eed280ea93d: > > Merge remote-tracking branch > 'remotes/dgilbert/tags/pull-migration-20200325b' into staging (2020-03-26 > 09:28:11 +) > > are available in the Git repository at: > > https://github.com/XanClic/qemu.git tags/pull-block-2020-03-26 > > for you to fetch changes up to a507c51790fa955c1fccd4deca3c50476a862b83: > > iotests/138: Test leaks/corruptions fixed report (2020-03-26 14:52:43 +0100) > > > Block patches for 5.0-rc1: > - Fix qemu-img convert with a host device or iscsi target > - Use-after-free fix in mirror > - Some minor qcow2 fixes > - Minor sheepdog fix > - Minor qemu-img check report fix > > Applied, thanks. Please update the changelog at https://wiki.qemu.org/ChangeLog/5.0 for any user-visible changes. -- PMM
Re: [PATCH] s390x: protvirt: Fix stray error_report_err in s390_machine_protect
On Thu, 26 Mar 2020 10:05:05 -0400 Janosch Frank wrote: > In case the protection of the machine fails at s390_pv_vm_enable(), > we'll currently report the local_error variable. Problem is that > there's no migration blocker error that we can report at this point so > the pointer is always NULL which leads to a SEGFAULT. > > Let's remove the error report. Yes, s390_pvm_vm_enable() will already moan on error. > > Signed-off-by: Janosch Frank > Reported-by: Marc Hartmayer > Fixes: 520935eedf941da3 ("s390x: protvirt: Add migration blocker") Note to self: update commit ID before I send a pull request (I will at least need to rebase for a headers update...) > --- > hw/s390x/s390-virtio-ccw.c | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c > index 3cf19c99f3468b7d..855ecf370d6e82fa 100644 > --- a/hw/s390x/s390-virtio-ccw.c > +++ b/hw/s390x/s390-virtio-ccw.c > @@ -358,7 +358,6 @@ static int s390_machine_protect(S390CcwMachineState *ms) > rc = s390_pv_vm_enable(); > if (rc) { > qemu_balloon_inhibit(false); > -error_report_err(local_err); > migrate_del_blocker(pv_mig_blocker); > error_free_or_abort(_mig_blocker); > return rc; Thanks, applied.
Re: [PATCH] qemu-user: fix build with LLVM lld 10
On 3/26/20 6:43 AM, Roger Pau Monne wrote: > lld 10.0.0 introduced a new linker option --image-base equivalent to > the GNU -Ttext-segment one, hence use it when available. > > This fixes the build of QEMU on systems using lld 10 or greater. > > Signed-off-by: Dimitry Andric > Signed-off-by: Roger Pau Monné > --- > Cc: Laurent Vivier > Cc: Richard Henderson > Cc: "Philippe Mathieu-Daudé" > Cc: Paolo Bonzini > Cc: "Daniel P. Berrangé" > --- The Plan is still to drop this whole section of code. However, it's still blocked on getting the x86_64 vsyscall patches upstream. r~
Re: [PATCH v9 3/4] linux-user: Support futex_time64
On Wed, Mar 25, 2020 at 11:22 PM Laurent Vivier wrote: > > Le 25/03/2020 à 18:41, Alistair Francis a écrit : > > On Wed, Mar 18, 2020 at 3:54 PM Alistair Francis > > wrote: > >> > >> Add support for host and target futex_time64. If futex_time64 exists on > >> the host we try that first before falling back to the standard futux > >> syscall. > >> > >> Signed-off-by: Alistair Francis > > > > @Laurent did you see this? > > > > I guess it's a little late for 5.0 but it would be nice to support. > > Yes, I've seen your patch. > > I think it can go into 5.0 RC because it's a bug fix and it is really > needed because the 32bit futex will be broken if host timespec uses > 64bit field. > > But I need to review and test before. Thanks! Alistair > > Thanks, > Laurent
Re: booke206_tlb_ways() returning 0
Le 26/03/2020 à 15:22, Peter Maydell a écrit : > Hi; Coverity points out a possible issue in booke206_get_tlbm() > (this is CID 1421942): > > > static inline ppcmas_tlb_t *booke206_get_tlbm(CPUPPCState *env, const int > tlbn, > target_ulong ea, int way) > { > int r; > uint32_t ways = booke206_tlb_ways(env, tlbn); > int ways_bits = ctz32(ways); > int tlb_bits = ctz32(booke206_tlb_size(env, tlbn)); > int i; > > way &= ways - 1; > ea >>= MAS2_EPN_SHIFT; > ea &= (1 << (tlb_bits - ways_bits)) - 1; > r = (ea << ways_bits) | way; > > Here if booke206_tlb_ways() returns zero, then ways_bits() > will be 32 and the shift left 'ea << ways_bits' is undefined > behaviour. > > My first assumption was that booke206_tlb_ways() is not supposed > to ever return 0 (it's looking at what I think are read-only > system registers, and it doesn't make much sense to have > a zero-way TLB). So I tried adding an assert: > > diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h > index 88d94495554..aedb6bcb265 100644 > --- a/target/ppc/cpu.h > +++ b/target/ppc/cpu.h > @@ -2403,6 +2403,7 @@ static inline int booke206_tlb_ways(CPUPPCState > *env, int tlbn) > { > uint32_t tlbncfg = env->spr[SPR_BOOKE_TLB0CFG + tlbn]; > int r = tlbncfg >> TLBnCFG_ASSOC_SHIFT; > +assert(r > 0); > return r; > } > > However, this isn't right, because it causes one of the check-acceptance > tests to fail, with this backtrace: > > #3 0x7074d412 in __GI___assert_fail (assertion=0x560a0d7d > "r > 0", file=0x560a0d40 > "/home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/cpu.h", > line=2406, function=0x560a1720 <__PRETTY_FUNCTION__.20811> > "booke206_tlb_ways") at assert.c:101 > #4 0x55a9939b in booke206_tlb_ways (env=0x56e52a30, > tlbn=2) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/cpu.h:2406 > #5 0x55a9b3ac in mmubooke206_get_physical_address > (env=0x56e52a30, ctx=0x7fffd77008a0, address=9223380835095282947, > rw=0, access_type=0, mmu_idx=1) at > /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1093 > #6 0x55a9c25d in get_physical_address_wtlb > (env=0x56e52a30, ctx=0x7fffd77008a0, eaddr=9223380835095282947, > rw=0, access_type=0, mmu_idx=1) at > /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1455 > #7 0x55a9c82b in cpu_ppc_handle_mmu_fault > (env=0x56e52a30, address=9223380835095282947, rw=0, mmu_idx=1) > at > /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1597 > #8 0x55a9f975 in ppc_cpu_tlb_fill (cs=0x56e49560, > addr=9223380835095282947, size=1, access_type=MMU_DATA_LOAD, > mmu_idx=1, probe=false, retaddr=140735610345781) at > /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:3053 > #9 0x558e1422 in tlb_fill (cpu=0x56e49560, > addr=9223380835095282947, size=1, access_type=MMU_DATA_LOAD, > mmu_idx=1, retaddr=140735610345781) at > /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1017 > #10 0x558e279b in load_helper (env=0x56e52a30, > addr=9223380835095282947, oi=1, retaddr=140735610345781, op=MO_8, > code_read=false, full_load=0x558e2b3a ) at > /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1534 > #11 0x558e2b80 in full_ldub_mmu (env=0x56e52a30, > addr=9223380835095282947, oi=1, retaddr=140735610345781) > at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1624 > #12 0x558e2bb4 in helper_ret_ldub_mmu (env=0x56e52a30, > addr=9223380835095282947, oi=1, retaddr=140735610345781) > at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1630 > #13 0x7fff900fd9c6 in code_gen_buffer () > #14 0x558f9915 in cpu_tb_exec (cpu=0x56e49560, > itb=0x7fff900fd780 ) > at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:172 > #15 0x558fa732 in cpu_loop_exec_tb (cpu=0x56e49560, > tb=0x7fff900fd780 , last_tb=0x7fffd7701078, > tb_exit=0x7fffd7701070) at > /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:619 > #16 0x558faa4c in cpu_exec (cpu=0x56e49560) at > /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:732 > #17 0x558bcf29 in tcg_cpu_exec (cpu=0x56e49560) at > /home/petmay01/linaro/qemu-from-laptop/qemu/cpus.c:1405 > #18 0x558bd77f in qemu_tcg_cpu_thread_fn (arg=0x56e49560) > at /home/petmay01/linaro/qemu-from-laptop/qemu/cpus.c:1713 > #19 0x55f2ff3f in qemu_thread_start (args=0x56e8dd10) at > /home/petmay01/linaro/qemu-from-laptop/qemu/util/qemu-thread-posix.c:519 > #20 0x70b156db in start_thread (arg=0x7fffd7704700) at > pthread_create.c:463 > #21 0x7083e88f in clone () at > ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 > > So under what circumstances can booke206_tlb_ways() > validly return 0? Should booke206_get_tlbm() cope with
Re: [PULL 0/6] Linux user for 5.0 patches
Le 26/03/2020 à 16:42, Peter Maydell a écrit : > On Thu, 26 Mar 2020 at 07:24, Laurent Vivier wrote: >> >> The following changes since commit 736cf607e40674776d752acc201f565723e86045: >> >> Update version for v5.0.0-rc0 release (2020-03-24 17:50:00 +) >> >> are available in the Git repository at: >> >> git://github.com/vivier/qemu.git tags/linux-user-for-5.0-pull-request >> >> for you to fetch changes up to a52f5f87bece827a338d6eb3332e3def86fb9c33: >> >> linux-user: Flush out implementation of gettimeofday (2020-03-26 08:08:54 >> +0100) >> >> >> Emulate x86_64 vsyscalls >> Fix syscall_nr.h cleanup >> >> > > Still fails :-( I would say it was expected... I think your build dir is corrupted by a previous build. You should have old .o file without .d file, and thus the .o file is not refreshed (check the date of cpu_loop.o). We cannot cleanup that before starting the build. The purpose of the cleanup in configure was to avoid this kind of problem but I did it badly. If you want to check incremental build, cleanup your build dir, checkout v4.20 or master, build it, and then build the PR branch. it will work: it's tested. If you build 4.2.0, then master and then PR, the build dir will be corrupted by the build of master that removes the .d files without removing the .o files. So when you build the PR, cpu_loop.o file remains and is not rebuild with the new values in cpu.h. We can't do anything to avoid that now, except removing all .o files under *-linux-user... Sorry for the mess... Thanks, Laurent
Re: [PATCH RFC 9/9] KVM: Dirty ring support
On Thu, Mar 26, 2020 at 02:14:36PM +, Dr. David Alan Gilbert wrote: > * Peter Xu (pet...@redhat.com) wrote: > > On Wed, Mar 25, 2020 at 08:41:44PM +, Dr. David Alan Gilbert wrote: > > > > [...] > > > > > > +enum KVMReaperState { > > > > +KVM_REAPER_NONE = 0, > > > > +/* The reaper is sleeping */ > > > > +KVM_REAPER_WAIT, > > > > +/* The reaper is reaping for dirty pages */ > > > > +KVM_REAPER_REAPING, > > > > +}; > > > > > > That probably needs to be KVMDirtyRingReaperState > > > given there are many things that could be reaped. > > > > Sure. > > > > > > > > > +/* > > > > + * KVM reaper instance, responsible for collecting the KVM dirty bits > > > > + * via the dirty ring. > > > > + */ > > > > +struct KVMDirtyRingReaper { > > > > +/* The reaper thread */ > > > > +QemuThread reaper_thr; > > > > +/* > > > > + * Telling the reaper thread to wakeup. This should be used as a > > > > + * generic interface to kick the reaper thread, like, in vcpu > > > > + * threads where it gets a exit due to ring full. > > > > + */ > > > > +EventNotifier reaper_event; > > > > > > I think I'd just used a simple semaphore for this type of thing. > > > > I'm actually uncertain on which is cheaper... > > > > At the meantime, I wanted to poll two handles at the same time below > > (in kvm_dirty_ring_reaper_thread). I don't know how to do that with > > semaphore. Could it? > > If you're OK with EventNotifier stick with it; it's just I'm used > to doing with it with a semaphore; e.g. a flag then the semaphore - but > that's fine. Ah yes flags could work, though we probably need to be careful with flags and use atomic accesses to avoid race conditions of flag lost. Then I'll keep it, thanks. > > > [...] > > > > > > @@ -412,6 +460,18 @@ int kvm_init_vcpu(CPUState *cpu) > > > > (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; > > > > } > > > > > > > > +if (s->kvm_dirty_gfn_count) { > > > > +cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size, > > > > + PROT_READ | PROT_WRITE, MAP_SHARED, > > > > > > Is the MAP_SHARED required? > > > > Yes it's required. It's the same when we map the per-vcpu kvm_run. > > > > If we use MAP_PRIVATE, it'll be in a COW fashion - when the userspace > > writes to the dirty gfns the 1st time, it'll copy the current dirty > > ring page in the kernel and from now on QEMU will never be able to see > > what the kernel writes to the dirty gfn pages. MAP_SHARED means the > > userspace and the kernel shares exactly the same page(s). > > OK, worth a comment. Sure. -- Peter Xu
Re: [PATCH v9 2/9] memory: Add interface to set iommu page size mask
Hi Bharat, On 3/23/20 9:46 AM, Bharat Bhushan wrote: > Allow to set page size mask to be supported by iommu. by iommu memory region. I mean this is not global to the IOMMU. > This is required to expose page size mask compatible with > host with virtio-iommu. > > Signed-off-by: Bharat Bhushan > --- > include/exec/memory.h | 20 > memory.c | 10 ++ > 2 files changed, 30 insertions(+) > > diff --git a/include/exec/memory.h b/include/exec/memory.h > index e85b7de99a..063c424854 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -355,6 +355,16 @@ typedef struct IOMMUMemoryRegionClass { > * @iommu: the IOMMUMemoryRegion > */ > int (*num_indexes)(IOMMUMemoryRegion *iommu); > + > +/* > + * Set supported IOMMU page size > + * > + * Optional method: if this is supported then set page size that > + * can be supported by IOMMU. This is called to set supported page > + * size as per host Linux. What about: If supported, allows to restrict the page size mask that can be supported with a given IOMMU memory region. For example, this allows to propagate host physical IOMMU page size mask limitations to the virtual IOMMU (vfio assignment with virtual iommu). > + */ > + void (*iommu_set_page_size_mask)(IOMMUMemoryRegion *iommu, > + uint64_t page_size_mask); > } IOMMUMemoryRegionClass; > > typedef struct CoalescedMemoryRange CoalescedMemoryRange; > @@ -1363,6 +1373,16 @@ int > memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr, > */ > int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr); > > +/** > + * memory_region_iommu_set_page_size_mask: set the supported pages > + * size by iommu. supported page sizes for a given IOMMU memory region > + * > + * @iommu_mr: the memory region IOMMU memory region > + * @page_size_mask: supported page size mask > + */ > +void memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr, > +uint64_t page_size_mask); > + > /** > * memory_region_name: get a memory region's name > * > diff --git a/memory.c b/memory.c > index aeaa8dcc9e..14c8783084 100644 > --- a/memory.c > +++ b/memory.c > @@ -1833,6 +1833,16 @@ static int > memory_region_update_iommu_notify_flags(IOMMUMemoryRegion *iommu_mr, > return ret; > } > > +void memory_region_iommu_set_page_size_mask(IOMMUMemoryRegion *iommu_mr, > +uint64_t page_size_mask) > +{ > +IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); > + > +if (imrc->iommu_set_page_size_mask) { > +imrc->iommu_set_page_size_mask(iommu_mr, page_size_mask); Shouldn't it return an int in case the setting cannot be applied? > +} > +} > + > int memory_region_register_iommu_notifier(MemoryRegion *mr, >IOMMUNotifier *n, Error **errp) > { > Thanks Eric
Re: [PATCH RFC 8/9] KVM: Add dirty-ring-size property
On Thu, Mar 26, 2020 at 01:41:44PM +, Dr. David Alan Gilbert wrote: > * Peter Xu (pet...@redhat.com) wrote: > > On Wed, Mar 25, 2020 at 08:00:31PM +, Dr. David Alan Gilbert wrote: > > > > @@ -2077,6 +2079,33 @@ static int kvm_init(MachineState *ms) > > > > s->memory_listener.listener.coalesced_io_add = > > > > kvm_coalesce_mmio_region; > > > > s->memory_listener.listener.coalesced_io_del = > > > > kvm_uncoalesce_mmio_region; > > > > > > > > +/* > > > > + * Enable KVM dirty ring if supported, otherwise fall back to > > > > + * dirty logging mode > > > > + */ > > > > +if (s->kvm_dirty_ring_size > 0) { > > > > +/* Read the max supported pages */ > > > > +ret = kvm_vm_check_extension(kvm_state, > > > > KVM_CAP_DIRTY_LOG_RING); > > > > +if (ret > 0) { > > > > +if (s->kvm_dirty_ring_size > ret) { > > > > +error_report("KVM dirty ring size %d too big (maximum > > > > is %d). " > > > > + "Please use a smaller value.", > > > > + s->kvm_dirty_ring_size, ret); > > > > +goto err; > > > > +} > > > > + > > > > +ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, > > > > +s->kvm_dirty_ring_size); > > > > +if (ret) { > > > > +error_report("Enabling of KVM dirty ring failed: %d", > > > > ret); > > > > +goto err; > > > > +} > > > > + > > > > +s->kvm_dirty_gfn_count = > > > > +s->kvm_dirty_ring_size / sizeof(struct kvm_dirty_gfn); > > > > > > What happens if I was to pass dirty-ring-size=1 ? > > > Then the count would be 0 and things would get upset somewhere? > > > Do you need to check for a minimum postive value? > > > > The above kvm_vm_enable_cap() should fail directly and QEMU will stop. > > Yes we should check it, but kernel will do that in all cases, so I > > just didn't do that explicitly again in the userspace. > > We probably should have that check since you can give them a more > obvious error message. Yes we can. Or I can enhance the error message when we failed with kvm_vm_enable_cap() so the user will get the important hints. I think maybe that's more important than the explicit check itself. > > > I was planning this to be an advanced feature so the user of this > > parameter should know the rules to pass values in. > > Advanced users just make advanced mistakes :-) > > I did wonder if perhaps this option should be a count of entries rather > than a byte size. Sometimes it's easier to know "how many bytes we used", while instead sometimes we want to know "how many dirty addresses we can track". But sure I can switch, considering the users might be more interested in the latter. :) Thanks, -- Peter Xu
[PATCH v2 3/3] replication: acquire aio context before calling job_cancel_sync
job_cancel_sync requires the job's lock to be held, all other callers already do this (replication_stop, drive_backup_abort, blockdev_backup_abort, job_cancel_sync_all, cancel_common). Signed-off-by: Stefan Reiter --- block/replication.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/block/replication.c b/block/replication.c index 413d95407d..6c809cda4e 100644 --- a/block/replication.c +++ b/block/replication.c @@ -144,12 +144,16 @@ fail: static void replication_close(BlockDriverState *bs) { BDRVReplicationState *s = bs->opaque; +Job *commit_job = >commit_job->job; +AioContext *commit_ctx = commit_job->aio_context; if (s->stage == BLOCK_REPLICATION_RUNNING) { replication_stop(s->rs, false, NULL); } if (s->stage == BLOCK_REPLICATION_FAILOVER) { -job_cancel_sync(>commit_job->job); +aio_context_acquire(commit_ctx); +job_cancel_sync(commit_job); +aio_context_release(commit_ctx); } if (s->mode == REPLICATION_MODE_SECONDARY) { -- 2.26.0
[PATCH v2 2/3] job: take each job's lock individually in job_txn_apply
All callers of job_txn_apply hold a single job's lock, but different jobs within a transaction can have different contexts, thus we need to lock each one individually before applying the callback function. Similar to job_completed_txn_abort this also requires releasing the caller's context before and reacquiring it after to avoid recursive locks which might break AIO_WAIT_WHILE in the callback. Signed-off-by: Stefan Reiter --- job.c | 32 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/job.c b/job.c index 134a07b92e..e0966162fa 100644 --- a/job.c +++ b/job.c @@ -136,17 +136,33 @@ static void job_txn_del_job(Job *job) } } -static int job_txn_apply(JobTxn *txn, int fn(Job *)) +static int job_txn_apply(Job *job, int fn(Job *)) { -Job *job, *next; +AioContext *outer_ctx = job->aio_context; +AioContext *inner_ctx; +Job *other_job, *next; +JobTxn *txn = job->txn; int rc = 0; -QLIST_FOREACH_SAFE(job, >jobs, txn_list, next) { -rc = fn(job); +/* + * Similar to job_completed_txn_abort, we take each job's lock before + * applying fn, but since we assume that outer_ctx is held by the caller, + * we need to release it here to avoid holding the lock twice - which would + * break AIO_WAIT_WHILE from within fn. + */ +aio_context_release(outer_ctx); + +QLIST_FOREACH_SAFE(other_job, >jobs, txn_list, next) { +inner_ctx = other_job->aio_context; +aio_context_acquire(inner_ctx); +rc = fn(other_job); +aio_context_release(inner_ctx); if (rc) { break; } } + +aio_context_acquire(outer_ctx); return rc; } @@ -774,11 +790,11 @@ static void job_do_finalize(Job *job) assert(job && job->txn); /* prepare the transaction to complete */ -rc = job_txn_apply(job->txn, job_prepare); +rc = job_txn_apply(job, job_prepare); if (rc) { job_completed_txn_abort(job); } else { -job_txn_apply(job->txn, job_finalize_single); +job_txn_apply(job, job_finalize_single); } } @@ -824,10 +840,10 @@ static void job_completed_txn_success(Job *job) assert(other_job->ret == 0); } -job_txn_apply(txn, job_transition_to_pending); +job_txn_apply(job, job_transition_to_pending); /* If no jobs need manual finalization, automatically do so */ -if (job_txn_apply(txn, job_needs_finalize) == 0) { +if (job_txn_apply(job, job_needs_finalize) == 0) { job_do_finalize(job); } } -- 2.26.0
[PATCH v2 1/3] backup: don't acquire aio_context in backup_clean
All code-paths leading to backup_clean (via job_clean) have the job's context already acquired. The job's context is guaranteed to be the same as the one used by backup_top via backup_job_create. Since the previous logic effectively acquired the lock twice, this broke cleanup of backups for disks using IO threads, since the BDRV_POLL_WHILE in bdrv_backup_top_drop -> bdrv_do_drained_begin would only release the lock once, thus deadlocking with the IO thread. This is a partial revert of 0abf2581717a19. Signed-off-by: Stefan Reiter --- block/backup.c | 4 1 file changed, 4 deletions(-) diff --git a/block/backup.c b/block/backup.c index 7430ca5883..a7a7dcaf4c 100644 --- a/block/backup.c +++ b/block/backup.c @@ -126,11 +126,7 @@ static void backup_abort(Job *job) static void backup_clean(Job *job) { BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); -AioContext *aio_context = bdrv_get_aio_context(s->backup_top); - -aio_context_acquire(aio_context); bdrv_backup_top_drop(s->backup_top); -aio_context_release(aio_context); } void backup_do_checkpoint(BlockJob *job, Error **errp) -- 2.26.0
[PATCH v2 0/3] Fix some AIO context locking in jobs
Contains three seperate but related patches cleaning up and fixing some issues regarding aio_context_acquire/aio_context_release for jobs. Mostly affects blockjobs running for devices that have IO threads enabled AFAICT. This is based on the discussions here: https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg07929.html I *think* the second patch also fixes the hangs on backup abort that I and Dietmar noticed in v1, but I'm not sure, they we're somewhat intermittent before too. Changes from v1: * fixed commit message for patch 1 * added patches 2 and 3 Stefan Reiter (3): backup: don't acquire aio_context in backup_clean job: take each job's lock individually in job_txn_apply replication: acquire aio context before calling job_cancel_sync block/backup.c | 4 block/replication.c | 6 +- job.c | 32 3 files changed, 29 insertions(+), 13 deletions(-) -- 2.26.0
Re: [PATCH v1 12/22] intel_iommu: add PASID cache management infrastructure
On Thu, Mar 26, 2020 at 01:57:10PM +, Liu, Yi L wrote: > > From: Liu, Yi L > > Sent: Thursday, March 26, 2020 2:15 PM > > To: 'Peter Xu' > > Subject: RE: [PATCH v1 12/22] intel_iommu: add PASID cache management > > infrastructure > > > > > From: Peter Xu > > > Sent: Wednesday, March 25, 2020 10:52 PM > > > To: Liu, Yi L > > > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache management > > > infrastructure > > > > > > On Wed, Mar 25, 2020 at 12:20:21PM +, Liu, Yi L wrote: > > > > > From: Peter Xu > > > > > Sent: Wednesday, March 25, 2020 1:32 AM > > > > > To: Liu, Yi L > > > > > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache > > > > > management infrastructure > > > > > > > > > > On Sun, Mar 22, 2020 at 05:36:09AM -0700, Liu Yi L wrote: > > > > > > This patch adds a PASID cache management infrastructure based on > > > > > > new added structure VTDPASIDAddressSpace, which is used to track > > > > > > the PASID usage and future PASID tagged DMA address translation > > > > > > support in vIOMMU. > [...] > > > > > > > > > > > > > > > > > > > > > +/* > > > > > > + * Step 2: loop all the exisitng vtd_dev_icx instances. > > > > > > + * Ideally, needs to loop all devices to find if there is any > > > > > > new > > > > > > + * PASID binding regards to the PASID cache invalidation > > > > > > request. > > > > > > + * But it is enough to loop the devices which are backed by > > > > > > host > > > > > > + * IOMMU. For devices backed by vIOMMU (a.k.a emulated > > > > > > devices), > > > > > > + * if new PASID happened on them, their vtd_pasid_as instance > > > > > > could > > > > > > + * be created during future vIOMMU DMA translation. > > > > > > + */ > > > > > > +QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) { > > > > > > +VTDPASIDAddressSpace *vtd_pasid_as; > > > > > > +VTDPASIDCacheEntry *pc_entry; > > > > > > +VTDPASIDEntry pe; > > > > > > +VTDBus *vtd_bus = vtd_dev_icx->vtd_bus; > > > > > > +uint16_t devfn = vtd_dev_icx->devfn; > > > > > > +int bus_n = pci_bus_num(vtd_bus->bus); > > > > > > + > > > > > > +/* i) fetch vtd_pasid_as and check if it is valid */ > > > > > > +vtd_pasid_as = vtd_add_find_pasid_as(s, vtd_bus, > > > > > > + devfn, pasid); > > > > > > > > > > I don't feel like it's correct here... > > > > > > > > > > Assuming we have two devices assigned D1, D2. D1 uses PASID=1, D2 > > > > > uses > > > PASID=2. > > > > > When invalidating against PASID=1, are you also going to create a > > > > > VTDPASIDAddressSpace also for D2 with PASID=1? > > > > > > > > Answer is no. Before going forward, let's see if the below flow looks > > > > good to you. > > > > > > > > Let me add one more device besides D1 and D2. Say device D3 which > > > > also uses PASID=1. And assume it begins with no PASID usage in guest. > > > > > > > > Then the flow from scratch is: > > > > > > > > a) guest IOMMU driver setup PASID entry for D1 with PASID=1, > > > >then invalidates against PASID #1 > > > > b) trap to QEMU, and comes to this function. Since there is > > > >no previous pasid cache invalidation, so the Step 1 of this > > > >function has nothing to do, then goes to Step 2 which is to > > > >loop all assigned devices and check if the guest pasid entry > > > >is present. In this loop, should find D1's pasid entry for > > > >PASID#1 is present. So create the VTDPASIDAddressSpace and > > > >initialize its pasid_cache_entry and pasid_cache_gen fields. > > > > c) guest IOMMU driver setup PASID entry for D2 with PASID=2, > > > >then invalidates against PASID #2 > > > > d) same with b), only difference is the Step 1 of this function > > > >will loop the VTDPASIDAddressSpace created in b), but its > > > >pasid is 1 which is not the target of current pasid cache > > > >invalidation. Same with b), in Step 2, it will create a > > > >VTDPASIDAddressSpace for D2+PASID#2 > > > > e) guest IOMMU driver setup PASID entry for D3 with PASID=1, > > > >then invalidates against PASID #1 > > > > f) trap to QEMU and comes to this function. Step 1 loops two > > > >VTDPASIDAddressSpace created in b) and d), and it finds there > > > >is a VTDPASIDAddressSpace whose pasid is 1. vtd_flush_pasid() > > > >will check if the cached pasid entry is the same with the one > > > >in guest memory. In this flow, it should be the same, so > > > >vtd_flush_pasid() will do nothing for it. Then comes to Step 2, > > > >it loops D1, D2, D3. > > > >- For D1, no need to do more since there is already a > > > > VTDPASIDAddressSpace created for D1+PASID#1. > > > >- For D2, its guest pasid entry for PASID#1 is not present, so > > > > no need to do anything for it. > > > >- For D3, its guest pasid entry for PASID#1 is present and it > > > > is exactly the reason for this
Re: [PATCH v2 0/2] mirror: Fix hang (operation waiting for itself/circular dependency)
On 3/26/20 10:36 AM, Kevin Wolf wrote: The recent fix didn't actually fix the whole problem. Operations can't only wait for themselves, but we can also end up with circular dependencies like two operations waiting for each other to complete. This reverts the first fix and implements another approach. v2: - Mark active mirror operations as in-flight, too Kevin Wolf (2): Revert "mirror: Don't let an operation wait for itself" mirror: Wait only for in-flight operations Series: Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v9 4/9] virtio-iommu: set supported page size mask
Hi Bharat, On 3/23/20 9:46 AM, Bharat Bhushan wrote: > Add optional interface to set page size mask. > Currently this is set global configuration and not > per endpoint. This allows to override the page size mask per end-point? > > Signed-off-by: Bharat Bhushan > --- > include/hw/virtio/virtio-iommu.h | 1 + > hw/virtio/virtio-iommu.c | 9 + > 2 files changed, 10 insertions(+) > > diff --git a/include/hw/virtio/virtio-iommu.h > b/include/hw/virtio/virtio-iommu.h > index 6f67f1020a..4efa09610a 100644 > --- a/include/hw/virtio/virtio-iommu.h > +++ b/include/hw/virtio/virtio-iommu.h > @@ -35,6 +35,7 @@ typedef struct IOMMUDevice { > void *viommu; > PCIBus *bus; > int devfn; > +uint64_t page_size_mask; > IOMMUMemoryRegion iommu_mr; > AddressSpace as; > } IOMMUDevice; > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > index 4cee8083bc..a28818202c 100644 > --- a/hw/virtio/virtio-iommu.c > +++ b/hw/virtio/virtio-iommu.c > @@ -650,6 +650,14 @@ static gint int_cmp(gconstpointer a, gconstpointer b, > gpointer user_data) > return (ua > ub) - (ua < ub); > } > > +static void virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr, > +uint64_t page_size_mask) > +{ > +IOMMUDevice *sdev = container_of(mr, IOMMUDevice, iommu_mr); > + > +sdev->page_size_mask = page_size_mask; > +} > + > static void virtio_iommu_device_realize(DeviceState *dev, Error **errp) > { > VirtIODevice *vdev = VIRTIO_DEVICE(dev); > @@ -865,6 +873,7 @@ static void > virtio_iommu_memory_region_class_init(ObjectClass *klass, > IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass); > > imrc->translate = virtio_iommu_translate; > +imrc->iommu_set_page_size_mask = virtio_iommu_set_page_size_mask; > } > > static const TypeInfo virtio_iommu_info = { > Thanks Eric
Re: [PATCH v9 8/9] virtio-iommu: Implement probe request
Hi Bharat On 3/23/20 9:46 AM, Bharat Bhushan wrote: > This patch implements the PROBE request. Currently supported > page size mask per endpoint is returned. Also append a NONE > property in the end. > > Signed-off-by: Bharat Bhushan > Signed-off-by: Eric Auger > --- > include/standard-headers/linux/virtio_iommu.h | 6 + Changes to virtio_iommu.h should be in a separate patch you should use ./scripts/update-linux-headers.sh See for instance: ddda37483d linux-headers: update until the uapi updates are not upstream you can link to your kernel branch and mention this is a temporary linux header update or partial if you just want to pick up the iommu.h changes. > hw/virtio/virtio-iommu.c | 161 +- > hw/virtio/trace-events| 2 + > 3 files changed, 166 insertions(+), 3 deletions(-) > > diff --git a/include/standard-headers/linux/virtio_iommu.h > b/include/standard-headers/linux/virtio_iommu.h > index b9443b83a1..8a0d47b907 100644 > --- a/include/standard-headers/linux/virtio_iommu.h > +++ b/include/standard-headers/linux/virtio_iommu.h > @@ -111,6 +111,7 @@ struct virtio_iommu_req_unmap { > > #define VIRTIO_IOMMU_PROBE_T_NONE0 > #define VIRTIO_IOMMU_PROBE_T_RESV_MEM1 > +#define VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK 2 > > #define VIRTIO_IOMMU_PROBE_T_MASK0xfff > > @@ -130,6 +131,11 @@ struct virtio_iommu_probe_resv_mem { > uint64_tend; > }; > > +struct virtio_iommu_probe_pgsize_mask { > + struct virtio_iommu_probe_property head; > + uint64_tpgsize_bitmap; > +}; > + > struct virtio_iommu_req_probe { > struct virtio_iommu_req_headhead; > uint32_tendpoint; > diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c > index 747e3cf1da..63fbacdcdc 100644 > --- a/hw/virtio/virtio-iommu.c > +++ b/hw/virtio/virtio-iommu.c > @@ -38,6 +38,10 @@ > > /* Max size */ > #define VIOMMU_DEFAULT_QUEUE_SIZE 256 > +#define VIOMMU_PROBE_SIZE 512 > + > +#define SUPPORTED_PROBE_PROPERTIES (\ > +1 << VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK) > > typedef struct VirtIOIOMMUDomain { > uint32_t id; > @@ -62,6 +66,13 @@ typedef struct VirtIOIOMMUMapping { > uint32_t flags; > } VirtIOIOMMUMapping; > > +typedef struct VirtIOIOMMUPropBuffer { > +VirtIOIOMMUEndpoint *endpoint; > +size_t filled; > +uint8_t *start; > +bool error; > +} VirtIOIOMMUPropBuffer; > + > static inline uint16_t virtio_iommu_get_bdf(IOMMUDevice *dev) > { > return PCI_BUILD_BDF(pci_bus_num(dev->bus), dev->devfn); > @@ -490,6 +501,114 @@ static int virtio_iommu_unmap(VirtIOIOMMU *s, > return ret; > } > > +static int virtio_iommu_fill_none_prop(VirtIOIOMMUPropBuffer *bufstate) > +{ > +struct virtio_iommu_probe_property *prop; > + > +prop = (struct virtio_iommu_probe_property *) > +(bufstate->start + bufstate->filled); > +prop->type = 0; > +prop->length = 0; > +bufstate->filled += sizeof(*prop); > +trace_virtio_iommu_fill_none_property(bufstate->endpoint->id); > +return 0; > +} > + > +static int virtio_iommu_fill_page_size_mask(VirtIOIOMMUPropBuffer *bufstate) > +{ > +struct virtio_iommu_probe_pgsize_mask *page_size_mask; > +size_t prop_size = sizeof(*page_size_mask); > +VirtIOIOMMUEndpoint *ep = bufstate->endpoint; > +VirtIOIOMMU *s = ep->viommu; > +IOMMUDevice *sdev; > + > +if (bufstate->filled + prop_size >= VIOMMU_PROBE_SIZE) { > +bufstate->error = true; > +/* get the traversal stopped by returning true */ > +return true; > +} > + > +page_size_mask = (struct virtio_iommu_probe_pgsize_mask *) > + (bufstate->start + bufstate->filled); > + > +page_size_mask->head.type = VIRTIO_IOMMU_PROBE_T_PAGE_SIZE_MASK; > +page_size_mask->head.length = prop_size; > +QLIST_FOREACH(sdev, >notifiers_list, next) { > +if (ep->id == sdev->devfn) { > +page_size_mask->pgsize_bitmap = sdev->page_size_mask; > + } > +} > +bufstate->filled += sizeof(*page_size_mask); > +trace_virtio_iommu_fill_pgsize_mask_property(bufstate->endpoint->id, > + > page_size_mask->pgsize_bitmap, > + bufstate->filled); > +return false; > +} > + > +/* Fill the properties[] buffer with properties of type @type */ > +static int virtio_iommu_fill_property(int type, > + VirtIOIOMMUPropBuffer *bufstate) > +{ > +int ret = -ENOSPC; > + > +if (bufstate->filled + sizeof(struct virtio_iommu_probe_property) > +>= VIOMMU_PROBE_SIZE) { > +/* no space left for the header */ > +bufstate->error = true; > +goto out; > +} > + > +switch (type) { > +case
Re: [PULL 0/6] Linux user for 5.0 patches
On Thu, 26 Mar 2020 at 07:24, Laurent Vivier wrote: > > The following changes since commit 736cf607e40674776d752acc201f565723e86045: > > Update version for v5.0.0-rc0 release (2020-03-24 17:50:00 +) > > are available in the Git repository at: > > git://github.com/vivier/qemu.git tags/linux-user-for-5.0-pull-request > > for you to fetch changes up to a52f5f87bece827a338d6eb3332e3def86fb9c33: > > linux-user: Flush out implementation of gettimeofday (2020-03-26 08:08:54 > +0100) > > > Emulate x86_64 vsyscalls > Fix syscall_nr.h cleanup > > Still fails :-( /home/petmay01/linaro/qemu-for-merges/build/all-linux-static/x86_64-linux-user/qemu-x86_64 -L ./gnemul/qemu-x86_64 x86_64/ls -l dummyfile qemu: 0x40008117e9: unhandled CPU exception 0x101 - aborting RAX=003f RBX=6e34 RCX=004000800b18 RDX=004000813180 RSI=0064 RDI=004000800670 RBP=6f40 RSP=004000800668 R8 = R9 =004000800b44 R10=004000801a18 R11=004000801260 R12=0040008008c0 R13=0008 R14=00400040 R15=0040008032d0 RIP=0040008117e9 RFL=0246 [---Z-P-] CPL=3 II=0 A20=1 SMM=0 HLT=0 ES = CS =0033 00effb00 DPL=3 CS64 [-RA] SS =002b 00cff300 DPL=3 DS [-WA] DS = FS = GS = LDT= 8200 DPL=0 LDT TR = 8b00 DPL=0 TSS64-busy GDT= 00400091a000 007f IDT= 004000919000 01ff CR0=80010001 CR2= CR3= CR4=0220 DR0= DR1= DR2= DR3= DR6=0ff0 DR7=0400 EFER=0500 Makefile:6: recipe for target 'test' failed thanks -- PMM
[PATCH v2 2/2] mirror: Wait only for in-flight operations
mirror_wait_for_free_in_flight_slot() just picks a random operation to wait for. However, a MirrorOp is already in s->ops_in_flight when mirror_co_read() waits for free slots, so if not enough slots are immediately available, an operation can end up waiting for itself, or two or more operations can wait for each other to complete, which results in a hang. Fix this by adding a flag to MirrorOp that tells us if the request is already in flight (and therefore occupies slots that it will later free), and picking only such operations for waiting. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692 Signed-off-by: Kevin Wolf --- block/mirror.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/mirror.c b/block/mirror.c index 393131b135..88414d1653 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -102,6 +102,7 @@ struct MirrorOp { bool is_pseudo_op; bool is_active_write; +bool is_in_flight; CoQueue waiting_requests; Coroutine *co; @@ -293,7 +294,9 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) * caller of this function. Since there is only one pseudo op * at any given time, we will always find some real operation * to wait on. */ -if (!op->is_pseudo_op && op->is_active_write == active) { +if (!op->is_pseudo_op && op->is_in_flight && +op->is_active_write == active) +{ qemu_co_queue_wait(>waiting_requests, NULL); return; } @@ -367,6 +370,7 @@ static void coroutine_fn mirror_co_read(void *opaque) /* Copy the dirty cluster. */ s->in_flight++; s->bytes_in_flight += op->bytes; +op->is_in_flight = true; trace_mirror_one_iteration(s, op->offset, op->bytes); ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes, @@ -382,6 +386,7 @@ static void coroutine_fn mirror_co_zero(void *opaque) op->s->in_flight++; op->s->bytes_in_flight += op->bytes; *op->bytes_handled = op->bytes; +op->is_in_flight = true; ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes, op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0); @@ -396,6 +401,7 @@ static void coroutine_fn mirror_co_discard(void *opaque) op->s->in_flight++; op->s->bytes_in_flight += op->bytes; *op->bytes_handled = op->bytes; +op->is_in_flight = true; ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes); mirror_write_complete(op, ret); @@ -1318,6 +1324,7 @@ static MirrorOp *coroutine_fn active_write_prepare(MirrorBlockJob *s, .offset = offset, .bytes = bytes, .is_active_write= true, +.is_in_flight = true, }; qemu_co_queue_init(>waiting_requests); QTAILQ_INSERT_TAIL(>ops_in_flight, op, next); -- 2.20.1
[PATCH v2 1/2] Revert "mirror: Don't let an operation wait for itself"
This reverts commit 7e6c4ff792734e196c8ca82564c56b5e7c6288ca. The fix was incomplete as it only protected against requests waiting for themselves, but not against requests waiting for each other. We need a different solution. Signed-off-by: Kevin Wolf --- block/mirror.c | 21 + 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/block/mirror.c b/block/mirror.c index 447051dbc6..393131b135 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -283,14 +283,11 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t *offset, } static inline void coroutine_fn -mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active) +mirror_wait_for_any_operation(MirrorBlockJob *s, bool active) { MirrorOp *op; QTAILQ_FOREACH(op, >ops_in_flight, next) { -if (self == op) { -continue; -} /* Do not wait on pseudo ops, because it may in turn wait on * some other operation to start, which may in fact be the * caller of this function. Since there is only one pseudo op @@ -305,10 +302,10 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active) } static inline void coroutine_fn -mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s, MirrorOp *self) +mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s) { /* Only non-active operations use up in-flight slots */ -mirror_wait_for_any_operation(s, self, false); +mirror_wait_for_any_operation(s, false); } /* Perform a mirror copy operation. @@ -351,7 +348,7 @@ static void coroutine_fn mirror_co_read(void *opaque) while (s->buf_free_count < nb_chunks) { trace_mirror_yield_in_flight(s, op->offset, s->in_flight); -mirror_wait_for_free_in_flight_slot(s, op); +mirror_wait_for_free_in_flight_slot(s); } /* Now make a QEMUIOVector taking enough granularity-sized chunks @@ -558,7 +555,7 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) while (s->in_flight >= MAX_IN_FLIGHT) { trace_mirror_yield_in_flight(s, offset, s->in_flight); -mirror_wait_for_free_in_flight_slot(s, pseudo_op); +mirror_wait_for_free_in_flight_slot(s); } if (s->ret < 0) { @@ -612,7 +609,7 @@ static void mirror_free_init(MirrorBlockJob *s) static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s) { while (s->in_flight > 0) { -mirror_wait_for_free_in_flight_slot(s, NULL); +mirror_wait_for_free_in_flight_slot(s); } } @@ -809,7 +806,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s) if (s->in_flight >= MAX_IN_FLIGHT) { trace_mirror_yield(s, UINT64_MAX, s->buf_free_count, s->in_flight); -mirror_wait_for_free_in_flight_slot(s, NULL); +mirror_wait_for_free_in_flight_slot(s); continue; } @@ -962,7 +959,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) /* Do not start passive operations while there are active * writes in progress */ while (s->in_active_write_counter) { -mirror_wait_for_any_operation(s, NULL, true); +mirror_wait_for_any_operation(s, true); } if (s->ret < 0) { @@ -988,7 +985,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp) if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight); -mirror_wait_for_free_in_flight_slot(s, NULL); +mirror_wait_for_free_in_flight_slot(s); continue; } else if (cnt != 0) { delay_ns = mirror_iteration(s); -- 2.20.1
[PATCH v2 0/2] mirror: Fix hang (operation waiting for itself/circular dependency)
The recent fix didn't actually fix the whole problem. Operations can't only wait for themselves, but we can also end up with circular dependencies like two operations waiting for each other to complete. This reverts the first fix and implements another approach. v2: - Mark active mirror operations as in-flight, too Kevin Wolf (2): Revert "mirror: Don't let an operation wait for itself" mirror: Wait only for in-flight operations block/mirror.c | 30 +- 1 file changed, 17 insertions(+), 13 deletions(-) -- 2.20.1
[PATCH] gdbstub: fix compiler complaining
./gdbstub.c: In function ‘handle_query_thread_extra’: /usr/include/glib-2.0/glib/glib-autocleanups.h:28:10: error: ‘cpu_name’ may be used uninitialized in this function [-Werror=maybe-uninitialized] g_free (*pp); ^ ./gdbstub.c:2063:26: note: ‘cpu_name’ was declared here g_autofree char *cpu_name; ^ cc1: all warnings being treated as errors Signed-off-by: Denis Plotnikov --- gdbstub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gdbstub.c b/gdbstub.c index 013fb1ac0f..171e150950 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -2060,8 +2060,8 @@ static void handle_query_thread_extra(GdbCmdContext *gdb_ctx, void *user_ctx) /* Print the CPU model and name in multiprocess mode */ ObjectClass *oc = object_get_class(OBJECT(cpu)); const char *cpu_model = object_class_get_name(oc); -g_autofree char *cpu_name; -cpu_name = object_get_canonical_path_component(OBJECT(cpu)); +g_autofree char *cpu_name = +object_get_canonical_path_component(OBJECT(cpu)); g_string_printf(rs, "%s %s [%s]", cpu_model, cpu_name, cpu->halted ? "halted " : "running"); } else { -- 2.17.0
[Bug 1869006] Re: PCIe cards passthrough to TCG guest works on 2GB of guest memory but fails on 4GB (vfio_dma_map invalid arg)
I wanted to try the same on machine without MSI. But my desktop refuses to boot into sane environment with pci=nomsi kernel option. -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1869006 Title: PCIe cards passthrough to TCG guest works on 2GB of guest memory but fails on 4GB (vfio_dma_map invalid arg) Status in QEMU: New Bug description: During one meeting coworker asked "did someone tried to passthrough PCIe card to other arch guest?" and I decided to check it. Plugged SATA and USB3 controllers into spare slots on mainboard and started playing. On 1GB VM instance it worked (both cold- and hot- plugged). On 4GB one it did not: Błąd podczas uruchamiania domeny: internal error: process exited while connecting to monitor: 2020-03-25T13:43:39.107524Z qemu-system-aarch64: -device vfio-pci,host=:29:00.0,id=hostdev0,bus=pci.3,addr=0x0: VFIO_MAP_DMA: -22 2020-03-25T13:43:39.107560Z qemu-system-aarch64: -device vfio-pci,host=:29:00.0,id=hostdev0,bus=pci.3,addr=0x0: vfio :29:00.0: failed to setup container for group 28: memory listener initialization failed: Region mach-virt.ram: vfio_dma_map(0x563169753c80, 0x4000, 0x1, 0x7fb2a3e0) = -22 (Invalid argument) Traceback (most recent call last): File "/usr/share/virt-manager/virtManager/asyncjob.py", line 75, in cb_wrapper callback(asyncjob, *args, **kwargs) File "/usr/share/virt-manager/virtManager/asyncjob.py", line 111, in tmpcb callback(*args, **kwargs) File "/usr/share/virt-manager/virtManager/object/libvirtobject.py", line 66, in newfn ret = fn(self, *args, **kwargs) File "/usr/share/virt-manager/virtManager/object/domain.py", line 1279, in startup self._backend.create() File "/usr/lib64/python3.8/site-packages/libvirt.py", line 1234, in create if ret == -1: raise libvirtError ('virDomainCreate() failed', dom=self) libvirt.libvirtError: internal error: process exited while connecting to monitor: 2020-03-25T13:43:39.107524Z qemu-system-aarch64: -device vfio-pci,host=:29:00.0,id=hostdev0,bus=pci.3,addr=0x0: VFIO_MAP_DMA: -22 2020-03-25T13:43:39.107560Z qemu-system-aarch64: -device vfio-pci,host=:29:00.0,id=hostdev0,bus=pci.3,addr=0x0: vfio :29:00.0: failed to setup container for group 28: memory listener initialization failed: Region mach-virt.ram: vfio_dma_map(0x563169753c80, 0x4000, 0x1, 0x7fb2a3e0) = -22 (Invalid argument) I played with memory and 3054 MB is maximum value possible to boot VM with coldplugged host PCIe cards. To manage notifications about this bug go to: https://bugs.launchpad.net/qemu/+bug/1869006/+subscriptions
[PULL 10/10] iotests/138: Test leaks/corruptions fixed report
Test that qemu-img check reports the number of leaks and corruptions fixed in its JSON report (after a successful run). While touching the _unsupported_imgopts line, adjust the note on why data_file does not work with this test: The current comment sounds a bit like it is a mistake for qemu-img check not to check external data files' refcounts. But there are no such refcounts, so it is no mistake. Just say that qemu-img check does not do much for external data files, and this is why this test does not work with them. Signed-off-by: Max Reitz Message-Id: <20200324172757.1173824-4-mre...@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- tests/qemu-iotests/138 | 41 -- tests/qemu-iotests/138.out | 14 + 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/138 b/tests/qemu-iotests/138 index 54b01046ad..1d5b0bed6d 100755 --- a/tests/qemu-iotests/138 +++ b/tests/qemu-iotests/138 @@ -41,8 +41,10 @@ _supported_fmt qcow2 _supported_proto file _supported_os Linux # With an external data file, data clusters are not refcounted -# (and so qemu-img check does not check their refcount) -_unsupported_imgopts data_file +# (so qemu-img check would not do much); +# we want to modify the refcounts, so we need them to have a specific +# format (namely u16) +_unsupported_imgopts data_file 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' echo echo '=== Check on an image with a multiple of 2^32 clusters ===' @@ -65,6 +67,41 @@ poke_file "$TEST_IMG" $((2048 + 8)) "\x00\x80\x00\x00\x00\x00\x00\x00" # allocate memory", we have an error showing that l2 entry is invalid. _check_test_img +echo +echo '=== Check leaks-fixed/corruptions-fixed report' +echo + +# After leaks and corruptions were fixed, those numbers should be +# reported by qemu-img check +_make_test_img 64k + +# Allocate data cluster +$QEMU_IO -c 'write 0 64k' "$TEST_IMG" | _filter_qemu_io + +reftable_ofs=$(peek_file_be "$TEST_IMG" 48 8) +refblock_ofs=$(peek_file_be "$TEST_IMG" $reftable_ofs 8) + +# Introduce a leak: Make the image header's refcount 2 +poke_file_be "$TEST_IMG" "$refblock_ofs" 2 2 + +l1_ofs=$(peek_file_be "$TEST_IMG" 40 8) + +# Introduce a corruption: Drop the COPIED flag from the (first) L1 entry +l1_entry=$(peek_file_be "$TEST_IMG" $l1_ofs 8) +l1_entry=$((l1_entry & ~(1 << 63))) +poke_file_be "$TEST_IMG" $l1_ofs 8 $l1_entry + +echo +# Should print the number of corruptions and leaks fixed +# (Filter out all JSON fields (recognizable by their four-space +# indentation), but keep the "-fixed" fields (by removing two spaces +# from their indentation)) +# (Also filter out the L1 entry, because why not) +_check_test_img -r all --output=json \ +| sed -e 's/^ \(.*\)-fixed"/\1-fixed"/' \ + -e '/^/d' \ + -e "s/\\([^0-9a-f]\\)$(printf %x $l1_entry)\\([^0-9a-f]\\)/\1L1_ENTRY_VALUE\2/" + # success, all done echo "*** done" rm -f $seq.full diff --git a/tests/qemu-iotests/138.out b/tests/qemu-iotests/138.out index aca7d47a80..79681e7cc9 100644 --- a/tests/qemu-iotests/138.out +++ b/tests/qemu-iotests/138.out @@ -9,4 +9,18 @@ ERROR: counting reference for region exceeding the end of the file by one cluste 1 errors were found on the image. Data may be corrupted, or further writes to the image may corrupt it. + +=== Check leaks-fixed/corruptions-fixed report + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) + +Leaked cluster 0 refcount=2 reference=1 +Repairing cluster 0 refcount=2 reference=1 +Repairing OFLAG_COPIED L2 cluster: l1_index=0 l1_entry=L1_ENTRY_VALUE refcount=1 +{ + "corruptions-fixed": 1, + "leaks-fixed": 1, +} *** done -- 2.25.1
[PULL 08/10] qemu-img: Fix check's leak/corruption fix report
There are two problems with qemu-img check's report on how many leaks and/or corruptions have been fixed: (1) ImageCheck.has_leaks_fixed and ImageCheck.has_corruptions_fixed are only true when ImageCheck.leaks or ImageCheck.corruptions (respectively) are non-zero. qcow2's check implementation will set the latter to zero after it has fixed leaks and corruptions, though, so leaks-fixed and corruptions-fixed are actually never reported after successful repairs. We should always report them when they are non-zero, just like all the other fields of ImageCheck. (2) After something has been fixed and we run the check a second time, leaks_fixed and corruptions_fixed are taken from the first run; but has_leaks_fixed and has_corruptions_fixed are not. The second run actually cannot fix anything, so with (1) fixed, has_leaks_fixed and has_corruptions_fixed will always be false here. (With (1) unfixed, they will at least be false on successful runs, because then the number of leaks and corruptions found in the second run should be 0.) We should save has_leaks_fixed and has_corruptions_fixed just like we save leaks_fixed and corruptions_fixed. Signed-off-by: Max Reitz Reviewed-by: Eric Blake Message-Id: <20200324172757.1173824-2-mre...@redhat.com> Signed-off-by: Max Reitz --- qemu-img.c | 9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index afddf33f08..b167376bd7 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -647,9 +647,9 @@ static int collect_image_check(BlockDriverState *bs, check->leaks= result.leaks; check->has_leaks= result.leaks != 0; check->corruptions_fixed= result.corruptions_fixed; -check->has_corruptions_fixed= result.corruptions != 0; +check->has_corruptions_fixed= result.corruptions_fixed != 0; check->leaks_fixed = result.leaks_fixed; -check->has_leaks_fixed = result.leaks != 0; +check->has_leaks_fixed = result.leaks_fixed != 0; check->image_end_offset = result.image_end_offset; check->has_image_end_offset = result.image_end_offset != 0; check->total_clusters = result.bfi.total_clusters; @@ -803,9 +803,12 @@ static int img_check(int argc, char **argv) if (check->corruptions_fixed || check->leaks_fixed) { int corruptions_fixed, leaks_fixed; +bool has_leaks_fixed, has_corruptions_fixed; leaks_fixed = check->leaks_fixed; +has_leaks_fixed = check->has_leaks_fixed; corruptions_fixed = check->corruptions_fixed; +has_corruptions_fixed = check->has_corruptions_fixed; if (output_format == OFORMAT_HUMAN) { qprintf(quiet, @@ -822,7 +825,9 @@ static int img_check(int argc, char **argv) ret = collect_image_check(bs, check, filename, fmt, 0); check->leaks_fixed = leaks_fixed; +check->has_leaks_fixed = has_leaks_fixed; check->corruptions_fixed= corruptions_fixed; +check->has_corruptions_fixed = has_corruptions_fixed; } if (!ret) { -- 2.25.1
[PULL 06/10] qcow2: Avoid feature name extension on small cluster size
From: Eric Blake As the feature name table can be quite large (over 9k if all 64 bits of all three feature fields have names; a mere 8 features leaves only 8 bytes for a backing file name in a 512-byte cluster), it is unwise to emit this optional header in images with small cluster sizes. Update iotest 036 to skip running on small cluster sizes; meanwhile, note that iotest 061 never passed on alternative cluster sizes (however, I limited this patch to tests with output affected by adding feature names, rather than auditing for other tests that are not robust to alternative cluster sizes). Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Message-Id: <20200324174233.1622067-4-ebl...@redhat.com> Signed-off-by: Max Reitz --- block/qcow2.c | 11 +-- tests/qemu-iotests/036 | 6 -- tests/qemu-iotests/061 | 6 -- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index b74cbeb047..2bb536b014 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2823,8 +2823,15 @@ int qcow2_update_header(BlockDriverState *bs) buflen -= ret; } -/* Feature table */ -if (s->qcow_version >= 3) { +/* + * Feature table. A mere 8 feature names occupies 392 bytes, and + * when coupled with the v3 minimum header of 104 bytes plus the + * 8-byte end-of-extension marker, that would leave only 8 bytes + * for a backing file name in an image with 512-byte clusters. + * Thus, we choose to omit this header for cluster sizes 4k and + * smaller. + */ +if (s->qcow_version >= 3 && s->cluster_size > 4096) { static const Qcow2Feature features[] = { { .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, diff --git a/tests/qemu-iotests/036 b/tests/qemu-iotests/036 index 512598421c..cf522de7a1 100755 --- a/tests/qemu-iotests/036 +++ b/tests/qemu-iotests/036 @@ -44,8 +44,10 @@ trap "_cleanup; exit \$status" 0 1 2 3 15 _supported_fmt qcow2 _supported_proto file # Only qcow2v3 and later supports feature bits; -# qcow2.py does not support external data files -_unsupported_imgopts 'compat=0.10' data_file +# qcow2.py does not support external data files; +# this test requires a cluster size large enough for the feature table +_unsupported_imgopts 'compat=0.10' data_file \ +'cluster_size=\(512\|1024\|2048\|4096\)' echo echo === Image with unknown incompatible feature bit === diff --git a/tests/qemu-iotests/061 b/tests/qemu-iotests/061 index 36b040491f..ce285d3084 100755 --- a/tests/qemu-iotests/061 +++ b/tests/qemu-iotests/061 @@ -44,8 +44,10 @@ _supported_os Linux # Conversion between different compat versions can only really work # with refcount_bits=16; # we have explicit tests for data_file here, but the whole test does -# not work with it -_unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' data_file +# not work with it; +# we have explicit tests for various cluster sizes, the remaining tests +# require the default 64k cluster +_unsupported_imgopts 'refcount_bits=\([^1]\|.\([^6]\|$\)\)' data_file cluster_size echo echo "=== Testing version downgrade with zero expansion ===" -- 2.25.1
[PULL 09/10] iotests: Add poke_file_[bl]e functions
Similarly to peek_file_[bl]e, we may want to write binary integers into a file. Currently, this often means messing around with poke_file and raw binary strings. I hope these functions make it a bit more comfortable. Signed-off-by: Max Reitz Code-suggested-by: Eric Blake Message-Id: <20200324172757.1173824-3-mre...@redhat.com> Reviewed-by: Eric Blake Signed-off-by: Max Reitz --- tests/qemu-iotests/common.rc | 24 1 file changed, 24 insertions(+) diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index 4c246c0450..bf3b9fdea0 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -53,6 +53,30 @@ poke_file() printf "$3" | dd "of=$1" bs=1 "seek=$2" conv=notrunc &>/dev/null } +# poke_file_le $img_filename $offset $byte_width $value +# Example: poke_file_le "$TEST_IMG" 512 2 65534 +poke_file_le() +{ +local img=$1 ofs=$2 len=$3 val=$4 str='' + +while ((len--)); do +str+=$(printf '\\x%02x' $((val & 0xff))) +val=$((val >> 8)) +done + +poke_file "$img" "$ofs" "$str" +} + +# poke_file_be $img_filename $offset $byte_width $value +# Example: poke_file_be "$TEST_IMG" 512 2 65279 +poke_file_be() +{ +local img=$1 ofs=$2 len=$3 val=$4 +local str=$(printf "%0$((len * 2))x\n" $val | sed 's/\(..\)/\\x\1/g') + +poke_file "$img" "$ofs" "$str" +} + # peek_file_le 'test.img' 512 2 => 65534 peek_file_le() { -- 2.25.1
[PULL 05/10] qcow2: List autoclear bit names in header
From: Eric Blake The feature table is supposed to advertise the name of all feature bits that we support; however, we forgot to update the table for autoclear bits. While at it, move the table to read-only memory in code, and tweak the qcow2 spec to name the second autoclear bit. Update iotests that are affected by the longer header length. Fixes: 88ddffae Fixes: 93c24936 Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200324174233.1622067-3-ebl...@redhat.com> Signed-off-by: Max Reitz --- docs/interop/qcow2.txt | 3 ++- block/qcow2.c | 12 +++- tests/qemu-iotests/031.out | 8 tests/qemu-iotests/036.out | 4 ++-- tests/qemu-iotests/061.out | 14 +++--- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/docs/interop/qcow2.txt b/docs/interop/qcow2.txt index 5597e24474..640e0eca40 100644 --- a/docs/interop/qcow2.txt +++ b/docs/interop/qcow2.txt @@ -143,7 +143,8 @@ the next fields through header_length. bit is unset, the bitmaps extension data must be considered inconsistent. -Bit 1: If this bit is set, the external data file can +Bit 1: Raw external data bit +If this bit is set, the external data file can be read as a consistent standalone raw image without looking at the qcow2 metadata. diff --git a/block/qcow2.c b/block/qcow2.c index b565cf912e..b74cbeb047 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -2825,7 +2825,7 @@ int qcow2_update_header(BlockDriverState *bs) /* Feature table */ if (s->qcow_version >= 3) { -Qcow2Feature features[] = { +static const Qcow2Feature features[] = { { .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, .bit = QCOW2_INCOMPAT_DIRTY_BITNR, @@ -2846,6 +2846,16 @@ int qcow2_update_header(BlockDriverState *bs) .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, .name = "lazy refcounts", }, +{ +.type = QCOW2_FEAT_TYPE_AUTOCLEAR, +.bit = QCOW2_AUTOCLEAR_BITMAPS_BITNR, +.name = "bitmaps", +}, +{ +.type = QCOW2_FEAT_TYPE_AUTOCLEAR, +.bit = QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR, +.name = "raw external data", +}, }; ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, diff --git a/tests/qemu-iotests/031.out b/tests/qemu-iotests/031.out index d535e407bc..46f97c5a4e 100644 --- a/tests/qemu-iotests/031.out +++ b/tests/qemu-iotests/031.out @@ -117,7 +117,7 @@ header_length 104 Header extension: magic 0x6803f857 -length192 +length288 data Header extension: @@ -150,7 +150,7 @@ header_length 104 Header extension: magic 0x6803f857 -length192 +length288 data Header extension: @@ -164,7 +164,7 @@ No errors were found on the image. magic 0x514649fb version 3 -backing_file_offset 0x178 +backing_file_offset 0x1d8 backing_file_size 0x17 cluster_bits 16 size 67108864 @@ -188,7 +188,7 @@ data 'host_device' Header extension: magic 0x6803f857 -length192 +length288 data Header extension: diff --git a/tests/qemu-iotests/036.out b/tests/qemu-iotests/036.out index 0b52b934e1..23b699ce06 100644 --- a/tests/qemu-iotests/036.out +++ b/tests/qemu-iotests/036.out @@ -26,7 +26,7 @@ compatible_features [] autoclear_features[63] Header extension: magic 0x6803f857 -length192 +length288 data @@ -38,7 +38,7 @@ compatible_features [] autoclear_features[] Header extension: magic 0x6803f857 -length192 +length288 data *** done diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out index 8b3091a412..413cc4e0f4 100644 --- a/tests/qemu-iotests/061.out +++ b/tests/qemu-iotests/061.out @@ -26,7 +26,7 @@ header_length 104 Header extension: magic 0x6803f857 -length192 +length288 data magic 0x514649fb @@ -84,7 +84,7 @@ header_length 104 Header extension: magic 0x6803f857 -length192 +length288 data magic
[PULL 07/10] sheepdog: Consistently set bdrv_has_zero_init_truncate
From: Eric Blake block_int.h claims that .bdrv_has_zero_init must return 0 if .bdrv_has_zero_init_truncate does likewise; but this is violated if only the former callback is provided if .bdrv_co_truncate also exists. When adding the latter callback, it was mistakenly added to only one of the three possible sheepdog instantiations. Fixes: 1dcaf527 Signed-off-by: Eric Blake Message-Id: <20200324174233.1622067-5-ebl...@redhat.com> Reviewed-by: John Snow Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Max Reitz --- block/sheepdog.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/sheepdog.c b/block/sheepdog.c index a8a7e32a41..59f7ebb171 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -3271,6 +3271,7 @@ static BlockDriver bdrv_sheepdog_tcp = { .bdrv_co_create = sd_co_create, .bdrv_co_create_opts = sd_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, +.bdrv_has_zero_init_truncate = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, .bdrv_get_allocated_file_size = sd_get_allocated_file_size, .bdrv_co_truncate = sd_co_truncate, @@ -3309,6 +3310,7 @@ static BlockDriver bdrv_sheepdog_unix = { .bdrv_co_create = sd_co_create, .bdrv_co_create_opts = sd_co_create_opts, .bdrv_has_zero_init = bdrv_has_zero_init_1, +.bdrv_has_zero_init_truncate = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, .bdrv_get_allocated_file_size = sd_get_allocated_file_size, .bdrv_co_truncate = sd_co_truncate, -- 2.25.1
[PULL 02/10] block: pass BlockDriver reference to the .bdrv_co_create
From: Maxim Levitsky This will allow the reuse of a single generic .bdrv_co_create implementation for several drivers. No functional changes. Signed-off-by: Maxim Levitsky Message-Id: <20200326011218.29230-2-mlevi...@redhat.com> Reviewed-by: Denis V. Lunev Signed-off-by: Max Reitz --- include/block/block_int.h | 3 ++- block.c | 3 ++- block/crypto.c| 3 ++- block/file-posix.c| 4 +++- block/file-win32.c| 4 +++- block/gluster.c | 3 ++- block/nfs.c | 4 +++- block/parallels.c | 3 ++- block/qcow.c | 3 ++- block/qcow2.c | 4 +++- block/qed.c | 3 ++- block/raw-format.c| 4 +++- block/rbd.c | 3 ++- block/sheepdog.c | 4 +++- block/ssh.c | 4 +++- block/vdi.c | 4 +++- block/vhdx.c | 3 ++- block/vmdk.c | 4 +++- block/vpc.c | 6 -- 19 files changed, 49 insertions(+), 20 deletions(-) diff --git a/include/block/block_int.h b/include/block/block_int.h index ae9c4da4d0..57c8ea24b2 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -135,7 +135,8 @@ struct BlockDriver { void (*bdrv_close)(BlockDriverState *bs); int coroutine_fn (*bdrv_co_create)(BlockdevCreateOptions *opts, Error **errp); -int coroutine_fn (*bdrv_co_create_opts)(const char *filename, +int coroutine_fn (*bdrv_co_create_opts)(BlockDriver *drv, +const char *filename, QemuOpts *opts, Error **errp); int (*bdrv_make_empty)(BlockDriverState *bs); diff --git a/block.c b/block.c index cccae5add9..ff23e20443 100644 --- a/block.c +++ b/block.c @@ -483,7 +483,8 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque) CreateCo *cco = opaque; assert(cco->drv); -ret = cco->drv->bdrv_co_create_opts(cco->filename, cco->opts, _err); +ret = cco->drv->bdrv_co_create_opts(cco->drv, +cco->filename, cco->opts, _err); error_propagate(>err, local_err); cco->ret = ret; } diff --git a/block/crypto.c b/block/crypto.c index 4425ebeb47..d577f89659 100644 --- a/block/crypto.c +++ b/block/crypto.c @@ -601,7 +601,8 @@ fail: return ret; } -static int coroutine_fn block_crypto_co_create_opts_luks(const char *filename, +static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv, + const char *filename, QemuOpts *opts, Error **errp) { diff --git a/block/file-posix.c b/block/file-posix.c index 9bc3838b2a..65bc980bc4 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -2405,7 +2405,9 @@ out: return result; } -static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, +static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + const char *filename, + QemuOpts *opts, Error **errp) { BlockdevCreateOptions options; diff --git a/block/file-win32.c b/block/file-win32.c index 77e8ff7b68..15859839a1 100644 --- a/block/file-win32.c +++ b/block/file-win32.c @@ -588,7 +588,9 @@ static int raw_co_create(BlockdevCreateOptions *options, Error **errp) return 0; } -static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts *opts, +static int coroutine_fn raw_co_create_opts(BlockDriver *drv, + const char *filename, + QemuOpts *opts, Error **errp) { BlockdevCreateOptions options; diff --git a/block/gluster.c b/block/gluster.c index 4fa4a77a47..0aa1f2cda4 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -1130,7 +1130,8 @@ out: return ret; } -static int coroutine_fn qemu_gluster_co_create_opts(const char *filename, +static int coroutine_fn qemu_gluster_co_create_opts(BlockDriver *drv, +const char *filename, QemuOpts *opts, Error **errp) { diff --git a/block/nfs.c b/block/nfs.c index 9a6311e270..cc2413d5ab 100644 --- a/block/nfs.c +++ b/block/nfs.c @@ -662,7 +662,9 @@ out: return ret; } -static int coroutine_fn nfs_file_co_create_opts(const char *url, QemuOpts *opts, +static int coroutine_fn nfs_file_co_create_opts(BlockDriver *drv, +const char *url, +QemuOpts *opts,
[PULL 04/10] qcow2: Comment typo fixes
From: Eric Blake Various trivial typos noticed while working on this file. Signed-off-by: Eric Blake Reviewed-by: Vladimir Sementsov-Ogievskiy Reviewed-by: Alberto Garcia Message-Id: <20200324174233.1622067-2-ebl...@redhat.com> Signed-off-by: Max Reitz --- block/qcow2.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/qcow2.c b/block/qcow2.c index 5f65fce924..b565cf912e 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -177,7 +177,7 @@ static ssize_t qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset, } -/* +/* * read qcow2 extension and fill bs * start reading from start_offset * finish reading upon magic of value 0 or when end_offset reached @@ -3255,7 +3255,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) * inconsistency later. * * We do need a refcount table because growing the refcount table means - * allocating two new refcount blocks - the seconds of which would be at + * allocating two new refcount blocks - the second of which would be at * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file * size for any qcow2 image. */ @@ -3500,7 +3500,7 @@ qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp) goto out; } -/* Want a backing file? There you go.*/ +/* Want a backing file? There you go. */ if (qcow2_opts->has_backing_file) { const char *backing_format = NULL; -- 2.25.1
[PULL 00/10] Block patches for 5.0-rc1
The following changes since commit 01e38186ecb1fc6275720c5425332eed280ea93d: Merge remote-tracking branch 'remotes/dgilbert/tags/pull-migration-20200325b' into staging (2020-03-26 09:28:11 +) are available in the Git repository at: https://github.com/XanClic/qemu.git tags/pull-block-2020-03-26 for you to fetch changes up to a507c51790fa955c1fccd4deca3c50476a862b83: iotests/138: Test leaks/corruptions fixed report (2020-03-26 14:52:43 +0100) Block patches for 5.0-rc1: - Fix qemu-img convert with a host device or iscsi target - Use-after-free fix in mirror - Some minor qcow2 fixes - Minor sheepdog fix - Minor qemu-img check report fix Eric Blake (4): qcow2: Comment typo fixes qcow2: List autoclear bit names in header qcow2: Avoid feature name extension on small cluster size sheepdog: Consistently set bdrv_has_zero_init_truncate Max Reitz (3): qemu-img: Fix check's leak/corruption fix report iotests: Add poke_file_[bl]e functions iotests/138: Test leaks/corruptions fixed report Maxim Levitsky (2): block: pass BlockDriver reference to the .bdrv_co_create block: trickle down the fallback image creation function use to the block drivers Vladimir Sementsov-Ogievskiy (1): block/mirror: fix use after free of local_err docs/interop/qcow2.txt | 3 ++- include/block/block.h| 1 + include/block/block_int.h| 14 +++- block.c | 38 +++-- block/crypto.c | 3 ++- block/file-posix.c | 11 -- block/file-win32.c | 4 +++- block/gluster.c | 3 ++- block/iscsi.c| 16 -- block/mirror.c | 1 + block/nbd.c | 6 ++ block/nfs.c | 4 +++- block/nvme.c | 3 +++ block/parallels.c| 3 ++- block/qcow.c | 3 ++- block/qcow2.c| 33 +++-- block/qed.c | 3 ++- block/raw-format.c | 4 +++- block/rbd.c | 3 ++- block/sheepdog.c | 6 +- block/ssh.c | 4 +++- block/vdi.c | 4 +++- block/vhdx.c | 3 ++- block/vmdk.c | 4 +++- block/vpc.c | 6 -- qemu-img.c | 9 ++-- tests/qemu-iotests/031.out | 8 +++ tests/qemu-iotests/036 | 6 -- tests/qemu-iotests/036.out | 4 ++-- tests/qemu-iotests/061 | 6 -- tests/qemu-iotests/061.out | 14 ++-- tests/qemu-iotests/138 | 41 ++-- tests/qemu-iotests/138.out | 14 tests/qemu-iotests/common.rc | 24 + 34 files changed, 233 insertions(+), 76 deletions(-) -- 2.25.1
[PULL 03/10] block: trickle down the fallback image creation function use to the block drivers
From: Maxim Levitsky Instead of checking the .bdrv_co_create_opts to see if we need the fallback, just implement the .bdrv_co_create_opts in the drivers that need it. This way we don't break various places that need to know if the underlying protocol/format really supports image creation, and this way we still allow some drivers to not support image creation. Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816007 Note that technically this driver reverts the image creation fallback for the vxhs driver since I don't have a means to test it, and IMHO it is better to leave it not supported as it was prior to generic image creation patches. Also drop iscsi_create_opts which was left accidentally. Signed-off-by: Maxim Levitsky Message-Id: <20200326011218.29230-3-mlevi...@redhat.com> Reviewed-by: Denis V. Lunev [mreitz: Fixed alignment, and moved bdrv_co_create_opts_simple() and bdrv_create_opts_simple from block.h into block_int.h] Signed-off-by: Max Reitz --- include/block/block.h | 1 + include/block/block_int.h | 11 +++ block.c | 35 --- block/file-posix.c| 7 ++- block/iscsi.c | 16 block/nbd.c | 6 ++ block/nvme.c | 3 +++ 7 files changed, 51 insertions(+), 28 deletions(-) diff --git a/include/block/block.h b/include/block/block.h index e569a4d747..b05995fe9c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -283,6 +283,7 @@ BlockDriver *bdrv_find_format(const char *format_name); int bdrv_create(BlockDriver *drv, const char* filename, QemuOpts *opts, Error **errp); int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp); + BlockDriverState *bdrv_new(void); void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, Error **errp); diff --git a/include/block/block_int.h b/include/block/block_int.h index 57c8ea24b2..4c3587ea19 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -1331,4 +1331,15 @@ int refresh_total_sectors(BlockDriverState *bs, int64_t hint); void bdrv_set_monitor_owned(BlockDriverState *bs); BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp); +/** + * Simple implementation of bdrv_co_create_opts for protocol drivers + * which only support creation via opening a file + * (usually existing raw storage device) + */ +int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, +const char *filename, +QemuOpts *opts, +Error **errp); +extern QemuOptsList bdrv_create_opts_simple; + #endif /* BLOCK_INT_H */ diff --git a/block.c b/block.c index ff23e20443..af3faf664e 100644 --- a/block.c +++ b/block.c @@ -598,8 +598,15 @@ static int create_file_fallback_zero_first_sector(BlockBackend *blk, return 0; } -static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv, - QemuOpts *opts, Error **errp) +/** + * Simple implementation of bdrv_co_create_opts for protocol drivers + * which only support creation via opening a file + * (usually existing raw storage device) + */ +int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, +const char *filename, +QemuOpts *opts, +Error **errp) { BlockBackend *blk; QDict *options; @@ -663,11 +670,7 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp) return -ENOENT; } -if (drv->bdrv_co_create_opts) { -return bdrv_create(drv, filename, opts, errp); -} else { -return bdrv_create_file_fallback(filename, drv, opts, errp); -} +return bdrv_create(drv, filename, opts, errp); } int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp) @@ -1592,9 +1595,9 @@ QemuOptsList bdrv_runtime_opts = { }, }; -static QemuOptsList fallback_create_opts = { -.name = "fallback-create-opts", -.head = QTAILQ_HEAD_INITIALIZER(fallback_create_opts.head), +QemuOptsList bdrv_create_opts_simple = { +.name = "simple-create-opts", +.head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head), .desc = { { .name = BLOCK_OPT_SIZE, @@ -5963,13 +5966,15 @@ void bdrv_img_create(const char *filename, const char *fmt, return; } +if (!proto_drv->create_opts) { +error_setg(errp, "Protocol driver '%s' does not support image creation", + proto_drv->format_name); +return; +} + /* Create parameter list */ create_opts = qemu_opts_append(create_opts, drv->create_opts); -if (proto_drv->create_opts) { -create_opts =
[PULL 01/10] block/mirror: fix use after free of local_err
From: Vladimir Sementsov-Ogievskiy local_err is used again in mirror_exit_common() after bdrv_set_backing_hd(), so we must zero it. Otherwise try to set non-NULL local_err will crash. Signed-off-by: Vladimir Sementsov-Ogievskiy Message-Id: <20200324153630.11882-3-vsement...@virtuozzo.com> Reviewed-by: Eric Blake Reviewed-by: John Snow Signed-off-by: Max Reitz --- block/mirror.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/mirror.c b/block/mirror.c index 447051dbc6..6203e5946e 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -678,6 +678,7 @@ static int mirror_exit_common(Job *job) bdrv_set_backing_hd(target_bs, backing, _err); if (local_err) { error_report_err(local_err); +local_err = NULL; ret = -EPERM; } } -- 2.25.1
booke206_tlb_ways() returning 0
Hi; Coverity points out a possible issue in booke206_get_tlbm() (this is CID 1421942): static inline ppcmas_tlb_t *booke206_get_tlbm(CPUPPCState *env, const int tlbn, target_ulong ea, int way) { int r; uint32_t ways = booke206_tlb_ways(env, tlbn); int ways_bits = ctz32(ways); int tlb_bits = ctz32(booke206_tlb_size(env, tlbn)); int i; way &= ways - 1; ea >>= MAS2_EPN_SHIFT; ea &= (1 << (tlb_bits - ways_bits)) - 1; r = (ea << ways_bits) | way; Here if booke206_tlb_ways() returns zero, then ways_bits() will be 32 and the shift left 'ea << ways_bits' is undefined behaviour. My first assumption was that booke206_tlb_ways() is not supposed to ever return 0 (it's looking at what I think are read-only system registers, and it doesn't make much sense to have a zero-way TLB). So I tried adding an assert: diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h index 88d94495554..aedb6bcb265 100644 --- a/target/ppc/cpu.h +++ b/target/ppc/cpu.h @@ -2403,6 +2403,7 @@ static inline int booke206_tlb_ways(CPUPPCState *env, int tlbn) { uint32_t tlbncfg = env->spr[SPR_BOOKE_TLB0CFG + tlbn]; int r = tlbncfg >> TLBnCFG_ASSOC_SHIFT; +assert(r > 0); return r; } However, this isn't right, because it causes one of the check-acceptance tests to fail, with this backtrace: #3 0x7074d412 in __GI___assert_fail (assertion=0x560a0d7d "r > 0", file=0x560a0d40 "/home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/cpu.h", line=2406, function=0x560a1720 <__PRETTY_FUNCTION__.20811> "booke206_tlb_ways") at assert.c:101 #4 0x55a9939b in booke206_tlb_ways (env=0x56e52a30, tlbn=2) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/cpu.h:2406 #5 0x55a9b3ac in mmubooke206_get_physical_address (env=0x56e52a30, ctx=0x7fffd77008a0, address=9223380835095282947, rw=0, access_type=0, mmu_idx=1) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1093 #6 0x55a9c25d in get_physical_address_wtlb (env=0x56e52a30, ctx=0x7fffd77008a0, eaddr=9223380835095282947, rw=0, access_type=0, mmu_idx=1) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1455 #7 0x55a9c82b in cpu_ppc_handle_mmu_fault (env=0x56e52a30, address=9223380835095282947, rw=0, mmu_idx=1) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:1597 #8 0x55a9f975 in ppc_cpu_tlb_fill (cs=0x56e49560, addr=9223380835095282947, size=1, access_type=MMU_DATA_LOAD, mmu_idx=1, probe=false, retaddr=140735610345781) at /home/petmay01/linaro/qemu-from-laptop/qemu/target/ppc/mmu_helper.c:3053 #9 0x558e1422 in tlb_fill (cpu=0x56e49560, addr=9223380835095282947, size=1, access_type=MMU_DATA_LOAD, mmu_idx=1, retaddr=140735610345781) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1017 #10 0x558e279b in load_helper (env=0x56e52a30, addr=9223380835095282947, oi=1, retaddr=140735610345781, op=MO_8, code_read=false, full_load=0x558e2b3a ) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1534 #11 0x558e2b80 in full_ldub_mmu (env=0x56e52a30, addr=9223380835095282947, oi=1, retaddr=140735610345781) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1624 #12 0x558e2bb4 in helper_ret_ldub_mmu (env=0x56e52a30, addr=9223380835095282947, oi=1, retaddr=140735610345781) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cputlb.c:1630 #13 0x7fff900fd9c6 in code_gen_buffer () #14 0x558f9915 in cpu_tb_exec (cpu=0x56e49560, itb=0x7fff900fd780 ) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:172 #15 0x558fa732 in cpu_loop_exec_tb (cpu=0x56e49560, tb=0x7fff900fd780 , last_tb=0x7fffd7701078, tb_exit=0x7fffd7701070) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:619 #16 0x558faa4c in cpu_exec (cpu=0x56e49560) at /home/petmay01/linaro/qemu-from-laptop/qemu/accel/tcg/cpu-exec.c:732 #17 0x558bcf29 in tcg_cpu_exec (cpu=0x56e49560) at /home/petmay01/linaro/qemu-from-laptop/qemu/cpus.c:1405 #18 0x558bd77f in qemu_tcg_cpu_thread_fn (arg=0x56e49560) at /home/petmay01/linaro/qemu-from-laptop/qemu/cpus.c:1713 #19 0x55f2ff3f in qemu_thread_start (args=0x56e8dd10) at /home/petmay01/linaro/qemu-from-laptop/qemu/util/qemu-thread-posix.c:519 #20 0x70b156db in start_thread (arg=0x7fffd7704700) at pthread_create.c:463 #21 0x7083e88f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 So under what circumstances can booke206_tlb_ways() validly return 0? Should booke206_get_tlbm() cope with a zero return, or can it assert() that it will never call booke206_tlb_ways() in a way that will cause one? thanks -- PMM
Re: [PATCH RFC 9/9] KVM: Dirty ring support
* Peter Xu (pet...@redhat.com) wrote: > On Wed, Mar 25, 2020 at 08:41:44PM +, Dr. David Alan Gilbert wrote: > > [...] > > > > +enum KVMReaperState { > > > +KVM_REAPER_NONE = 0, > > > +/* The reaper is sleeping */ > > > +KVM_REAPER_WAIT, > > > +/* The reaper is reaping for dirty pages */ > > > +KVM_REAPER_REAPING, > > > +}; > > > > That probably needs to be KVMDirtyRingReaperState > > given there are many things that could be reaped. > > Sure. > > > > > > +/* > > > + * KVM reaper instance, responsible for collecting the KVM dirty bits > > > + * via the dirty ring. > > > + */ > > > +struct KVMDirtyRingReaper { > > > +/* The reaper thread */ > > > +QemuThread reaper_thr; > > > +/* > > > + * Telling the reaper thread to wakeup. This should be used as a > > > + * generic interface to kick the reaper thread, like, in vcpu > > > + * threads where it gets a exit due to ring full. > > > + */ > > > +EventNotifier reaper_event; > > > > I think I'd just used a simple semaphore for this type of thing. > > I'm actually uncertain on which is cheaper... > > At the meantime, I wanted to poll two handles at the same time below > (in kvm_dirty_ring_reaper_thread). I don't know how to do that with > semaphore. Could it? If you're OK with EventNotifier stick with it; it's just I'm used to doing with it with a semaphore; e.g. a flag then the semaphore - but that's fine. > [...] > > > > @@ -412,6 +460,18 @@ int kvm_init_vcpu(CPUState *cpu) > > > (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE; > > > } > > > > > > +if (s->kvm_dirty_gfn_count) { > > > +cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_size, > > > + PROT_READ | PROT_WRITE, MAP_SHARED, > > > > Is the MAP_SHARED required? > > Yes it's required. It's the same when we map the per-vcpu kvm_run. > > If we use MAP_PRIVATE, it'll be in a COW fashion - when the userspace > writes to the dirty gfns the 1st time, it'll copy the current dirty > ring page in the kernel and from now on QEMU will never be able to see > what the kernel writes to the dirty gfn pages. MAP_SHARED means the > userspace and the kernel shares exactly the same page(s). OK, worth a comment. > > > > > + cpu->kvm_fd, > > > + PAGE_SIZE * > > > KVM_DIRTY_LOG_PAGE_OFFSET); > > > +if (cpu->kvm_dirty_gfns == MAP_FAILED) { > > > +ret = -errno; > > > +DPRINTF("mmap'ing vcpu dirty gfns failed\n"); > > > > Include errno? > > Will do. > > [...] > > > > +static uint64_t kvm_dirty_ring_reap(KVMState *s) > > > +{ > > > +KVMMemoryListener *kml; > > > +int ret, i, locked_count = s->nr_as; > > > +CPUState *cpu; > > > +uint64_t total = 0; > > > + > > > +/* > > > + * We need to lock all kvm slots for all address spaces here, > > > + * because: > > > + * > > > + * (1) We need to mark dirty for dirty bitmaps in multiple slots > > > + * and for tons of pages, so it's better to take the lock here > > > + * once rather than once per page. And more importantly, > > > + * > > > + * (2) We must _NOT_ publish dirty bits to the other threads > > > + * (e.g., the migration thread) via the kvm memory slot dirty > > > + * bitmaps before correctly re-protect those dirtied pages. > > > + * Otherwise we can have potential risk of data corruption if > > > + * the page data is read in the other thread before we do > > > + * reset below. > > > + */ > > > +for (i = 0; i < s->nr_as; i++) { > > > +kml = s->as[i].ml; > > > +if (!kml) { > > > +/* > > > + * This is tricky - we grow s->as[] dynamically now. Take > > > + * care of that case. We also assumed the as[] will fill > > > + * one by one starting from zero. Without this, we race > > > + * with register_smram_listener. > > > + * > > > + * TODO: make all these prettier... > > > + */ > > > +locked_count = i; > > > +break; > > > +} > > > +kvm_slots_lock(kml); > > > +} > > > + > > > +CPU_FOREACH(cpu) { > > > +total += kvm_dirty_ring_reap_one(s, cpu); > > > +} > > > + > > > +if (total) { > > > +ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS); > > > +assert(ret == total); > > > +} > > > + > > > +/* Unlock whatever locks that we have locked */ > > > +for (i = 0; i < locked_count; i++) { > > > +kvm_slots_unlock(s->as[i].ml); > > > +} > > > + > > > +CPU_FOREACH(cpu) { > > > +if (cpu->kvm_dirty_ring_full) { > > > +qemu_sem_post(>kvm_dirty_ring_avail); > > > +} > > > > Why do you need to wait until here - couldn't you release > > each vcpu after you've reaped it? > > We probably still need to wait.
[PATCH] s390x: protvirt: Fix stray error_report_err in s390_machine_protect
In case the protection of the machine fails at s390_pv_vm_enable(), we'll currently report the local_error variable. Problem is that there's no migration blocker error that we can report at this point so the pointer is always NULL which leads to a SEGFAULT. Let's remove the error report. Signed-off-by: Janosch Frank Reported-by: Marc Hartmayer Fixes: 520935eedf941da3 ("s390x: protvirt: Add migration blocker") --- hw/s390x/s390-virtio-ccw.c | 1 - 1 file changed, 1 deletion(-) diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c index 3cf19c99f3468b7d..855ecf370d6e82fa 100644 --- a/hw/s390x/s390-virtio-ccw.c +++ b/hw/s390x/s390-virtio-ccw.c @@ -358,7 +358,6 @@ static int s390_machine_protect(S390CcwMachineState *ms) rc = s390_pv_vm_enable(); if (rc) { qemu_balloon_inhibit(false); -error_report_err(local_err); migrate_del_blocker(pv_mig_blocker); error_free_or_abort(_mig_blocker); return rc; -- 2.25.1
Re: [PATCH] acpi: pcihp: fix left shift undefined behavior in acpi_pcihp_eject_slot()
On Thu, Mar 26, 2020 at 09:56:24AM -0400, Igor Mammedov wrote: > Coverity spots subj in following guest triggered code path > pci_write(, data = 0) -> acpi_pcihp_eject_slot(,slots = 0) > uinst32_t slot = ctz32(slots) > ... > ... = ~(1U << slot) > where 'slot' value is 32 in case 'slots' bitmap is empty. > 'slots' is a bitmap and empty one shouldn't do anything > so return early doing nothing if resulted slot value is > not valid (i.e. not in 0-31 range) > > Reported-by: Peter Maydell > Signed-off-by: Igor Mammedov tagged, thanks! > --- > hw/acpi/pcihp.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c > index 4dcef372bf..0dc963e983 100644 > --- a/hw/acpi/pcihp.c > +++ b/hw/acpi/pcihp.c > @@ -154,7 +154,7 @@ static void acpi_pcihp_eject_slot(AcpiPciHpState *s, > unsigned bsel, unsigned slo > > trace_acpi_pci_eject_slot(bsel, slot); > > -if (!bus) { > +if (!bus || slot > 31) { > return; > } > > -- > 2.18.1
RE: [PATCH v1 12/22] intel_iommu: add PASID cache management infrastructure
> From: Liu, Yi L > Sent: Thursday, March 26, 2020 2:15 PM > To: 'Peter Xu' > Subject: RE: [PATCH v1 12/22] intel_iommu: add PASID cache management > infrastructure > > > From: Peter Xu > > Sent: Wednesday, March 25, 2020 10:52 PM > > To: Liu, Yi L > > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache management > > infrastructure > > > > On Wed, Mar 25, 2020 at 12:20:21PM +, Liu, Yi L wrote: > > > > From: Peter Xu > > > > Sent: Wednesday, March 25, 2020 1:32 AM > > > > To: Liu, Yi L > > > > Subject: Re: [PATCH v1 12/22] intel_iommu: add PASID cache > > > > management infrastructure > > > > > > > > On Sun, Mar 22, 2020 at 05:36:09AM -0700, Liu Yi L wrote: > > > > > This patch adds a PASID cache management infrastructure based on > > > > > new added structure VTDPASIDAddressSpace, which is used to track > > > > > the PASID usage and future PASID tagged DMA address translation > > > > > support in vIOMMU. [...] > > > > > > > > > > > > > > > > > +/* > > > > > + * Step 2: loop all the exisitng vtd_dev_icx instances. > > > > > + * Ideally, needs to loop all devices to find if there is any new > > > > > + * PASID binding regards to the PASID cache invalidation request. > > > > > + * But it is enough to loop the devices which are backed by host > > > > > + * IOMMU. For devices backed by vIOMMU (a.k.a emulated devices), > > > > > + * if new PASID happened on them, their vtd_pasid_as instance > > > > > could > > > > > + * be created during future vIOMMU DMA translation. > > > > > + */ > > > > > +QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) { > > > > > +VTDPASIDAddressSpace *vtd_pasid_as; > > > > > +VTDPASIDCacheEntry *pc_entry; > > > > > +VTDPASIDEntry pe; > > > > > +VTDBus *vtd_bus = vtd_dev_icx->vtd_bus; > > > > > +uint16_t devfn = vtd_dev_icx->devfn; > > > > > +int bus_n = pci_bus_num(vtd_bus->bus); > > > > > + > > > > > +/* i) fetch vtd_pasid_as and check if it is valid */ > > > > > +vtd_pasid_as = vtd_add_find_pasid_as(s, vtd_bus, > > > > > + devfn, pasid); > > > > > > > > I don't feel like it's correct here... > > > > > > > > Assuming we have two devices assigned D1, D2. D1 uses PASID=1, D2 > > > > uses > > PASID=2. > > > > When invalidating against PASID=1, are you also going to create a > > > > VTDPASIDAddressSpace also for D2 with PASID=1? > > > > > > Answer is no. Before going forward, let's see if the below flow looks > > > good to you. > > > > > > Let me add one more device besides D1 and D2. Say device D3 which > > > also uses PASID=1. And assume it begins with no PASID usage in guest. > > > > > > Then the flow from scratch is: > > > > > > a) guest IOMMU driver setup PASID entry for D1 with PASID=1, > > >then invalidates against PASID #1 > > > b) trap to QEMU, and comes to this function. Since there is > > >no previous pasid cache invalidation, so the Step 1 of this > > >function has nothing to do, then goes to Step 2 which is to > > >loop all assigned devices and check if the guest pasid entry > > >is present. In this loop, should find D1's pasid entry for > > >PASID#1 is present. So create the VTDPASIDAddressSpace and > > >initialize its pasid_cache_entry and pasid_cache_gen fields. > > > c) guest IOMMU driver setup PASID entry for D2 with PASID=2, > > >then invalidates against PASID #2 > > > d) same with b), only difference is the Step 1 of this function > > >will loop the VTDPASIDAddressSpace created in b), but its > > >pasid is 1 which is not the target of current pasid cache > > >invalidation. Same with b), in Step 2, it will create a > > >VTDPASIDAddressSpace for D2+PASID#2 > > > e) guest IOMMU driver setup PASID entry for D3 with PASID=1, > > >then invalidates against PASID #1 > > > f) trap to QEMU and comes to this function. Step 1 loops two > > >VTDPASIDAddressSpace created in b) and d), and it finds there > > >is a VTDPASIDAddressSpace whose pasid is 1. vtd_flush_pasid() > > >will check if the cached pasid entry is the same with the one > > >in guest memory. In this flow, it should be the same, so > > >vtd_flush_pasid() will do nothing for it. Then comes to Step 2, > > >it loops D1, D2, D3. > > >- For D1, no need to do more since there is already a > > > VTDPASIDAddressSpace created for D1+PASID#1. > > >- For D2, its guest pasid entry for PASID#1 is not present, so > > > no need to do anything for it. > > >- For D3, its guest pasid entry for PASID#1 is present and it > > > is exactly the reason for this invalidation. So create a > > > VTDPASIDAddressSpace for and init the pasid_cache_entry and > > > pasid_cache_gen fields. > > > > > > > I feel like we shouldn't create VTDPASIDAddressSpace only if it > > > > existed, say, until when we reach vtd_dev_get_pe_from_pasid() below with > retcode==0. > > > > > >
[PATCH] acpi: pcihp: fix left shift undefined behavior in acpi_pcihp_eject_slot()
Coverity spots subj in following guest triggered code path pci_write(, data = 0) -> acpi_pcihp_eject_slot(,slots = 0) uinst32_t slot = ctz32(slots) ... ... = ~(1U << slot) where 'slot' value is 32 in case 'slots' bitmap is empty. 'slots' is a bitmap and empty one shouldn't do anything so return early doing nothing if resulted slot value is not valid (i.e. not in 0-31 range) Reported-by: Peter Maydell Signed-off-by: Igor Mammedov --- hw/acpi/pcihp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/acpi/pcihp.c b/hw/acpi/pcihp.c index 4dcef372bf..0dc963e983 100644 --- a/hw/acpi/pcihp.c +++ b/hw/acpi/pcihp.c @@ -154,7 +154,7 @@ static void acpi_pcihp_eject_slot(AcpiPciHpState *s, unsigned bsel, unsigned slo trace_acpi_pci_eject_slot(bsel, slot); -if (!bus) { +if (!bus || slot > 31) { return; } -- 2.18.1
Re: [PATCH for-5.0 v2 0/3] qemu-img: Fix check's leak/corruption fix report
On 24.03.20 18:27, Max Reitz wrote: > Branch: https://github.com/XanClic/qemu.git fix-check-result-v2 > Branch: https://git.xanclic.moe/XanClic/qemu.git fix-check-result-v2 > > v1: https://lists.nongnu.org/archive/html/qemu-block/2020-02/msg01418.html Thanks for the review again, applied to my block branch. Max signature.asc Description: OpenPGP digital signature
Re: [PATCH 2/2] util/bufferiszero: improve avx2 accelerator
On Thu, 2020-03-26 at 08:26 -0500, Eric Blake wrote: > On 3/25/20 9:09 PM, Hu, Robert wrote: > > (Don't know why my Linux-Evolution missed this mail.) > > > -Original Message- > > > Long line; it's nice to wrap commit messages around column 70 or > > > so (because > > > reading 'git log' in an 80-column window adds indentation). > > > > > > > [Hu, Robert] > > I think I set my vim on wrap. This probably escaped by paste. > > I ran checkpatch.pl on the patches before sending. It escaped check > > but didn't > > escaped your eagle eye Thank you. > > checkpatch doesn't flag commit message long lines. Maybe it could > be > patched to do so, but it's not at the top of my list to write that > patch. > > > > > > > I just fix a boudary case on his original patch. > > > > > > boundary > > > > [Hu, Robert] > > Emm... again spell error. Usually I would paste descriptions into > > some editors > > with spell check, but forgot this time. > > Vim doesn't have spell check I think. What editor would you suggest > > me to > > integrate with git editing? > > I'm an emacs user, so I have no suggestions for vim, but I'd be very > surprised if there were not some vim expert online that could figure > out > how to wire in a spell-checker to vim. Google quickly finds: > https://www.ostechnix.com/use-spell-check-feature-vim-text-editor/ > nice, thanks:)
[PATCH] qemu-user: fix build with LLVM lld 10
lld 10.0.0 introduced a new linker option --image-base equivalent to the GNU -Ttext-segment one, hence use it when available. This fixes the build of QEMU on systems using lld 10 or greater. Signed-off-by: Dimitry Andric Signed-off-by: Roger Pau Monné --- Cc: Laurent Vivier Cc: Richard Henderson Cc: "Philippe Mathieu-Daudé" Cc: Paolo Bonzini Cc: "Daniel P. Berrangé" --- configure | 42 +++--- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/configure b/configure index da09c35895..92d57d84fa 100755 --- a/configure +++ b/configure @@ -6514,27 +6514,31 @@ if ( [ "$linux_user" = yes ] || [ "$bsd_user" = yes ] ) && [ "$pie" = no ]; then cat > $TMPC &1; then -error_exit \ -"We need to link the QEMU user mode binaries at a" \ -"specific text address. Unfortunately your linker" \ -"doesn't support either the -Ttext-segment option or" \ -"printing the default linker script with --verbose." \ -"If you don't want the user mode binaries, pass the" \ -"--disable-user option to configure." - fi + textseg_ldflags="-Wl,-Ttext-segment=$textseg_addr" + if ! compile_prog "" "$textseg_ldflags"; then +# In case ld does not support -Ttext-segment, edit the default linker +# script via sed to set the .text start addr. This is needed on FreeBSD +# at least. +if ! $ld --verbose >/dev/null 2>&1; then + error_exit \ + "We need to link the QEMU user mode binaries at a" \ + "specific text address. Unfortunately your linker" \ + "doesn't support either the --image-base or -Ttext-segment" \ + "options or printing the default linker script with" \ + "--verbose. If you don't want the user mode binaries," \ + "pass the --disable-user option to configure." +fi - $ld --verbose | sed \ --e '1,/==/d' \ --e '/==/,$d' \ --e "s/[.] = [0-9a-fx]* [+] SIZEOF_HEADERS/. = $textseg_addr + SIZEOF_HEADERS/" \ --e "s/__executable_start = [0-9a-fx]*/__executable_start = $textseg_addr/" > config-host.ld - textseg_ldflags="-Wl,-T../config-host.ld" +$ld --verbose | sed \ + -e '1,/==/d' \ + -e '/==/,$d' \ + -e "s/[.] = [0-9a-fx]* [+] SIZEOF_HEADERS/. = $textseg_addr + SIZEOF_HEADERS/" \ + -e "s/__executable_start = [0-9a-fx]*/__executable_start = $textseg_addr/" > config-host.ld +textseg_ldflags="-Wl,-T../config-host.ld" + fi fi fi fi -- 2.26.0
Re: [PATCH RFC 8/9] KVM: Add dirty-ring-size property
* Peter Xu (pet...@redhat.com) wrote: > On Wed, Mar 25, 2020 at 08:00:31PM +, Dr. David Alan Gilbert wrote: > > > @@ -2077,6 +2079,33 @@ static int kvm_init(MachineState *ms) > > > s->memory_listener.listener.coalesced_io_add = > > > kvm_coalesce_mmio_region; > > > s->memory_listener.listener.coalesced_io_del = > > > kvm_uncoalesce_mmio_region; > > > > > > +/* > > > + * Enable KVM dirty ring if supported, otherwise fall back to > > > + * dirty logging mode > > > + */ > > > +if (s->kvm_dirty_ring_size > 0) { > > > +/* Read the max supported pages */ > > > +ret = kvm_vm_check_extension(kvm_state, KVM_CAP_DIRTY_LOG_RING); > > > +if (ret > 0) { > > > +if (s->kvm_dirty_ring_size > ret) { > > > +error_report("KVM dirty ring size %d too big (maximum is > > > %d). " > > > + "Please use a smaller value.", > > > + s->kvm_dirty_ring_size, ret); > > > +goto err; > > > +} > > > + > > > +ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, > > > +s->kvm_dirty_ring_size); > > > +if (ret) { > > > +error_report("Enabling of KVM dirty ring failed: %d", > > > ret); > > > +goto err; > > > +} > > > + > > > +s->kvm_dirty_gfn_count = > > > +s->kvm_dirty_ring_size / sizeof(struct kvm_dirty_gfn); > > > > What happens if I was to pass dirty-ring-size=1 ? > > Then the count would be 0 and things would get upset somewhere? > > Do you need to check for a minimum postive value? > > The above kvm_vm_enable_cap() should fail directly and QEMU will stop. > Yes we should check it, but kernel will do that in all cases, so I > just didn't do that explicitly again in the userspace. We probably should have that check since you can give them a more obvious error message. > I was planning this to be an advanced feature so the user of this > parameter should know the rules to pass values in. Advanced users just make advanced mistakes :-) I did wonder if perhaps this option should be a count of entries rather than a byte size. > Of course we can introduce another global knob to enable/disable this > feature as a whole, then I can offer a default value for the size > parameter. I just didn't bother that in this RFC version, but I can > switch to that if that's preferred. > > [...] > > > > @@ -3065,6 +3123,12 @@ static void kvm_accel_class_init(ObjectClass *oc, > > > void *data) > > > NULL, NULL, _abort); > > > object_class_property_set_description(oc, "kvm-shadow-mem", > > > "KVM shadow MMU size", _abort); > > > + > > > +object_class_property_add(oc, "dirty-ring-size", "int", > > > +kvm_get_dirty_ring_size, kvm_set_dirty_ring_size, > > > +NULL, NULL, _abort); > > > > I don't think someone passing in a non-number should cause an abort; > > it should exit, but I don't think it should abort/core. > > It won't - _abort is for the operation to add the property. It > should never fail. > > User illegal input will look like this (a graceful exit): > > $ ./x86_64-softmmu/qemu-system-x86_64 -accel kvm,dirty-ring-size=a > qemu-system-x86_64: -accel kvm,dirty-ring-size=a: Parameter 'dirty-ring-size' > expects int64 OK good. > > > > > +object_class_property_set_description(oc, "dirty-ring-size", > > > +"KVM dirty ring size (<=0 to disable)", _abort); > > > } > > > > > > static const TypeInfo kvm_accel_type = { > > > diff --git a/qemu-options.hx b/qemu-options.hx > > > index 224a8e8712..140bd38726 100644 > > > --- a/qemu-options.hx > > > +++ b/qemu-options.hx > > > @@ -119,6 +119,7 @@ DEF("accel", HAS_ARG, QEMU_OPTION_accel, > > > "kernel-irqchip=on|off|split controls accelerated > > > irqchip support (default=on)\n" > > > "kvm-shadow-mem=size of KVM shadow MMU in bytes\n" > > > "tb-size=n (TCG translation block cache size)\n" > > > +"dirty-ring-size=n (KVM dirty ring size in Bytes)\n" > > > "thread=single|multi (enable multi-threaded TCG)\n", > > > QEMU_ARCH_ALL) > > > STEXI > > > @item -accel @var{name}[,prop=@var{value}[,...]] > > > @@ -140,6 +141,8 @@ irqchip completely is not recommended except for > > > debugging purposes. > > > Defines the size of the KVM shadow MMU. > > > @item tb-size=@var{n} > > > Controls the size (in MiB) of the TCG translation block cache. > > > +@item dirty-ring-size=@val{n} > > > +Controls the size (in Bytes) of KVM dirty ring (<=0 to disable). > > > > I don't see the point in allowing < 0 ; I'd ban it. > > Sure; I can switch to an uint64. Great. Dave > Thanks, > > -- > Peter Xu > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers
On 26.03.20 14:35, Eric Blake wrote: > On 3/26/20 8:28 AM, Kevin Wolf wrote: >> Am 26.03.2020 um 14:20 hat Eric Blake geschrieben: +++ b/block/file-posix.c @@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, + .bdrv_co_create_opts = bdrv_co_create_opts_simple, + .create_opts = _create_opts_simple, >>> >>> I'd drop the leading & for consistency with the rest of this struct >>> initializer. >> >> This one isn't a function pointer, so I think the & is necessary. > > Ah, right. Visual pattern-matching failed me, since I didn't read the > actual types in the .h file. > > Hmm - is it possible to write the patch in such a way that .create_opts > can be left NULL when .bdrv_co_create_opts is bdrv_co_create_opts_simple? Setting .create_opts is actually the main point of this series, so we don’t have to look for and fix all places that decide whether a block driver is capable of image creation based on whether it’s set or not. Max signature.asc Description: OpenPGP digital signature
Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'
On Thu, 26 Mar 2020 09:31:09 -0400 "Michael S. Tsirkin" wrote: > On Thu, Mar 26, 2020 at 09:28:27AM -0400, Michael S. Tsirkin wrote: > > On Thu, Mar 26, 2020 at 02:23:17PM +0100, Igor Mammedov wrote: > > > On Thu, 26 Mar 2020 11:52:36 + > > > Peter Maydell wrote: > > > > > > > Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot() > > > > is passed a zero 'slots' argument then ctz32(slots) will return 32, > > > > and then the code that does '1U << slot' is C undefined behaviour > > > > because it's an oversized shift. (This is CID 1421896.) > > > > > > > > Since the pci_write() function in this file can call > > > > acpi_pcihp_eject_slot() with an arbitrary value from the guest, > > > > I think we need to handle 'slots == 0' safely. But what should > > > > the behaviour be? > > > > > > it also uncovers a bug, where we are not able to eject slot 0 on bridge, > > > > > > And that is by design. A standard PCI SHPC register can support up to 31 > > hotpluggable slots. So we chose slot 0 as non hotpluggable. > > It's consistent across SHPC, PCI-E, so I made ACPI match. > > Sorry I was confused. It's a PCI thing, PCI-E does not have > slot numbers for downstream ports at all. Scratch that, it was mistake on my part where I tests with a change that masks 0 and wrongly at that. slot 0 on bridge can be removed just fine > > > You can't hot-add it either. > > > > > can be reproduced with: > > > > > > -enable-kvm -m 4G -device pci-bridge,chassis_nr=1 -global > > > PIIX4_PM.acpi-pci-hotplug-with-bridge-support=on -device > > > virtio-net-pci,bus=pci.1,addr=0,id=netdev12 > > > > > > (monitor) device_del netdev12 > > > (monitor) qtree # still shows the device > > > > > > reason is that acpi_pcihp_eject_slot() > > >if (PCI_SLOT(dev->devfn) == slot) { # doesn't match (0 != 32) > > > > > > so device is not deleted > > > > We should probably teach QEMU that some slots aren't hotpluggable > > even if device in it is hotpluggable in theory. But that is > > a separate issue. > > > > > > thanks > > > > -- PMM > > > > >
Re: [PATCH 0/2] Fix the generic image creation code
On 26.03.20 13:23, Max Reitz wrote: > On 26.03.20 02:12, Maxim Levitsky wrote: >> The recent patches from Max Reitz allowed some block drivers to not >> provide the .bdrv_co_create_opts and still allow qemu-img to >> create/format images as long as the image is already existing >> (that is the case with various block storage drivers like nbd/iscsi/nvme, >> etc) >> >> However it was found out that some places in the code depend on the >> .bdrv_co_create_opts/.create_opts to be != NULL to decide if to allow >> image creation. >> >> To avoid adding failback code to all these places, just make generic failback >> code be used by the drivers that need it, so that for outside user, there >> is no diffirence if failback was used or not. >> >> Best regards, >> Maxim Levitsky >> >> Maxim Levitsky (2): >> block: pass BlockDriver reference to the .bdrv_co_create >> block: trickle down the fallback image creation function use to the >> block drivers > > Thanks, fixed the function parameter alignment, moved the declarations > from block.h into block_int.h, and applied the series to my block branch: (And the spelling fixes suggested by Eric) > https://git.xanclic.moe/XanClic/qemu/commits/branch/block > > Max > signature.asc Description: OpenPGP digital signature
Re: [PATCH v6 7/7] virtio-net: add migration support for RSS and hash report
Code looks OK but patchew testing shows failures. If they are false positives pls reply to that mail. On Thu, Mar 26, 2020 at 02:34:39PM +0200, Yuri Benditovich wrote: > ping > > On Fri, Mar 20, 2020 at 1:58 PM Yuri Benditovich > wrote: > > Save and restore RSS/hash report configuration. > > Signed-off-by: Yuri Benditovich > --- > hw/net/virtio-net.c | 37 + > 1 file changed, 37 insertions(+) > > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c > index a0614ad4e6..7de7587abd 100644 > --- a/hw/net/virtio-net.c > +++ b/hw/net/virtio-net.c > @@ -2842,6 +2842,13 @@ static int virtio_net_post_load_device(void > *opaque, > int version_id) > } > } > > + if (n->rss_data.enabled) { > + trace_virtio_net_rss_enable(n->rss_data.hash_types, > + n->rss_data.indirections_len, > + sizeof(n->rss_data.key)); > + } else { > + trace_virtio_net_rss_disable(); > + } > return 0; > } > > @@ -3019,6 +3026,32 @@ static const VMStateDescription > vmstate_virtio_net_has_vnet = { > }, > }; > > +static bool virtio_net_rss_needed(void *opaque) > +{ > + return VIRTIO_NET(opaque)->rss_data.enabled; > +} > + > +static const VMStateDescription vmstate_virtio_net_rss = { > + .name = "virtio-net-device/rss", > + .version_id = 1, > + .minimum_version_id = 1, > + .needed = virtio_net_rss_needed, > + .fields = (VMStateField[]) { > + VMSTATE_BOOL(rss_data.enabled, VirtIONet), > + VMSTATE_BOOL(rss_data.redirect, VirtIONet), > + VMSTATE_BOOL(rss_data.populate_hash, VirtIONet), > + VMSTATE_UINT32(rss_data.hash_types, VirtIONet), > + VMSTATE_UINT16(rss_data.indirections_len, VirtIONet), > + VMSTATE_UINT16(rss_data.default_queue, VirtIONet), > + VMSTATE_UINT8_ARRAY(rss_data.key, VirtIONet, > + VIRTIO_NET_RSS_MAX_KEY_SIZE), > + VMSTATE_VARRAY_UINT16_ALLOC(rss_data.indirections_table, > VirtIONet, > + rss_data.indirections_len, 0, > + vmstate_info_uint16, uint16_t), > + VMSTATE_END_OF_LIST() > + }, > +}; > + > static const VMStateDescription vmstate_virtio_net_device = { > .name = "virtio-net-device", > .version_id = VIRTIO_NET_VM_VERSION, > @@ -3069,6 +3102,10 @@ static const VMStateDescription > vmstate_virtio_net_device = { > has_ctrl_guest_offloads), > VMSTATE_END_OF_LIST() > }, > + .subsections = (const VMStateDescription * []) { > + _virtio_net_rss, > + NULL > + } > }; > > static NetClientInfo net_virtio_info = { > -- > 2.17.1 > >
Re: [PATCH for-5.0 v2 0/4] bug fixes extracted from larger qcow2 zero bit work
On 24.03.20 18:42, Eric Blake wrote: > My proposal [1] to add an autoclear all-zero-content bit to the qcow2 > format has now stalled into 5.1 territory, but several patches in my > initial proposal are uncontroversial and obvious bug fixes worth > having in 5.0. > > [1] https://lists.gnu.org/archive/html/qemu-devel/2020-01/msg08075.html > > Eric Blake (4): > qcow2: Comment typo fixes > qcow2: List autoclear bit names in header > qcow2: Avoid feature name extension on small cluster size > sheepdog: Consistently set bdrv_has_zero_init_truncate OK, so my only real gripe was with the question whether we could make patch 3 decide dynamically when to include or not include the feature name table. There’s little difference in practice: Right now, we could probably get away with including it in images with 4k clusters, then it would automatically switch to 8k clusters minimum (which is the limit chosen in patch 3 as-is), and at some theoretical point in the far, far future it would switch 16k clusters minimum. I don’t think anyone really cares about whether the feature name table is in images with 4k clusters or not, and I think we still have a couple of decades before we the table gets too big for images with 8k clusters (and we probably won’t be using qcow2 then anymore). So tl;dr: There’s no practical benefit of deciding dynamically, hence I’m taking this series as-is: https://git.xanclic.moe/XanClic/qemu/commits/branch/block Thanks! Max signature.asc Description: OpenPGP digital signature
Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers
On 3/26/20 8:28 AM, Kevin Wolf wrote: Am 26.03.2020 um 14:20 hat Eric Blake geschrieben: +++ b/block/file-posix.c @@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, +.bdrv_co_create_opts = bdrv_co_create_opts_simple, +.create_opts = _create_opts_simple, I'd drop the leading & for consistency with the rest of this struct initializer. This one isn't a function pointer, so I think the & is necessary. Ah, right. Visual pattern-matching failed me, since I didn't read the actual types in the .h file. Hmm - is it possible to write the patch in such a way that .create_opts can be left NULL when .bdrv_co_create_opts is bdrv_co_create_opts_simple? -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v6 0/7] reference implementation of RSS and hash report
On Fri, Mar 20, 2020 at 01:57:44PM +0200, Yuri Benditovich wrote: > Support for VIRTIO_NET_F_RSS and VIRTIO_NET_F_HASH_REPORT > features in QEMU for reference purpose. > Implements Toeplitz hash calculation for incoming > packets according to configuration provided by driver. > Uses calculated hash for decision on receive virtqueue > and/or reports the hash in the virtio header Series: Reviewed-by: Michael S. Tsirkin > Changes from v5: > RSS migration state moved to subsection and migrated > only if enabled (patch 7) > Updated sign off (patch 6) > > Yuri Benditovich (7): > virtio-net: introduce RSS and hash report features > virtio-net: implement RSS configuration command > virtio-net: implement RX RSS processing > tap: allow extended virtio header with hash info > virtio-net: reference implementation of hash report > vmstate.h: provide VMSTATE_VARRAY_UINT16_ALLOC macro > virtio-net: add migration support for RSS and hash report > > hw/net/trace-events| 3 + > hw/net/virtio-net.c| 448 +++-- > include/hw/virtio/virtio-net.h | 16 ++ > include/migration/vmstate.h| 10 + > net/tap.c | 11 +- > 5 files changed, 460 insertions(+), 28 deletions(-) > > -- > 2.17.1
Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers
On Thu, 2020-03-26 at 08:20 -0500, Eric Blake wrote: > On 3/25/20 8:12 PM, Maxim Levitsky wrote: > > Instead of checking the .bdrv_co_create_opts to see if we need the failback, > > fallback 100% true. > > > just implement the .bdrv_co_create_opts in the drivers that need it. > > > > This way we don't break various places that need to know if the underlying > > protocol/format really supports image creation, and this way we still > > allow some drivers to not support image creation. > > > > Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd > > Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816007 > > > > Note that technically this driver reverts the image creation failback > > fallback > > > for the vxhs driver since I don't have a means to test it, > > and IMHO it is better to leave it not supported as it was prior to > > generic image creation patches. > > > > Also drop iscsi_create_opts which was left accidently > > accidentally True. I did a spell check on the commit message, but I guess I updated it afterward with this. > > > > > Signed-off-by: Maxim Levitsky > > --- > > +++ b/block/file-posix.c > > @@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = { > > .bdrv_reopen_prepare = raw_reopen_prepare, > > .bdrv_reopen_commit = raw_reopen_commit, > > .bdrv_reopen_abort = raw_reopen_abort, > > +.bdrv_co_create_opts = bdrv_co_create_opts_simple, > > +.create_opts = _create_opts_simple, > > I'd drop the leading & for consistency with the rest of this struct > initializer. Can I? This is struct reference and I think that only for function references, the leading & is optional. Best regards, Maxim Levitsky
Re: [PATCH v6 0/7] reference implementation of RSS and hash report
On Fri, Mar 20, 2020 at 06:38:51AM -0700, no-re...@patchew.org wrote: > qemu-system-x86_64: -accel kvm: failed to initialize kvm: No such file or > directory > qemu-system-x86_64: falling back to tcg > = > ==8831==ERROR: AddressSanitizer: stack-buffer-overflow on address > 0x7ffc0dacbe80 at pc 0x55ee01d9e92a bp 0x7ffc0dac7080 sp 0x7ffc0dac6830 > READ of size 1518 at 0x7ffc0dacbe80 thread T0 > #0 0x55ee01d9e929 in __asan_memcpy > (/tmp/qemu-test/build/x86_64-softmmu/qemu-system-x86_64+0x1acb929) > #1 0x55ee04694dad in iov_from_buf_full /tmp/qemu-test/src/util/iov.c:33:13 Hmm this seems to be iov related so could be triggered by the new patches. Could you investigate pls? -- MST
RE: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb invalidation to host
> From: Peter Xu > Sent: Thursday, March 26, 2020 9:23 PM > To: Liu, Yi L > Subject: Re: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb > invalidation to host > > On Thu, Mar 26, 2020 at 09:02:48AM -0400, Peter Xu wrote: > > [...] > > > > > > > +static inline bool vtd_pasid_cache_valid( > > > > > > + VTDPASIDAddressSpace *vtd_pasid_as) { > > > > > > +return vtd_pasid_as->iommu_state && > > ^ > > > > > > > > > > > > This check can be dropped because always true? > > > > > > > > > > If you agree with both the changes, please add: > > > > > > > > > > Reviewed-by: Peter Xu > > > > > > > > I think the code should ensure all the pasid_as in hash table is > > > > valid. And we can since all the operations are under protection of > > > > iommu_lock. > > > > > > > Peter, > > > > > > I think my reply was wrong. pasid_as in has table may be stale since > > > the per pasid_as cache_gen may be not identical with the cache_gen > > > in iommu_state. e.g. vtd_pasid_cache_reset() only increases the > > > cache_gen in iommu_state. So there will be pasid_as in hash table > > > which has cached pasid entry but its cache_gen is not equal to the > > > one in iommu_state. For such pasid_as, we should treat it as stale. > > > So I guess the vtd_pasid_cache_valid() is still necessary. > > > > I guess you misread my comment. :) > > > > I was saying the "vtd_pasid_as->iommu_state" check is not needed, > > because iommu_state was always set if the address space is created. > > vtd_pasid_cache_valid() is needed. > > > > Also, please double confirm that vtd_pasid_cache_reset() should drop > > all the address spaces (as I think it should), not "only increase the > > cache_gen". IMHO you should only increase the cache_gen in the PSI > > hook (vtd_pasid_cache_psi()) only. > > Sorry, I mean GSI (vtd_pasid_cache_gsi), not PSI. Got it.. Really confused me. :-) Regards, Yi Liu
Re: [PATCH v6 7/7] virtio-net: add migration support for RSS and hash report
Yuri Benditovich wrote: > Save and restore RSS/hash report configuration. > > Signed-off-by: Yuri Benditovich Reviewed-by: Juan Quintela sorry, hadn't seen it. vmstate parts are right.
Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'
On Thu, Mar 26, 2020 at 01:50:41PM +0100, Igor Mammedov wrote: > On Thu, 26 Mar 2020 13:29:01 +0100 > Igor Mammedov wrote: > > > On Thu, 26 Mar 2020 11:52:36 + > > Peter Maydell wrote: > > > > > Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot() > > > is passed a zero 'slots' argument then ctz32(slots) will return 32, > > > and then the code that does '1U << slot' is C undefined behaviour > > > because it's an oversized shift. (This is CID 1421896.) > > > > > > Since the pci_write() function in this file can call > > > acpi_pcihp_eject_slot() with an arbitrary value from the guest, > > > I think we need to handle 'slots == 0' safely. But what should > > > the behaviour be? > > > > 0 is not valid value, we should ignore and return early in this case > > like we do with bsel. I'll post a path shortly. > well, looking more that is only true for main bus, for bridges it can be > slot number can be zero, It can but we don't allow slot zero hotplug with SHPC so it's easier if we don't allow this with ACPI either. > then AML left shifts it and writes into B0EJ > which traps into pci_write(, data) and that is supposed to eject > slot 0 according to guest(AML). > > Michael, > what's your take on it? > > > > > > > > > thanks > > > -- PMM > > > > > > >
Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'
On Thu, Mar 26, 2020 at 09:28:27AM -0400, Michael S. Tsirkin wrote: > On Thu, Mar 26, 2020 at 02:23:17PM +0100, Igor Mammedov wrote: > > On Thu, 26 Mar 2020 11:52:36 + > > Peter Maydell wrote: > > > > > Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot() > > > is passed a zero 'slots' argument then ctz32(slots) will return 32, > > > and then the code that does '1U << slot' is C undefined behaviour > > > because it's an oversized shift. (This is CID 1421896.) > > > > > > Since the pci_write() function in this file can call > > > acpi_pcihp_eject_slot() with an arbitrary value from the guest, > > > I think we need to handle 'slots == 0' safely. But what should > > > the behaviour be? > > > > it also uncovers a bug, where we are not able to eject slot 0 on bridge, > > > And that is by design. A standard PCI SHPC register can support up to 31 > hotpluggable slots. So we chose slot 0 as non hotpluggable. > It's consistent across SHPC, PCI-E, so I made ACPI match. Sorry I was confused. It's a PCI thing, PCI-E does not have slot numbers for downstream ports at all. > You can't hot-add it either. > > > can be reproduced with: > > > > -enable-kvm -m 4G -device pci-bridge,chassis_nr=1 -global > > PIIX4_PM.acpi-pci-hotplug-with-bridge-support=on -device > > virtio-net-pci,bus=pci.1,addr=0,id=netdev12 > > > > (monitor) device_del netdev12 > > (monitor) qtree # still shows the device > > > > reason is that acpi_pcihp_eject_slot() > >if (PCI_SLOT(dev->devfn) == slot) { # doesn't match (0 != 32) > > > > so device is not deleted > > We should probably teach QEMU that some slots aren't hotpluggable > even if device in it is hotpluggable in theory. But that is > a separate issue. > > > > thanks > > > -- PMM > > >
Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers
Am 26.03.2020 um 14:20 hat Eric Blake geschrieben: > > +++ b/block/file-posix.c > > @@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = { > > .bdrv_reopen_prepare = raw_reopen_prepare, > > .bdrv_reopen_commit = raw_reopen_commit, > > .bdrv_reopen_abort = raw_reopen_abort, > > +.bdrv_co_create_opts = bdrv_co_create_opts_simple, > > +.create_opts = _create_opts_simple, > > I'd drop the leading & for consistency with the rest of this struct > initializer. This one isn't a function pointer, so I think the & is necessary. Kevin
Re: [PATCH 1/2] block: pass BlockDriver reference to the .bdrv_co_create
On 3/26/20 8:22 AM, Maxim Levitsky wrote: On Thu, 2020-03-26 at 08:18 -0500, Eric Blake wrote: On 3/25/20 8:12 PM, Maxim Levitsky wrote: This will allow to reuse a single generic .bdrv_co_create "allow to ${verb}" is not idiomatic, better is "allow ${subject} to ${verb}" or "allow ${verb}ing". In this case, I'd go with: This will allow the reuse of a single... I agree! This commit will probably go as is but next time I'll keep it in mind! Max hasn't sent the pull request yet; there's still time for him to amend his staging queue if he wants. But yeah, it's not a huge deal if the patch goes in without spelling/grammar polish. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'
On Thu, Mar 26, 2020 at 02:23:17PM +0100, Igor Mammedov wrote: > On Thu, 26 Mar 2020 11:52:36 + > Peter Maydell wrote: > > > Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot() > > is passed a zero 'slots' argument then ctz32(slots) will return 32, > > and then the code that does '1U << slot' is C undefined behaviour > > because it's an oversized shift. (This is CID 1421896.) > > > > Since the pci_write() function in this file can call > > acpi_pcihp_eject_slot() with an arbitrary value from the guest, > > I think we need to handle 'slots == 0' safely. But what should > > the behaviour be? > > it also uncovers a bug, where we are not able to eject slot 0 on bridge, And that is by design. A standard PCI SHPC register can support up to 31 hotpluggable slots. So we chose slot 0 as non hotpluggable. It's consistent across SHPC, PCI-E, so I made ACPI match. You can't hot-add it either. > can be reproduced with: > > -enable-kvm -m 4G -device pci-bridge,chassis_nr=1 -global > PIIX4_PM.acpi-pci-hotplug-with-bridge-support=on -device > virtio-net-pci,bus=pci.1,addr=0,id=netdev12 > > (monitor) device_del netdev12 > (monitor) qtree # still shows the device > > reason is that acpi_pcihp_eject_slot() >if (PCI_SLOT(dev->devfn) == slot) { # doesn't match (0 != 32) > > so device is not deleted We should probably teach QEMU that some slots aren't hotpluggable even if device in it is hotpluggable in theory. But that is a separate issue. > > thanks > > -- PMM > >
Re: [PATCH 2/2] util/bufferiszero: improve avx2 accelerator
On 3/25/20 9:09 PM, Hu, Robert wrote: (Don't know why my Linux-Evolution missed this mail.) -Original Message- Long line; it's nice to wrap commit messages around column 70 or so (because reading 'git log' in an 80-column window adds indentation). [Hu, Robert] I think I set my vim on wrap. This probably escaped by paste. I ran checkpatch.pl on the patches before sending. It escaped check but didn't escaped your eagle eye Thank you. checkpatch doesn't flag commit message long lines. Maybe it could be patched to do so, but it's not at the top of my list to write that patch. I just fix a boudary case on his original patch. boundary [Hu, Robert] Emm... again spell error. Usually I would paste descriptions into some editors with spell check, but forgot this time. Vim doesn't have spell check I think. What editor would you suggest me to integrate with git editing? I'm an emacs user, so I have no suggestions for vim, but I'd be very surprised if there were not some vim expert online that could figure out how to wire in a spell-checker to vim. Google quickly finds: https://www.ostechnix.com/use-spell-check-feature-vim-text-editor/ -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
RE: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb invalidation to host
> From: Peter Xu > Sent: Thursday, March 26, 2020 9:03 PM > To: Liu, Yi L > Subject: Re: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb > invalidation to host > > On Thu, Mar 26, 2020 at 05:41:39AM +, Liu, Yi L wrote: > > > From: Liu, Yi L > > > Sent: Wednesday, March 25, 2020 9:22 PM > > > To: 'Peter Xu' > > > Subject: RE: [PATCH v1 20/22] intel_iommu: propagate PASID-based > > > iotlb invalidation to host > > > > > > > From: Peter Xu > > > > Sent: Wednesday, March 25, 2020 2:34 AM > > > > To: Liu, Yi L > > > > Subject: Re: [PATCH v1 20/22] intel_iommu: propagate PASID-based > > > > iotlb invalidation to host > > > > > > > > On Sun, Mar 22, 2020 at 05:36:17AM -0700, Liu Yi L wrote: > > > > > This patch propagates PASID-based iotlb invalidation to host. > > > > > > > > > > Intel VT-d 3.0 supports nested translation in PASID granular. > > > > > Guest SVA support could be implemented by configuring nested > > > > > translation on specific PASID. This is also known as dual stage > > > > > DMA translation. > > > > > > > > > > Under such configuration, guest owns the GVA->GPA translation > > > > > which is configured as first level page table in host side for a > > > > > specific pasid, and host owns GPA->HPA translation. As guest > > > > > owns first level translation table, piotlb invalidation should > > > > > be propagated to host since host IOMMU will cache first level > > > > > page table related mappings during DMA address translation. > > > > > > > > > > This patch traps the guest PASID-based iotlb flush and propagate > > > > > it to host. > > > > > > > > > > Cc: Kevin Tian > > > > > Cc: Jacob Pan > > > > > Cc: Peter Xu > > > > > Cc: Yi Sun > > > > > Cc: Paolo Bonzini > > > > > Cc: Richard Henderson > > > > > Cc: Eduardo Habkost > > > > > Signed-off-by: Liu Yi L > > > > > --- > > > > > hw/i386/intel_iommu.c | 139 > > > > + > > > > > hw/i386/intel_iommu_internal.h | 7 +++ > > > > > 2 files changed, 146 insertions(+) > > > > > > > > > > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index > > > > > b9ac07d..10d314d 100644 > > > > > --- a/hw/i386/intel_iommu.c > > > > > +++ b/hw/i386/intel_iommu.c > > > > > @@ -3134,15 +3134,154 @@ static bool > > > > vtd_process_pasid_desc(IntelIOMMUState *s, > > > > > return (ret == 0) ? true : false; } > > > > > > > > > > +/** > > > > > + * Caller of this function should hold iommu_lock. > > > > > + */ > > > > > +static void vtd_invalidate_piotlb(IntelIOMMUState *s, > > > > > + VTDBus *vtd_bus, > > > > > + int devfn, > > > > > + DualIOMMUStage1Cache > > > > > +*stage1_cache) { > > > > > +VTDHostIOMMUContext *vtd_dev_icx; > > > > > +HostIOMMUContext *host_icx; > > > > > + > > > > > +vtd_dev_icx = vtd_bus->dev_icx[devfn]; > > > > > +if (!vtd_dev_icx) { > > > > > +goto out; > > > > > +} > > > > > +host_icx = vtd_dev_icx->host_icx; > > > > > +if (!host_icx) { > > > > > +goto out; > > > > > +} > > > > > +if (host_iommu_ctx_flush_stage1_cache(host_icx, stage1_cache)) { > > > > > +error_report("Cache flush failed"); > > > > > > > > I think this should not easily be triggered by the guest, but just > > > > in case... Let's use > > > > error_report_once() to be safe. > > > > > > Agreed. > > > > > > > > +} > > > > > +out: > > > > > +return; > > > > > +} > > > > > + > > > > > +static inline bool vtd_pasid_cache_valid( > > > > > + VTDPASIDAddressSpace *vtd_pasid_as) { > > > > > +return vtd_pasid_as->iommu_state && > ^ > > > > > > > > > This check can be dropped because always true? > > > > > > > > If you agree with both the changes, please add: > > > > > > > > Reviewed-by: Peter Xu > > > > > > I think the code should ensure all the pasid_as in hash table is > > > valid. And we can since all the operations are under protection of > > > iommu_lock. > > > > > Peter, > > > > I think my reply was wrong. pasid_as in has table may be stale since > > the per pasid_as cache_gen may be not identical with the cache_gen in > > iommu_state. e.g. vtd_pasid_cache_reset() only increases the cache_gen > > in iommu_state. So there will be pasid_as in hash table which has > > cached pasid entry but its cache_gen is not equal to the one in > > iommu_state. For such pasid_as, we should treat it as stale. > > So I guess the vtd_pasid_cache_valid() is still necessary. > > I guess you misread my comment. :) > > I was saying the "vtd_pasid_as->iommu_state" check is not needed, because > iommu_state was always set if the address space is created. > vtd_pasid_cache_valid() is needed. ok, I see. > Also, please double confirm that vtd_pasid_cache_reset() should drop all the > address spaces (as I think it should), not "only increase the cache_gen". yes, I'm just
Re: acpi_pcihp_eject_slot() bug if passed 'slots == 0'
On Thu, 26 Mar 2020 11:52:36 + Peter Maydell wrote: > Hi; Coverity spots that if hw/acpi/pcihp.c:acpi_pcihp_eject_slot() > is passed a zero 'slots' argument then ctz32(slots) will return 32, > and then the code that does '1U << slot' is C undefined behaviour > because it's an oversized shift. (This is CID 1421896.) > > Since the pci_write() function in this file can call > acpi_pcihp_eject_slot() with an arbitrary value from the guest, > I think we need to handle 'slots == 0' safely. But what should > the behaviour be? it also uncovers a bug, where we are not able to eject slot 0 on bridge, can be reproduced with: -enable-kvm -m 4G -device pci-bridge,chassis_nr=1 -global PIIX4_PM.acpi-pci-hotplug-with-bridge-support=on -device virtio-net-pci,bus=pci.1,addr=0,id=netdev12 (monitor) device_del netdev12 (monitor) qtree # still shows the device reason is that acpi_pcihp_eject_slot() if (PCI_SLOT(dev->devfn) == slot) { # doesn't match (0 != 32) so device is not deleted > thanks > -- PMM >
Re: [PATCH v1 20/22] intel_iommu: propagate PASID-based iotlb invalidation to host
On Thu, Mar 26, 2020 at 09:02:48AM -0400, Peter Xu wrote: [...] > > > > > +static inline bool vtd_pasid_cache_valid( > > > > > + VTDPASIDAddressSpace *vtd_pasid_as) { > > > > > +return vtd_pasid_as->iommu_state && > ^ > > > > > > > > > This check can be dropped because always true? > > > > > > > > If you agree with both the changes, please add: > > > > > > > > Reviewed-by: Peter Xu > > > > > > I think the code should ensure all the pasid_as in hash table is valid. > > > And we can > > > since all the operations are under protection of iommu_lock. > > > > > Peter, > > > > I think my reply was wrong. pasid_as in has table may be stale since > > the per pasid_as cache_gen may be not identical with the cache_gen > > in iommu_state. e.g. vtd_pasid_cache_reset() only increases the > > cache_gen in iommu_state. So there will be pasid_as in hash table > > which has cached pasid entry but its cache_gen is not equal to the > > one in iommu_state. For such pasid_as, we should treat it as stale. > > So I guess the vtd_pasid_cache_valid() is still necessary. > > I guess you misread my comment. :) > > I was saying the "vtd_pasid_as->iommu_state" check is not needed, > because iommu_state was always set if the address space is created. > vtd_pasid_cache_valid() is needed. > > Also, please double confirm that vtd_pasid_cache_reset() should drop > all the address spaces (as I think it should), not "only increase the > cache_gen". IMHO you should only increase the cache_gen in the PSI > hook (vtd_pasid_cache_psi()) only. Sorry, I mean GSI (vtd_pasid_cache_gsi), not PSI. -- Peter Xu
Re: [PATCH 1/2] block: pass BlockDriver reference to the .bdrv_co_create
On Thu, 2020-03-26 at 08:18 -0500, Eric Blake wrote: > On 3/25/20 8:12 PM, Maxim Levitsky wrote: > > This will allow to reuse a single generic .bdrv_co_create > > "allow to ${verb}" is not idiomatic, better is "allow ${subject} to > ${verb}" or "allow ${verb}ing". In this case, I'd go with: > > This will allow the reuse of a single... I agree! This commit will probably go as is but next time I'll keep it in mind! Best regards, Maxim Levitsky > > > implementation for several drivers. > > No functional changes. > > > > Signed-off-by: Maxim Levitsky > > ---
Re: [PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers
On 3/25/20 8:12 PM, Maxim Levitsky wrote: Instead of checking the .bdrv_co_create_opts to see if we need the failback, fallback just implement the .bdrv_co_create_opts in the drivers that need it. This way we don't break various places that need to know if the underlying protocol/format really supports image creation, and this way we still allow some drivers to not support image creation. Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816007 Note that technically this driver reverts the image creation failback fallback for the vxhs driver since I don't have a means to test it, and IMHO it is better to leave it not supported as it was prior to generic image creation patches. Also drop iscsi_create_opts which was left accidently accidentally Signed-off-by: Maxim Levitsky --- +++ b/block/file-posix.c @@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, .bdrv_reopen_abort = raw_reopen_abort, +.bdrv_co_create_opts = bdrv_co_create_opts_simple, +.create_opts = _create_opts_simple, I'd drop the leading & for consistency with the rest of this struct initializer. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH 1/2] block: pass BlockDriver reference to the .bdrv_co_create
On 3/25/20 8:12 PM, Maxim Levitsky wrote: This will allow to reuse a single generic .bdrv_co_create "allow to ${verb}" is not idiomatic, better is "allow ${subject} to ${verb}" or "allow ${verb}ing". In this case, I'd go with: This will allow the reuse of a single... implementation for several drivers. No functional changes. Signed-off-by: Maxim Levitsky --- -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
RE: [RFC v6 05/24] memory: Introduce IOMMU Memory Region inject_faults API
Hi Eric, I'm also considering how to inject iommu fault to vIOMMU. As our previous discussion (long time ago), MemoryRegion way doesn't work well for VTd case. So I'd like see your opinion on the proposal below: I've a patch to make vIOMMUs register PCIIOMMUOps to PCI layer. Current usage is to get address space and set/unset HostIOMMUContext (added by me). I think it may be also nice to add the fault injection callback in the PCIIOMMUOps. Thoughts? https://patchwork.ozlabs.org/patch/1259645/ Regards, Yi Liu > From: Eric Auger > Sent: Saturday, March 21, 2020 12:58 AM > To: eric.auger@gmail.com; eric.au...@redhat.com; qemu-devel@nongnu.org; > Subject: [RFC v6 05/24] memory: Introduce IOMMU Memory Region inject_faults > API > > This new API allows to inject @count iommu_faults into > the IOMMU memory region. > > Signed-off-by: Eric Auger > --- > include/exec/memory.h | 25 + > memory.c | 10 ++ > 2 files changed, 35 insertions(+) > > diff --git a/include/exec/memory.h b/include/exec/memory.h > index f2c773163f..141a5dc197 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -57,6 +57,8 @@ struct MemoryRegionMmio { > CPUWriteMemoryFunc *write[3]; > }; > > +struct iommu_fault; > + > typedef struct IOMMUTLBEntry IOMMUTLBEntry; > > /* See address_space_translate: bit 0 is read, bit 1 is write. */ > @@ -357,6 +359,19 @@ typedef struct IOMMUMemoryRegionClass { > * @iommu: the IOMMUMemoryRegion > */ > int (*num_indexes)(IOMMUMemoryRegion *iommu); > + > +/* > + * Inject @count faults into the IOMMU memory region > + * > + * Optional method: if this method is not provided, then > + * memory_region_injection_faults() will return -ENOENT > + * > + * @iommu: the IOMMU memory region to inject the faults in > + * @count: number of faults to inject > + * @buf: fault buffer > + */ > +int (*inject_faults)(IOMMUMemoryRegion *iommu, int count, > + struct iommu_fault *buf); > } IOMMUMemoryRegionClass; > > typedef struct CoalescedMemoryRange CoalescedMemoryRange; > @@ -1365,6 +1380,16 @@ int > memory_region_iommu_attrs_to_index(IOMMUMemoryRegion *iommu_mr, > */ > int memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr); > > +/** > + * memory_region_inject_faults : inject @count faults stored in @buf > + * > + * @iommu_mr: the IOMMU memory region > + * @count: number of faults to be injected > + * @buf: buffer containing the faults > + */ > +int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, > +struct iommu_fault *buf); > + > /** > * memory_region_name: get a memory region's name > * > diff --git a/memory.c b/memory.c > index 09be40edd2..9cdd77e0de 100644 > --- a/memory.c > +++ b/memory.c > @@ -2001,6 +2001,16 @@ int > memory_region_iommu_num_indexes(IOMMUMemoryRegion *iommu_mr) > return imrc->num_indexes(iommu_mr); > } > > +int memory_region_inject_faults(IOMMUMemoryRegion *iommu_mr, int count, > +struct iommu_fault *buf) > +{ > +IOMMUMemoryRegionClass *imrc = > IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr); > +if (!imrc->inject_faults) { > +return -ENOENT; > +} > +return imrc->inject_faults(iommu_mr, count, buf); > +} > + > void memory_region_set_log(MemoryRegion *mr, bool log, unsigned client) > { > uint8_t mask = 1 << client; > -- > 2.20.1
[Bug 1867519] Re: qemu 4.2 segfaults on VF detach
** Merge proposal linked: https://code.launchpad.net/~paelzer/ubuntu/+source/qemu/+git/qemu/+merge/381033 -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1867519 Title: qemu 4.2 segfaults on VF detach Status in QEMU: Fix Committed Status in qemu package in Ubuntu: Fix Released Bug description: After updating Ubuntu 20.04 to the Beta version, we get the following error and the virtual machines stucks when detaching PCI devices using virsh command: Error: error: Failed to detach device from /tmp/vf_interface_attached.xml error: internal error: End of file from qemu monitor steps to reproduce: 1. create a VM over Ubuntu 20.04 (5.4.0-14-generic) 2. attach PCI device to this VM (Mellanox VF for example) 3. try to detaching the PCI device using virsh command: a. create a pci interface xml file: b. #virsh detach-device - Ubuntu release: Description:Ubuntu Focal Fossa (development branch) Release:20.04 - Package ver: libvirt0: Installed: 6.0.0-0ubuntu3 Candidate: 6.0.0-0ubuntu5 Version table: 6.0.0-0ubuntu5 500 500 http://il.archive.ubuntu.com/ubuntu focal/main amd64 Packages *** 6.0.0-0ubuntu3 100 100 /var/lib/dpkg/status - What you expected to happen: PCI device detached without any errors. - What happened instead: getting the errors above and he VM stuck additional info: after downgrading the libvirt0 package and all the dependent packages to 5.4 the previous, version, seems that the issue disappeared To manage notifications about this bug go to: https://bugs.launchpad.net/qemu/+bug/1867519/+subscriptions
Re: [PATCH v2] block: make BlockConf.*_size properties 32-bit
On 3/26/20 1:52 AM, Roman Kagan wrote: Devices (virtio-blk, scsi, etc.) and the block layer are happy to use 32-bit for logical_block_size, physical_block_size, and min_io_size. However, the properties in BlockConf are defined as uint16_t limiting the values to 32768. This appears unnecessary tight, and we've seen bigger block sizes handy at times. Make them 32 bit instead and lift the limitation up to 2 MiB which appears to be good enough for everybody. and matches the current qemu limit for qcow2 cluster sizes As the values can now be fairly big and awkward to type, make the property setter accept common size suffixes (k, m). Signed-off-by: Roman Kagan --- v1 -> v2: - cap the property at 2 MiB [Eric] - accept size suffixes +++ b/hw/core/qdev-properties.c @@ -14,6 +14,7 @@ #include "qapi/visitor.h" #include "chardev/char.h" #include "qemu/uuid.h" +#include "qemu/units.h" void qdev_prop_set_after_realize(DeviceState *dev, const char *name, Error **errp) @@ -729,30 +730,39 @@ const PropertyInfo qdev_prop_pci_devfn = { /* --- blocksize --- */ +/* lower limit is sector size */ +#define MIN_BLOCK_SIZE 512 +#define MIN_BLOCK_SIZE_STR "512 B" +/* upper limit is arbitrary, 2 MiB looks sufficient */ +#define MAX_BLOCK_SIZE (2 * MiB) +#define MAX_BLOCK_SIZE_STR "2 MiB" For this comment, I might add that it matches our limit for qcow2 cluster size. Reviewed-by: Eric Blake -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org