date:20240415

RE: [PATCH v2 05/10] vfio: Implement get_host_iommu_info() callback

2024-04-15 Thread Duan, Zhenzhong



>-Original Message-
>From: Cédric Le Goater 
>Subject: Re: [PATCH v2 05/10] vfio: Implement get_host_iommu_info()
>callback
>
>On 4/8/24 10:12, Zhenzhong Duan wrote:
>> Utilize iova_ranges to calculate host IOMMU address width and
>> package it in HIOD_LEGACY_INFO for vIOMMU usage.
>>
>> HIOD_LEGACY_INFO will be used by both VFIO and VDPA so declare
>> it in host_iommu_device.h.
>>
>> Signed-off-by: Zhenzhong Duan 
>> ---
>>   include/sysemu/host_iommu_device.h | 10 ++
>>   hw/vfio/container.c| 24 
>>   2 files changed, 34 insertions(+)
>>
>> diff --git a/include/sysemu/host_iommu_device.h
>b/include/sysemu/host_iommu_device.h
>> index 22ccbe3a5d..beb8be8231 100644
>> --- a/include/sysemu/host_iommu_device.h
>> +++ b/include/sysemu/host_iommu_device.h
>> @@ -16,4 +16,14 @@ struct HostIOMMUDeviceClass {
>>   int (*get_host_iommu_info)(HostIOMMUDevice *hiod, void *data,
>uint32_t len,
>>  Error **errp);
>>   };
>> +
>> +/*
>> + * Define the format of host IOMMU related info that current VFIO
>> + * or VDPA can privode to vIOMMU.
>> + *
>> + * @aw_bits: Host IOMMU address width. 0xff if no limitation.
>> + */
>> +typedef struct HIOD_LEGACY_INFO {
>
>Please use CamelCase names.

Sure.

>
>> +uint8_t aw_bits;
>> +} HIOD_LEGACY_INFO;
>>   #endif
>> diff --git a/hw/vfio/container.c b/hw/vfio/container.c
>> index 44018ef085..ba0ad4a41b 100644
>> --- a/hw/vfio/container.c
>> +++ b/hw/vfio/container.c
>> @@ -1143,8 +1143,32 @@ static void
>vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
>>   vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
>>   };
>>
>> +static int hiod_legacy_vfio_get_host_iommu_info(HostIOMMUDevice
>*hiod,
>> +void *data, uint32_t len,
>> +Error **errp)
>> +{
>> +VFIODevice *vbasedev = HIOD_LEGACY_VFIO(hiod)->vdev;
>> +/* iova_ranges is a sorted list */
>> +GList *l = g_list_last(vbasedev->bcontainer->iova_ranges);
>> +HIOD_LEGACY_INFO *info = data;
>> +
>> +assert(sizeof(HIOD_LEGACY_INFO) <= len);
>> +
>> +if (l) {
>> +Range *range = l->data;
>> +info->aw_bits = find_last_bit(>upb, BITS_PER_LONG) + 1;
>
>There is a comment in range.h saying:
>
> /*
>  * Do not access members directly, use the functions!
>
>Please introduce a new helper.

Sure, thanks for point out.

BRs.
Zhenzhong

>
>
>Thanks,
>
>C.
>
>
>
>> +} else {
>> +info->aw_bits = 0xff;
>> +}
>> +
>> +return 0;
>> +}
>> +
>>   static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
>>   {
>> +HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
>> +
>> +hioc->get_host_iommu_info =
>hiod_legacy_vfio_get_host_iommu_info;
>>   };
>>
>>   static const TypeInfo types[] = {

Re: [PATCH v9 13/20] virtio-net: Return an error when vhost cannot enable RSS

2024-04-15 Thread Yuri Benditovich

On Tue, Apr 16, 2024 at 7:00 AM Jason Wang  wrote:
>
> On Mon, Apr 15, 2024 at 10:05 PM Yuri Benditovich
>  wrote:
> >
> > On Wed, Apr 3, 2024 at 2:11 PM Akihiko Odaki  
> > wrote:
> > >
> > > vhost requires eBPF for RSS. When eBPF is not available, virtio-net
> > > implicitly disables RSS even if the user explicitly requests it. Return
> > > an error instead of implicitly disabling RSS if RSS is requested but not
> > > available.
> > >
> > > Signed-off-by: Akihiko Odaki 
> > > ---
> > >  hw/net/virtio-net.c | 97 
> > > ++---
> > >  1 file changed, 48 insertions(+), 49 deletions(-)
> > >
> > > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > > index 61b49e335dea..3d53eba88cfc 100644
> > > --- a/hw/net/virtio-net.c
> > > +++ b/hw/net/virtio-net.c
> > > @@ -793,9 +793,6 @@ static uint64_t virtio_net_get_features(VirtIODevice 
> > > *vdev, uint64_t features,
> > >  return features;
> > >  }
> > >
> > > -if (!ebpf_rss_is_loaded(>ebpf_rss)) {
> > > -virtio_clear_feature(, VIRTIO_NET_F_RSS);
> > > -}
> > >  features = vhost_net_get_features(get_vhost_net(nc->peer), features);
> > >  vdev->backend_features = features;
> > >
> > > @@ -3591,6 +3588,50 @@ static bool 
> > > failover_hide_primary_device(DeviceListener *listener,
> > >  return qatomic_read(>failover_primary_hidden);
> > >  }
> > >
> > > +static void virtio_net_device_unrealize(DeviceState *dev)
> > > +{
> > > +VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > > +VirtIONet *n = VIRTIO_NET(dev);
> > > +int i, max_queue_pairs;
> > > +
> > > +if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
> > > +virtio_net_unload_ebpf(n);
> > > +}
> > > +
> > > +/* This will stop vhost backend if appropriate. */
> > > +virtio_net_set_status(vdev, 0);
> > > +
> > > +g_free(n->netclient_name);
> > > +n->netclient_name = NULL;
> > > +g_free(n->netclient_type);
> > > +n->netclient_type = NULL;
> > > +
> > > +g_free(n->mac_table.macs);
> > > +g_free(n->vlans);
> > > +
> > > +if (n->failover) {
> > > +qobject_unref(n->primary_opts);
> > > +device_listener_unregister(>primary_listener);
> > > +migration_remove_notifier(>migration_state);
> > > +} else {
> > > +assert(n->primary_opts == NULL);
> > > +}
> > > +
> > > +max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
> > > +for (i = 0; i < max_queue_pairs; i++) {
> > > +virtio_net_del_queue(n, i);
> > > +}
> > > +/* delete also control vq */
> > > +virtio_del_queue(vdev, max_queue_pairs * 2);
> > > +qemu_announce_timer_del(>announce_timer, false);
> > > +g_free(n->vqs);
> > > +qemu_del_nic(n->nic);
> > > +virtio_net_rsc_cleanup(n);
> > > +g_free(n->rss_data.indirections_table);
> > > +net_rx_pkt_uninit(n->rx_pkt);
> > > +virtio_cleanup(vdev);
> > > +}
> > > +
> > >  static void virtio_net_device_realize(DeviceState *dev, Error **errp)
> > >  {
> > >  VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > > @@ -3760,53 +3801,11 @@ static void virtio_net_device_realize(DeviceState 
> > > *dev, Error **errp)
> > >
> > >  net_rx_pkt_init(>rx_pkt);
> > >
> > > -if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
> > > -virtio_net_load_ebpf(n);
> > > -}
> > > -}
> > > -
> > > -static void virtio_net_device_unrealize(DeviceState *dev)
> > > -{
> > > -VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > > -VirtIONet *n = VIRTIO_NET(dev);
> > > -int i, max_queue_pairs;
> > > -
> > > -if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
> > > -virtio_net_unload_ebpf(n);
> > > +if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS) &&
> > > +!virtio_net_load_ebpf(n) && get_vhost_net(nc->peer)) {
> > > +virtio_net_device_unrealize(dev);
> > > +error_setg(errp, "Can't load eBPF RSS for vhost");
> > >  }
> >
> > As I already mentioned, I think this is an extremely bad idea to
> > fail to run qemu due to such a reason as .absence of one feature.
> > What I suggest is:
> > 1. Redefine rss as tri-state (off|auto|on)
> > 2. Fail to run only if rss is on and not available via ebpf
> > 3. On auto - silently drop it
>
> "Auto" might be promatic for migration compatibility which is hard to
> be used by management layers like libvirt. The reason is that there's
> no way for libvirt to know if it is supported by device or not.

In terms of migration every feature that somehow depends on the kernel
is problematic, not only RSS. Last time we added the USO feature - is
it different?
And in terms of migration "rss=on" is problematic the same way as "rss=auto".
Can you please show one scenario of migration where they will behave
differently? And in terms of regular experience there is a big advantage.


>
> Thanks
>
> > 4. The same with 'hash' option - it is not compatible with vhost (at
> > least at the

RE: [PATCH v2 03/10] backends/iommufd: Introduce abstract HIODIOMMUFD device

2024-04-15 Thread Duan, Zhenzhong



>-Original Message-
>From: Cédric Le Goater 
>Subject: Re: [PATCH v2 03/10] backends/iommufd: Introduce abstract
>HIODIOMMUFD device
>
>On 4/8/24 10:12, Zhenzhong Duan wrote:
>> HIODIOMMUFD represents a host IOMMU device under iommufd backend.
>>
>> Currently it includes only public iommufd handle and device id.
>> which could be used to get hw IOMMU information.
>>
>> When nested translation is supported in future, vIOMMU is going
>> to have iommufd related operations like attaching/detaching hwpt,
>> So IOMMUFDDevice interface will be further extended at that time.
>>
>> VFIO and VDPA device have different way of attaching/detaching hwpt.
>> So HIODIOMMUFD is still an abstract class which will be inherited by
>> VFIO and VDPA device.
>>
>> Introduce a helper hiod_iommufd_init() to initialize HIODIOMMUFD
>> device.
>>
>> Suggested-by: Cédric Le Goater 
>> Originally-by: Yi Liu 
>> Signed-off-by: Yi Sun 
>> Signed-off-by: Zhenzhong Duan 
>> ---
>>   include/sysemu/iommufd.h | 22 +++
>>   backends/iommufd.c   | 47 ++--
>>   2 files changed, 53 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
>> index 9af27ebd6c..71c53cbb45 100644
>> --- a/include/sysemu/iommufd.h
>> +++ b/include/sysemu/iommufd.h
>> @@ -4,6 +4,7 @@
>>   #include "qom/object.h"
>>   #include "exec/hwaddr.h"
>>   #include "exec/cpu-common.h"
>> +#include "sysemu/host_iommu_device.h"
>>
>>   #define TYPE_IOMMUFD_BACKEND "iommufd"
>>   OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass,
>IOMMUFD_BACKEND)
>> @@ -33,4 +34,25 @@ int
>iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>hwaddr iova,
>>   ram_addr_t size, void *vaddr, bool readonly);
>>   int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t
>ioas_id,
>> hwaddr iova, ram_addr_t size);
>> +
>> +#define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
>
>Please keep TYPE_HOST_IOMMU_DEVICE

Sure.

>
>> +OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass,
>HIOD_IOMMUFD)
>> +
>> +struct HIODIOMMUFD {
>> +/*< private >*/
>> +HostIOMMUDevice parent;
>> +void *opaque;
>> +
>> +/*< public >*/
>> +IOMMUFDBackend *iommufd;
>> +uint32_t devid;
>> +};
>> +
>> +struct HIODIOMMUFDClass {
>> +/*< private >*/
>> +HostIOMMUDeviceClass parent_class;
>> +};
>
>This new class doesn't seem useful. Do you have plans for handlers ?

Yes, In nesting series 
https://github.com/yiliu1765/qemu/commits/zhenzhong/iommufd_nesting_rfcv2/
This commit 
https://github.com/yiliu1765/qemu/commit/581fc900aa296988eaa48abee6d68d3670faf8c9
implement [at|de]tach_hwpt handlers.

So I add an extra layer of abstract HIODIOMMUFDClass.

>
>> +
>> +void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend
>*iommufd,
>> +   uint32_t devid);
>>   #endif
>> diff --git a/backends/iommufd.c b/backends/iommufd.c
>> index 62a79fa6b0..ef8b3a808b 100644
>> --- a/backends/iommufd.c
>> +++ b/backends/iommufd.c
>> @@ -212,23 +212,38 @@ int
>iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
>>   return ret;
>>   }
>>
>> -static const TypeInfo iommufd_backend_info = {
>> -.name = TYPE_IOMMUFD_BACKEND,
>> -.parent = TYPE_OBJECT,
>> -.instance_size = sizeof(IOMMUFDBackend),
>> -.instance_init = iommufd_backend_init,
>> -.instance_finalize = iommufd_backend_finalize,
>> -.class_size = sizeof(IOMMUFDBackendClass),
>> -.class_init = iommufd_backend_class_init,
>> -.interfaces = (InterfaceInfo[]) {
>> -{ TYPE_USER_CREATABLE },
>> -{ }
>> -}
>> -};
>> +void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend
>*iommufd,
>> +   uint32_t devid)
>> +{
>> +idev->iommufd = iommufd;
>> +idev->devid = devid;
>> +}
>
>This routine doesn't seem useful. I wonder if we shouldn't introduce
>properties. I'm not sure this is useful either.

This routine is called in patch8 to initialize iommu, devid and ioas(in future 
nesting series).
I didn't choose properties as HIODIOMMUFD is not user creatable, property is a 
bit heavy
here. But I'm fine to use it if you prefer.

Thanks
Zhenzhong

>
>
>> -static void register_types(void)
>> +static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
>>   {
>> -type_register_static(_backend_info);
>>   }
>>
>> -type_init(register_types);
>> +static const TypeInfo types[] = {
>> +{
>> +.name = TYPE_IOMMUFD_BACKEND,
>> +.parent = TYPE_OBJECT,
>> +.instance_size = sizeof(IOMMUFDBackend),
>> +.instance_init = iommufd_backend_init,
>> +.instance_finalize = iommufd_backend_finalize,
>> +.class_size = sizeof(IOMMUFDBackendClass),
>> +.class_init = iommufd_backend_class_init,
>> +.interfaces = (InterfaceInfo[]) {
>> +{ TYPE_USER_CREATABLE },
>> +{ }
>> +}
>> +}, {
>> +

[PATCH 3/7] accel/tcg: Return the TranslationBlock from cpu_unwind_state_data

2024-04-15 Thread Richard Henderson

Fix the i386 get_memio_eip function to use tb->cflags
instead of cs->tcg_cflags.

Signed-off-by: Richard Henderson 
---
 include/exec/cpu-common.h | 9 +
 accel/tcg/translate-all.c | 9 +
 target/i386/helper.c  | 6 --
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 6346df17ce..f056132cab 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -176,12 +176,13 @@ void list_cpus(void);
  * @host_pc: the host pc within the translation
  * @data: output data
  *
- * Attempt to load the the unwind state for a host pc occurring in
- * translated code.  If @host_pc is not in translated code, the
- * function returns false; otherwise @data is loaded.
+ * Attempt to load the the unwind state for a host pc occurring in translated
+ * code.  If @host_pc is not in translated code, the function returns NULL;
+ * otherwise @data is loaded and the TranslationBlock is returned.
  * This is the same unwind info as given to restore_state_to_opc.
  */
-bool cpu_unwind_state_data(CPUState *cpu, uintptr_t host_pc, uint64_t *data);
+const TranslationBlock *cpu_unwind_state_data(CPUState *cpu, uintptr_t host_pc,
+  uint64_t *data);
 
 /**
  * cpu_restore_state:
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 83cc14fbde..c745bc5b6c 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -243,15 +243,16 @@ bool cpu_restore_state(CPUState *cpu, uintptr_t host_pc)
 return false;
 }
 
-bool cpu_unwind_state_data(CPUState *cpu, uintptr_t host_pc, uint64_t *data)
+const TranslationBlock *
+cpu_unwind_state_data(CPUState *cpu, uintptr_t host_pc, uint64_t *data)
 {
 if (in_code_gen_buffer((const void *)(host_pc - tcg_splitwx_diff))) {
 TranslationBlock *tb = tcg_tb_lookup(host_pc);
-if (tb) {
-return cpu_unwind_data_from_tb(tb, host_pc, data) >= 0;
+if (tb && cpu_unwind_data_from_tb(tb, host_pc, data) >= 0) {
+return tb;
 }
 }
-return false;
+return NULL;
 }
 
 void page_init(void)
diff --git a/target/i386/helper.c b/target/i386/helper.c
index 23ccb23a5b..eaa691a851 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -517,13 +517,15 @@ static inline target_ulong get_memio_eip(CPUX86State *env)
 #ifdef CONFIG_TCG
 uint64_t data[TARGET_INSN_START_WORDS];
 CPUState *cs = env_cpu(env);
+const TranslationBlock *tb;
 
-if (!cpu_unwind_state_data(cs, cs->mem_io_pc, data)) {
+tb = cpu_unwind_state_data(cs, cs->mem_io_pc, data);
+if (!tb) {
 return env->eip;
 }
 
 /* Per x86_restore_state_to_opc. */
-if (cs->tcg_cflags & CF_PCREL) {
+if (tb->cflags & CF_PCREL) {
 return (env->eip & TARGET_PAGE_MASK) | data[0];
 } else {
 return data[0] - env->segs[R_CS].base;
-- 
2.34.1

[PATCH 6/7] target/i386: Introduce cpu_compute_eflags_ccop

2024-04-15 Thread Richard Henderson

This is a generalization of cpu_compute_eflags, with a dynamic
value of cc_op, and is thus tcg specific.

Signed-off-by: Richard Henderson 
---
 target/i386/cpu.h   |  2 ++
 target/i386/tcg/cc_helper.c | 10 ++
 2 files changed, 12 insertions(+)

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 6b05738079..285f26d99d 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -2379,6 +2379,8 @@ void cpu_x86_inject_mce(Monitor *mon, X86CPU *cpu, int 
bank,
 
 uint32_t cpu_cc_compute_all(CPUX86State *env1);
 
+uint32_t cpu_compute_eflags_ccop(CPUX86State *env, CCOp op);
+
 static inline uint32_t cpu_compute_eflags(CPUX86State *env)
 {
 uint32_t eflags = env->eflags;
diff --git a/target/i386/tcg/cc_helper.c b/target/i386/tcg/cc_helper.c
index f76e9cb8cf..8203682ca8 100644
--- a/target/i386/tcg/cc_helper.c
+++ b/target/i386/tcg/cc_helper.c
@@ -225,6 +225,16 @@ uint32_t cpu_cc_compute_all(CPUX86State *env)
 return helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, CC_OP);
 }
 
+uint32_t cpu_compute_eflags_ccop(CPUX86State *env, CCOp op)
+{
+uint32_t eflags;
+
+eflags = helper_cc_compute_all(CC_DST, CC_SRC, CC_SRC2, op);
+eflags |= env->df & DF_MASK;
+eflags |= env->eflags & ~(VM_MASK | RF_MASK);
+return eflags;
+}
+
 target_ulong helper_cc_compute_c(target_ulong dst, target_ulong src1,
  target_ulong src2, int op)
 {
-- 
2.34.1

[PATCH 1/7] tcg: Introduce INDEX_op_plugin_pc

2024-04-15 Thread Richard Henderson

Add an opcode to find a code address within the current insn,
for later use with unwinding.  Generate the code generically
using tcg_reg_alloc_do_movi.

Signed-off-by: Richard Henderson 
---
 include/tcg/tcg-op-common.h |  1 +
 include/tcg/tcg-opc.h   |  1 +
 tcg/tcg-op.c|  5 +
 tcg/tcg.c   | 10 ++
 4 files changed, 17 insertions(+)

diff --git a/include/tcg/tcg-op-common.h b/include/tcg/tcg-op-common.h
index 009e2778c5..a32c88a182 100644
--- a/include/tcg/tcg-op-common.h
+++ b/include/tcg/tcg-op-common.h
@@ -76,6 +76,7 @@ void tcg_gen_lookup_and_goto_ptr(void);
 
 void tcg_gen_plugin_cb(unsigned from);
 void tcg_gen_plugin_mem_cb(TCGv_i64 addr, unsigned meminfo);
+void tcg_gen_plugin_pc(TCGv_ptr);
 
 /* 32 bit ops */
 
diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index 546eb49c11..087d1b82da 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -199,6 +199,7 @@ DEF(goto_ptr, 0, 1, 0, TCG_OPF_BB_EXIT | TCG_OPF_BB_END)
 
 DEF(plugin_cb, 0, 0, 1, TCG_OPF_NOT_PRESENT)
 DEF(plugin_mem_cb, 0, 1, 1, TCG_OPF_NOT_PRESENT)
+DEF(plugin_pc, 1, 0, 0, TCG_OPF_NOT_PRESENT)
 
 /* Replicate ld/st ops for 32 and 64-bit guest addresses. */
 DEF(qemu_ld_a32_i32, 1, 1, 1,
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index eff3728622..b8ca78cbe4 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -322,6 +322,11 @@ void tcg_gen_plugin_mem_cb(TCGv_i64 addr, unsigned meminfo)
 tcg_gen_op2(INDEX_op_plugin_mem_cb, tcgv_i64_arg(addr), meminfo);
 }
 
+void tcg_gen_plugin_pc(TCGv_ptr arg)
+{
+tcg_gen_op1(INDEX_op_plugin_pc, tcgv_ptr_arg(arg));
+}
+
 /* 32 bit ops */
 
 void tcg_gen_discard_i32(TCGv_i32 arg)
diff --git a/tcg/tcg.c b/tcg/tcg.c
index d248c52e96..42e2b53729 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -4701,6 +4701,13 @@ static void tcg_reg_alloc_mov(TCGContext *s, const TCGOp 
*op)
 }
 }
 
+static void tcg_reg_alloc_plugin_pc(TCGContext *s, const TCGOp *op)
+{
+tcg_reg_alloc_do_movi(s, arg_temp(op->args[0]),
+  (uintptr_t)tcg_splitwx_to_rx(s->code_ptr),
+  op->life, output_pref(op, 0));
+}
+
 /*
  * Specialized code generation for INDEX_op_dup_vec.
  */
@@ -6208,6 +6215,9 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, 
uint64_t pc_start)
 case INDEX_op_mov_vec:
 tcg_reg_alloc_mov(s, op);
 break;
+case INDEX_op_plugin_pc:
+tcg_reg_alloc_plugin_pc(s, op);
+break;
 case INDEX_op_dup_vec:
 tcg_reg_alloc_dup(s, op);
 break;
-- 
2.34.1

[PATCH 2/7] accel/tcg: Set CPUState.plugin_ra before all plugin callbacks

2024-04-15 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 include/hw/core/cpu.h  |  1 +
 accel/tcg/plugin-gen.c | 50 +-
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 10cd492aff..f4af37c13d 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -350,6 +350,7 @@ typedef union IcountDecr {
 typedef struct CPUNegativeOffsetState {
 CPUTLB tlb;
 #ifdef CONFIG_PLUGIN
+uintptr_t plugin_ra;
 GArray *plugin_mem_cbs;
 #endif
 IcountDecr icount_decr;
diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c
index 36e9134a5d..f96b49cce6 100644
--- a/accel/tcg/plugin-gen.c
+++ b/accel/tcg/plugin-gen.c
@@ -37,6 +37,12 @@ enum plugin_gen_from {
 PLUGIN_GEN_AFTER_TB,
 };
 
+enum plugin_gen_ra {
+GEN_RA_DONE,
+GEN_RA_FROM_TB,
+GEN_RA_FROM_INSN,
+};
+
 /* called before finishing a TB with exit_tb, goto_tb or goto_ptr */
 void plugin_gen_disable_mem_helpers(void)
 {
@@ -151,11 +157,38 @@ static void gen_mem_cb(struct qemu_plugin_dyn_cb *cb,
 tcg_temp_free_i32(cpu_index);
 }
 
-static void inject_cb(struct qemu_plugin_dyn_cb *cb)
+static void inject_ra(enum plugin_gen_ra *gen_ra)
+{
+TCGv_ptr ra;
+
+switch (*gen_ra) {
+case GEN_RA_DONE:
+return;
+case GEN_RA_FROM_TB:
+ra = tcg_constant_ptr(NULL);
+break;
+case GEN_RA_FROM_INSN:
+ra = tcg_temp_ebb_new_ptr();
+tcg_gen_plugin_pc(ra);
+break;
+default:
+g_assert_not_reached();
+}
+
+tcg_gen_st_ptr(ra, tcg_env,
+   offsetof(CPUState, neg.plugin_ra) -
+   offsetof(ArchCPU, env));
+tcg_temp_free_ptr(ra);
+*gen_ra = GEN_RA_DONE;
+}
+
+static void inject_cb(struct qemu_plugin_dyn_cb *cb,
+  enum plugin_gen_ra *gen_ra)
 
 {
 switch (cb->type) {
 case PLUGIN_CB_REGULAR:
+inject_ra(gen_ra);
 gen_udata_cb(cb);
 break;
 case PLUGIN_CB_INLINE:
@@ -167,16 +200,18 @@ static void inject_cb(struct qemu_plugin_dyn_cb *cb)
 }
 
 static void inject_mem_cb(struct qemu_plugin_dyn_cb *cb,
+  enum plugin_gen_ra *gen_ra,
   enum qemu_plugin_mem_rw rw,
   qemu_plugin_meminfo_t meminfo, TCGv_i64 addr)
 {
 if (cb->rw & rw) {
 switch (cb->type) {
 case PLUGIN_CB_MEM_REGULAR:
+inject_ra(gen_ra);
 gen_mem_cb(cb, meminfo, addr);
 break;
 default:
-inject_cb(cb);
+inject_cb(cb, gen_ra);
 break;
 }
 }
@@ -186,6 +221,7 @@ static void plugin_gen_inject(struct qemu_plugin_tb 
*plugin_tb)
 {
 TCGOp *op, *next;
 int insn_idx = -1;
+enum plugin_gen_ra gen_ra;
 
 if (unlikely(qemu_loglevel_mask(LOG_TB_OP_PLUGIN)
  && qemu_log_in_addr_range(plugin_tb->vaddr))) {
@@ -205,10 +241,12 @@ static void plugin_gen_inject(struct qemu_plugin_tb 
*plugin_tb)
  */
 memset(tcg_ctx->free_temps, 0, sizeof(tcg_ctx->free_temps));
 
+gen_ra = GEN_RA_FROM_TB;
 QTAILQ_FOREACH_SAFE(op, _ctx->ops, link, next) {
 switch (op->opc) {
 case INDEX_op_insn_start:
 insn_idx++;
+gen_ra = GEN_RA_FROM_INSN;
 break;
 
 case INDEX_op_plugin_cb:
@@ -244,7 +282,8 @@ static void plugin_gen_inject(struct qemu_plugin_tb 
*plugin_tb)
 cbs = plugin_tb->cbs;
 for (i = 0, n = (cbs ? cbs->len : 0); i < n; i++) {
 inject_cb(
-_array_index(cbs, struct qemu_plugin_dyn_cb, i));
+_array_index(cbs, struct qemu_plugin_dyn_cb, i),
+_ra);
 }
 break;
 
@@ -256,7 +295,8 @@ static void plugin_gen_inject(struct qemu_plugin_tb 
*plugin_tb)
 cbs = insn->insn_cbs;
 for (i = 0, n = (cbs ? cbs->len : 0); i < n; i++) {
 inject_cb(
-_array_index(cbs, struct qemu_plugin_dyn_cb, i));
+_array_index(cbs, struct qemu_plugin_dyn_cb, i),
+_ra);
 }
 break;
 
@@ -288,7 +328,7 @@ static void plugin_gen_inject(struct qemu_plugin_tb 
*plugin_tb)
 cbs = insn->mem_cbs;
 for (i = 0, n = (cbs ? cbs->len : 0); i < n; i++) {
 inject_mem_cb(_array_index(cbs, struct qemu_plugin_dyn_cb, 
i),
-  rw, meminfo, addr);
+  _ra, rw, meminfo, addr);
 }
 
 tcg_ctx->emit_before_op = NULL;
-- 
2.34.1

[PATCH 7/7] target/i386: Implement TCGCPUOps for plugin register reads

2024-04-15 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 target/i386/tcg/tcg-cpu.c | 72 ++-
 1 file changed, 56 insertions(+), 16 deletions(-)

diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index cca19cd40e..2370053df2 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -22,9 +22,11 @@
 #include "helper-tcg.h"
 #include "qemu/accel.h"
 #include "hw/core/accel-cpu.h"
-
+#include "gdbstub/helpers.h"
+#include "gdb-internal.h"
 #include "tcg-cpu.h"
 
+
 /* Frob eflags into and out of the CPU temporary format.  */
 
 static void x86_cpu_exec_enter(CPUState *cs)
@@ -61,38 +63,74 @@ static void x86_cpu_synchronize_from_tb(CPUState *cs,
 }
 }
 
-static void x86_restore_state_to_opc(CPUState *cs,
- const TranslationBlock *tb,
- const uint64_t *data)
+static uint64_t eip_from_unwind(CPUX86State *env, const TranslationBlock *tb,
+uint64_t data0)
 {
-X86CPU *cpu = X86_CPU(cs);
-CPUX86State *env = >env;
-int cc_op = data[1];
 uint64_t new_pc;
 
 if (tb_cflags(tb) & CF_PCREL) {
 /*
- * data[0] in PC-relative TBs is also a linear address, i.e. an 
address with
- * the CS base added, because it is not guaranteed that EIP bits 12 
and higher
- * stay the same across the translation block.  Add the CS base back 
before
- * replacing the low bits, and subtract it below just like for 
!CF_PCREL.
+ * data[0] in PC-relative TBs is also a linear address,
+ * i.e. an address with the CS base added, because it is
+ * not guaranteed that EIP bits 12 and higher stay the
+ * same across the translation block.  Add the CS base
+ * back before replacing the low bits, and subtract it
+ * below just like for !CF_PCREL.
  */
 uint64_t pc = env->eip + tb->cs_base;
-new_pc = (pc & TARGET_PAGE_MASK) | data[0];
+new_pc = (pc & TARGET_PAGE_MASK) | data0;
 } else {
-new_pc = data[0];
+new_pc = data0;
 }
 if (tb->flags & HF_CS64_MASK) {
-env->eip = new_pc;
-} else {
-env->eip = (uint32_t)(new_pc - tb->cs_base);
+return new_pc;
 }
+return (uint32_t)(new_pc - tb->cs_base);
+}
 
+static void x86_restore_state_to_opc(CPUState *cs,
+ const TranslationBlock *tb,
+ const uint64_t *data)
+{
+CPUX86State *env = cpu_env(cs);
+CCOp cc_op;
+
+env->eip = eip_from_unwind(env, tb, data[0]);
+
+cc_op = data[1];
 if (cc_op != CC_OP_DYNAMIC) {
 env->cc_op = cc_op;
 }
 }
 
+static bool x86_plugin_need_unwind_for_reg(CPUState *cs, int reg)
+{
+return reg == IDX_IP_REG || reg == IDX_FLAGS_REG;
+}
+
+static int x86_plugin_unwind_read_reg(CPUState *cs, GByteArray *buf, int reg,
+  const TranslationBlock *tb,
+  const uint64_t *data)
+{
+CPUX86State *env = cpu_env(cs);
+CCOp cc_op;
+
+switch (reg) {
+case IDX_IP_REG:
+return gdb_get_regl(buf, eip_from_unwind(env, tb, data[0]));
+
+case IDX_FLAGS_REG:
+cc_op = data[1];
+if (cc_op == CC_OP_DYNAMIC) {
+cc_op = env->cc_op;
+}
+return gdb_get_reg32(buf, cpu_compute_eflags_ccop(env, cc_op));
+
+default:
+g_assert_not_reached();
+}
+}
+
 #ifndef CONFIG_USER_ONLY
 static bool x86_debug_check_breakpoint(CPUState *cs)
 {
@@ -110,6 +148,8 @@ static const TCGCPUOps x86_tcg_ops = {
 .initialize = tcg_x86_init,
 .synchronize_from_tb = x86_cpu_synchronize_from_tb,
 .restore_state_to_opc = x86_restore_state_to_opc,
+.plugin_need_unwind_for_reg = x86_plugin_need_unwind_for_reg,
+.plugin_unwind_read_reg = x86_plugin_unwind_read_reg,
 .cpu_exec_enter = x86_cpu_exec_enter,
 .cpu_exec_exit = x86_cpu_exec_exit,
 #ifdef CONFIG_USER_ONLY
-- 
2.34.1

[PATCH 5/7] target/i386: Split out gdb-internal.h

2024-04-15 Thread Richard Henderson

Signed-off-by: Richard Henderson 
---
 target/i386/gdb-internal.h | 65 ++
 target/i386/gdbstub.c  |  1 +
 2 files changed, 66 insertions(+)
 create mode 100644 target/i386/gdb-internal.h

diff --git a/target/i386/gdb-internal.h b/target/i386/gdb-internal.h
new file mode 100644
index 00..7cf4c1a656
--- /dev/null
+++ b/target/i386/gdb-internal.h
@@ -0,0 +1,65 @@
+/*
+ * x86 gdb server stub
+ *
+ * Copyright (c) 2003-2005 Fabrice Bellard
+ * Copyright (c) 2013 SUSE LINUX Products GmbH
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ */
+
+#ifndef I386_GDB_INTERNAL_H
+#define I386_GDB_INTERNAL_H
+
+/*
+ * Keep these in sync with assignment to
+ * gdb_num_core_regs in target/i386/cpu.c
+ * and with the machine description
+ */
+
+/*
+ * SEG: 6 segments, plus fs_base, gs_base, kernel_gs_base
+ */
+
+/*
+ * general regs ->  8 or 16
+ */
+#define IDX_NB_IP   1
+#define IDX_NB_FLAGS1
+#define IDX_NB_SEG  (6 + 3)
+#define IDX_NB_CTL  6
+#define IDX_NB_FP   16
+/*
+ * fpu regs --> 8 or 16
+ */
+#define IDX_NB_MXCSR1
+/*
+ *  total > 8+1+1+9+6+16+8+1=50 or 16+1+1+9+6+16+16+1=66
+ */
+
+#define IDX_IP_REG  CPU_NB_REGS
+#define IDX_FLAGS_REG   (IDX_IP_REG + IDX_NB_IP)
+#define IDX_SEG_REGS(IDX_FLAGS_REG + IDX_NB_FLAGS)
+#define IDX_CTL_REGS(IDX_SEG_REGS + IDX_NB_SEG)
+#define IDX_FP_REGS (IDX_CTL_REGS + IDX_NB_CTL)
+#define IDX_XMM_REGS(IDX_FP_REGS + IDX_NB_FP)
+#define IDX_MXCSR_REG   (IDX_XMM_REGS + CPU_NB_REGS)
+
+#define IDX_CTL_CR0_REG (IDX_CTL_REGS + 0)
+#define IDX_CTL_CR2_REG (IDX_CTL_REGS + 1)
+#define IDX_CTL_CR3_REG (IDX_CTL_REGS + 2)
+#define IDX_CTL_CR4_REG (IDX_CTL_REGS + 3)
+#define IDX_CTL_CR8_REG (IDX_CTL_REGS + 4)
+#define IDX_CTL_EFER_REG(IDX_CTL_REGS + 5)
+
+#endif
diff --git a/target/i386/gdbstub.c b/target/i386/gdbstub.c
index ebb000df6a..9662509b82 100644
--- a/target/i386/gdbstub.c
+++ b/target/i386/gdbstub.c
@@ -20,6 +20,7 @@
 #include "qemu/osdep.h"
 #include "cpu.h"
 #include "include/gdbstub/helpers.h"
+#include "gdb-internal.h"
 
 #ifdef TARGET_X86_64
 static const int gpr_map[16] = {
-- 
2.34.1

[PATCH 0/7] plugins: Use unwind info for special gdb registers

2024-04-15 Thread Richard Henderson

Based-on: 20240404230611.21231-1-richard.hender...@linaro.org
("[PATCH v2 00/21] Rewrite plugin code generation")

This is an attempt to fix
https://gitlab.com/qemu-project/qemu/-/issues/2208
("PC is not updated for each instruction in TCG plugins")

I have only updated target/i386 so far, but basically all targets
need updating for the new callbacks.  Extra points to anyone who
sees how to avoid the extra code duplication.  :-)


r~


Richard Henderson (7):
  tcg: Introduce INDEX_op_plugin_pc
  accel/tcg: Set CPUState.plugin_ra before all plugin callbacks
  accel/tcg: Return the TranslationBlock from cpu_unwind_state_data
  plugins: Introduce TCGCPUOps callbacks for mid-tb register reads
  target/i386: Split out gdb-internal.h
  target/i386: Introduce cpu_compute_eflags_ccop
  target/i386: Implement TCGCPUOps for plugin register reads

 include/exec/cpu-common.h |  9 +++--
 include/hw/core/cpu.h |  1 +
 include/hw/core/tcg-cpu-ops.h | 13 +++
 include/tcg/tcg-op-common.h   |  1 +
 include/tcg/tcg-opc.h |  1 +
 target/i386/cpu.h |  2 +
 target/i386/gdb-internal.h| 65 +++
 accel/tcg/plugin-gen.c| 50 +---
 accel/tcg/translate-all.c |  9 +++--
 plugins/api.c | 36 +-
 target/i386/gdbstub.c |  1 +
 target/i386/helper.c  |  6 ++-
 target/i386/tcg/cc_helper.c   | 10 +
 target/i386/tcg/tcg-cpu.c | 72 +++
 tcg/tcg-op.c  |  5 +++
 tcg/tcg.c | 10 +
 16 files changed, 258 insertions(+), 33 deletions(-)
 create mode 100644 target/i386/gdb-internal.h

-- 
2.34.1

[PATCH 4/7] plugins: Introduce TCGCPUOps callbacks for mid-tb register reads

2024-04-15 Thread Richard Henderson

Certain target registers are not updated continuously within
the translation block.  For normal exception handling we use
unwind info to re-generate the correct value when required.
Leverage that same info for reading those registers for plugins.

All targets will need updating for these new callbacks.

Signed-off-by: Richard Henderson 
---
 include/hw/core/tcg-cpu-ops.h | 13 +
 plugins/api.c | 36 +--
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
index bf8ff8e3ee..e954d83edf 100644
--- a/include/hw/core/tcg-cpu-ops.h
+++ b/include/hw/core/tcg-cpu-ops.h
@@ -49,6 +49,19 @@ struct TCGCPUOps {
 /** @debug_excp_handler: Callback for handling debug exceptions */
 void (*debug_excp_handler)(CPUState *cpu);
 
+/**
+ * @plugin_need_unwind_for_reg:
+ * True if unwind info needed for reading reg.
+ */
+bool (*plugin_need_unwind_for_reg)(CPUState *cpu, int reg);
+/**
+ * @plugin_unwind_read_reg:
+ * Like CPUClass.gdb_read_register, but for registers that require
+ * regeneration using unwind info, like in @restore_state_to_opc.
+ */
+int (*plugin_unwind_read_reg)(CPUState *cpu, GByteArray *buf, int reg,
+  const TranslationBlock *tb,
+  const uint64_t *data);
 #ifdef NEED_CPU_H
 #ifdef CONFIG_USER_ONLY
 /**
diff --git a/plugins/api.c b/plugins/api.c
index 3912c9cc8f..3543647a89 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -40,10 +40,12 @@
 #include "qemu/plugin.h"
 #include "qemu/log.h"
 #include "tcg/tcg.h"
+#include "tcg/insn-start-words.h"
 #include "exec/exec-all.h"
 #include "exec/gdbstub.h"
 #include "exec/ram_addr.h"
 #include "disas/disas.h"
+#include "hw/core/tcg-cpu-ops.h"
 #include "plugin.h"
 #ifndef CONFIG_USER_ONLY
 #include "qemu/plugin-memory.h"
@@ -454,9 +456,39 @@ GArray *qemu_plugin_get_registers(void)
 
 int qemu_plugin_read_register(struct qemu_plugin_register *reg, GByteArray 
*buf)
 {
-g_assert(current_cpu);
+CPUState *cs;
+uintptr_t ra;
+int regno;
 
-return gdb_read_register(current_cpu, buf, GPOINTER_TO_INT(reg));
+assert(current_cpu);
+cs = current_cpu;
+ra = cs->neg.plugin_ra;
+regno = GPOINTER_TO_INT(reg);
+
+/*
+ * When plugin_ra is 0, we have no unwind info.  This will be true for
+ * TB callbacks that happen before any insns of the TB have started.
+ */
+if (ra) {
+const TCGCPUOps *tcg_ops = cs->cc->tcg_ops;
+
+/*
+ * For plugins in the middle of the TB, we may need to locate
+ * and use unwind data to reconstruct a register value.
+ * Usually this required for the PC, but there may be others.
+ */
+if (tcg_ops->plugin_need_unwind_for_reg &&
+tcg_ops->plugin_need_unwind_for_reg(cs, regno)) {
+uint64_t data[TARGET_INSN_START_WORDS];
+const TranslationBlock *tb;
+
+tb = cpu_unwind_state_data(cs, ra, data);
+assert(tb);
+return tcg_ops->plugin_unwind_read_reg(cs, buf, regno, tb, data);
+}
+}
+
+return gdb_read_register(cs, buf, regno);
 }
 
 struct qemu_plugin_scoreboard *qemu_plugin_scoreboard_new(size_t element_size)
-- 
2.34.1

[PATCH 0/7] plugins: Use unwind info for special gdb registers

2024-04-15 Thread Richard Henderson

Based-on: 20240404230611.21231-1-richard.hender...@linaro.org
("[PATCH v2 00/21] Rewrite plugin code generation")

This is an attempt to fix
https://gitlab.com/qemu-project/qemu/-/issues/2208
("PC is not updated for each instruction in TCG plugins")

I have only updated target/i386 so far, but basically all targets
need updating for the new callbacks.  Extra points to anyone who
sees how to avoid the extra code duplication.  :-)


r~


Richard Henderson (7):
  tcg: Introduce INDEX_op_plugin_pc
  accel/tcg: Set CPUState.plugin_ra before all plugin callbacks
  accel/tcg: Return the TranslationBlock from cpu_unwind_state_data
  plugins: Introduce TCGCPUOps callbacks for mid-tb register reads
  target/i386: Split out gdb-internal.h
  target/i386: Introduce cpu_compute_eflags_ccop
  target/i386: Implement TCGCPUOps for plugin register reads

 include/exec/cpu-common.h |  9 +++--
 include/hw/core/cpu.h |  1 +
 include/hw/core/tcg-cpu-ops.h | 13 +++
 include/tcg/tcg-op-common.h   |  1 +
 include/tcg/tcg-opc.h |  1 +
 target/i386/cpu.h |  2 +
 target/i386/gdb-internal.h| 65 +++
 accel/tcg/plugin-gen.c| 50 +---
 accel/tcg/translate-all.c |  9 +++--
 plugins/api.c | 36 +-
 target/i386/gdbstub.c |  1 +
 target/i386/helper.c  |  6 ++-
 target/i386/tcg/cc_helper.c   | 10 +
 target/i386/tcg/tcg-cpu.c | 72 +++
 tcg/tcg-op.c  |  5 +++
 tcg/tcg.c | 10 +
 16 files changed, 258 insertions(+), 33 deletions(-)
 create mode 100644 target/i386/gdb-internal.h

-- 
2.34.1

Re: [PATCH v9 13/20] virtio-net: Return an error when vhost cannot enable RSS

2024-04-15 Thread Jason Wang

On Mon, Apr 15, 2024 at 10:05 PM Yuri Benditovich
 wrote:
>
> On Wed, Apr 3, 2024 at 2:11 PM Akihiko Odaki  wrote:
> >
> > vhost requires eBPF for RSS. When eBPF is not available, virtio-net
> > implicitly disables RSS even if the user explicitly requests it. Return
> > an error instead of implicitly disabling RSS if RSS is requested but not
> > available.
> >
> > Signed-off-by: Akihiko Odaki 
> > ---
> >  hw/net/virtio-net.c | 97 
> > ++---
> >  1 file changed, 48 insertions(+), 49 deletions(-)
> >
> > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > index 61b49e335dea..3d53eba88cfc 100644
> > --- a/hw/net/virtio-net.c
> > +++ b/hw/net/virtio-net.c
> > @@ -793,9 +793,6 @@ static uint64_t virtio_net_get_features(VirtIODevice 
> > *vdev, uint64_t features,
> >  return features;
> >  }
> >
> > -if (!ebpf_rss_is_loaded(>ebpf_rss)) {
> > -virtio_clear_feature(, VIRTIO_NET_F_RSS);
> > -}
> >  features = vhost_net_get_features(get_vhost_net(nc->peer), features);
> >  vdev->backend_features = features;
> >
> > @@ -3591,6 +3588,50 @@ static bool 
> > failover_hide_primary_device(DeviceListener *listener,
> >  return qatomic_read(>failover_primary_hidden);
> >  }
> >
> > +static void virtio_net_device_unrealize(DeviceState *dev)
> > +{
> > +VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > +VirtIONet *n = VIRTIO_NET(dev);
> > +int i, max_queue_pairs;
> > +
> > +if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
> > +virtio_net_unload_ebpf(n);
> > +}
> > +
> > +/* This will stop vhost backend if appropriate. */
> > +virtio_net_set_status(vdev, 0);
> > +
> > +g_free(n->netclient_name);
> > +n->netclient_name = NULL;
> > +g_free(n->netclient_type);
> > +n->netclient_type = NULL;
> > +
> > +g_free(n->mac_table.macs);
> > +g_free(n->vlans);
> > +
> > +if (n->failover) {
> > +qobject_unref(n->primary_opts);
> > +device_listener_unregister(>primary_listener);
> > +migration_remove_notifier(>migration_state);
> > +} else {
> > +assert(n->primary_opts == NULL);
> > +}
> > +
> > +max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
> > +for (i = 0; i < max_queue_pairs; i++) {
> > +virtio_net_del_queue(n, i);
> > +}
> > +/* delete also control vq */
> > +virtio_del_queue(vdev, max_queue_pairs * 2);
> > +qemu_announce_timer_del(>announce_timer, false);
> > +g_free(n->vqs);
> > +qemu_del_nic(n->nic);
> > +virtio_net_rsc_cleanup(n);
> > +g_free(n->rss_data.indirections_table);
> > +net_rx_pkt_uninit(n->rx_pkt);
> > +virtio_cleanup(vdev);
> > +}
> > +
> >  static void virtio_net_device_realize(DeviceState *dev, Error **errp)
> >  {
> >  VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > @@ -3760,53 +3801,11 @@ static void virtio_net_device_realize(DeviceState 
> > *dev, Error **errp)
> >
> >  net_rx_pkt_init(>rx_pkt);
> >
> > -if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
> > -virtio_net_load_ebpf(n);
> > -}
> > -}
> > -
> > -static void virtio_net_device_unrealize(DeviceState *dev)
> > -{
> > -VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> > -VirtIONet *n = VIRTIO_NET(dev);
> > -int i, max_queue_pairs;
> > -
> > -if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
> > -virtio_net_unload_ebpf(n);
> > +if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS) &&
> > +!virtio_net_load_ebpf(n) && get_vhost_net(nc->peer)) {
> > +virtio_net_device_unrealize(dev);
> > +error_setg(errp, "Can't load eBPF RSS for vhost");
> >  }
>
> As I already mentioned, I think this is an extremely bad idea to
> fail to run qemu due to such a reason as .absence of one feature.
> What I suggest is:
> 1. Redefine rss as tri-state (off|auto|on)
> 2. Fail to run only if rss is on and not available via ebpf
> 3. On auto - silently drop it

"Auto" might be promatic for migration compatibility which is hard to
be used by management layers like libvirt. The reason is that there's
no way for libvirt to know if it is supported by device or not.

Thanks

> 4. The same with 'hash' option - it is not compatible with vhost (at
> least at the moment)
> 5. Reformat the patch as it is hard to review it due to replacing
> entire procedures, i.e. one patch with replacing without changes,
> another one - with real changes.
> If this is hard to review only for me - please ignore that.
>
> > -
> > -/* This will stop vhost backend if appropriate. */
> > -virtio_net_set_status(vdev, 0);
> > -
> > -g_free(n->netclient_name);
> > -n->netclient_name = NULL;
> > -g_free(n->netclient_type);
> > -n->netclient_type = NULL;
> > -
> > -g_free(n->mac_table.macs);
> > -g_free(n->vlans);
> > -
> > -if (n->failover) {
> > -qobject_unref(n->primary_opts);
> > -

RE: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device

2024-04-15 Thread Duan, Zhenzhong

Hi Cédric,

>-Original Message-
>From: Cédric Le Goater 
>Subject: Re: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device
>
>On 4/8/24 10:12, Zhenzhong Duan wrote:
>> HIODLegacyVFIO represents a host IOMMU device under VFIO legacy
>> container backend.
>>
>> It includes a link to VFIODevice.
>>
>> Suggested-by: Eric Auger 
>> Suggested-by: Cédric Le Goater 
>> Signed-off-by: Zhenzhong Duan 
>> ---
>>   include/hw/vfio/vfio-common.h | 11 +++
>>   hw/vfio/container.c   | 11 ++-
>>   2 files changed, 21 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>common.h
>> index b9da6c08ef..f30772f534 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -31,6 +31,7 @@
>>   #endif
>>   #include "sysemu/sysemu.h"
>>   #include "hw/vfio/vfio-container-base.h"
>> +#include "sysemu/host_iommu_device.h"
>>
>>   #define VFIO_MSG_PREFIX "vfio %s: "
>>
>> @@ -147,6 +148,16 @@ typedef struct VFIOGroup {
>>   bool ram_block_discard_allowed;
>>   } VFIOGroup;
>>
>> +#define TYPE_HIOD_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-
>vfio"
>
>I would prefer to keep the prefix TYPE_HOST_IOMMU_DEVICE.

Will do.

>
>> +OBJECT_DECLARE_SIMPLE_TYPE(HIODLegacyVFIO, HIOD_LEGACY_VFIO)
>> +
>> +/* Abstraction of VFIO legacy host IOMMU device */
>> +struct HIODLegacyVFIO {
>
>same here

Should I do the same for all the HostIOMMUDevice and HostIOMMUDeviceClass 
sub-structures?

The reason I used 'HIOD' abbreviation is some function names become extremely 
long
and exceed 80 characters. E.g.:

@@ -1148,9 +1148,9 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
 vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
 };

-static int hiod_legacy_vfio_get_host_iommu_info(HostIOMMUDevice *hiod,
-void *data, uint32_t len,
-Error **errp)
+static int host_iommu_device_legacy_vfio_get_host_iommu_info(HostIOMMUDevice 
*hiod,
+ void *data, 
uint32_t len,
+ Error **errp)
 {
 VFIODevice *vbasedev = HIOD_LEGACY_VFIO(hiod)->vdev;
 /* iova_ranges is a sorted list */
@@ -1173,7 +1173,7 @@ static void hiod_legacy_vfio_class_init(ObjectClass *oc, 
void *data)
 {
 HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);

-hioc->get_host_iommu_info = hiod_legacy_vfio_get_host_iommu_info;
+hioc->get_host_iommu_info = 
host_iommu_device_legacy_vfio_get_host_iommu_info;
 };

I didn't find other way to make it meet the 80 chars limitation. Any 
suggestions on this?

>
>> +/*< private >*/
>> +HostIOMMUDevice parent;
>> +VFIODevice *vdev;
>
>It seems to me that the back pointer should be on the container instead.
>Looks more correct conceptually.

Yes, that makes sense for legacy VFIO, as iova_ranges, pgsizes etc are all 
saved in bcontainer.

>
>
>> +};
>> +
>>   typedef struct VFIODMABuf {
>>   QemuDmaBuf buf;
>>   uint32_t pos_x, pos_y, pos_updates;
>> diff --git a/hw/vfio/container.c b/hw/vfio/container.c
>> index 77bdec276e..44018ef085 100644
>> --- a/hw/vfio/container.c
>> +++ b/hw/vfio/container.c
>> @@ -1143,12 +1143,21 @@ static void
>vfio_iommu_legacy_class_init(ObjectClass *klass, void *data)
>>   vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
>>   };
>>
>> +static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
>> +{
>> +};
>
>Is it preferable to introduce routines when they are actually useful.
>Please drop the .class_init definition.

Sure.

Thanks
Zhenzhong

>
>Thanks,
>
>C.
>
>
>> +
>>   static const TypeInfo types[] = {
>>   {
>>   .name = TYPE_VFIO_IOMMU_LEGACY,
>>   .parent = TYPE_VFIO_IOMMU,
>>   .class_init = vfio_iommu_legacy_class_init,
>> -},
>> +}, {
>> +.name = TYPE_HIOD_LEGACY_VFIO,
>> +.parent = TYPE_HOST_IOMMU_DEVICE,
>> +.instance_size = sizeof(HIODLegacyVFIO),
>> +.class_init = hiod_legacy_vfio_class_init,
>> +}
>>   };
>>
>>   DEFINE_TYPES(types)

Re: [PATCH v8] virtio-pci: fix use of a released vector

2024-04-15 Thread Jason Wang

On Mon, Apr 15, 2024 at 6:41 PM Cindy Lu  wrote:
>
> On Mon, Apr 15, 2024 at 5:34 PM Michael S. Tsirkin  wrote:
> >
> > From: Cindy Lu 
> >
> > During the booting process of the non-standard image, the behavior of the
> > called function in qemu is as follows:
> >
> > 1. vhost_net_stop() was triggered by guest image. This will call the 
> > function
> > virtio_pci_set_guest_notifiers() with assgin= false,
> > virtio_pci_set_guest_notifiers(） will release the irqfd for vector 0
> >
> > 2. virtio_reset() was triggered, this will set configure vector to 
> > VIRTIO_NO_VECTOR
> >
> > 3.vhost_net_start() was called (at this time, the configure vector is
> > still VIRTIO_NO_VECTOR) and then call virtio_pci_set_guest_notifiers() with
> > assgin=true, so the irqfd for vector 0 is still not "init" during this 
> > process
> >
> > 4. The system continues to boot and sets the vector back to 0. After that
> > msix_fire_vector_notifier() was triggered to unmask the vector 0 and  meet 
> > the crash
> >
> > To fix the issue, we need to support changing the vector after 
> > VIRTIO_CONFIG_S_DRIVER_OK is set.
> >
> > (gdb) bt
> > 0  __pthread_kill_implementation (threadid=, 
> > signo=signo@entry=6, no_tid=no_tid@entry=0)
> > at pthread_kill.c:44
> > 1  0x7fc87148ec53 in __pthread_kill_internal (signo=6, 
> > threadid=) at pthread_kill.c:78
> > 2  0x7fc87143e956 in __GI_raise (sig=sig@entry=6) at 
> > ../sysdeps/posix/raise.c:26
> > 3  0x7fc8714287f4 in __GI_abort () at abort.c:79
> > 4  0x7fc87142871b in __assert_fail_base
> > (fmt=0x7fc8715bbde0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", 
> > assertion=0x5606413efd53 "ret == 0", file=0x5606413ef87d 
> > "../accel/kvm/kvm-all.c", line=1837, function=) at 
> > assert.c:92
> > 5  0x7fc871437536 in __GI___assert_fail
> > (assertion=0x5606413efd53 "ret == 0", file=0x5606413ef87d 
> > "../accel/kvm/kvm-all.c", line=1837, function=0x5606413f06f0 
> > <__PRETTY_FUNCTION__.19> "kvm_irqchip_commit_routes") at assert.c:101
> > 6  0x560640f884b5 in kvm_irqchip_commit_routes (s=0x560642cae1f0) at 
> > ../accel/kvm/kvm-all.c:1837
> > 7  0x560640c98f8e in virtio_pci_one_vector_unmask
> > (proxy=0x560643c65f00, queue_no=4294967295, vector=0, msg=..., 
> > n=0x560643c6e4c8)
> > at ../hw/virtio/virtio-pci.c:1005
> > 8  0x560640c99201 in virtio_pci_vector_unmask (dev=0x560643c65f00, 
> > vector=0, msg=...)
> > at ../hw/virtio/virtio-pci.c:1070
> > 9  0x560640bc402e in msix_fire_vector_notifier (dev=0x560643c65f00, 
> > vector=0, is_masked=false)
> > at ../hw/pci/msix.c:120
> > 10 0x560640bc40f1 in msix_handle_mask_update (dev=0x560643c65f00, 
> > vector=0, was_masked=true)
> > at ../hw/pci/msix.c:140
> > 11 0x560640bc4503 in msix_table_mmio_write (opaque=0x560643c65f00, 
> > addr=12, val=0, size=4)
> > at ../hw/pci/msix.c:231
> > 12 0x560640f26d83 in memory_region_write_accessor
> > (mr=0x560643c66540, addr=12, value=0x7fc86b7bc628, size=4, shift=0, 
> > mask=4294967295, attrs=...)
> > at ../system/memory.c:497
> > 13 0x560640f270a6 in access_with_adjusted_size
> >
> >  (addr=12, value=0x7fc86b7bc628, size=4, access_size_min=1, 
> > access_size_max=4, access_fn=0x560640f26c8d , 
> > mr=0x560643c66540, attrs=...) at ../system/memory.c:573
> > 14 0x560640f2a2b5 in memory_region_dispatch_write (mr=0x560643c66540, 
> > addr=12, data=0, op=MO_32, attrs=...)
> > at ../system/memory.c:1521
> > 15 0x560640f37bac in flatview_write_continue
> > (fv=0x7fc65805e0b0, addr=4273803276, attrs=..., ptr=0x7fc871e9c028, 
> > len=4, addr1=12, l=4, mr=0x560643c66540)
> > at ../system/physmem.c:2714
> > 16 0x560640f37d0f in flatview_write
> > (fv=0x7fc65805e0b0, addr=4273803276, attrs=..., buf=0x7fc871e9c028, 
> > len=4) at ../system/physmem.c:2756
> > 17 0x560640f380bf in address_space_write
> > (as=0x560642161ae0 , addr=4273803276, attrs=..., 
> > buf=0x7fc871e9c028, len=4)
> > at ../system/physmem.c:2863
> > 18 0x560640f3812c in address_space_rw
> > (as=0x560642161ae0 , addr=4273803276, attrs=..., 
> > buf=0x7fc871e9c028, len=4, is_write=true) at ../system/physmem.c:2873
> > --Type  for more, q to quit, c to continue without paging--
> > 19 0x560640f8aa55 in kvm_cpu_exec (cpu=0x560642f205e0) at 
> > ../accel/kvm/kvm-all.c:2915
> > 20 0x560640f8d731 in kvm_vcpu_thread_fn (arg=0x560642f205e0) at 
> > ../accel/kvm/kvm-accel-ops.c:51
> > 21 0x5606411949f4 in qemu_thread_start (args=0x560642f292b0) at 
> > ../util/qemu-thread-posix.c:541
> > 22 0x7fc87148cdcd in start_thread (arg=) at 
> > pthread_create.c:442
> > 23 0x7fc871512630 in clone3 () at 
> > ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
> > (gdb)
> >
> > MST: coding style and typo fixups
> >
> > Fixes: f9a09ca3ea ("vhost: add support for configure interrupt")
> > Cc: qemu-sta...@nongnu.org
> > Signed-off-by: Cindy Lu 
> > Message-Id: <20240412062750.475180-1-l...@redhat.com>
> >

Re: [PATCH 1/2] target/riscv: prioritize pmp errors in raise_mmu_exception()

2024-04-15 Thread Joseph Chan

Digging more in  Priv-v1.12/riscv-privileged-20211203.pdf

 :
Page 82,  Section 4.3.2 Virtual Address Translation Process.

The spec actually mentions an address translation algorithm. Step 2
mentions the exception code should be " access-fault exception
corresponding to the original access type". i.e. 1, 5, 7.  All other steps
should use " page-fault exception corresponding to the original access
type". i.e. 12, 13, 15.

Regards,
Joseph Chan

On Mon, Apr 15, 2024 at 11:50 AM Joseph Chan  wrote:

> FYI
>
> Priv-v1.12/riscv-privileged-20211203.pdf
> 
> defines exception priorities on
> Page 40, Table 3.7
> Page 130, Table 8.7
>
> There is a sentence under Table 3.7:
> "When a virtual address is translated into a physical address, the address
> translation algorithm
> determines what specific exception may be raised."
>
>
> The spec does not insist any implementation to report Exception Code 12
> over 1; 13,15 over 5, 7. On the other hand, the phrases "During instruction
> address translation:" and "With physical address for instruction:" gives me
> the impression that when the implementation can distinguish between these
> situations, then reporting 12 , 13, 15 instead of 1, 5, 7 will provide a
> fine-grained reason for why things were broken.
>
> Regards,
> Joseph Chan
>
>
> On Sat, Apr 13, 2024 at 3:59 AM Alexei Filippov <
> alexei.filip...@syntacore.com> wrote:
>
>> From: Daniel Henrique Barboza 
>>
>> raise_mmu_exception(), as is today, is prioritizing guest page faults by
>> checking first if virt_enabled && !first_stage, and then considering the
>> regular inst/load/store faults.
>>
>> There's no mention in the spec about guest page fault being a higher
>> priority that PMP faults. In fact, privileged spec section 3.7.1 says:
>>
>> "Attempting to fetch an instruction from a PMP region that does not have
>> execute permissions raises an instruction access-fault exception.
>> Attempting to execute a load or load-reserved instruction which accesses
>> a physical address within a PMP region without read permissions raises a
>> load access-fault exception. Attempting to execute a store,
>> store-conditional, or AMO instruction which accesses a physical address
>> within a PMP region without write permissions raises a store
>> access-fault exception."
>>
>> So, in fact, we're doing it wrong - PMP faults should always be thrown,
>> regardless of also being a first or second stage fault.
>>
>> The way riscv_cpu_tlb_fill() and get_physical_address() work is
>> adequate: a TRANSLATE_PMP_FAIL error is immediately reported and
>> reflected in the 'pmp_violation' flag. What we need is to change
>> raise_mmu_exception() to prioritize it.
>>
>> Reported-by: Joseph Chan 
>> Fixes: 82d53adfbb ("target/riscv/cpu_helper.c: Invalid exception on MMU
>> translation stage")
>> Signed-off-by: Daniel Henrique Barboza 
>> ---
>>  target/riscv/cpu_helper.c | 22 --
>>  1 file changed, 12 insertions(+), 10 deletions(-)
>>
>> diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
>> index bc70ab5abc..196166f8dd 100644
>> --- a/target/riscv/cpu_helper.c
>> +++ b/target/riscv/cpu_helper.c
>> @@ -1203,28 +1203,30 @@ static void raise_mmu_exception(CPURISCVState
>> *env, target_ulong address,
>>
>>  switch (access_type) {
>>  case MMU_INST_FETCH:
>> -if (env->virt_enabled && !first_stage) {
>> +if (pmp_violation) {
>> +cs->exception_index = RISCV_EXCP_INST_ACCESS_FAULT;
>> +} else if (env->virt_enabled && !first_stage) {
>>  cs->exception_index = RISCV_EXCP_INST_GUEST_PAGE_FAULT;
>>  } else {
>> -cs->exception_index = pmp_violation ?
>> -RISCV_EXCP_INST_ACCESS_FAULT :
>> RISCV_EXCP_INST_PAGE_FAULT;
>> +cs->exception_index = RISCV_EXCP_INST_PAGE_FAULT;
>>  }
>>  break;
>>  case MMU_DATA_LOAD:
>> -if (two_stage && !first_stage) {
>> +if (pmp_violation) {
>> +cs->exception_index = RISCV_EXCP_LOAD_ACCESS_FAULT;
>> +} else if (two_stage && !first_stage) {
>>  cs->exception_index = RISCV_EXCP_LOAD_GUEST_ACCESS_FAULT;
>>  } else {
>> -cs->exception_index = pmp_violation ?
>> -RISCV_EXCP_LOAD_ACCESS_FAULT :
>> RISCV_EXCP_LOAD_PAGE_FAULT;
>> +cs->exception_index = RISCV_EXCP_LOAD_PAGE_FAULT;
>>  }
>>  break;
>>  case MMU_DATA_STORE:
>> -if (two_stage && !first_stage) {
>> +if (pmp_violation) {
>> +cs->exception_index = RISCV_EXCP_STORE_AMO_ACCESS_FAULT;
>> +} else if (two_stage && !first_stage) {
>>  cs->exception_index =
>> RISCV_EXCP_STORE_GUEST_AMO_ACCESS_FAULT;
>>  } else {
>> -cs->exception_index =

Re: [PATCH for-9.0] ppc440_pcix: Do not expose a bridge device on PCI bus

2024-04-15 Thread Nicholas Piggin

On Wed Apr 10, 2024 at 9:03 PM AEST, BALATON Zoltan wrote:
> On Wed, 10 Apr 2024, Nicholas Piggin wrote:
> > On Wed Apr 10, 2024 at 9:55 AM AEST, BALATON Zoltan wrote:
> >> Real 460EX SoC apparently does not expose a bridge device and having
> >> it appear on PCI bus confuses an AmigaOS file system driver that uses
> >> this to detect which machine it is running on. Since values written
> >> here by firmware are never read, just ignore these writes and drop the
> >> bridge device.
> >>
> >> Signed-off-by: BALATON Zoltan 
> >> ---
> >> This is only used by sam460ex and this fixes an issue with AmigaOS on
> >> this machine so I'd like this to be merged for 9.0 please.
> >
> > Is it a regression? Does it have a fixes: or resolves: tag?
> >
> > Unless we broke it in this cycle, I would be inclined to wait,
> > and we can ask to put it in stable.
>
> It's not something that broke in this cycle but since this does not affect 
> anything else than sam460ex I think it's OK to change this for 9.0. The 
> changes to 440 tlb in this cycle made sam460ex more useful to run AmigaOS 
> and this fixes the file system driver on it so it would make 9.0 really 
> usable. Otherwise people would have to wait longer until August or install 
> a stable update. Since this has low chance to break anything (tested with 
> AmogaOS and Linux and MorphOS does not boot due to do_io changes anyway) I 
> don't think we have to wait with this.

Hey, travelling / at a conference / on vacation for the next couple of
weeks.

It's just a bit late for hard freeze IMO, since we didn't break it
before the prior release or a bad security / crash bug. Will put it in
9.1.

Thanks,
Nick

Re: [RFC v3 0/6] virtio,vhost: Add VIRTIO_F_IN_ORDER support

2024-04-15 Thread Lei Yang

QE tested this series with packed=on/off, in_order=true and vhost=off
under regression tests, everything are works fine.

Tested-by: Lei Yang 

On Mon, Apr 8, 2024 at 11:34 PM Jonah Palmer  wrote:
>
> The goal of these patches is to add support to a variety of virtio and
> vhost devices for the VIRTIO_F_IN_ORDER transport feature. This feature
> indicates that all buffers are used by the device in the same order in
> which they were made available by the driver.
>
> These patches attempt to implement a generalized, non-device-specific
> solution to support this feature.
>
> The core feature behind this solution is a buffer mechanism in the form
> of a VirtQueue's used_elems VirtQueueElement array. This allows devices
> who always use buffers in-order by default to have a minimal overhead
> impact. Devices that may not always use buffers in-order likely will
> experience a performance hit. How large that performance hit is will
> depend on how frequent elements are completed out-of-order.
>
> A VirtQueue whose device who uses this feature will use its used_elems
> VirtQueueElement array to hold used VirtQueueElements. The index that
> used elements are placed in used_elems is the same index on the
> used/descriptor ring that would satisfy the in-order requirement. In
> other words, used elements are placed in their in-order locations on
> used_elems and are only written to the used/descriptor ring once the
> elements on used_elems are able to continue their expected order.
>
> To differentiate between a "used" and "unused" element on the used_elems
> array (a "used" element being an element that has returned from
> processing and an "unused" element being an element that has not yet
> been processed), we added a boolean 'filled' member to the
> VirtQueueElement struct. This flag is set to true when the element comes
> back from processing (virtqueue_ordered_fill) and then set back to false
> once it's been written to the used/descriptor ring
> (virtqueue_ordered_flush).
>
> ---
> v3: Add elements to used_elems during virtqueue_split/packed_pop
> Replace current_seq_idx usage with vq->last_avail_idx
> Remove used_seq_idx, leverage used_idx and last_avail_idx for
> searching used_elems
> Remove seq_idx in VirtQueueElement
> Add boolean to VirtQueueElement to signal element status
> Add virtqueue_ordered_fill/flush functions for ordering
>
> v2: Use a VirtQueue's used_elems array as a buffer mechanism
>
> v1: Implement custom GLib GHashTable as a buffer mechanism
>
> Jonah Palmer (6):
>   virtio: Add bool to VirtQueueElement
>   virtio: virtqueue_pop - VIRTIO_F_IN_ORDER support
>   virtio: virtqueue_ordered_fill - VIRTIO_F_IN_ORDER support
>   virtio: virtqueue_ordered_flush - VIRTIO_F_IN_ORDER support
>   vhost,vhost-user: Add VIRTIO_F_IN_ORDER to vhost feature bits
>   virtio: Add VIRTIO_F_IN_ORDER property definition
>
>  hw/block/vhost-user-blk.c|   1 +
>  hw/net/vhost_net.c   |   2 +
>  hw/scsi/vhost-scsi.c |   1 +
>  hw/scsi/vhost-user-scsi.c|   1 +
>  hw/virtio/vhost-user-fs.c|   1 +
>  hw/virtio/vhost-user-vsock.c |   1 +
>  hw/virtio/virtio.c   | 118 ++-
>  include/hw/virtio/virtio.h   |   5 +-
>  net/vhost-vdpa.c |   1 +
>  9 files changed, 127 insertions(+), 4 deletions(-)
>
> --
> 2.39.3
>

[RFC 1/1] hw/nvme: add atomic write support

2024-04-15 Thread Alan Adamson

Forces writes to be atomic based on new atomic write parameters.

New NVMe QEMU Parameters (See NVMe Specification for details):
atomic.dn (default off) - Set the value of Disable Normal.
atomic.awun=UINT16 (default: 0)
atomic.awupf=UINT16 (default: 0)
atomic.acwu=UINT16 (default: 0)

Signed-off-by: Alan Adamson 
---
 hw/nvme/ctrl.c | 147 -
 hw/nvme/nvme.h |  17 ++
 2 files changed, 163 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 127c3d238346..5d19965122d0 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -40,6 +40,9 @@
  *  sriov_vi_flexible= \
  *  sriov_max_vi_per_vf= \
  *  sriov_max_vq_per_vf= \
+ *  atomic.awun, \
+ *  atomic.awupf, \
+ *  atomic.acwu, \
  *  subsys=
  *  -device nvme-ns,drive=,bus=,nsid=,\
  *  zoned=, \
@@ -254,6 +257,7 @@ static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_ERROR_RECOVERY]   = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
+[NVME_WRITE_ATOMICITY]  = NVME_FEAT_CAP_CHANGE,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
 [NVME_TIMESTAMP]= NVME_FEAT_CAP_CHANGE,
 [NVME_HOST_BEHAVIOR_SUPPORT]= NVME_FEAT_CAP_CHANGE,
@@ -6071,6 +6075,9 @@ defaults:
 }
 goto out;
 
+break;
+case NVME_WRITE_ATOMICITY:
+result = n->dn;
 break;
 default:
 result = nvme_feature_default[fid];
@@ -6154,6 +6161,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 uint8_t save = NVME_SETFEAT_SAVE(dw10);
 uint16_t status;
 int i;
+NvmeIdCtrl *id = >id_ctrl;
+NvmeAtomic *atomic = >atomic;
 
 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
 
@@ -6306,6 +6315,21 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
NvmeRequest *req)
 return NVME_CMD_SEQ_ERROR | NVME_DNR;
 case NVME_FDP_EVENTS:
 return nvme_set_feature_fdp_events(n, ns, req);
+case NVME_WRITE_ATOMICITY:
+
+/*
+ * Since NAWUN and NAWUPF are not currently supported, this
+ * has no affect.
+ */
+n->dn = 0x1 & dw11;
+
+if (n->dn) {
+atomic->atomic_max_write_size = id->awupf + 1;
+} else {
+atomic->atomic_max_write_size = id->awun + 1;
+}
+
+break;
 default:
 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
 }
@@ -7003,16 +7027,76 @@ static void nvme_update_sq_tail(NvmeSQueue *sq)
 trace_pci_nvme_update_sq_tail(sq->sqid, sq->tail);
 }
 
+#define NVME_ATOMIC_NO_START0
+#define NVME_ATOMIC_START_ATOMIC1
+#define NVME_ATOMIC_START_NONATOMIC 2
+
+static int nvme_atomic_write_check(NvmeCtrl *n, NvmeCmd *cmd, NvmeAtomic 
*atomic)
+{
+NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
+uint64_t slba = le64_to_cpu(rw->slba);
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb);
+uint64_t elba = slba + nlb;
+bool cmd_atomic_wr = 1;
+int i;
+
+if ((cmd->opcode == NVME_CMD_READ) || ((cmd->opcode == NVME_CMD_WRITE) &&
+((rw->nlb + 1) > atomic->atomic_max_write_size))) {
+cmd_atomic_wr = 0;
+}
+
+for (i = 1; i < n->params.max_ioqpairs + 1; i++) {
+NvmeSQueue *sq;
+NvmeRequest *r;
+NvmeRwCmd *req_rw;
+uint64_t req_slba;
+uint32_t req_nlb;
+uint64_t req_elba;
+
+sq = n->sq[i];
+if (!sq) {
+break;
+}
+
+QTAILQ_FOREACH(r, >out_req_list, entry) {
+req_rw = (NvmeRwCmd *)>cmd;
+
+if (cmd->nsid == r->ns->params.nsid) {
+req_slba = le64_to_cpu(req_rw->slba);
+req_nlb = (uint32_t)le16_to_cpu(req_rw->nlb);
+req_elba = req_slba + req_nlb;
+
+if (cmd_atomic_wr) {
+if ((elba >= req_slba) && (slba <= req_elba)) {
+return NVME_ATOMIC_NO_START;
+}
+} else {
+if (r->atomic_write && ((elba >= req_slba) &&
+(slba <= req_elba))) {
+return NVME_ATOMIC_NO_START;
+}
+}
+}
+}
+}
+if (cmd_atomic_wr) {
+return NVME_ATOMIC_START_ATOMIC;
+}
+return NVME_ATOMIC_START_NONATOMIC;
+}
+
 static void nvme_process_sq(void *opaque)
 {
 NvmeSQueue *sq = opaque;
 NvmeCtrl *n = sq->ctrl;
 NvmeCQueue *cq = n->cq[sq->cqid];
-
+NvmeAtomic *atomic = >atomic;
 uint16_t status;
 hwaddr addr;
 NvmeCmd cmd;
 NvmeRequest *req;
+int ret;
+bool set_atomic = 0;
 
 if (n->dbbuf_enabled) {
 nvme_update_sq_tail(sq);
@@ -7026,6 +7110,23 @@ static void

[RFC 0/1] hw/nvme: add atomic write support

2024-04-15 Thread Alan Adamson

Since there is discussion in the Linux NVMe Driver community to add NVMe Atomic 
Write
support, it would be desirable to test it with qemu nvme emulation.
 
Initially, this RFC will focus on supporting NVMe controller atomic write 
parameters(AWUN,
AWUPF, and ACWU) but will be extended to support Namespace parameters (NAWUN, 
NAWUPF
and NACWU).
 
Atomic Write Parameters for NVMe QEMU
-
New NVMe QEMU Parameters (See NVMe Specification for details):
atomic.dn (default off) - Set the value of Disable Normal.
atomic.awun=UINT16 (default: 0)
atomic.awupf=UINT16 (default: 0)
atomic.acwu=UINT16 (default: 0)
 
qemu command line example:
qemu-system-x86_64 -cpu host --enable-kvm -smp cpus=4 -no-reboot -m 
8192M -drive file=./disk.img,if=ide \
-boot c -device e1000,netdev=net0,mac=DE:CC:CC:EF:99:88 -netdev 
tap,id=net0 \
-device 
nvme,id=nvme-ctrl-0,serial=nvme-1,atomic.dn=off,atomic.awun=63,atomic.awupf=63,atomic.acwu=0
 \
-drive file=./nvme.img,if=none,id=nvm-1 -device 
nvme-ns,drive=nvm-1,bus=nvme-ctrl-0 nvme-ns,drive=nvm-1,bus=nvme-ctrl-0
 
Making Writes Atomic:
-
- Prior to a command being pulled off the SQ and executed, a check is made to 
see if it
  conflicts "atomically" with a currently executing command.
- All currently executing commands on the same namespace, across all SQs need 
to be checked.
- If an atomic conflict is detected, the command is not started and remains on 
the queue.
 
Testing
---
NVMe QEMU Parameters used: 
atomic.dn=off,atomic.awun=63,atomic.awupf=63,atomic.acwu=0
 
# nvme id-ctrl /dev/nvme0 | grep awun
awun  : 63
# nvme id-ctrl /dev/nvme0 | grep awupf
awupf : 63
# nvme id-ctrl /dev/nvme0 | grep acwu
acwu  : 0< Since qemu-nvme doesn't support Compare and Write, this is 
always zero
# nvme get-feature /dev/nvme0  -f 0xa
get-feature:0x0a (Write Atomicity Normal), Current value:
#
 
# fio --filename=/dev/nvme0n1 --direct=1 --rw=randwrite --bs=32k --iodepth=256 
--name=iops --numjobs=50 --verify=crc64 --verify_fatal=1 --ioengine=libaio
 
When executed without atomic write support, eventually the following error will 
be
observed:
 
crc64: verify failed at file /dev/nvme0n1 offset 857669632, length 32768
(requested block: offset=857669632, length=32768, flags=88)
Expected CRC: 9c87d3539dafdca0
Received CRC: d521f7ea3b69d2ee
 
When executed with atomic write support, this error no longer happens.
 
Questions
-
AWUN vs AWUPF - Does the nvme emulation need to do treat these differently? 
Currently the
larger of the two will be used as the max atomic write size.
 
Future Work
---
- Namespace support (NAWUN, NAWUPF and NACWU)
- Namespace Boundary support (NABSN, NABO, and NABSPF)
- Atomic Compare and Write Unit (ACWU)

Alan Adamson (1):
  nvme: add atomic write support

 hw/nvme/ctrl.c | 147 -
 hw/nvme/nvme.h |  17 ++
 2 files changed, 163 insertions(+), 1 deletion(-)

-- 
2.39.3

[PATCH v5 1/3] ui/console: Introduce dpy_gl_qemu_dmabuf_get_..() helpers

2024-04-15 Thread dongwon . kim

From: Dongwon Kim 

This commit introduces dpy_gl_qemu_dmabuf_get_... helpers to extract
specific fields from the QemuDmaBuf struct. It also updates all instances
where fields within the QemuDmaBuf struct are directly accessed, replacing
them with calls to these new helper functions.

Suggested-by: Marc-André Lureau 
Cc: Philippe Mathieu-Daudé 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 include/ui/console.h|  17 +
 hw/display/vhost-user-gpu.c |   6 +-
 hw/display/virtio-gpu-udmabuf.c |   7 +-
 hw/vfio/display.c   |  15 +++--
 ui/console.c| 116 +++-
 ui/dbus-console.c   |   9 ++-
 ui/dbus-listener.c  |  43 +++-
 ui/egl-headless.c   |  23 +--
 ui/egl-helpers.c|  47 +++--
 ui/gtk-egl.c|  48 -
 ui/gtk-gl-area.c|  37 ++
 ui/gtk.c|   6 +-
 ui/spice-display.c  |  50 --
 13 files changed, 316 insertions(+), 108 deletions(-)

diff --git a/include/ui/console.h b/include/ui/console.h
index 0bc7a00ac0..6292943a82 100644
--- a/include/ui/console.h
+++ b/include/ui/console.h
@@ -358,6 +358,23 @@ void dpy_gl_cursor_dmabuf(QemuConsole *con, QemuDmaBuf 
*dmabuf,
   bool have_hot, uint32_t hot_x, uint32_t hot_y);
 void dpy_gl_cursor_position(QemuConsole *con,
 uint32_t pos_x, uint32_t pos_y);
+
+int32_t dpy_gl_qemu_dmabuf_get_fd(QemuDmaBuf *dmabuf);
+uint32_t dpy_gl_qemu_dmabuf_get_width(QemuDmaBuf *dmabuf);
+uint32_t dpy_gl_qemu_dmabuf_get_height(QemuDmaBuf *dmabuf);
+uint32_t dpy_gl_qemu_dmabuf_get_stride(QemuDmaBuf *dmabuf);
+uint32_t dpy_gl_qemu_dmabuf_get_fourcc(QemuDmaBuf *dmabuf);
+uint64_t dpy_gl_qemu_dmabuf_get_modifier(QemuDmaBuf *dmabuf);
+uint32_t dpy_gl_qemu_dmabuf_get_texture(QemuDmaBuf *dmabuf);
+uint32_t dpy_gl_qemu_dmabuf_get_x(QemuDmaBuf *dmabuf);
+uint32_t dpy_gl_qemu_dmabuf_get_y(QemuDmaBuf *dmabuf);
+uint32_t dpy_gl_qemu_dmabuf_get_backing_width(QemuDmaBuf *dmabuf);
+uint32_t dpy_gl_qemu_dmabuf_get_backing_height(QemuDmaBuf *dmabuf);
+bool dpy_gl_qemu_dmabuf_get_y0_top(QemuDmaBuf *dmabuf);
+void *dpy_gl_qemu_dmabuf_get_sync(QemuDmaBuf *dmabuf);
+int32_t dpy_gl_qemu_dmabuf_get_fence_fd(QemuDmaBuf *dmabuf);
+bool dpy_gl_qemu_dmabuf_get_allow_fences(QemuDmaBuf *dmabuf);
+bool dpy_gl_qemu_dmabuf_get_draw_submitted(QemuDmaBuf *dmabuf);
 void dpy_gl_release_dmabuf(QemuConsole *con,
QemuDmaBuf *dmabuf);
 void dpy_gl_update(QemuConsole *con,
diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
index 709c8a02a1..87dcfbca10 100644
--- a/hw/display/vhost-user-gpu.c
+++ b/hw/display/vhost-user-gpu.c
@@ -249,6 +249,7 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
VhostUserGpuMsg *msg)
 case VHOST_USER_GPU_DMABUF_SCANOUT: {
 VhostUserGpuDMABUFScanout *m = >payload.dmabuf_scanout;
 int fd = qemu_chr_fe_get_msgfd(>vhost_chr);
+int old_fd;
 QemuDmaBuf *dmabuf;
 
 if (m->scanout_id >= g->parent_obj.conf.max_outputs) {
@@ -262,8 +263,9 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
VhostUserGpuMsg *msg)
 g->parent_obj.enable = 1;
 con = g->parent_obj.scanout[m->scanout_id].con;
 dmabuf = >dmabuf[m->scanout_id];
-if (dmabuf->fd >= 0) {
-close(dmabuf->fd);
+old_fd = dpy_gl_qemu_dmabuf_get_fd(dmabuf);
+if (old_fd >= 0) {
+close(old_fd);
 dmabuf->fd = -1;
 }
 dpy_gl_release_dmabuf(con, dmabuf);
diff --git a/hw/display/virtio-gpu-udmabuf.c b/hw/display/virtio-gpu-udmabuf.c
index d51184d658..e3f358b575 100644
--- a/hw/display/virtio-gpu-udmabuf.c
+++ b/hw/display/virtio-gpu-udmabuf.c
@@ -206,6 +206,7 @@ int virtio_gpu_update_dmabuf(VirtIOGPU *g,
 {
 struct virtio_gpu_scanout *scanout = >parent_obj.scanout[scanout_id];
 VGPUDMABuf *new_primary, *old_primary = NULL;
+uint32_t width, height;
 
 new_primary = virtio_gpu_create_dmabuf(g, scanout_id, res, fb, r);
 if (!new_primary) {
@@ -216,10 +217,10 @@ int virtio_gpu_update_dmabuf(VirtIOGPU *g,
 old_primary = g->dmabuf.primary[scanout_id];
 }
 
+width = dpy_gl_qemu_dmabuf_get_width(_primary->buf);
+height = dpy_gl_qemu_dmabuf_get_height(_primary->buf);
 g->dmabuf.primary[scanout_id] = new_primary;
-qemu_console_resize(scanout->con,
-new_primary->buf.width,
-new_primary->buf.height);
+qemu_console_resize(scanout->con, width, height);
 dpy_gl_scanout_dmabuf(scanout->con, _primary->buf);
 
 if (old_primary) {
diff --git a/hw/vfio/display.c b/hw/vfio/display.c
index 1aa440c663..f9c39cbd51 100644
--- a/hw/vfio/display.c
+++ b/hw/vfio/display.c
@@ -259,9 +259,13 @@ static VFIODMABuf *vfio_display_get_dmabuf(VFIOPCIDevice 
*vdev,
 
 static void

[PATCH v5 3/3] ui/console: Introduce dpy_gl_qemu_dmabuf_new() and free() helpers

2024-04-15 Thread dongwon . kim

From: Dongwon Kim 

This commit introduces utility functions for the creation and deallocation
of QemuDmaBuf instances. Additionally, it updates all relevant sections
of the codebase to utilize these new utility functions.

Suggested-by: Marc-André Lureau 
Cc: Philippe Mathieu-Daudé 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 include/hw/vfio/vfio-common.h   |  2 +-
 include/hw/virtio/virtio-gpu.h  |  4 ++--
 include/ui/console.h|  8 +++-
 hw/display/vhost-user-gpu.c | 32 +--
 hw/display/virtio-gpu-udmabuf.c | 24 +--
 hw/vfio/display.c   | 26 -
 ui/console.c| 34 +
 ui/dbus-listener.c  | 28 ---
 8 files changed, 95 insertions(+), 63 deletions(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..d66e27db02 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -148,7 +148,7 @@ typedef struct VFIOGroup {
 } VFIOGroup;
 
 typedef struct VFIODMABuf {
-QemuDmaBuf buf;
+QemuDmaBuf *buf;
 uint32_t pos_x, pos_y, pos_updates;
 uint32_t hot_x, hot_y, hot_updates;
 int dmabuf_id;
diff --git a/include/hw/virtio/virtio-gpu.h b/include/hw/virtio/virtio-gpu.h
index ed44cdad6b..56d6e821bf 100644
--- a/include/hw/virtio/virtio-gpu.h
+++ b/include/hw/virtio/virtio-gpu.h
@@ -169,7 +169,7 @@ struct VirtIOGPUBaseClass {
 DEFINE_PROP_UINT32("yres", _state, _conf.yres, 800)
 
 typedef struct VGPUDMABuf {
-QemuDmaBuf buf;
+QemuDmaBuf *buf;
 uint32_t scanout_id;
 QTAILQ_ENTRY(VGPUDMABuf) next;
 } VGPUDMABuf;
@@ -238,7 +238,7 @@ struct VhostUserGPU {
 VhostUserBackend *vhost;
 int vhost_gpu_fd; /* closed by the chardev */
 CharBackend vhost_chr;
-QemuDmaBuf dmabuf[VIRTIO_GPU_MAX_SCANOUTS];
+QemuDmaBuf *dmabuf[VIRTIO_GPU_MAX_SCANOUTS];
 bool backend_blocked;
 };
 
diff --git a/include/ui/console.h b/include/ui/console.h
index 3d9d8b9fce..6d7c03b7c5 100644
--- a/include/ui/console.h
+++ b/include/ui/console.h
@@ -358,7 +358,13 @@ void dpy_gl_cursor_dmabuf(QemuConsole *con, QemuDmaBuf 
*dmabuf,
   bool have_hot, uint32_t hot_x, uint32_t hot_y);
 void dpy_gl_cursor_position(QemuConsole *con,
 uint32_t pos_x, uint32_t pos_y);
-
+QemuDmaBuf *dpy_gl_qemu_dmabuf_new(uint32_t width, uint32_t height,
+   uint32_t stride, uint32_t x,
+   uint32_t y, uint32_t backing_width,
+   uint32_t backing_height, uint32_t fourcc,
+   uint64_t modifier, uint32_t dmabuf_fd,
+   bool allow_fences, bool y0_top);
+void dpy_gl_qemu_dmabuf_free(QemuDmaBuf *dmabuf);
 int32_t dpy_gl_qemu_dmabuf_get_fd(QemuDmaBuf *dmabuf);
 uint32_t dpy_gl_qemu_dmabuf_get_width(QemuDmaBuf *dmabuf);
 uint32_t dpy_gl_qemu_dmabuf_get_height(QemuDmaBuf *dmabuf);
diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
index 87dcfbca10..4d8461e94a 100644
--- a/hw/display/vhost-user-gpu.c
+++ b/hw/display/vhost-user-gpu.c
@@ -250,6 +250,7 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
VhostUserGpuMsg *msg)
 VhostUserGpuDMABUFScanout *m = >payload.dmabuf_scanout;
 int fd = qemu_chr_fe_get_msgfd(>vhost_chr);
 int old_fd;
+uint64_t modifier = 0;
 QemuDmaBuf *dmabuf;
 
 if (m->scanout_id >= g->parent_obj.conf.max_outputs) {
@@ -262,31 +263,34 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
VhostUserGpuMsg *msg)
 
 g->parent_obj.enable = 1;
 con = g->parent_obj.scanout[m->scanout_id].con;
-dmabuf = >dmabuf[m->scanout_id];
-old_fd = dpy_gl_qemu_dmabuf_get_fd(dmabuf);
-if (old_fd >= 0) {
-close(old_fd);
-dmabuf->fd = -1;
+dmabuf = g->dmabuf[m->scanout_id];
+if (dmabuf) {
+old_fd = dpy_gl_qemu_dmabuf_get_fd(dmabuf);
+if (old_fd >= 0) {
+close(old_fd);
+dpy_gl_qemu_dmabuf_set_fd(dmabuf, -1);
+}
 }
 dpy_gl_release_dmabuf(con, dmabuf);
+g_clear_pointer(, dpy_gl_qemu_dmabuf_free);
 if (fd == -1) {
 dpy_gl_scanout_disable(con);
 break;
 }
-*dmabuf = (QemuDmaBuf) {
-.fd = fd,
-.width = m->fd_width,
-.height = m->fd_height,
-.stride = m->fd_stride,
-.fourcc = m->fd_drm_fourcc,
-.y0_top = m->fd_flags & VIRTIO_GPU_RESOURCE_FLAG_Y_0_TOP,
-};
+
 if (msg->request == VHOST_USER_GPU_DMABUF_SCANOUT2) {
 VhostUserGpuDMABUFScanout2 *m2 = >payload.dmabuf_scanout2;
-dmabuf->modifier = m2->modifier;
+modifier = m2->modifier;
 }
 
+dmabuf =

[PATCH v5 0/3] ui/console: Private QemuDmaBuf struct

2024-04-15 Thread dongwon . kim

From: Dongwon Kim 

This series introduces privacy enhancements to the QemuDmaBuf struct
and its contained data to bolster security. it accomplishes this by
introducing of helper functions for allocating, deallocating, and
accessing individual fields within the struct and replacing all direct
references to individual fields in the struct with methods using helpers
throughout the codebase. 

This change was made based on a suggestion from Marc-André Lureau
 

(Resumitting same patch series with this new cover-leter)

Dongwon Kim (3):
  ui/console: Introduce dpy_gl_qemu_dmabuf_get_..() helpers
  ui/console: Introduce dpy_gl_qemu_dmabuf_set_..() helpers
  ui/console: Introduce dpy_gl_qemu_dmabuf_new() and free() helpers

 include/hw/vfio/vfio-common.h   |   2 +-
 include/hw/virtio/virtio-gpu.h  |   4 +-
 include/ui/console.h|  28 +
 hw/display/vhost-user-gpu.c |  32 +++---
 hw/display/virtio-gpu-udmabuf.c |  27 ++---
 hw/vfio/display.c   |  35 ---
 ui/console.c| 180 +++-
 ui/dbus-console.c   |   9 +-
 ui/dbus-listener.c  |  71 +++--
 ui/egl-headless.c   |  23 ++--
 ui/egl-helpers.c|  59 ++-
 ui/gtk-egl.c|  52 +
 ui/gtk-gl-area.c|  41 +---
 ui/gtk.c|   8 +-
 ui/spice-display.c  |  50 +
 15 files changed, 449 insertions(+), 172 deletions(-)

-- 
2.34.1

[PATCH v5 2/3] ui/console: Introduce dpy_gl_qemu_dmabuf_set_..() helpers

2024-04-15 Thread dongwon . kim

From: Dongwon Kim 

To enhance security in accessing the QemuDmaBuf struct, new helper
functions for setting specific fields within the struct were introduced.
And all occurrences where these fields were previously set directly
have been updated to utilize these helper functions.

Suggested-by: Marc-André Lureau 
Cc: Philippe Mathieu-Daudé 
Cc: Vivek Kasireddy 
Signed-off-by: Dongwon Kim 
---
 include/ui/console.h |  5 +
 ui/console.c | 30 ++
 ui/egl-helpers.c | 16 +---
 ui/gtk-egl.c |  4 ++--
 ui/gtk-gl-area.c |  4 ++--
 ui/gtk.c |  2 +-
 6 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/include/ui/console.h b/include/ui/console.h
index 6292943a82..3d9d8b9fce 100644
--- a/include/ui/console.h
+++ b/include/ui/console.h
@@ -375,6 +375,11 @@ void *dpy_gl_qemu_dmabuf_get_sync(QemuDmaBuf *dmabuf);
 int32_t dpy_gl_qemu_dmabuf_get_fence_fd(QemuDmaBuf *dmabuf);
 bool dpy_gl_qemu_dmabuf_get_allow_fences(QemuDmaBuf *dmabuf);
 bool dpy_gl_qemu_dmabuf_get_draw_submitted(QemuDmaBuf *dmabuf);
+void dpy_gl_qemu_dmabuf_set_texture(QemuDmaBuf *dmabuf, uint32_t texture);
+void dpy_gl_qemu_dmabuf_set_fence_fd(QemuDmaBuf *dmabuf, int32_t fence_fd);
+void dpy_gl_qemu_dmabuf_set_sync(QemuDmaBuf *dmabuf, void *sync);
+void dpy_gl_qemu_dmabuf_set_draw_submitted(QemuDmaBuf *dmabuf, bool 
draw_submitted);
+void dpy_gl_qemu_dmabuf_set_fd(QemuDmaBuf *dmabuf, int32_t fd);
 void dpy_gl_release_dmabuf(QemuConsole *con,
QemuDmaBuf *dmabuf);
 void dpy_gl_update(QemuConsole *con,
diff --git a/ui/console.c b/ui/console.c
index 5d5635f783..d4ca9e6e0f 100644
--- a/ui/console.c
+++ b/ui/console.c
@@ -1244,6 +1244,36 @@ bool dpy_gl_qemu_dmabuf_get_draw_submitted(QemuDmaBuf 
*dmabuf)
 return dmabuf->draw_submitted;
 }
 
+void dpy_gl_qemu_dmabuf_set_texture(QemuDmaBuf *dmabuf, uint32_t texture)
+{
+assert(dmabuf != NULL);
+dmabuf->texture = texture;
+}
+
+void dpy_gl_qemu_dmabuf_set_fence_fd(QemuDmaBuf *dmabuf, int32_t fence_fd)
+{
+assert(dmabuf != NULL);
+dmabuf->fence_fd = fence_fd;
+}
+
+void dpy_gl_qemu_dmabuf_set_sync(QemuDmaBuf *dmabuf, void *sync)
+{
+assert(dmabuf != NULL);
+dmabuf->sync = sync;
+}
+
+void dpy_gl_qemu_dmabuf_set_draw_submitted(QemuDmaBuf *dmabuf, bool 
draw_submitted)
+{
+assert(dmabuf != NULL);
+dmabuf->draw_submitted = draw_submitted;
+}
+
+void dpy_gl_qemu_dmabuf_set_fd(QemuDmaBuf *dmabuf, int32_t fd)
+{
+assert(dmabuf != NULL);
+dmabuf->fd = fd;
+}
+
 void dpy_gl_release_dmabuf(QemuConsole *con,
   QemuDmaBuf *dmabuf)
 {
diff --git a/ui/egl-helpers.c b/ui/egl-helpers.c
index 86d64c68ce..c71a2878c2 100644
--- a/ui/egl-helpers.c
+++ b/ui/egl-helpers.c
@@ -348,8 +348,8 @@ void egl_dmabuf_import_texture(QemuDmaBuf *dmabuf)
 return;
 }
 
-glGenTextures(1, >texture);
-texture = dpy_gl_qemu_dmabuf_get_texture(dmabuf);
+glGenTextures(1, );
+dpy_gl_qemu_dmabuf_set_texture(dmabuf, texture);
 glBindTexture(GL_TEXTURE_2D, texture);
 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
 glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
@@ -368,7 +368,7 @@ void egl_dmabuf_release_texture(QemuDmaBuf *dmabuf)
 }
 
 glDeleteTextures(1, );
-dmabuf->texture = 0;
+dpy_gl_qemu_dmabuf_set_texture(dmabuf, 0);
 }
 
 void egl_dmabuf_create_sync(QemuDmaBuf *dmabuf)
@@ -382,7 +382,7 @@ void egl_dmabuf_create_sync(QemuDmaBuf *dmabuf)
 sync = eglCreateSyncKHR(qemu_egl_display,
 EGL_SYNC_NATIVE_FENCE_ANDROID, NULL);
 if (sync != EGL_NO_SYNC_KHR) {
-dmabuf->sync = sync;
+dpy_gl_qemu_dmabuf_set_sync(dmabuf, sync);
 }
 }
 }
@@ -390,12 +390,14 @@ void egl_dmabuf_create_sync(QemuDmaBuf *dmabuf)
 void egl_dmabuf_create_fence(QemuDmaBuf *dmabuf)
 {
 void *sync = dpy_gl_qemu_dmabuf_get_sync(dmabuf);
+int fence_fd;
 
 if (sync) {
-dmabuf->fence_fd = eglDupNativeFenceFDANDROID(qemu_egl_display,
-  sync);
+fence_fd = eglDupNativeFenceFDANDROID(qemu_egl_display,
+  sync);
+dpy_gl_qemu_dmabuf_set_fence_fd(dmabuf, fence_fd);
 eglDestroySyncKHR(qemu_egl_display, sync);
-dmabuf->sync = NULL;
+dpy_gl_qemu_dmabuf_set_sync(dmabuf, NULL);
 }
 }
 
diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
index c9469af9ed..7494a34d7c 100644
--- a/ui/gtk-egl.c
+++ b/ui/gtk-egl.c
@@ -87,7 +87,7 @@ void gd_egl_draw(VirtualConsole *vc)
 if (!dpy_gl_qemu_dmabuf_get_draw_submitted(dmabuf)) {
 return;
 } else {
-dmabuf->draw_submitted = false;
+dpy_gl_qemu_dmabuf_set_draw_submitted(dmabuf, false);
 }
 }
 #endif
@@ -381,7 +381,7 @@ void gd_egl_flush(DisplayChangeListener *dcl,
 if

RE: [PATCH] vhost-user-gpu: fix import of DMABUF

2024-04-15 Thread Kim, Dongwon

Hi Marc-André,

> -Original Message-
> From: marcandre.lur...@redhat.com 
> Sent: Monday, April 15, 2024 4:16 AM
> To: qemu-devel@nongnu.org
> Cc: Kim, Dongwon ; dbas...@redhat.com; Marc-
> André Lureau ; Michael S. Tsirkin
> ; Gerd Hoffmann 
> Subject: [PATCH] vhost-user-gpu: fix import of DMABUF
> 
> From: Marc-André Lureau 
> 
> When using vhost-user-gpu with GL, qemu -display gtk doesn't show output and
> prints: qemu: eglCreateImageKHR failed
> 
> Since commit 9ac06df8b ("virtio-gpu-udmabuf: correct naming of QemuDmaBuf
> size properties"), egl_dmabuf_import_texture() uses backing_{width,height} for
> the texture dimension.
> 
> Fixes: commit 9ac06df8b ("virtio-gpu-udmabuf: correct naming of QemuDmaBuf
> size properties")
> Signed-off-by: Marc-André Lureau 
> ---
>  hw/display/vhost-user-gpu.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c index
> 709c8a02a1..baffb1c2d4 100644
> --- a/hw/display/vhost-user-gpu.c
> +++ b/hw/display/vhost-user-gpu.c
> @@ -273,8 +273,8 @@ vhost_user_gpu_handle_display(VhostUserGPU *g,
> VhostUserGpuMsg *msg)
>  }
>  *dmabuf = (QemuDmaBuf) {
>  .fd = fd,
> -.width = m->fd_width,
> -.height = m->fd_height,
[Kim, Dongwon]  I think we could just leave .width/.height setting here 
although nothing will go wrong in any cases. Did you have any specific reason 
why leaving these uninitialized?

> +.backing_width = m->fd_width,
> +.backing_height = m->fd_height,
>  .stride = m->fd_stride,
>  .fourcc = m->fd_drm_fourcc,
>  .y0_top = m->fd_flags & VIRTIO_GPU_RESOURCE_FLAG_Y_0_TOP,
> --
> 2.41.0.28.gd7d8841f67

Re: [PATCH v6 11/12] hw/cxl/cxl-mailbox-utils: Add superset extent release mailbox support

2024-04-15 Thread fan

On Fri, Apr 05, 2024 at 09:57:18AM +, Jørgen Hansen wrote:
> On 3/25/24 20:02, nifan@gmail.com wrote:
> > From: Fan Ni 
> > 
> > With the change, we extend the extent release mailbox command processing
> > to allow more flexible release. As long as the DPA range of the extent to
> > release is covered by accepted extent(s) in the device, the release can be
> > performed.
> > 
> > Signed-off-by: Fan Ni 
> > ---
> >   hw/cxl/cxl-mailbox-utils.c | 41 ++
> >   1 file changed, 24 insertions(+), 17 deletions(-)
> > 
> > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> > index a0d2239176..3b7949c364 100644
> > --- a/hw/cxl/cxl-mailbox-utils.c
> > +++ b/hw/cxl/cxl-mailbox-utils.c
> > @@ -1674,6 +1674,12 @@ static CXLRetCode 
> > cxl_dc_extent_release_dry_run(CXLType3Dev *ct3d,
> >   dpa = in->updated_entries[i].start_dpa;
> >   len = in->updated_entries[i].len;
> > 
> > +/* Check if the DPA range is not fully backed with valid extents */
> > +if (!ct3_test_region_block_backed(ct3d, dpa, len)) {
> > +ret = CXL_MBOX_INVALID_PA;
> > +goto free_and_exit;
> > +}
> 
> In cxl_dcd_add_dyn_cap_rsp_dry_run, the opposite check (all 0's in the 
> bitmap) could be used instead of looping through the full extent list 
> (and this also makes my previous comment about reusing the bitmap from 
> cxl_detect_malformed_extent_list irrelevant).

For adding, we need to make sure the incoming extents have no overlaps
with accepted extents, that means if any bit of the range is not 0, it
returns error. We cannot use !ct3_test_region_block_backed for the
purpose, as it return true when all 1s, not has 1s.

For the purpose, we need some function like
ct3_test_region_block_all_cleared or ct3_test_region_block_non_backed.
We do not have that in current code.
Checking bitmap is more performance efficient, but it introduces more
changes, so I will leave it as it is until there are more concerns.

Fan

> 
> > +/* After this point, extent overflow is the only error can happen 
> > */
> >   while (len > 0) {
> >   QTAILQ_FOREACH(ent, _list, node) {
> >   range_init_nofail(, ent->start_dpa, ent->len);
> > @@ -1713,25 +1719,27 @@ static CXLRetCode 
> > cxl_dc_extent_release_dry_run(CXLType3Dev *ct3d,
> >   goto free_and_exit;
> >   }
> >   } else {
> > -/*
> > - * TODO: we reject the attempt to remove an extent
> > - * that overlaps with multiple extents in the 
> > device
> > - * for now, we will allow it once superset release
> > - * support is added.
> > - */
> > -ret = CXL_MBOX_INVALID_PA;
> > -goto free_and_exit;
> > +len1 = dpa - ent_start_dpa;
> > +len2 = 0;
> > +len_done = ent_len - len1 - len2;
> 
> You don't need len2 in the else statement.
> 
> Thanks,
> Jørgen
> 
> > +
> > +cxl_remove_extent_from_extent_list(_list, ent);
> > +cnt_delta--;
> > +if (len1) {
> > +cxl_insert_extent_to_extent_list(_list,
> > + ent_start_dpa,
> > + len1, NULL, 
> > 0);
> > +cnt_delta++;
> > +}
> >   }
> > 
> >   len -= len_done;
> > -/* len == 0 here until superset release is added */
> > +if (len) {
> > +dpa = ent_start_dpa + ent_len;
> > +}
> >   break;
> >   }
> >   }
> > -if (len) {
> > -ret = CXL_MBOX_INVALID_PA;
> > -goto free_and_exit;
> > -}
> >   }
> >   }
> >   free_and_exit:
> > @@ -1819,10 +1827,9 @@ static CXLRetCode cmd_dcd_release_dyn_cap(const 
> > struct cxl_cmd *cmd,
> >   }
> > 
> >   len -= len_done;
> > -/*
> > - * len will always be 0 until superset release is add.
> > - * TODO: superset release will be added.
> > - */
> > +if (len > 0) {
> > +dpa = ent_start_dpa + ent_len;
> > +}
> >   break;
> >   }
> >   }
> > --
> > 2.43.0
> >

Re: [PATCH v6 09/12] hw/cxl/events: Add qmp interfaces to add/release dynamic capacity extents

2024-04-15 Thread fan



>From ce75be83e915fbc4dd6e489f976665b81174002b Mon Sep 17 00:00:00 2001
From: Fan Ni 
Date: Tue, 20 Feb 2024 09:48:31 -0800
Subject: [PATCH 09/13] hw/cxl/events: Add qmp interfaces to add/release
 dynamic capacity extents

To simulate FM functionalities for initiating Dynamic Capacity Add
(Opcode 5604h) and Dynamic Capacity Release (Opcode 5605h) as in CXL spec
r3.1 7.6.7.6.5 and 7.6.7.6.6, we implemented two QMP interfaces to issue
add/release dynamic capacity extents requests.

With the change, we allow to release an extent only when its DPA range
is contained by a single accepted extent in the device. That is to say,
extent superset release is not supported yet.

1. Add dynamic capacity extents:

For example, the command to add two continuous extents (each 128MiB long)
to region 0 (starting at DPA offset 0) looks like below:

{ "execute": "qmp_capabilities" }

{ "execute": "cxl-add-dynamic-capacity",
  "arguments": {
  "path": "/machine/peripheral/cxl-dcd0",
  "hid": 0,
  "selection-policy": 2,
  "region-id": 0,
  "tag": "",
  "extents": [
  {
  "offset": 0,
  "len": 134217728
  },
  {
  "offset": 134217728,
  "len": 134217728
  }
  ]
  }
}

2. Release dynamic capacity extents:

For example, the command to release an extent of size 128MiB from region 0
(DPA offset 128MiB) looks like below:

{ "execute": "cxl-release-dynamic-capacity",
  "arguments": {
  "path": "/machine/peripheral/cxl-dcd0",
  "hid": 0,
  "flags": 1,
  "region-id": 0,
  "tag": "",
  "extents": [
  {
  "offset": 134217728,
  "len": 134217728
  }
  ]
  }
}

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  |  65 ++--
 hw/mem/cxl_type3.c  | 310 +++-
 hw/mem/cxl_type3_stubs.c|  20 +++
 include/hw/cxl/cxl_device.h |  22 +++
 include/hw/cxl/cxl_events.h |  18 +++
 qapi/cxl.json   |  69 
 6 files changed, 491 insertions(+), 13 deletions(-)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index cd9092b6bf..839ae836a1 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -1405,7 +1405,7 @@ static CXLRetCode cmd_dcd_get_dyn_cap_ext_list(const 
struct cxl_cmd *cmd,
  * Check whether any bit between addr[nr, nr+size) is set,
  * return true if any bit is set, otherwise return false
  */
-static bool test_any_bits_set(const unsigned long *addr, unsigned long nr,
+bool test_any_bits_set(const unsigned long *addr, unsigned long nr,
   unsigned long size)
 {
 unsigned long res = find_next_bit(addr, size + nr, nr);
@@ -1444,7 +1444,7 @@ CXLDCRegion *cxl_find_dc_region(CXLType3Dev *ct3d, 
uint64_t dpa, uint64_t len)
 return NULL;
 }
 
-static void cxl_insert_extent_to_extent_list(CXLDCExtentList *list,
+void cxl_insert_extent_to_extent_list(CXLDCExtentList *list,
  uint64_t dpa,
  uint64_t len,
  uint8_t *tag,
@@ -1470,6 +1470,44 @@ void cxl_remove_extent_from_extent_list(CXLDCExtentList 
*list,
 g_free(extent);
 }
 
+/*
+ * Add a new extent to the extent "group" if group exists;
+ * otherwise, create a new group
+ * Return value: return the group where the extent is inserted.
+ */
+CXLDCExtentGroup *cxl_insert_extent_to_extent_group(CXLDCExtentGroup *group,
+uint64_t dpa,
+uint64_t len,
+uint8_t *tag,
+uint16_t shared_seq)
+{
+if (!group) {
+group = g_new0(CXLDCExtentGroup, 1);
+QTAILQ_INIT(>list);
+}
+cxl_insert_extent_to_extent_list(>list, dpa, len,
+ tag, shared_seq);
+return group;
+}
+
+void cxl_extent_group_list_insert_tail(CXLDCExtentGroupList *list,
+   CXLDCExtentGroup *group)
+{
+QTAILQ_INSERT_TAIL(list, group, node);
+}
+
+void cxl_extent_group_list_delete_front(CXLDCExtentGroupList *list)
+{
+CXLDCExtent *ent, *ent_next;
+CXLDCExtentGroup *group = QTAILQ_FIRST(list);
+
+QTAILQ_REMOVE(list, group, node);
+QTAILQ_FOREACH_SAFE(ent, >list, node, ent_next) {
+cxl_remove_extent_from_extent_list(>list, ent);
+}
+g_free(group);
+}
+
 /*
  * CXL r3.1 Table 8-168: Add Dynamic Capacity Response Input Payload
  * CXL r3.1 Table 8-170: Release Dynamic Capacity Input Payload
@@ -1541,6 +1579,7 @@ static CXLRetCode 
cxl_dcd_add_dyn_cap_rsp_dry_run(CXLType3Dev *ct3d,
 {
 uint32_t i;
 CXLDCExtent *ent;
+CXLDCExtentGroup *ext_group;
 uint64_t dpa, len;
 Range range1, range2;
 
@@ -1551,9 +1590,13 @@ static CXLRetCode 
cxl_dcd_add_dyn_cap_rsp_dry_run(CXLType3Dev *ct3d,

Re: Questions about "QEMU gives wrong MPIDR value for Arm CPU types with MT=1" (gitlab issue #1608)

2024-04-15 Thread Dorjoy Chowdhury

On Mon, Apr 15, 2024 at 5:35 PM Peter Maydell  wrote:
>
> On Sat, 13 Apr 2024 at 20:59, Dorjoy Chowdhury  wrote:
> >
> > Hi,
> > Hope everyone is doing well. I was looking at "Bite Sized" tagged QEMU
> > issues in gitlab to see if there is anything that makes sense for me
> > as a first time contributor. I see this issue "QEMU gives wrong MPIDR
> > value for Arm CPU types with MT=1" (gitlab URL:
> > https://gitlab.com/qemu-project/qemu/-/issues/1608 ).
> >
> > From the bug ticket description, it is very clear that I will need to
> > add a bool member variable in the "AarchCPU" struct which is in
> > "target/arm/cpu.h" file. I believe the next logical change is to set
> > this member variable to "true" in the corresponding arm cpu "initfn"
> > functions (a55, a76, noeverse_n1) which are in "target/arm/cpu64.c"
> > file. I have a few questions about the following steps as I am looking
> > through the code.
> >
> > 1. I believe I will need to update the "arm_build_mp_affinity"
> > function in "target/arm/cpu.c" file to now also take in a bool
> > parameter that will indicate if the function should instead put the
> > "core index" in the "aff1" bits instead of the existing logic of
> > "aff0" bits and the cluster id in the "aff2" bits instead of the
> > existing logic of "aff1" bits. But I see this function being invoked
> > from 3 other files: "hw/arm/sbsa-ref.c", "hw/arm/virt.c",
> > "hw/arm/npcm7xx.c". Should the function calls in these files always
> > have that corresponding argument set to "false"?
>
> This bit of the codebase has got a bit more complicated since
> I wrote up the bug report. I will look into this and get back
> to you, but my suspicion is that these calls must return the
> same value that the actual CPU MPIDR affinity values have,
> because these values are going to end up in the DTB and ACPI
> tables, and the OS will want them to match up with MPIDRs.
>

Sounds great. Let me know. It sounds like then it could make sense to
change the "arm_build_mp_affinity" function signature in file
"target/arm/cpu.c" file to be like this:

uint64_t arm_build_mp_affinity(bool has_smt, int idx, uint8_t clusterz)

I think in all the files where it is invoked it should be possible to
know the SMT status of the cpu using ARMCPU(qemu_get_cpu(cpu)) or
similar. Let me know what you think.

> > 2. As per the bug ticket description, I will also need to update the
> > "mpidr_read_val" function in the "target/arm/helper.c" file to only
> > set the MT bit (24th) to 1 if the member variable is true. I think
> > there is nothing else to be done in this function apart from checking
> > and then setting the MT bit. Is my assumption correct?
>
> Yes, that's right.
>
> > I think doing the above steps should fix the bug and probably we don't
> > need anything else. It would be great if someone can help me answer
> > the questions or any suggestion would be great if my assumptions are
> > wrong. Thanks.
>
> The other thing we need to do is check the TRM (technical reference
> manual) for the CPUs that were added since I filed that bug in
> April 2023, to see if they need to have the flag set or not. The
> ones we need to check are:
>  * cortex-a710
>  * neoverse-n2
>  * neoverse-v1
>

Good point. I have now looked at the TRMs of the a710, neoverse-n2,
neoverse-v2 and they are similar like the ones mentioned in the gitlab
bug ticket i.e., MT bit should be 1, Aff0 should be 0, Aff1 should be
core index, Aff2 should be cluster id.

> thanks
> -- PMM

Let me know what you think. If everything sounds alright, I will try
and post a patch.

Thanks and regards,
Dorjoy.

Re: [PATCH 1/2] target/riscv: prioritize pmp errors in raise_mmu_exception()

2024-04-15 Thread Joseph Chan

FYI

Priv-v1.12/riscv-privileged-20211203.pdf

defines exception priorities on
Page 40, Table 3.7
Page 130, Table 8.7

There is a sentence under Table 3.7:
"When a virtual address is translated into a physical address, the address
translation algorithm
determines what specific exception may be raised."


The spec does not insist any implementation to report Exception Code 12
over 1; 13,15 over 5, 7. On the other hand, the phrases "During instruction
address translation:" and "With physical address for instruction:" gives me
the impression that when the implementation can distinguish between these
situations, then reporting 12 , 13, 15 instead of 1, 5, 7 will provide a
fine-grained reason for why things were broken.

Regards,
Joseph Chan


On Sat, Apr 13, 2024 at 3:59 AM Alexei Filippov <
alexei.filip...@syntacore.com> wrote:

> From: Daniel Henrique Barboza 
>
> raise_mmu_exception(), as is today, is prioritizing guest page faults by
> checking first if virt_enabled && !first_stage, and then considering the
> regular inst/load/store faults.
>
> There's no mention in the spec about guest page fault being a higher
> priority that PMP faults. In fact, privileged spec section 3.7.1 says:
>
> "Attempting to fetch an instruction from a PMP region that does not have
> execute permissions raises an instruction access-fault exception.
> Attempting to execute a load or load-reserved instruction which accesses
> a physical address within a PMP region without read permissions raises a
> load access-fault exception. Attempting to execute a store,
> store-conditional, or AMO instruction which accesses a physical address
> within a PMP region without write permissions raises a store
> access-fault exception."
>
> So, in fact, we're doing it wrong - PMP faults should always be thrown,
> regardless of also being a first or second stage fault.
>
> The way riscv_cpu_tlb_fill() and get_physical_address() work is
> adequate: a TRANSLATE_PMP_FAIL error is immediately reported and
> reflected in the 'pmp_violation' flag. What we need is to change
> raise_mmu_exception() to prioritize it.
>
> Reported-by: Joseph Chan 
> Fixes: 82d53adfbb ("target/riscv/cpu_helper.c: Invalid exception on MMU
> translation stage")
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  target/riscv/cpu_helper.c | 22 --
>  1 file changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
> index bc70ab5abc..196166f8dd 100644
> --- a/target/riscv/cpu_helper.c
> +++ b/target/riscv/cpu_helper.c
> @@ -1203,28 +1203,30 @@ static void raise_mmu_exception(CPURISCVState
> *env, target_ulong address,
>
>  switch (access_type) {
>  case MMU_INST_FETCH:
> -if (env->virt_enabled && !first_stage) {
> +if (pmp_violation) {
> +cs->exception_index = RISCV_EXCP_INST_ACCESS_FAULT;
> +} else if (env->virt_enabled && !first_stage) {
>  cs->exception_index = RISCV_EXCP_INST_GUEST_PAGE_FAULT;
>  } else {
> -cs->exception_index = pmp_violation ?
> -RISCV_EXCP_INST_ACCESS_FAULT : RISCV_EXCP_INST_PAGE_FAULT;
> +cs->exception_index = RISCV_EXCP_INST_PAGE_FAULT;
>  }
>  break;
>  case MMU_DATA_LOAD:
> -if (two_stage && !first_stage) {
> +if (pmp_violation) {
> +cs->exception_index = RISCV_EXCP_LOAD_ACCESS_FAULT;
> +} else if (two_stage && !first_stage) {
>  cs->exception_index = RISCV_EXCP_LOAD_GUEST_ACCESS_FAULT;
>  } else {
> -cs->exception_index = pmp_violation ?
> -RISCV_EXCP_LOAD_ACCESS_FAULT : RISCV_EXCP_LOAD_PAGE_FAULT;
> +cs->exception_index = RISCV_EXCP_LOAD_PAGE_FAULT;
>  }
>  break;
>  case MMU_DATA_STORE:
> -if (two_stage && !first_stage) {
> +if (pmp_violation) {
> +cs->exception_index = RISCV_EXCP_STORE_AMO_ACCESS_FAULT;
> +} else if (two_stage && !first_stage) {
>  cs->exception_index = RISCV_EXCP_STORE_GUEST_AMO_ACCESS_FAULT;
>  } else {
> -cs->exception_index = pmp_violation ?
> -RISCV_EXCP_STORE_AMO_ACCESS_FAULT :
> -RISCV_EXCP_STORE_PAGE_FAULT;
> +cs->exception_index = RISCV_EXCP_STORE_PAGE_FAULT;
>  }
>  break;
>  default:
> --
> 2.34.1
>
>

Re: Intention to work on GSoC project

2024-04-15 Thread Sahil

Hi,

Thank you for your reply.

On Monday, April 15, 2024 2:27:36 PM IST Eugenio Perez Martin wrote:
> [...]
> > I have one question though. One of the options (use case 1 in [1])
> > 
> > given to the "qemu-kvm" command is:
> > > -device virtio-net-pci,netdev=vhost-vdpa0,bus=pcie.0,addr=0x7\
> > > ,disable-modern=off,page-per-vq=on
> > 
> > This gives an error:
> > > Bus "pcie.0" not found
> > 
> > Does pcie refer to PCI Express? Changing this to pci.0 works.
> 
> Yes, you don't need to mess with pcie stuff so this solution is
> totally valid. I think we need to change that part in the tutorial.
> 

Understood.

> > I read through the "device buses" section in QEMU's user
> > documentation [5], but I have still not understood this.
> > 
> > "ls /sys/bus/pci/devices/* | grep vdpa" does not give any results.
> > Replacing pci with pci_express doesn't give any results either. How
> > does one know which pci bus the vdpa device is connected to?
> > I have gone through the "vDPA bus drivers" section of the "vDPA
> > kernel framework" article [6] but I haven't managed to find an
> > answer yet. Am I missing something here?
> 
> You cannot see the vDPA device from the guest. From the guest POV is a
> regular virtio over PCI bus.
>
> From the host, vdpa_sim is not a PCI device either, so you cannot see
> under /sys/bus. Do you have a vdpa* entry under
> /sys/bus/vdpa/devices/?
>

After re-reading the linked articles, I think I have got some more
clarity. One confusion was related to the difference between vdpa
and vhost-vdpa.

So far what I have understood is that L0 acts as the host and L1
acts as the guest in this setup. I understand that the guest can't
see the vDPA device.

I now also understand that vdpa_sim is not a PCI device. I am also
under the impression that vdpa refers to the vdpa bus while
vhost-vdpa is the device. Is my understanding correct?

After running the commands in the blog [1], I see that there's a
vhost-vdpa-0 device under /dev.

I also have an entry "vdpa0" under /sys/bus/vdpa/devices/ which
is a symlink to /sys/devices/vdpa0. There's a dir "vhost-vdpa-0"
under "/sys/devices/vdpa0". Hypothetically, if vhost-vdpa-0 had
been a PCI device, then it would have been present under
/sys/bus/pci/devices, right?

Another source of confusion was the pci.0 option passed to the
qemu-kvm command. But I have understood this as well now:
"-device virtio-net-pci" is a pci device.

> > There's one more thing. In "use case 1" of "Running traffic with
> > vhost_vdpa in Guest" [1], running "modprobe pktgen" in the L1 VM
> > 
> > gives an error:
> > > module pktgen couldn't be found in /lib/modules/6.5.6-300.fc39.x86_64.
> > 
> > The kernel version is 6.5.6-300.fc39.x86_64. I haven't tried building
> > pktgen manually in L1. I'll try that and will check if vdpa_sim works
> > as expected after that.
> 
> Did you install kernel-modules-internal?

I just realized I had the wrong version of kernel-modules-internal
installed. It works after installing the right version.

Thanks,
Sahil

[1] 
https://www.redhat.com/en/blog/hands-vdpa-what-do-you-do-when-you-aint-got-hardware-part-1

[PATCH] qemu-options: Deprecate "-runas" and introduce "-run-with user=..." instead

2024-04-15 Thread Thomas Huth

The old "-runas" option has the disadvantage that it is not visible
in the QAPI schema, so it is not available via the normal introspection
mechanisms. We've recently introduced the "-run-with" option for exactly
this purpose, which is meant to handle the options that affect the
runtime behavior. Thus let's introduce a "user=..." parameter here now
and deprecate the old "-runas" option.

Signed-off-by: Thomas Huth 
---
 docs/about/deprecated.rst |  6 ++
 system/vl.c   | 15 +++
 qemu-options.hx   | 11 +--
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/docs/about/deprecated.rst b/docs/about/deprecated.rst
index 7b548519b5..6b932961bc 100644
--- a/docs/about/deprecated.rst
+++ b/docs/about/deprecated.rst
@@ -61,6 +61,12 @@ configurations (e.g. -smp drawers=1,books=1,clusters=1 for 
x86 PC machine) is
 marked deprecated since 9.0, users have to ensure that all the topology members
 described with -smp are supported by the target machine.
 
+``-runas`` (since 9.1)
+--
+
+Use ``-run-with user=..`` instead.
+
+
 User-mode emulator command line arguments
 -
 
diff --git a/system/vl.c b/system/vl.c
index c644222982..a8e979 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -770,6 +770,10 @@ static QemuOptsList qemu_run_with_opts = {
 .name = "chroot",
 .type = QEMU_OPT_STRING,
 },
+{
+.name = "user",
+.type = QEMU_OPT_STRING,
+},
 { /* end of list */ }
 },
 };
@@ -3583,6 +3587,7 @@ void qemu_init(int argc, char **argv)
 break;
 #if defined(CONFIG_POSIX)
 case QEMU_OPTION_runas:
+warn_report("-runas is deprecated, use '-run-with user=...' 
instead");
 if (!os_set_runas(optarg)) {
 error_report("User \"%s\" doesn't exist"
  " (and is not :)",
@@ -3609,6 +3614,16 @@ void qemu_init(int argc, char **argv)
 if (str) {
 os_set_chroot(str);
 }
+str = qemu_opt_get(opts, "user");
+if (str) {
+if (!os_set_runas(str)) {
+error_report("User \"%s\" doesn't exist"
+ " (and is not :)",
+ optarg);
+exit(1);
+}
+}
+
 break;
 }
 #endif /* CONFIG_POSIX */
diff --git a/qemu-options.hx b/qemu-options.hx
index 8ce85d4559..50912c7dab 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4824,7 +4824,8 @@ DEF("runas", HAS_ARG, QEMU_OPTION_runas, \
 SRST
 ``-runas user``
 Immediately before starting guest execution, drop root privileges,
-switching to the specified user.
+switching to the specified user. This option is deprecated, use
+``-run-with user=...`` instead.
 ERST
 
 DEF("prom-env", HAS_ARG, QEMU_OPTION_prom_env,
@@ -4993,7 +4994,9 @@ DEF("run-with", HAS_ARG, QEMU_OPTION_run_with,
 "-run-with [async-teardown=on|off][,chroot=dir]\n"
 "Set miscellaneous QEMU process lifecycle options:\n"
 "async-teardown=on enables asynchronous teardown (Linux 
only)\n"
-"chroot=dir chroot to dir just before starting the VM\n",
+"chroot=dir chroot to dir just before starting the VM\n"
+"user=username switch to the specified user before 
starting the VM\n"
+"user=uid:gid dito, but use specified user-ID and group-ID 
instead\n",
 QEMU_ARCH_ALL)
 SRST
 ``-run-with [async-teardown=on|off][,chroot=dir]``
@@ -5013,6 +5016,10 @@ SRST
 ``chroot=dir`` can be used for doing a chroot to the specified directory
 immediately before starting the guest execution. This is especially useful
 in combination with -runas.
+
+``user=username`` or ``user=uid:gid`` can be used to drop root privileges
+by switching to the specified user (via username) or user and group
+(via uid:gid) immediately before starting guest execution.
 ERST
 #endif
 
-- 
2.44.0

Re: [PATCH v2] target/i386: Give IRQs a chance when resetting HF_INHIBIT_IRQ_MASK

2024-04-15 Thread Philippe Mathieu-Daudé


On 15/4/24 11:32, Paolo Bonzini wrote:

On Mon, Apr 15, 2024 at 8:50 AM Ruihan Li  wrote:


When emulated with QEMU, interrupts will never come in the following
loop. However, if the NOP instruction is uncommented, interrupts will
fire as normal.

 loop:
 cli
 call do_sti
 jmp loop

 do_sti:
 sti
 # nop
 ret

This behavior is different from that of a real processor. For example,
if KVM is enabled, interrupts will always fire regardless of whether the
NOP instruction is commented or not. Also, the Intel Software Developer
Manual states that after the STI instruction is executed, the interrupt
inhibit should end as soon as the next instruction (e.g., the RET
instruction if the NOP instruction is commented) is executed.


Thanks, interesting bug!

What do you think about writing this:


  /* If several instructions disable interrupts, only the first does it.  */
  if (inhibit && !(s->flags & HF_INHIBIT_IRQ_MASK)) {
  gen_set_hflag(s, HF_INHIBIT_IRQ_MASK);
-} else {
+inhibit_reset = false;
+} else if (!inhibit && (s->flags & HF_INHIBIT_IRQ_MASK)) {
  gen_reset_hflag(s, HF_INHIBIT_IRQ_MASK);
+inhibit_reset = true;
+} else {
+inhibit_reset = false;
  }


in a slightly simpler manner:

 inhibit_reset = false;
 if (s->flags & HF_INHIBIT_IRQ_MASK) {
 gen_reset_hflag(s, HF_INHIBIT_IRQ_MASK);
 inhibit_reset = true;
 } else if (inhibit) {
 gen_set_hflag(s, HF_INHIBIT_IRQ_MASK);
 }

No need to submit v3, I can do the change myself when applying.


Cc: qemu-sta...@nongnu.org

Re: [PATCH v6 08/12] hw/cxl/cxl-mailbox-utils: Add mailbox commands to support add/release dynamic capacity response

2024-04-15 Thread fan

On Thu, Apr 04, 2024 at 01:32:23PM +, Jørgen Hansen wrote:
> On 3/25/24 20:02, nifan@gmail.com wrote:
> > From: Fan Ni 
> > 
> > Per CXL spec 3.1, two mailbox commands are implemented:
> > Add Dynamic Capacity Response (Opcode 4802h) 8.2.9.9.9.3, and
> > Release Dynamic Capacity (Opcode 4803h) 8.2.9.9.9.4.
> > 
> > For the process of the above two commands, we use two-pass approach.
> > Pass 1: Check whether the input payload is valid or not; if not, skip
> >  Pass 2 and return mailbox process error.
> > Pass 2: Do the real work--add or release extents, respectively.
> > 
> > Signed-off-by: Fan Ni 
> > ---
> >   hw/cxl/cxl-mailbox-utils.c  | 433 +++-
> >   hw/mem/cxl_type3.c  |  11 +
> >   include/hw/cxl/cxl_device.h |   4 +
> >   3 files changed, 444 insertions(+), 4 deletions(-)
> > 
> > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> > index 30ef46a036..a9eca516c8 100644
> > --- a/hw/cxl/cxl-mailbox-utils.c
> > +++ b/hw/cxl/cxl-mailbox-utils.c
> > @@ -19,6 +19,7 @@
> >   #include "qemu/units.h"
> >   #include "qemu/uuid.h"
> >   #include "sysemu/hostmem.h"
> > +#include "qemu/range.h"
> > 
> >   #define CXL_CAPACITY_MULTIPLIER   (256 * MiB)
> >   #define CXL_DC_EVENT_LOG_SIZE 8
> > @@ -85,6 +86,8 @@ enum {
> >   DCD_CONFIG  = 0x48,
> >   #define GET_DC_CONFIG  0x0
> >   #define GET_DYN_CAP_EXT_LIST   0x1
> > +#define ADD_DYN_CAP_RSP0x2
> > +#define RELEASE_DYN_CAP0x3
> >   PHYSICAL_SWITCH = 0x51,
> >   #define IDENTIFY_SWITCH_DEVICE  0x0
> >   #define GET_PHYSICAL_PORT_STATE 0x1
> > @@ -1400,6 +1403,422 @@ static CXLRetCode 
> > cmd_dcd_get_dyn_cap_ext_list(const struct cxl_cmd *cmd,
> >   return CXL_MBOX_SUCCESS;
> >   }
> > 
> > +/*
> > + * Check whether any bit between addr[nr, nr+size) is set,
> > + * return true if any bit is set, otherwise return false
> > + */
> > +static bool test_any_bits_set(const unsigned long *addr, unsigned long nr,
> > +  unsigned long size)
> > +{
> > +unsigned long res = find_next_bit(addr, size + nr, nr);
> > +
> > +return res < nr + size;
> > +}
> > +
> > +CXLDCRegion *cxl_find_dc_region(CXLType3Dev *ct3d, uint64_t dpa, uint64_t 
> > len)
> > +{
> > +int i;
> > +CXLDCRegion *region = >dc.regions[0];
> > +
> > +if (dpa < region->base ||
> > +dpa >= region->base + ct3d->dc.total_capacity) {
> > +return NULL;
> > +}
> > +
> > +/*
> > + * CXL r3.1 section 9.13.3: Dynamic Capacity Device (DCD)
> > + *
> > + * Regions are used in increasing-DPA order, with Region 0 being used 
> > for
> > + * the lowest DPA of Dynamic Capacity and Region 7 for the highest DPA.
> > + * So check from the last region to find where the dpa belongs. 
> > Extents that
> > + * cross multiple regions are not allowed.
> > + */
> > +for (i = ct3d->dc.num_regions - 1; i >= 0; i--) {
> > +region = >dc.regions[i];
> > +if (dpa >= region->base) {
> > +if (dpa + len > region->base + region->len) {
> > +return NULL;
> > +}
> > +return region;
> > +}
> > +}
> > +
> > +return NULL;
> > +}
> > +
> > +static void cxl_insert_extent_to_extent_list(CXLDCExtentList *list,
> > + uint64_t dpa,
> > + uint64_t len,
> > + uint8_t *tag,
> > + uint16_t shared_seq)
> > +{
> > +CXLDCExtent *extent;
> > +
> > +extent = g_new0(CXLDCExtent, 1);
> > +extent->start_dpa = dpa;
> > +extent->len = len;
> > +if (tag) {
> > +memcpy(extent->tag, tag, 0x10);
> > +}
> > +extent->shared_seq = shared_seq;
> > +
> > +QTAILQ_INSERT_TAIL(list, extent, node);
> > +}
> > +
> > +void cxl_remove_extent_from_extent_list(CXLDCExtentList *list,
> > +CXLDCExtent *extent)
> > +{
> > +QTAILQ_REMOVE(list, extent, node);
> > +g_free(extent);
> > +}
> > +
> > +/*
> > + * CXL r3.1 Table 8-168: Add Dynamic Capacity Response Input Payload
> > + * CXL r3.1 Table 8-170: Release Dynamic Capacity Input Payload
> > + */
> > +typedef struct CXLUpdateDCExtentListInPl {
> > +uint32_t num_entries_updated;
> > +uint8_t flags;
> > +uint8_t rsvd[3];
> > +/* CXL r3.1 Table 8-169: Updated Extent */
> > +struct {
> > +uint64_t start_dpa;
> > +uint64_t len;
> > +uint8_t rsvd[8];
> > +} QEMU_PACKED updated_entries[];
> > +} QEMU_PACKED CXLUpdateDCExtentListInPl;
> > +
> > +/*
> > + * For the extents in the extent list to operate, check whether they are 
> > valid
> > + * 1. The extent should be in the range of a valid DC region;
> > + * 2. The extent should not cross multiple regions;
> > + * 3. The start DPA and

Re: [PATCH v6 08/12] hw/cxl/cxl-mailbox-utils: Add mailbox commands to support add/release dynamic capacity response

2024-04-15 Thread fan



>From 4b9695299d3d4b22f83666f8ab79099ec9f9817f Mon Sep 17 00:00:00 2001
From: Fan Ni 
Date: Tue, 20 Feb 2024 09:48:30 -0800
Subject: [PATCH 08/13] hw/cxl/cxl-mailbox-utils: Add mailbox commands to
 support add/release dynamic capacity response

Per CXL spec 3.1, two mailbox commands are implemented:
Add Dynamic Capacity Response (Opcode 4802h) 8.2.9.9.9.3, and
Release Dynamic Capacity (Opcode 4803h) 8.2.9.9.9.4.

For the process of the above two commands, we use two-pass approach.
Pass 1: Check whether the input payload is valid or not; if not, skip
Pass 2 and return mailbox process error.
Pass 2: Do the real work--add or release extents, respectively.

Signed-off-by: Fan Ni 
---
 hw/cxl/cxl-mailbox-utils.c  | 396 
 hw/mem/cxl_type3.c  |  11 +
 include/hw/cxl/cxl_device.h |   4 +
 3 files changed, 411 insertions(+)

diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
index 1915959015..cd9092b6bf 100644
--- a/hw/cxl/cxl-mailbox-utils.c
+++ b/hw/cxl/cxl-mailbox-utils.c
@@ -19,6 +19,7 @@
 #include "qemu/units.h"
 #include "qemu/uuid.h"
 #include "sysemu/hostmem.h"
+#include "qemu/range.h"
 
 #define CXL_CAPACITY_MULTIPLIER   (256 * MiB)
 #define CXL_DC_EVENT_LOG_SIZE 8
@@ -85,6 +86,8 @@ enum {
 DCD_CONFIG  = 0x48,
 #define GET_DC_CONFIG  0x0
 #define GET_DYN_CAP_EXT_LIST   0x1
+#define ADD_DYN_CAP_RSP0x2
+#define RELEASE_DYN_CAP0x3
 PHYSICAL_SWITCH = 0x51,
 #define IDENTIFY_SWITCH_DEVICE  0x0
 #define GET_PHYSICAL_PORT_STATE 0x1
@@ -1398,6 +1401,393 @@ static CXLRetCode cmd_dcd_get_dyn_cap_ext_list(const 
struct cxl_cmd *cmd,
 return CXL_MBOX_SUCCESS;
 }
 
+/*
+ * Check whether any bit between addr[nr, nr+size) is set,
+ * return true if any bit is set, otherwise return false
+ */
+static bool test_any_bits_set(const unsigned long *addr, unsigned long nr,
+  unsigned long size)
+{
+unsigned long res = find_next_bit(addr, size + nr, nr);
+
+return res < nr + size;
+}
+
+CXLDCRegion *cxl_find_dc_region(CXLType3Dev *ct3d, uint64_t dpa, uint64_t len)
+{
+int i;
+CXLDCRegion *region = >dc.regions[0];
+
+if (dpa < region->base ||
+dpa >= region->base + ct3d->dc.total_capacity) {
+return NULL;
+}
+
+/*
+ * CXL r3.1 section 9.13.3: Dynamic Capacity Device (DCD)
+ *
+ * Regions are used in increasing-DPA order, with Region 0 being used for
+ * the lowest DPA of Dynamic Capacity and Region 7 for the highest DPA.
+ * So check from the last region to find where the dpa belongs. Extents 
that
+ * cross multiple regions are not allowed.
+ */
+for (i = ct3d->dc.num_regions - 1; i >= 0; i--) {
+region = >dc.regions[i];
+if (dpa >= region->base) {
+if (dpa + len > region->base + region->len) {
+return NULL;
+}
+return region;
+}
+}
+
+return NULL;
+}
+
+static void cxl_insert_extent_to_extent_list(CXLDCExtentList *list,
+ uint64_t dpa,
+ uint64_t len,
+ uint8_t *tag,
+ uint16_t shared_seq)
+{
+CXLDCExtent *extent;
+
+extent = g_new0(CXLDCExtent, 1);
+extent->start_dpa = dpa;
+extent->len = len;
+if (tag) {
+memcpy(extent->tag, tag, 0x10);
+}
+extent->shared_seq = shared_seq;
+
+QTAILQ_INSERT_TAIL(list, extent, node);
+}
+
+void cxl_remove_extent_from_extent_list(CXLDCExtentList *list,
+CXLDCExtent *extent)
+{
+QTAILQ_REMOVE(list, extent, node);
+g_free(extent);
+}
+
+/*
+ * CXL r3.1 Table 8-168: Add Dynamic Capacity Response Input Payload
+ * CXL r3.1 Table 8-170: Release Dynamic Capacity Input Payload
+ */
+typedef struct CXLUpdateDCExtentListInPl {
+uint32_t num_entries_updated;
+uint8_t flags;
+uint8_t rsvd[3];
+/* CXL r3.1 Table 8-169: Updated Extent */
+struct {
+uint64_t start_dpa;
+uint64_t len;
+uint8_t rsvd[8];
+} QEMU_PACKED updated_entries[];
+} QEMU_PACKED CXLUpdateDCExtentListInPl;
+
+/*
+ * For the extents in the extent list to operate, check whether they are valid
+ * 1. The extent should be in the range of a valid DC region;
+ * 2. The extent should not cross multiple regions;
+ * 3. The start DPA and the length of the extent should align with the block
+ * size of the region;
+ * 4. The address range of multiple extents in the list should not overlap.
+ */
+static CXLRetCode cxl_detect_malformed_extent_list(CXLType3Dev *ct3d,
+const CXLUpdateDCExtentListInPl *in)
+{
+uint64_t min_block_size = UINT64_MAX;
+CXLDCRegion *region;
+CXLDCRegion *lastregion = >dc.regions[ct3d->dc.num_regions - 1];
+g_autofree unsigned long *blk_bitmap =

Re: [PATCH v6 10/12] hw/mem/cxl_type3: Add dpa range validation for accesses to DC regions

2024-04-15 Thread fan

On Fri, Apr 12, 2024 at 06:54:42PM -0400, Gregory Price wrote:
> On Mon, Mar 25, 2024 at 12:02:28PM -0700, nifan@gmail.com wrote:
> > From: Fan Ni 
> > 
> > All dpa ranges in the DC regions are invalid to access until an extent
> > covering the range has been added. Add a bitmap for each region to
> > record whether a DC block in the region has been backed by DC extent.
> > For the bitmap, a bit in the bitmap represents a DC block. When a DC
> > extent is added, all the bits of the blocks in the extent will be set,
> > which will be cleared when the extent is released.
> > 
> > Reviewed-by: Jonathan Cameron 
> > Signed-off-by: Fan Ni 
> > ---
> >  hw/cxl/cxl-mailbox-utils.c  |  6 +++
> >  hw/mem/cxl_type3.c  | 76 +
> >  include/hw/cxl/cxl_device.h |  7 
> >  3 files changed, 89 insertions(+)
> > 
> > diff --git a/hw/cxl/cxl-mailbox-utils.c b/hw/cxl/cxl-mailbox-utils.c
> > index 7094e007b9..a0d2239176 100644
> > --- a/hw/cxl/cxl-mailbox-utils.c
> > +++ b/hw/cxl/cxl-mailbox-utils.c
> > @@ -1620,6 +1620,7 @@ static CXLRetCode cmd_dcd_add_dyn_cap_rsp(const 
> > struct cxl_cmd *cmd,
> >  
> >  cxl_insert_extent_to_extent_list(extent_list, dpa, len, NULL, 0);
> >  ct3d->dc.total_extent_count += 1;
> > +ct3_set_region_block_backed(ct3d, dpa, len);
> >  
> >  ent = QTAILQ_FIRST(>dc.extents_pending);
> >  cxl_remove_extent_from_extent_list(>dc.extents_pending, ent);
> 
> while looking at the MHD code, we had decided to "reserve" the blocks in
> the bitmap in the call to `qmp_cxl_process_dynamic_capacity` in order to
> prevent a potential double-allocation (basically we need to sanity check
> that two hosts aren't reserving the region PRIOR to the host being
> notified).
> 
> I did not see any checks in the `qmp_cxl_process_dynamic_capacity` path
> to prevent pending extents from being double-allocated.  Is this an
> explicit choice?
> 
> I can see, for example, why you may want to allow the following in the
> pending list: [Add X, Remove X, Add X].  I just want to know if this is
> intentional or not. If not, you may consider adding a pending check
> during the sanity check phase of `qmp_cxl_process_dynamic_capacity`
> 
> ~Gregory

First, for remove request, pending list is not involved. See cxl r3.1,
9.13.3.3. Pending basically means "pending to add". 
So for the above example, in the pending list, you can see [Add x, add x] if the
event is not processed in time.
Second, from the spec, I cannot find any text saying we cannot issue
another add extent X if it is still pending.
>From the kernel side, if the first one is accepted, the second one will
get rejected, and there is no issue there.
If the first is reject for some reason, the second one can get
accepted or rejected and do not need to worry about the first one.


Fan

Re: [PATCH v8] arm/kvm: Enable support for KVM_ARM_VCPU_PMU_V3_FILTER

2024-04-15 Thread Daniel P . Berrangé

On Tue, Apr 02, 2024 at 03:01:50PM +0200, Kevin Wolf wrote:
> Am 29.03.2024 um 04:45 hat Shaoqin Huang geschrieben:
> > Hi Daniel,
> > 
> > On 3/25/24 16:55, Daniel P. Berrangé wrote:
> > > On Mon, Mar 25, 2024 at 01:35:58PM +0800, Shaoqin Huang wrote:
> > > > Hi Daniel,
> > > > 
> > > > Thanks for your reviewing. I see your comments in the v7.
> > > > 
> > > > I have some doubts about what you said about the QAPI. Do you want me to
> > > > convert the current design into the QAPI parsing like the
> > > > IOThreadVirtQueueMapping? And we need to add new json definition in the
> > > > qapi/ directory?
> > 
> > I have defined the QAPI for kvm-pmu-filter like below:

> > @@ -2439,6 +2441,7 @@ static Property arm_cpu_properties[] = {
> >  mp_affinity, ARM64_AFFINITY_INVALID),
> >  DEFINE_PROP_INT32("node-id", ARMCPU, node_id, CPU_UNSET_NUMA_NODE_ID),
> >  DEFINE_PROP_INT32("core-count", ARMCPU, core_count, -1),
> > +DEFINE_PROP_KVM_PMU_FILTER("kvm-pmu-filter", ARMCPU, kvm_pmu_filter),
> >  DEFINE_PROP_END_OF_LIST()
> >  };
> > 
> > And I guess I can use the new json format input like below:
> > 
> > qemu-system-aarch64 \
> > -cpu host, '{"filter": [{"action": "a", "start": 0x10, "end": "0x11"}]}'
> > 
> > But it doesn't work. It seems like because the -cpu option doesn't
> > support json format parameter.
> > 
> > Maybe I'm wrong. So I want to double check with if the -cpu option
> > support json format nowadays?
> 
> As far as I can see, -cpu doesn't support JSON yet. But even if it did,
> your command line would be invalid because the 'host,' part isn't JSON.
> 
> > If the -cpu option doesn't support json format, how I can use the QAPI
> > for kvm-pmu-filter property?
> 
> This would probably mean QAPIfying all CPUs first, which sounds like a
> major effort.

I wonder if we can do a half-way house where we parse the JSON and
turn it into regular QemuOpts internally, and then just use QAPI
parsing for the filter property. IOW, publically give the illusion
that -cpu has been QAPI-ified, but without actually doing the hard
part yet. The idea being to avoid inventing a new cli syntax that
has no analogue to QAPI.
With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v9] arm/kvm: Enable support for KVM_ARM_VCPU_PMU_V3_FILTER

2024-04-15 Thread Daniel P . Berrangé

On Mon, Apr 08, 2024 at 10:49:40PM -0400, Shaoqin Huang wrote:
> The KVM_ARM_VCPU_PMU_V3_FILTER provides the ability to let the VMM decide
> which PMU events are provided to the guest. Add a new option
> `kvm-pmu-filter` as -cpu sub-option to set the PMU Event Filtering.
> Without the filter, all PMU events are exposed from host to guest by
> default. The usage of the new sub-option can be found from the updated
> document (docs/system/arm/cpu-features.rst).
> 
> Here is an example which shows how to use the PMU Event Filtering, when
> we launch a guest by use kvm, add such command line:
> 
>   # qemu-system-aarch64 \
> -accel kvm \
> -cpu host,kvm-pmu-filter="D:0x11-0x11"

I'm still against implementing this one-off custom parsed syntax
for kvm-pmu-filter values. Once this syntax exists, we're locked
into back-compatibility for multiple releases, and it will make
a conversion to QAPI/JSON harder.

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 09/13] block/gluster: Use URI parsing code from glib

2024-04-15 Thread Daniel P . Berrangé

On Fri, Apr 12, 2024 at 09:40:11AM -0500, Eric Blake wrote:
> On Fri, Apr 12, 2024 at 03:24:11PM +0200, Thomas Huth wrote:
> > Since version 2.66, glib has useful URI parsing functions, too.
> > Use those instead of the QEMU-internal ones to be finally able
> > to get rid of the latter.
> > 
> > Signed-off-by: Thomas Huth 
> > ---
> >  block/gluster.c | 71 -
> >  1 file changed, 35 insertions(+), 36 deletions(-)
> > 
> > diff --git a/block/gluster.c b/block/gluster.c
> > index cc74af06dc..1c9505f8bb 100644
> > --- a/block/gluster.c
> > +++ b/block/gluster.c
> > @@ -17,7 +17,6 @@
> >  #include "qapi/error.h"
> >  #include "qapi/qmp/qdict.h"
> >  #include "qapi/qmp/qerror.h"
> > -#include "qemu/uri.h"
> >  #include "qemu/error-report.h"
> >  #include "qemu/module.h"
> >  #include "qemu/option.h"
> > @@ -289,9 +288,9 @@ static void glfs_clear_preopened(glfs_t *fs)
> >  }
> >  }
> >  
> > -static int parse_volume_options(BlockdevOptionsGluster *gconf, char *path)
> > +static int parse_volume_options(BlockdevOptionsGluster *gconf, const char 
> > *path)
> 
> Is it worth mentioning in the commit message that this includes a
> const-correctness tweak?
> 
> > @@ -364,57 +363,57 @@ static int 
> > qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
> >  QAPI_LIST_PREPEND(gconf->server, gsconf);
> >  
> >  /* transport */
> > -if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
> > +uri_scheme = g_uri_get_scheme(uri);
> > +if (!uri_scheme || !strcmp(uri_scheme, "gluster")) {
> 
> Pre-existing, but per RFC 3986, we should probably be using strcasecmp
> for scheme comparisons (I'm not sure if g_uri_parse guarantees a
> lower-case return, even when the user passed in upper case).  That can
> be a separate patch.

Docs say it is lowercase:

  https://developer-old.gnome.org/glib/stable/glib-URI-Functions.html

  "on return, contains the scheme (converted to lowercase), or NULL."

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 09/13] block/gluster: Use URI parsing code from glib

2024-04-15 Thread Daniel P . Berrangé

On Fri, Apr 12, 2024 at 03:24:11PM +0200, Thomas Huth wrote:
> Since version 2.66, glib has useful URI parsing functions, too.
> Use those instead of the QEMU-internal ones to be finally able
> to get rid of the latter.
> 
> Signed-off-by: Thomas Huth 
> ---
>  block/gluster.c | 71 -
>  1 file changed, 35 insertions(+), 36 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 08/13] Remove glib compatibility code that is not required anymore

2024-04-15 Thread Daniel P . Berrangé

On Fri, Apr 12, 2024 at 03:24:10PM +0200, Thomas Huth wrote:
> Now that we bumped the minumum glib version to 2.66, we can drop
> the old code.
> 
> Suggested-by: Paolo Bonzini 
> Signed-off-by: Thomas Huth 
> ---
>  qga/commands-posix-ssh.c |  8 
>  util/error-report.c  | 10 --
>  2 files changed, 18 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 07/13] Bump minimum glib version to v2.66

2024-04-15 Thread Daniel P . Berrangé

On Fri, Apr 12, 2024 at 03:24:09PM +0200, Thomas Huth wrote:
> Now that we dropped support for CentOS 8 and Ubuntu 20.04, we can
> look into bumping the glib version to a new minimum for further
> clean-ups. According to repology.org, available versions are:
> 
>  CentOS Stream 9:   2.66.7
>  Debian 11: 2.66.8
>  Fedora 38: 2.74.1
>  Freebsd:   2.78.4
>  Homebrew:  2.80.0
>  Openbsd:   2.78.4
>  OpenSuse leap 15.5:2.70.5
>  pkgsrc_current:2.78.4
>  Ubuntu 22.04:  2.72.1
> 
> Thus it should be safe to bump the minimum glib version to 2.66 now.
> Version 2.66 comes with new functions for URI parsing which will
> allow further clean-ups in the following patches.
> 
> Signed-off-by: Thomas Huth 
> ---
>  meson.build  | 16 +---
>  include/glib-compat.h| 27 ++-
>  qga/commands-posix-ssh.c |  4 ++--
>  3 files changed, 5 insertions(+), 42 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 06/13] ci: move external build environment setups to CentOS Stream 9

2024-04-15 Thread Daniel P . Berrangé

On Fri, Apr 12, 2024 at 03:24:08PM +0200, Thomas Huth wrote:
> From: Paolo Bonzini 
> 
> RHEL 9 (and thus also the derivatives) are available since two years
> now, so according to QEMU's support policy, we can drop the active
> support for the previous major version 8 now.
> 
> Thus upgrade our CentOS Stream build environment playbooks to major
> version 9 now.
> 
> Signed-off-by: Paolo Bonzini 
> Reviewed-by: Thomas Huth 
> Message-ID: <20240412103708.27650-1-pbonz...@redhat.com>
> Signed-off-by: Thomas Huth 
> ---
>  .../stream/{8 => 9}/build-environment.yml | 31 ++---
>  .../stream/{8 => 9}/x86_64/configure  |  4 +-
>  .../stream/{8 => 9}/x86_64/test-avocado   |  0
>  scripts/ci/setup/build-environment.yml| 44 +++
>  4 files changed, 34 insertions(+), 45 deletions(-)
>  rename scripts/ci/org.centos/stream/{8 => 9}/build-environment.yml (75%)
>  rename scripts/ci/org.centos/stream/{8 => 9}/x86_64/configure (98%)
>  rename scripts/ci/org.centos/stream/{8 => 9}/x86_64/test-avocado (100%)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 05/13] .travis.yml: Update the jobs to Ubuntu 22.04

2024-04-15 Thread Daniel P . Berrangé

On Fri, Apr 12, 2024 at 03:24:07PM +0200, Thomas Huth wrote:
> According to our support policy, we'll soon drop our official support
> for Ubuntu 20.04 ("Focal Fossa") in QEMU. Thus we should update the
> Travis jobs now to a newer release (Ubuntu 22.04 - "Jammy Jellyfish")
> for future testing. Since all jobs are using this release now, we
> can drop the entries from the individual jobs and use the global
> setting again.
> 
> Signed-off-by: Thomas Huth 
> ---
>  .travis.yml | 13 +++--
>  1 file changed, 3 insertions(+), 10 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 04/13] tests: Update our CI to use CentOS Stream 9 instead of 8

2024-04-15 Thread Daniel P . Berrangé

On Fri, Apr 12, 2024 at 03:24:06PM +0200, Thomas Huth wrote:
> RHEL 9 (and thus also the derivatives) are available since two years
> now, so according to QEMU's support policy, we can drop the active
> support for the previous major version 8 now.
> Thus upgrade our CentOS Stream container to major version 9 now.
> 
> Signed-off-by: Thomas Huth 
> ---
>  .gitlab-ci.d/buildtest.yml| 16 -
>  .gitlab-ci.d/container-core.yml   |  4 +--
>  .../{centos8.docker => centos9.docker}| 34 +++
>  tests/lcitool/mappings.yml| 20 ---
>  tests/lcitool/refresh |  2 +-
>  tests/vm/centos   |  4 +--
>  6 files changed, 26 insertions(+), 54 deletions(-)
>  rename tests/docker/dockerfiles/{centos8.docker => centos9.docker} (82%)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 03/13] tests/docker/dockerfiles: Run lcitool-refresh after the lcitool update

2024-04-15 Thread Daniel P . Berrangé

On Fri, Apr 12, 2024 at 03:24:05PM +0200, Thomas Huth wrote:
> This update adds the removing of the EXTERNALLY-MANAGED marker files
> that has been added to the lcitool recently.

For those who don't know, python now commonly blocks the ability to
run 'pip install' outside of a venv. This generally makes sense for
a precious installation environment. Our containers are disposable
though, so a venv has no benefit. Removing the 'EXTERNALLY-MANAGED'
allows the historical arbitrary use of 'pip' outside a venv.
lcitool just does this unconditionally given the containers are
not precious.

> 
> Signed-off-by: Thomas Huth 
> ---
>  tests/docker/dockerfiles/alpine.docker| 3 ++-
>  tests/docker/dockerfiles/centos8.docker   | 1 +
>  tests/docker/dockerfiles/debian-amd64-cross.docker| 3 ++-
>  tests/docker/dockerfiles/debian-arm64-cross.docker| 3 ++-
>  tests/docker/dockerfiles/debian-armel-cross.docker| 3 ++-
>  tests/docker/dockerfiles/debian-armhf-cross.docker| 3 ++-
>  tests/docker/dockerfiles/debian-i686-cross.docker | 3 ++-
>  tests/docker/dockerfiles/debian-mips64el-cross.docker | 3 ++-
>  tests/docker/dockerfiles/debian-mipsel-cross.docker   | 3 ++-
>  tests/docker/dockerfiles/debian-ppc64el-cross.docker  | 3 ++-
>  tests/docker/dockerfiles/debian-riscv64-cross.docker  | 3 ++-
>  tests/docker/dockerfiles/debian-s390x-cross.docker| 3 ++-
>  tests/docker/dockerfiles/debian.docker| 1 +
>  tests/docker/dockerfiles/fedora-win64-cross.docker| 3 ++-
>  tests/docker/dockerfiles/fedora.docker| 1 +
>  tests/docker/dockerfiles/opensuse-leap.docker | 1 +
>  tests/docker/dockerfiles/ubuntu2204.docker| 1 +
>  17 files changed, 29 insertions(+), 12 deletions(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 02/13] tests/lcitool/libvirt-ci: Update to the latest master branch

2024-04-15 Thread Daniel P . Berrangé

On Fri, Apr 12, 2024 at 03:24:04PM +0200, Thomas Huth wrote:
> We need the latest fixes for the lcitool to be able to properly
> update our CentOS docker file to CentOS Stream 9.
> 
> Signed-off-by: Thomas Huth 
> ---
>  tests/lcitool/libvirt-ci | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Reviewed-by: Daniel P. Berrangé 


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 0/2] Improvements for switches in hw/cpu/Kconfig

2024-04-15 Thread Richard Henderson


On 4/14/24 23:56, Thomas Huth wrote:

Thomas Huth (2):
   hw: Fix problem with the A*MPCORE switches in the Kconfig files
   hw: Add a Kconfig switch for the TYPE_CPU_CLUSTER device


Reviewed-by: Richard Henderson 

r~

Re: [PATCH] target/sparc: resolve ASI_USERTXT correctly

2024-04-15 Thread Richard Henderson


On 4/14/24 15:48, M Bazz wrote:

I noticed that cpu_mmu_index() would have returned MMU_USER_IDX
if the supervisor bit hadn't happened to be set (not sure if this
execution path can occur for lda).


No, it cannot.


Note that this check is gone in your patch.


Correct.  Since 'lda' has already checked that supervisor mode has been enabled, the 
translator may jump directly to the desired result of MMU_KERNEL_IDX.



If I understand everything you've taught me, then the following patch would
have also satisfied the permissions issue. Could you confirm this please?
The essential change is the MMU_USER_IDX in the call to make_memop_idx()

diff --git a/target/sparc/ldst_helper.c b/target/sparc/ldst_helper.c
index e581bb42ac..be3c03a3b6 100644
--- a/target/sparc/ldst_helper.c
+++ b/target/sparc/ldst_helper.c
@@ -702,6 +702,24 @@ uint64_t helper_ld_asi(CPUSPARCState *env,
target_ulong addr,
  break;
  }
  break;
+case ASI_USERTXT: /* User code access */
+oi = make_memop_idx(memop, MMU_USER_IDX);
+switch (size) {
+case 1:
+ret = cpu_ldb_code_mmu(env, addr, oi, GETPC());
+break;
+case 2:
+ret = cpu_ldw_code_mmu(env, addr, oi, GETPC());
+break;
+default:
+case 4:
+ret = cpu_ldl_code_mmu(env, addr, oi, GETPC());
+break;
+case 8:
+ret = cpu_ldq_code_mmu(env, addr, oi, GETPC());
+break;
+}
+break;


Correct, that would also work.


r~

Re: Add 'info pg' command to monitor

2024-04-15 Thread Peter Maydell

On Mon, 15 Apr 2024 at 17:09, Don Porter  wrote:
> I am a CS professor (and, newly, a second-time contributor). I have
> been using qemu in my courses for over a decade, especially a course
> that asks students to write major pieces of an OS kernel from starter
> code.
>
> I have some patches, originally from Austin Clements at MIT, that I
> have found useful over the years and that may be useful to others.  It
> would also be nice not to have to build a custom qemu each semester.  I
> have cleared upstreaming these with Austin, the original author.
>
> This patch set adds an 'info pg' command to the monitor, which prints
> a nicer view of the page tables.  A project in my graduate OS course
> involves implementing x86 page table support, and my students have
> found this helpful for debugging.

So, my issue with this is that it's x86 specific, and it adds
yet another monitor command that is doing "show some kind of debug
info related to the guest page tables", along with "info mem"
and "info tlb". Plus it is yet another lump of code that's
walking the guest page tables and interpreting them.

What I'd really like to see is some infrastructure that is
at least somewhat guest-architecture-agnostic, so we can
minimise what a guest architecture needs to implement (and
then make providing that mandatory).

The other thing I'd like to see is perhaps some investigation of
whether there's any way to implement something useful by
using/extending the existing get_phys_page_attrs_debug() and
similar functions, so that you don't have to write one lot
of page-table-walking code for QEMU to use when it's executing
guest code and a separate lot (that's bound to get out of
sync or not support new functionality/changes) that's only
handling these monitor debug commands. There's a lot of
complexity in figuring out things like permissions in a
modern architecture...

thanks
-- PMM

Re: XIVE VFIO kernel resample failure in INTx mode under heavy load

2024-04-15 Thread Timothy Pearson




- Original Message -
> From: "Cédric Le Goater" 
> To: "Alexey Kardashevskiy" , "Alex Williamson" 
> , "Timothy Pearson"
> 
> Cc: "l...@suse.de:PowerPC" , "qemu-devel" 
> , "Frederic Barrat"
> , "npiggin" , "David Gibson" 
> 
> Sent: Thursday, April 21, 2022 1:35:50 AM
> Subject: Re: XIVE VFIO kernel resample failure in INTx mode under heavy load

> On 4/21/22 05:07, Alexey Kardashevskiy wrote:
>> 
>> 
>> On 14/04/2022 22:41, Cédric Le Goater wrote:
>>>
> After re-reading what I just wrote, I am leaning towards disabling use of
> KVM_CAP_IRQFD_RESAMPLE as it seems last worked on POWER8 and never since 
> :)
>
> Did I miss something in the picture (hey Cedric)?

 How about disabling it like this?

 =
 diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
 index 5bfd4aa9e5aa..c999f7b1ab1b 100644
 --- a/hw/ppc/spapr_pci.c
 +++ b/hw/ppc/spapr_pci.c
 @@ -732,7 +732,7 @@ static PCIINTxRoute spapr_route_intx_pin_to_irq(void
 *opaque, int pin)
   SpaprPhbState *sphb = SPAPR_PCI_HOST_BRIDGE(opaque);
   PCIINTxRoute route;

 -    route.mode = PCI_INTX_ENABLED;
 +    route.mode = PCI_INTX_DISABLED;

 =
>>>
>>> I like it.
>> 
>> 
>> The only thing is that this resampling works on POWER8/XICS and
>> removing it there is not great. So far sPAPR PHB was unaware of
>> underlying interrupt controller, or was not it?
> 
> It is. The dynamic change of the underlying irqchip in QEMU and
> in KVM required that for CAS. Of course, plenty is done in the
> back of the devices when this happens, see spapr_irq.
> 
> There are some quirks related to LPM with VIO devices in Linux.
> This is the only case I know about.
> 
> Thanks,
> 
> C.

Unfortunately this remains quite broken, and after a kernel upgrade (including 
the purported fix [1]) and a qemu upgrade we have now completely lost the 
ability to get the card working in the guest with *any* combination of 
parameters.

In guest XIVE mode with irqchip on it passes through a handful of interrupts, 
then dies.  In guest XICS mode we're dropping the majority of the interrupts.  
This is all on POWER9.

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/arch/powerpc/kvm/powerpc.c?id=52882b9c7a761b2b4e44717d6fbd1ed94c601b7f

[PATCH 1/2] monitor: Implement a generic x86 page table iterator

2024-04-15 Thread Don Porter

From: Austin Clements 

This iterator provides a way to traverse 32-bit, PAE, and 64-bit page
tables by abstracting them as n-ary trees.  A struct describes the
full range of x86 page table layouts and the iterator builds on this
to provide a "successor" function for efficiently traversing the page
table tree.

This code is currently unused, but provides the groundwork for the
following "info pg" patch.  It could also be used to unify and
simplify the implementations of "info mem" and "info tlb".

Signed-off-by: Austin Clements 
[geo...@ldpreload.com: Rebased on top of 2.9.0]
Signed-off-by: Geoffrey Thomas 
Signed-off-by: Don Porter 
---
 target/i386/monitor.c | 149 ++
 1 file changed, 149 insertions(+)

diff --git a/target/i386/monitor.c b/target/i386/monitor.c
index 3a281dab02..7924de6520 100644
--- a/target/i386/monitor.c
+++ b/target/i386/monitor.c
@@ -34,6 +34,155 @@
 #include "qapi/qapi-commands-misc-target.h"
 #include "qapi/qapi-commands-misc.h"
 
+/**
+ * PTLayout describes the layout of an x86 page table in enough detail
+ * to fully decode up to a 4-level 64-bit page table tree.
+ */
+typedef struct PTLayout {
+int levels, entsize;
+int entries[4]; /* Entries in each table level */
+int shift[4];   /* VA bit shift each each level */
+bool pse[4];/* Whether PSE bit is valid */
+const char *names[4];
+int vaw, paw;   /* VA and PA width in characters */
+} PTLayout;
+
+/**
+ * PTIter provides a generic way to traverse and decode an x86 page
+ * table tree.
+ */
+typedef struct PTIter {
+const PTLayout *layout;
+bool pse;   /* PSE enabled */
+int level;  /* Current level */
+int i[4];   /* Index at each level */
+hwaddr base[4]; /* Physical base pointer */
+
+uint64_t ent;   /* Current entry */
+bool present, leaf;
+target_ulong va;
+hwaddr pa;
+target_ulong  size;
+} PTIter;
+
+static bool ptiter_succ(PTIter *it);
+
+/**
+ * Initialize a PTIter to point to the first entry of the page table's
+ * top level.  On failure, prints a message to mon and returns false.
+ */
+static bool
+__attribute__ ((unused))
+ptiter_init(Monitor *mon, PTIter *it)
+{
+static const PTLayout l32 = {
+2, 4, {1024, 1024}, {22, 12}, {1, 0}, {"PDE", "PTE"}, 8, 8
+};
+static const PTLayout lpae = {
+3, 8, {4, 512, 512}, {30, 21, 12}, {0, 1, 0},
+{"PDP", "PDE", "PTE"}, 8, 13
+};
+#ifdef TARGET_X86_64
+static const PTLayout l64 = {
+4, 8, {512, 512, 512, 512}, {39, 30, 21, 12}, {0, 1, 1, 0},
+{"PML4", "PDP", "PDE", "PTE"}, 12, 13
+};
+#endif
+CPUArchState *env;
+
+env = mon_get_cpu_env(mon);
+
+if (!(env->cr[0] & CR0_PG_MASK)) {
+monitor_printf(mon, "PG disabled\n");
+return false;
+}
+
+memset(it, 0, sizeof(*it));
+if (env->cr[4] & CR4_PAE_MASK) {
+#ifdef TARGET_X86_64
+if (env->hflags & HF_LMA_MASK) {
+it->layout = 
+it->base[0] = env->cr[3] & 0x3f000ULL;
+} else
+#endif
+{
+it->layout = 
+it->base[0] = env->cr[3] & ~0x1f;
+}
+it->pse = true;
+} else {
+it->layout = 
+it->base[0] = env->cr[3] & ~0xfff;
+it->pse = (env->cr[4] & CR4_PSE_MASK);
+}
+
+/* Trick ptiter_succ into doing the hard initialization. */
+it->i[0] = -1;
+it->leaf = true;
+ptiter_succ(it);
+return true;
+}
+
+/**
+ * Move a PTIter to the successor of the current entry.  Specifically:
+ * if the iterator points to a leaf, move to its next sibling, or to
+ * the next sibling of a parent if it has no more siblings.  If the
+ * iterator points to a non-leaf, move to its first child.  If there
+ * is no successor, return false.
+ *
+ * Note that the resulting entry may not be marked present, though
+ * non-present entries are always leafs (within a page
+ * table/directory/etc, this will always visit all entries).
+ */
+static bool ptiter_succ(PTIter *it)
+{
+int i, l, entsize;
+uint64_t ent64;
+uint32_t ent32;
+bool large;
+
+if (it->level < 0) {
+return false;
+} else if (!it->leaf) {
+/* Move to this entry's first child */
+it->level++;
+it->base[it->level] = it->pa;
+it->i[it->level] = 0;
+} else {
+/* Move forward and, if we hit the end of this level, up */
+while (++it->i[it->level] == it->layout->entries[it->level]) {
+if (it->level-- == 0) {
+/* We're out of page table */
+return false;
+}
+}
+}
+
+/* Read this entry */
+l = it->level;
+entsize = it->layout->entsize;
+cpu_physical_memory_read(it->base[l] + it->i[l] * entsize,
+ entsize == 4 ? (void *) : (void *),
+

Add 'info pg' command to monitor

2024-04-15 Thread Don Porter

Hi all,

I am a CS professor (and, newly, a second-time contributor). I have
been using qemu in my courses for over a decade, especially a course
that asks students to write major pieces of an OS kernel from starter
code.

I have some patches, originally from Austin Clements at MIT, that I
have found useful over the years and that may be useful to others.  It
would also be nice not to have to build a custom qemu each semester.  I
have cleared upstreaming these with Austin, the original author.

This patch set adds an 'info pg' command to the monitor, which prints
a nicer view of the page tables.  A project in my graduate OS course
involves implementing x86 page table support, and my students have
found this helpful for debugging.

Thank you in advance for your time,
Don Porter

[PATCH 2/2] monitor: Add an "info pg" command that prints the current page tables

2024-04-15 Thread Don Porter

From: Austin Clements 

The new "info pg" monitor command prints the current page table,
including virtual address ranges, flag bits, and snippets of physical
page numbers.  Completely filled regions of the page table with
compatible flags are "folded", with the result that the complete
output for a freshly booted x86-64 Linux VM can fit in a single
terminal window.  The output looks like this:

VPN range Entry FlagsPhysical page
[7f000-7f000] PML4[0fe] ---DA--UWP
  [7f28c-7f28f]  PDP[0a3] ---DA--UWP
[7f28c4600-7f28c47ff]  PDE[023] ---DA--UWP
  [7f28c4655-7f28c4656]  PTE[055-056] X--D---U-P 007f14-007f15
  [7f28c465b-7f28c465b]  PTE[05b] A--U-P 001cfc
...
[ff800-ff800] PML4[1ff] ---DA--UWP
  [8-b]  PDP[1fe] ---DA---WP
[81000-81dff]  PDE[008-00e] -GSDA---WP 001000-001dff
  [c-f]  PDP[1ff] ---DA--UWP
[ff400-ff5ff]  PDE[1fa] ---DA--UWP
  [ff5fb-ff5fc]  PTE[1fb-1fc] XG-DACT-WP 0fec00 0fee00
[ff600-ff7ff]  PDE[1fb] ---DA--UWP
  [ff600-ff600]  PTE[000] -G-DA--U-P 001467

Signed-off-by: Austin Clements 
[geo...@ldpreload.com: Rebased on top of 2.9.0]
Signed-off-by: Geoffrey Thomas 
Signed-off-by: Don Porter 
---
 hmp-commands-info.hx |  15 +++
 include/monitor/hmp-target.h |   2 +
 target/i386/monitor.c| 179 ++-
 3 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index ad1b1306e3..ae7de74041 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -239,6 +239,21 @@ SRST
 Show the active virtual memory mappings.
 ERST
 
+#if defined(TARGET_I386)
+{
+.name   = "pg",
+.args_type  = "",
+.params = "",
+.help   = "show the page table",
+.cmd= hmp_info_pg,
+},
+#endif
+
+SRST   
|
+  ``info pg``  
|
+Show the active page table.
|
+ERST
+
 {
 .name   = "mtree",
 .args_type  = "flatview:-f,dispatch_tree:-d,owner:-o,disabled:-D",
diff --git a/include/monitor/hmp-target.h b/include/monitor/hmp-target.h
index d78e979f05..68f58d2a31 100644
--- a/include/monitor/hmp-target.h
+++ b/include/monitor/hmp-target.h
@@ -48,6 +48,8 @@ void hmp_info_mem(Monitor *mon, const QDict *qdict);
 void hmp_info_tlb(Monitor *mon, const QDict *qdict);
 void hmp_mce(Monitor *mon, const QDict *qdict);
 void hmp_info_local_apic(Monitor *mon, const QDict *qdict);
+void hmp_info_io_apic(Monitor *mon, const QDict *qdict);
+void hmp_info_pg(Monitor *mon, const QDict *qdict);
 void hmp_info_sev(Monitor *mon, const QDict *qdict);
 void hmp_info_sgx(Monitor *mon, const QDict *qdict);
 void hmp_info_via(Monitor *mon, const QDict *qdict);
diff --git a/target/i386/monitor.c b/target/i386/monitor.c
index 7924de6520..4cf39a4140 100644
--- a/target/i386/monitor.c
+++ b/target/i386/monitor.c
@@ -72,7 +72,6 @@ static bool ptiter_succ(PTIter *it);
  * top level.  On failure, prints a message to mon and returns false.
  */
 static bool
-__attribute__ ((unused))
 ptiter_init(Monitor *mon, PTIter *it)
 {
 static const PTLayout l32 = {
@@ -88,10 +87,16 @@ ptiter_init(Monitor *mon, PTIter *it)
 {"PML4", "PDP", "PDE", "PTE"}, 12, 13
 };
 #endif
+
 CPUArchState *env;
 
 env = mon_get_cpu_env(mon);
 
+if (!env) {
+monitor_printf(mon, "No CPU available\n");
+return false;
+}
+
 if (!(env->cr[0] & CR0_PG_MASK)) {
 monitor_printf(mon, "PG disabled\n");
 return false;
@@ -200,6 +205,178 @@ static hwaddr addr_canonical(CPUArchState *env, hwaddr 
addr)
 return addr;
 }
 
+/**
+ * Return true if the page tree rooted at iter is complete and
+ * compatible with compat.  last will be filled with the last entry at
+ * each level.  If false, does not change iter and last can be filled
+ * with anything; if true, returns with iter at the next entry on the
+ * same level, or the next parent entry if iter is on the last entry
+ * of this level.
+ */
+static bool pg_complete(PTIter *root, const PTIter compat[], PTIter last[])
+{
+PTIter iter = *root;
+
+if ((root->ent & 0xfff) != (compat[root->level].ent & 0xfff)) {
+return false;
+}
+
+last[root->level] = *root;
+ptiter_succ();
+if (!root->leaf) {
+/* Are all of the direct children of root complete? */
+while (iter.level == root->level + 1) {
+if (!pg_complete(, compat, last)) {
+return false;
+}
+}
+}
+assert(iter.level <= root->level);
+assert(iter.level == root->level ?
+   iter.i[iter.level] == root->i[iter.level] +

Re: [PATCH v2 3/5] intel_iommu: Add a framework to do compatibility check with host IOMMU cap/ecap

2024-04-15 Thread Cédric Le Goater


On 4/8/24 10:44, Zhenzhong Duan wrote:

From: Yi Liu 

If check fails, the host side device(either vfio or vdpa device) should not
be passed to guest.

Implementation details for different backends will be in following patches.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
  hw/i386/intel_iommu.c | 35 +++
  1 file changed, 35 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4f84e2e801..a49b587c73 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -35,6 +35,7 @@
  #include "sysemu/kvm.h"
  #include "sysemu/dma.h"
  #include "sysemu/sysemu.h"
+#include "sysemu/iommufd.h"
  #include "hw/i386/apic_internal.h"
  #include "kvm/kvm_i386.h"
  #include "migration/vmstate.h"
@@ -3819,6 +3820,32 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus,
  return vtd_dev_as;
  }
  
+static int vtd_check_legacy_hdev(IntelIOMMUState *s,

+ HostIOMMUDevice *hiod,
+ Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_iommufd_hdev(IntelIOMMUState *s,
+  HostIOMMUDevice *hiod,
+  Error **errp)
+{
+return 0;
+}
+
+static int vtd_check_hdev(IntelIOMMUState *s, VTDHostIOMMUDevice *vtd_hdev,
+  Error **errp)
+{
+HostIOMMUDevice *hiod = vtd_hdev->dev;
+
+if (object_dynamic_cast(OBJECT(hiod), TYPE_HIOD_IOMMUFD)) {
+return vtd_check_iommufd_hdev(s, hiod, errp);
+}
+
+return vtd_check_legacy_hdev(s, hiod, errp);
+}



I think we should be using the .get_host_iommu_info() class handler
instead. Can we refactor the code slightly to avoid this check on
the type ?


Thanks,

C.





+
  static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,
  HostIOMMUDevice *hiod, Error **errp)
  {
@@ -3829,6 +3856,7 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void 
*opaque, int devfn,
  .devfn = devfn,
  };
  struct vtd_as_key *new_key;
+int ret;
  
  assert(hiod);
  
@@ -3848,6 +3876,13 @@ static int vtd_dev_set_iommu_device(PCIBus *bus, void *opaque, int devfn,

  vtd_hdev->iommu_state = s;
  vtd_hdev->dev = hiod;
  
+ret = vtd_check_hdev(s, vtd_hdev, errp);

+if (ret) {
+g_free(vtd_hdev);
+vtd_iommu_unlock(s);
+return ret;
+}
+
  new_key = g_malloc(sizeof(*new_key));
  new_key->bus = bus;
  new_key->devfn = devfn;

[PATCH] tests/avocado: update sunxi kernel from armbian to 6.6.16

2024-04-15 Thread Peter Maydell

The Linux kernel 5.10.16 binary for sunxi has been removed from
apt.armbian.com. This means that the avocado tests for these machines
will be skipped (status CANCEL) if the old binary isn't present in
the avocado cache.

Update to 6.6.16, in the same way we did in commit e384db41d8661
when we moved to 5.10.16 in 2021.

Cc: qemu-sta...@nongnu.org
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2284
Signed-off-by: Peter Maydell 
---
At this point in the release cycle I don't think I really want
to put this into 9.0, though I could just about squeeze it in.

cc'ing stable as an FYI -- since the tests fall back to the
CANCEL status this doesn't break CI, so it's not a requirement
to backport to any stable branches. But it would probably be
preferable to get the coverage back on the stable branches so
we can detect if we get something wrong on a backport of a
patch that affects these machines.
---
 tests/avocado/boot_linux_console.py | 70 ++---
 tests/avocado/replay_kernel.py  |  8 ++--
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/tests/avocado/boot_linux_console.py 
b/tests/avocado/boot_linux_console.py
index 989b65111c0..d0ab5aaa83a 100644
--- a/tests/avocado/boot_linux_console.py
+++ b/tests/avocado/boot_linux_console.py
@@ -646,12 +646,12 @@ def test_arm_cubieboard_initrd(self):
 :avocado: tags=accel:tcg
 """
 deb_url = ('https://apt.armbian.com/pool/main/l/'
-   
'linux-5.10.16-sunxi/linux-image-current-sunxi_21.02.2_armhf.deb')
-deb_hash = '9fa84beda245cabf0b4fa84cf6eaa7738ead1da0'
+   
'linux-6.6.16/linux-image-current-sunxi_24.2.1_armhf__6.6.16-Seb3e-D6b4a-P2359-Ce96bHfe66-HK01ba-V014b-B067e-R448a.deb')
+deb_hash = 'f7c3c8c5432f765445dc6e7eab02f3bbe668256b'
 deb_path = self.fetch_asset(deb_url, asset_hash=deb_hash)
 kernel_path = self.extract_from_deb(deb_path,
-'/boot/vmlinuz-5.10.16-sunxi')
-dtb_path = 
'/usr/lib/linux-image-current-sunxi/sun4i-a10-cubieboard.dtb'
+
'/boot/vmlinuz-6.6.16-current-sunxi')
+dtb_path = 
'/usr/lib/linux-image-6.6.16-current-sunxi/sun4i-a10-cubieboard.dtb'
 dtb_path = self.extract_from_deb(deb_path, dtb_path)
 initrd_url = ('https://github.com/groeck/linux-build-test/raw/'
   '2eb0a73b5d5a28df3170c546ddaaa9757e1e0848/rootfs/'
@@ -690,12 +690,12 @@ def test_arm_cubieboard_sata(self):
 :avocado: tags=accel:tcg
 """
 deb_url = ('https://apt.armbian.com/pool/main/l/'
-   
'linux-5.10.16-sunxi/linux-image-current-sunxi_21.02.2_armhf.deb')
-deb_hash = '9fa84beda245cabf0b4fa84cf6eaa7738ead1da0'
+   
'linux-6.6.16/linux-image-current-sunxi_24.2.1_armhf__6.6.16-Seb3e-D6b4a-P2359-Ce96bHfe66-HK01ba-V014b-B067e-R448a.deb')
+deb_hash = 'f7c3c8c5432f765445dc6e7eab02f3bbe668256b'
 deb_path = self.fetch_asset(deb_url, asset_hash=deb_hash)
 kernel_path = self.extract_from_deb(deb_path,
-'/boot/vmlinuz-5.10.16-sunxi')
-dtb_path = 
'/usr/lib/linux-image-current-sunxi/sun4i-a10-cubieboard.dtb'
+
'/boot/vmlinuz-6.6.16-current-sunxi')
+dtb_path = 
'/usr/lib/linux-image-6.6.16-current-sunxi/sun4i-a10-cubieboard.dtb'
 dtb_path = self.extract_from_deb(deb_path, dtb_path)
 rootfs_url = ('https://github.com/groeck/linux-build-test/raw/'
   '2eb0a73b5d5a28df3170c546ddaaa9757e1e0848/rootfs/'
@@ -872,13 +872,13 @@ def test_arm_bpim2u(self):
 :avocado: tags=machine:bpim2u
 :avocado: tags=accel:tcg
 """
-deb_url = ('https://apt.armbian.com/pool/main/l/linux-5.10.16-sunxi/'
-   'linux-image-current-sunxi_21.02.2_armhf.deb')
-deb_hash = '9fa84beda245cabf0b4fa84cf6eaa7738ead1da0'
+deb_url = ('https://apt.armbian.com/pool/main/l/'
+   
'linux-6.6.16/linux-image-current-sunxi_24.2.1_armhf__6.6.16-Seb3e-D6b4a-P2359-Ce96bHfe66-HK01ba-V014b-B067e-R448a.deb')
+deb_hash = 'f7c3c8c5432f765445dc6e7eab02f3bbe668256b'
 deb_path = self.fetch_asset(deb_url, asset_hash=deb_hash)
 kernel_path = self.extract_from_deb(deb_path,
-'/boot/vmlinuz-5.10.16-sunxi')
-dtb_path = ('/usr/lib/linux-image-current-sunxi/'
+
'/boot/vmlinuz-6.6.16-current-sunxi')
+dtb_path = ('/usr/lib/linux-image-6.6.16-current-sunxi/'
 'sun8i-r40-bananapi-m2-ultra.dtb')
 dtb_path = self.extract_from_deb(deb_path, dtb_path)
 
@@ -899,13 +899,13 @@ def test_arm_bpim2u_initrd(self):
 :avocado: tags=accel:tcg
 :avocado: tags=machine:bpim2u
 """
-deb_url =

Re: [PATCH v2 2/6] hw/ppc: SPI controller model - registers implementation

2024-04-15 Thread Cédric Le Goater


Hello Chalapathi

The subject could be rephrased to : "ppc/pnv: Add SPI controller model".

On 4/9/24 19:56, Chalapathi V wrote:

SPI controller device model supports a connection to a single SPI responder.
This provide access to SPI seeproms, TPM, flash device and an ADC controller.

All SPI function control is mapped into the SPI register space to enable full
control by firmware. In this commit SPI configuration component is modelled
which contains all SPI configuration and status registers as well as the hold
registers for data to be sent or having been received.

An existing QEMU SSI framework is used and SSI_BUS is created.

Signed-off-by: Chalapathi V 
---
  include/hw/ppc/pnv_spi_controller.h  |  55 +
  include/hw/ppc/pnv_spi_controller_regs.h | 114 ++


These two files should be under hw/ssi/ and include/hw/ssi/. Please
remove '_controller'.


  include/hw/ppc/pnv_xscom.h   |   3 +
  hw/ppc/pnv_spi_controller.c  | 278 +++
  hw/ppc/Kconfig   |   1 +
  hw/ppc/meson.build   |   1 +
  6 files changed, 452 insertions(+)
  create mode 100644 include/hw/ppc/pnv_spi_controller.h
  create mode 100644 include/hw/ppc/pnv_spi_controller_regs.h
  create mode 100644 hw/ppc/pnv_spi_controller.c

diff --git a/include/hw/ppc/pnv_spi_controller.h 
b/include/hw/ppc/pnv_spi_controller.h
new file mode 100644
index 00..5ec50fb14c
--- /dev/null
+++ b/include/hw/ppc/pnv_spi_controller.h
@@ -0,0 +1,55 @@
+/*
+ * QEMU PowerPC SPI Controller model
+ *
+ * Copyright (c) 2024, IBM Corporation.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ *
+ * This model Supports a connection to a single SPI responder.
+ * Introduced for P10 to provide access to SPI seeproms, TPM, flash device
+ * and an ADC controller.
+ */
+#include "hw/ssi/ssi.h"
+
+#ifndef PPC_PNV_SPI_CONTROLLER_H
+#define PPC_PNV_SPI_CONTROLLER_H
+
+#define TYPE_PNV_SPI_CONTROLLER "pnv-spi-controller"
+#define PNV_SPICONTROLLER(obj) \
+OBJECT_CHECK(PnvSpiController, (obj), TYPE_PNV_SPI_CONTROLLER)


You could use OBJECT_DECLARE_SIMPLE_TYPE ? Anyhow, I would prefer
naming the macro PNV_SPI_CONTROLLER.


+#define SPI_CONTROLLER_REG_SIZE 8
+
+typedef struct SSIBus SSIBus;


why ?


+
+#define TYPE_PNV_SPI_BUS "pnv-spi-bus"
+OBJECT_DECLARE_SIMPLE_TYPE(PnvSPIBus, PNV_SPI_BUS)
+
+typedef struct PnvSPIBus {


I don't think this extra PnvSPIBus model is useful.


+SysBusDevice parent_obj;
+
+SSIBus *ssi_bus;
+qemu_irq *cs_line;


These two attributes could live under PnvSpiController.


+uint32_t id;


and this one would become useless.


+} PnvSPIBus;

+typedef struct PnvSpiController {
+DeviceState parent;
+
+PnvSPIBus   bus;
+MemoryRegionxscom_spic_regs;
+/* SPI controller object number */
+uint32_tspic_num;
+
+/* SPI Controller registers */
+uint64_terror_reg;
+uint64_tcounter_config_reg;
+uint64_tconfig_reg1;
+uint64_tclock_config_reset_control;
+uint64_tmemory_mapping_reg;
+uint64_ttransmit_data_reg;
+uint64_treceive_data_reg;
+uint8_t sequencer_operation_reg[SPI_CONTROLLER_REG_SIZE];
+uint64_tstatus_reg;


You could use an array of uint64_t also.



+} PnvSpiController;
+#endif /* PPC_PNV_SPI_CONTROLLER_H */
diff --git a/include/hw/ppc/pnv_spi_controller_regs.h 
b/include/hw/ppc/pnv_spi_controller_regs.h
new file mode 100644
index 00..6f613aca5e
--- /dev/null
+++ b/include/hw/ppc/pnv_spi_controller_regs.h
@@ -0,0 +1,114 @@
+/*
+ * QEMU PowerPC SPI Controller model
+ *
+ * Copyright (c) 2023, IBM Corporation.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef SPI_CONTROLLER_REGS_H
+#define SPI_CONTROLLER_REGS_H
+
+/* Error Register */
+#define ERROR_REG   0x00
+
+/* counter_config_reg */
+#define COUNTER_CONFIG_REG  0x01
+#define COUNTER_CONFIG_REG_SHIFT_COUNT_N1   PPC_BITMASK(0, 7)
+#define COUNTER_CONFIG_REG_SHIFT_COUNT_N2   PPC_BITMASK(8, 15)
+#define COUNTER_CONFIG_REG_COUNT_COMPARE1   PPC_BITMASK(24, 31)
+#define COUNTER_CONFIG_REG_COUNT_COMPARE2   PPC_BITMASK(32, 39)
+#define COUNTER_CONFIG_REG_N1_COUNT_CONTROL PPC_BITMASK(48, 51)
+#define COUNTER_CONFIG_REG_N2_COUNT_CONTROL PPC_BITMASK(52, 55)
+
+/* config_reg */
+#define CONFIG_REG1 0x02
+
+/* clock_config_reset_control_ecc_enable_reg */
+#define CLOCK_CONFIG_REG0x03
+#define CLOCK_CONFIG_RESET_CONTROL_HARD_RESET   0x0084;
+#define CLOCK_CONFIG_REG_RESET_CONTROL  PPC_BITMASK(24, 27)
+#define CLOCK_CONFIG_REG_ECC_CONTROLPPC_BITMASK(28, 30)
+
+/* memory_mapping_reg */
+#define MEMORY_MAPPING_REG  0x04
+#define MEMORY_MAPPING_REG_MMSPISM_BASE_ADDRPPC_BITMASK(0, 15)
+#define MEMORY_MAPPING_REG_MMSPISM_ADDR_MASKPPC_BITMASK(16, 31)

[PULL 0/2] Misc HW patches for 2024-04-15

2024-04-15 Thread Philippe Mathieu-Daudé

The following changes since commit 824ebb92c39920a65b34a93d1bd462baf0d2d174:

  Merge tag 'pull-sp-20240412' of https://gitlab.com/rth7680/qemu into staging 
(2024-04-13 09:43:46 +0100)

are available in the Git repository at:

  https://github.com/philmd/qemu.git tags/hw-misc-20240415

for you to fetch changes up to 6e4aceba2079e3df42edc89d44f4ee02343bb09e:

  hw/pci-host/ppc440_pcix: Do not expose a bridge device on PCI bus (2024-04-15 
13:07:15 +0200)


Misc HW patch queue

Fixes for hardware used by machines running AmigaOS.



BALATON Zoltan (2):
  hw/isa/vt82c686: Keep track of PIRQ/PINT pins separately
  hw/pci-host/ppc440_pcix: Do not expose a bridge device on PCI bus

 hw/isa/vt82c686.c |  3 ++-
 hw/pci-host/ppc440_pcix.c | 11 ---
 2 files changed, 6 insertions(+), 8 deletions(-)

-- 
2.41.0

[PULL 1/2] hw/isa/vt82c686: Keep track of PIRQ/PINT pins separately

2024-04-15 Thread Philippe Mathieu-Daudé

From: BALATON Zoltan 

Move calculation of mask after the switch which sets the function
number for PIRQ/PINT pins to make sure the state of these pins are
kept track of separately and IRQ is raised if any of them is active.

Cc: qemu-sta...@nongnu.org
Fixes: 7e01bd80c1 hw/isa/vt82c686: Bring back via_isa_set_irq()
Signed-off-by: BALATON Zoltan 
Reviewed-by: Philippe Mathieu-Daudé 
Message-ID: <20240410222543.0ea534e6...@zero.eik.bme.hu>
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/isa/vt82c686.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/isa/vt82c686.c b/hw/isa/vt82c686.c
index aa91942745..8582ac0322 100644
--- a/hw/isa/vt82c686.c
+++ b/hw/isa/vt82c686.c
@@ -658,7 +658,7 @@ void via_isa_set_irq(PCIDevice *d, int pin, int level)
 ViaISAState *s = VIA_ISA(pci_get_function_0(d));
 uint8_t irq = d->config[PCI_INTERRUPT_LINE], max_irq = 15;
 int f = PCI_FUNC(d->devfn);
-uint16_t mask = BIT(f);
+uint16_t mask;
 
 switch (f) {
 case 0: /* PIRQ/PINT inputs */
@@ -673,6 +673,7 @@ void via_isa_set_irq(PCIDevice *d, int pin, int level)
 }
 
 /* Keep track of the state of all sources */
+mask = BIT(f);
 if (level) {
 s->irq_state[0] |= mask;
 } else {
-- 
2.41.0

[PULL 2/2] hw/pci-host/ppc440_pcix: Do not expose a bridge device on PCI bus

2024-04-15 Thread Philippe Mathieu-Daudé

From: BALATON Zoltan 

Real 460EX SoC apparently does not expose a bridge device and having
it appear on PCI bus confuses an AmigaOS file system driver that uses
this to detect which machine it is running on.

Cc: qemu-sta...@nongnu.org
Signed-off-by: BALATON Zoltan 
Reviewed-by: Philippe Mathieu-Daudé 
Message-ID: <20240411192443.b4d644e6...@zero.eik.bme.hu>
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/pci-host/ppc440_pcix.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/hw/pci-host/ppc440_pcix.c b/hw/pci-host/ppc440_pcix.c
index 1926ae2a27..ef212d99aa 100644
--- a/hw/pci-host/ppc440_pcix.c
+++ b/hw/pci-host/ppc440_pcix.c
@@ -52,7 +52,7 @@ OBJECT_DECLARE_SIMPLE_TYPE(PPC440PCIXState, PPC440_PCIX_HOST)
 struct PPC440PCIXState {
 PCIHostState parent_obj;
 
-PCIDevice *dev;
+uint8_t config[PCI_CONFIG_SPACE_SIZE];
 struct PLBOutMap pom[PPC440_PCIX_NR_POMS];
 struct PLBInMap pim[PPC440_PCIX_NR_PIMS];
 uint32_t sts;
@@ -171,7 +171,7 @@ static void ppc440_pcix_reg_write4(void *opaque, hwaddr 
addr,
 trace_ppc440_pcix_reg_write(addr, val, size);
 switch (addr) {
 case PCI_VENDOR_ID ... PCI_MAX_LAT:
-stl_le_p(s->dev->config + addr, val);
+stl_le_p(s->config + addr, val);
 break;
 
 case PCIX0_POM0LAL:
@@ -302,7 +302,7 @@ static uint64_t ppc440_pcix_reg_read4(void *opaque, hwaddr 
addr,
 
 switch (addr) {
 case PCI_VENDOR_ID ... PCI_MAX_LAT:
-val = ldl_le_p(s->dev->config + addr);
+val = ldl_le_p(s->config + addr);
 break;
 
 case PCIX0_POM0LAL:
@@ -498,10 +498,7 @@ static void ppc440_pcix_realize(DeviceState *dev, Error 
**errp)
 memory_region_init(>iomem, OBJECT(dev), "pci-io", 64 * KiB);
 h->bus = pci_register_root_bus(dev, NULL, ppc440_pcix_set_irq,
  ppc440_pcix_map_irq, >irq, >busmem, >iomem,
- PCI_DEVFN(0, 0), 1, TYPE_PCI_BUS);
-
-s->dev = pci_create_simple(h->bus, PCI_DEVFN(0, 0),
-   TYPE_PPC4xx_HOST_BRIDGE);
+ PCI_DEVFN(1, 0), 1, TYPE_PCI_BUS);
 
 memory_region_init(>bm, OBJECT(s), "bm-ppc440-pcix", UINT64_MAX);
 memory_region_add_subregion(>bm, 0x0, >busmem);
-- 
2.41.0

Re: [PATCH] hw/isa/vt82c686: Keep track of PIRQ/PINT pins separately

2024-04-15 Thread Philippe Mathieu-Daudé


On 11/4/24 00:25, BALATON Zoltan wrote:

Move calculation of mask after the switch which sets the function
number for PIRQ/PINT pins to make sure the state of these pins are
kept track of separately and IRQ is raised if any of them is active.

Fixes: 7e01bd80c1 hw/isa/vt82c686: Bring back via_isa_set_irq()
Signed-off-by: BALATON Zoltan 
---
Preferably for 9.0 if there will be another RC.

  hw/isa/vt82c686.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)


Queued, thanks.

Re: [PATCH] hw/misc/applesmc: Simplify DeviceReset handler

2024-04-15 Thread Philippe Mathieu-Daudé


On 10/4/24 20:08, Philippe Mathieu-Daudé wrote:

Have applesmc_find_key() return a const pointer.
Since the returned buffers are not modified in
applesmc_io_data_write(), it is pointless to
delete and re-add the keys in the DeviceReset
handler. Add them once in DeviceRealize, and
discard them in the DeviceUnrealize handler.

Signed-off-by: Philippe Mathieu-Daudé 
---
As discussed in
https://lore.kernel.org/qemu-devel/6fbcf565-f12c-4196-b6c8-559843c7a...@linaro.org/
---
  hw/misc/applesmc.c | 36 +---
  1 file changed, 21 insertions(+), 15 deletions(-)


Queued, thanks.

Re: [PATCH v2 1/6] hw/ppc: remove SPI responder model

2024-04-15 Thread Cédric Le Goater


On 4/9/24 19:56, Chalapathi V wrote:

-- Empty commit to align the patch numbers between PATCH v1 and PATCH v2.
SPI responder model is removed as pnv spi controller and seeprom is
implemented using QEMU SSI framework.


Please drop this empty patch. Patch numbers do not need to be aligned
between respins of the same patchset.

Thanks,

C.

Re: [PATCH v2] ppc440_pcix: Do not expose a bridge device on PCI bus

2024-04-15 Thread Philippe Mathieu-Daudé


On 14/4/24 15:00, BALATON Zoltan wrote:

On Sat, 13 Apr 2024, Philippe Mathieu-Daudé wrote:

On 11/4/24 21:24, BALATON Zoltan wrote:

Real 460EX SoC apparently does not expose a bridge device and having
it appear on PCI bus confuses an AmigaOS file system driver that uses
this to detect which machine it is running on.

Signed-off-by: BALATON Zoltan 
---
Here's another version that keeps the values and only drops the device
so it's even less likely it could break anything, in case this can be
accepted for 9.0.

  hw/pci-host/ppc440_pcix.c | 11 ---
  1 file changed, 4 insertions(+), 7 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 


Thanks. Nick, could you ack this please so it could be merged if you 
won't send more pull requests? (I'm the maintainer of this file as it's 
only used by sam460ex so maybe an ack is not needed but it could help to 
show you have no problem with it.)


No need, queued, thanks.

[PATCH 6.8 008/172] virtio_net: Do not send RSS key if it is not supported

2024-04-15 Thread Greg Kroah-Hartman

6.8-stable review patch.  If anyone has any objections, please let me know.

--

From: Breno Leitao 

commit 059a49aa2e25c58f90b50151f109dd3c4cdb3a47 upstream.

There is a bug when setting the RSS options in virtio_net that can break
the whole machine, getting the kernel into an infinite loop.

Running the following command in any QEMU virtual machine with virtionet
will reproduce this problem:

# ethtool -X eth0  hfunc toeplitz

This is how the problem happens:

1) ethtool_set_rxfh() calls virtnet_set_rxfh()

2) virtnet_set_rxfh() calls virtnet_commit_rss_command()

3) virtnet_commit_rss_command() populates 4 entries for the rss
scatter-gather

4) Since the command above does not have a key, then the last
scatter-gatter entry will be zeroed, since rss_key_size == 0.
sg_buf_size = vi->rss_key_size;

5) This buffer is passed to qemu, but qemu is not happy with a buffer
with zero length, and do the following in virtqueue_map_desc() (QEMU
function):

  if (!sz) {
  virtio_error(vdev, "virtio: zero sized buffers are not allowed");

6) virtio_error() (also QEMU function) set the device as broken

vdev->broken = true;

7) Qemu bails out, and do not repond this crazy kernel.

8) The kernel is waiting for the response to come back (function
virtnet_send_command())

9) The kernel is waiting doing the following :

  while (!virtqueue_get_buf(vi->cvq, ) &&
 !virtqueue_is_broken(vi->cvq))
  cpu_relax();

10) None of the following functions above is true, thus, the kernel
loops here forever. Keeping in mind that virtqueue_is_broken() does
not look at the qemu `vdev->broken`, so, it never realizes that the
vitio is broken at QEMU side.

Fix it by not sending RSS commands if the feature is not available in
the device.

Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.")
Cc: sta...@vger.kernel.org
Cc: qemu-devel@nongnu.org
Signed-off-by: Breno Leitao 
Reviewed-by: Heng Qi 
Reviewed-by: Xuan Zhuo 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/virtio_net.c |   26 ++
 1 file changed, 22 insertions(+), 4 deletions(-)

--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3768,6 +3768,7 @@ static int virtnet_set_rxfh(struct net_d
struct netlink_ext_ack *extack)
 {
struct virtnet_info *vi = netdev_priv(dev);
+   bool update = false;
int i;
 
if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
@@ -3775,13 +3776,28 @@ static int virtnet_set_rxfh(struct net_d
return -EOPNOTSUPP;
 
if (rxfh->indir) {
+   if (!vi->has_rss)
+   return -EOPNOTSUPP;
+
for (i = 0; i < vi->rss_indir_table_size; ++i)
vi->ctrl->rss.indirection_table[i] = rxfh->indir[i];
+   update = true;
}
-   if (rxfh->key)
+
+   if (rxfh->key) {
+   /* If either _F_HASH_REPORT or _F_RSS are negotiated, the
+* device provides hash calculation capabilities, that is,
+* hash_key is configured.
+*/
+   if (!vi->has_rss && !vi->has_rss_hash_report)
+   return -EOPNOTSUPP;
+
memcpy(vi->ctrl->rss.key, rxfh->key, vi->rss_key_size);
+   update = true;
+   }
 
-   virtnet_commit_rss_command(vi);
+   if (update)
+   virtnet_commit_rss_command(vi);
 
return 0;
 }
@@ -4686,13 +4702,15 @@ static int virtnet_probe(struct virtio_d
if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT))
vi->has_rss_hash_report = true;
 
-   if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
+   if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) {
vi->has_rss = true;
 
-   if (vi->has_rss || vi->has_rss_hash_report) {
vi->rss_indir_table_size =
virtio_cread16(vdev, offsetof(struct virtio_net_config,
rss_max_indirection_table_length));
+   }
+
+   if (vi->has_rss || vi->has_rss_hash_report) {
vi->rss_key_size =
virtio_cread8(vdev, offsetof(struct virtio_net_config, 
rss_max_key_size));

Re: [PULL 0/1] virtio: bugfix

2024-04-15 Thread Peter Maydell

On Mon, 15 Apr 2024 at 11:52, Michael S. Tsirkin  wrote:
>
> The following changes since commit e104a960c33b68fedf26dfb7b8e00abab8f2:
>
>   qdev-monitor: fix error message in find_device_state() (2024-04-09 02:31:33 
> -0400)
>
> are available in the Git repository at:
>
>   https://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git tags/for_upstream
>
> for you to fetch changes up to 2ce6cff94df2650c460f809e5ad263f1d22507c0:
>
>   virtio-pci: fix use of a released vector (2024-04-15 06:50:44 -0400)
>
> 
> virtio: bugfix
>
> A last minute fix for a use of a vector after it's released.
>
> Signed-off-by: Michael S. Tsirkin 
>
> 


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/9.0
for any user-visible changes.

-- PMM

MAINTAINERS tweak [was: [PATCH for-9.1 0/9] Switch to glib URI parsing code]

2024-04-15 Thread Eric Blake

[Trying Peter Lieven's alternate address...]

On Thu, Mar 28, 2024 at 03:05:57PM +0100, Thomas Huth wrote:
> In the QEMU 9.1 development cycle, we can drop the support for
> Ubuntu 20.04 and CentOS 8 since the following major versions of
> these distributions are available since 2 years already.

Every time I've replied to any message in this thread, I've gotten a
response:

| +Your message to p...@kamp.de couldn't be delivered.
| kamp.de couldn't confirm that your message was sent from a trusted location.
| eblake  Office 365  pl
| Action Required Recipient
| SPF validation error
| 
| How to Fix It
| Your organization's email admin will have to diagnose and fix your domain's 
email settings. Please forward this message to your
| +email admin.
| 
| 
| 
| More Info for Email Admins
| Status code: 550 5.7.23
| 
| This error occurs when Sender Policy Framework (SPF) validation for the 
sender's domain fails. If you're the sender's email
| +admin, make sure the SPF records for your domain at your domain registrar 
are set up correctly. Office 365 supports only one
| +SPF record (a TXT record that defines SPF) for your domain. Include the 
following domain name: spf.protection.outlook.com. If
| +you have a hybrid configuration (some mailboxes in the cloud, and some 
mailboxes on premises) or if you're an Exchange Online
| +Protection standalone customer, add the outbound IP address of your 
on-premises servers to the TXT record.

Red Hat IT says that it is unlikely to be Red Hat's SPF settings, and
suspects that it is instead something caused by whatever Peter is
using to bounce mail from his alias Peter Lieven  to his
Office 365 account.  As I appear to be unable to contact Peter (even
my use of direct email, bypassing the list, and using a personal
account instead of my Red Hat email) about this issue, I'm wondering
if Peter is still an active contributor to the project.

But while typing this email, to see if RBD, iSCSI, and NFS need a new
entry in MAINTAINERS, I did a search through the list archives, where
the last email I found from Peter was
https://lists.gnu.org/archive/html/qemu-devel/2023-01/msg00574.html,
which asked to update MAINTAINERS to his new address, and that has not
made it in so far...

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

Re: [PATCH v9 13/20] virtio-net: Return an error when vhost cannot enable RSS

2024-04-15 Thread Yuri Benditovich

On Wed, Apr 3, 2024 at 2:11 PM Akihiko Odaki  wrote:
>
> vhost requires eBPF for RSS. When eBPF is not available, virtio-net
> implicitly disables RSS even if the user explicitly requests it. Return
> an error instead of implicitly disabling RSS if RSS is requested but not
> available.
>
> Signed-off-by: Akihiko Odaki 
> ---
>  hw/net/virtio-net.c | 97 
> ++---
>  1 file changed, 48 insertions(+), 49 deletions(-)
>
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 61b49e335dea..3d53eba88cfc 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -793,9 +793,6 @@ static uint64_t virtio_net_get_features(VirtIODevice 
> *vdev, uint64_t features,
>  return features;
>  }
>
> -if (!ebpf_rss_is_loaded(>ebpf_rss)) {
> -virtio_clear_feature(, VIRTIO_NET_F_RSS);
> -}
>  features = vhost_net_get_features(get_vhost_net(nc->peer), features);
>  vdev->backend_features = features;
>
> @@ -3591,6 +3588,50 @@ static bool 
> failover_hide_primary_device(DeviceListener *listener,
>  return qatomic_read(>failover_primary_hidden);
>  }
>
> +static void virtio_net_device_unrealize(DeviceState *dev)
> +{
> +VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> +VirtIONet *n = VIRTIO_NET(dev);
> +int i, max_queue_pairs;
> +
> +if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
> +virtio_net_unload_ebpf(n);
> +}
> +
> +/* This will stop vhost backend if appropriate. */
> +virtio_net_set_status(vdev, 0);
> +
> +g_free(n->netclient_name);
> +n->netclient_name = NULL;
> +g_free(n->netclient_type);
> +n->netclient_type = NULL;
> +
> +g_free(n->mac_table.macs);
> +g_free(n->vlans);
> +
> +if (n->failover) {
> +qobject_unref(n->primary_opts);
> +device_listener_unregister(>primary_listener);
> +migration_remove_notifier(>migration_state);
> +} else {
> +assert(n->primary_opts == NULL);
> +}
> +
> +max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
> +for (i = 0; i < max_queue_pairs; i++) {
> +virtio_net_del_queue(n, i);
> +}
> +/* delete also control vq */
> +virtio_del_queue(vdev, max_queue_pairs * 2);
> +qemu_announce_timer_del(>announce_timer, false);
> +g_free(n->vqs);
> +qemu_del_nic(n->nic);
> +virtio_net_rsc_cleanup(n);
> +g_free(n->rss_data.indirections_table);
> +net_rx_pkt_uninit(n->rx_pkt);
> +virtio_cleanup(vdev);
> +}
> +
>  static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>  {
>  VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> @@ -3760,53 +3801,11 @@ static void virtio_net_device_realize(DeviceState 
> *dev, Error **errp)
>
>  net_rx_pkt_init(>rx_pkt);
>
> -if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
> -virtio_net_load_ebpf(n);
> -}
> -}
> -
> -static void virtio_net_device_unrealize(DeviceState *dev)
> -{
> -VirtIODevice *vdev = VIRTIO_DEVICE(dev);
> -VirtIONet *n = VIRTIO_NET(dev);
> -int i, max_queue_pairs;
> -
> -if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
> -virtio_net_unload_ebpf(n);
> +if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS) &&
> +!virtio_net_load_ebpf(n) && get_vhost_net(nc->peer)) {
> +virtio_net_device_unrealize(dev);
> +error_setg(errp, "Can't load eBPF RSS for vhost");
>  }

As I already mentioned, I think this is an extremely bad idea to
fail to run qemu due to such a reason as .absence of one feature.
What I suggest is:
1. Redefine rss as tri-state (off|auto|on)
2. Fail to run only if rss is on and not available via ebpf
3. On auto - silently drop it
4. The same with 'hash' option - it is not compatible with vhost (at
least at the moment)
5. Reformat the patch as it is hard to review it due to replacing
entire procedures, i.e. one patch with replacing without changes,
another one - with real changes.
If this is hard to review only for me - please ignore that.

> -
> -/* This will stop vhost backend if appropriate. */
> -virtio_net_set_status(vdev, 0);
> -
> -g_free(n->netclient_name);
> -n->netclient_name = NULL;
> -g_free(n->netclient_type);
> -n->netclient_type = NULL;
> -
> -g_free(n->mac_table.macs);
> -g_free(n->vlans);
> -
> -if (n->failover) {
> -qobject_unref(n->primary_opts);
> -device_listener_unregister(>primary_listener);
> -migration_remove_notifier(>migration_state);
> -} else {
> -assert(n->primary_opts == NULL);
> -}
> -
> -max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
> -for (i = 0; i < max_queue_pairs; i++) {
> -virtio_net_del_queue(n, i);
> -}
> -/* delete also control vq */
> -virtio_del_queue(vdev, max_queue_pairs * 2);
> -qemu_announce_timer_del(>announce_timer, false);
> -g_free(n->vqs);
> -qemu_del_nic(n->nic);
> -

Re: [PATCH v4] target/riscv/kvm/kvm-cpu.c: kvm_riscv_handle_sbi() fail with vendor-specific SBI

2024-04-15 Thread Andrew Jones

On Sat, Apr 13, 2024 at 02:25:26PM +0300, Alexei Filippov wrote:
> kvm_riscv_handle_sbi() may return not supported return code to not trigger
> qemu abort with vendor-specific sbi.
> 
> Added SBI related return code's defines.
> 
> Signed-off-by: Alexei Filippov 
> Fixes: 4eb47125 ("target/riscv: Handle KVM_EXIT_RISCV_SBI exit")
> ---
> 
> Changes since v3:
> -Clear Reviewed-by tags
>  target/riscv/kvm/kvm-cpu.c | 13 +
>  target/riscv/sbi_ecall_interface.h | 12 
>  2 files changed, 17 insertions(+), 8 deletions(-)
>

Reviewed-by: Andrew Jones

Re: [PATCH 02/12] hw/vfio/pci: Replace sprintf() by g_strdup_printf()

2024-04-15 Thread Cédric Le Goater


On 4/12/24 17:25, Alex Williamson wrote:

On Wed, 10 Apr 2024 18:06:03 +0200
Philippe Mathieu-Daudé  wrote:


sprintf() is deprecated on Darwin since macOS 13.0 / XCode 14.1,
resulting in painful developper experience. Use g_strdup_printf()
instead.


Isn't this code only compiled for Linux hosts?  


It is not.


Maybe still a valid change, but the rationale seems irrelevant.


I agree the commit log should be rephrased.

There is also a v2 doing a different change :

  https://lore.kernel.org/qemu-devel/20240411101550.99392-1-phi...@linaro.org/

This is a bit confusing.

Thanks,

C.

Re: secure boot & direct kernel load (was: Re: [PATCH] x86/loader: only patch linux kernels)

2024-04-15 Thread Daniel P . Berrangé

On Mon, Apr 15, 2024 at 03:30:32PM +0200, Gerd Hoffmann wrote:
>   Hi,
> 
> > > Options I see:
> > > 
> > >   (a) Stop using direct kernel boot, let virt-install & other tools
> > >   create vfat boot media with shim+kernel+initrd instead.
> > > 
> > >   (b) Enroll the distro signing keys in the efi variable store, so
> > >   booting the kernel without shim.efi works.
> > > 
> > >   (c) Add support for loading shim to qemu (and ovmf), for example
> > >   with a new '-shim' command line option which stores shim.efi
> > >   in some new fw_cfg file.
> > 
> > The problem with this is that now virt-install  has to actually
> > find the correct a shim.efi binary. It is already somewhat hard
> > to find a suitable kerenl+initrd binary, and AFAIK, the places
> > where we get these binaries don't have shim.efi alongside.
> > 
> > eg for RHEL/Fedora we grab kernel+initrd from the pxeboot dir:
> > 
> >   
> > https://fedora.mirrorservice.org/fedora/linux/development/rawhide/Everything/x86_64/os/images/pxeboot/
> 
> shim is 
> https://fedora.mirrorservice.org/fedora/linux/development/rawhide/Everything/x86_64/os/EFI/BOOT/BOOTX64.EFI
> 
> > In various forums we have discussed adding the secureboot
> > certs to the libosinfo database, so that we can have a
> > customized EFI varstore with minimized certs, even for the
> > ISO / HDD boot scenario.
> 
> Well.  It's not that easy unfortunately.  At least the "minimized certs"
> part.  shim often is signed with the microsoft keys only, so you can't
> drop that without rendering the install.iso unbootable.
> 
> Only adding the distro certs without removing the microsoft certs works
> of course.

In that scenario libosinfo would report that the given OS
requires both the microsoft & $distro certs to be
enrolled.

Only if shim were signed by the $distro certs, would
libosifo omit reporting the microsoft certs.

Basically libosinfo would have to report whatever set
of 'n' certs are required to make boot work.


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v2 10/10] vfio: Pass HostIOMMUDevice to vIOMMU

2024-04-15 Thread Cédric Le Goater


On 4/8/24 10:12, Zhenzhong Duan wrote:

With HostIOMMUDevice passed, vIOMMU can check compatibility with host
IOMMU, call into IOMMUFD specific methods, etc.

Originally-by: Yi Liu 
Signed-off-by: Nicolin Chen 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 


LGTM, waiting v3.


Thanks,

C.





---
  hw/vfio/pci.c | 20 +++-
  1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 64780d1b79..224501a86e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3111,11 +3111,17 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  
  vfio_bars_register(vdev);
  
-ret = vfio_add_capabilities(vdev, errp);

+ret = pci_device_set_iommu_device(pdev, vbasedev->hiod, errp);
  if (ret) {
+error_prepend(errp, "Failed to set iommu_device: ");
  goto out_teardown;
  }
  
+ret = vfio_add_capabilities(vdev, errp);

+if (ret) {
+goto out_unset_idev;
+}
+
  if (vdev->vga) {
  vfio_vga_quirk_setup(vdev);
  }
@@ -3132,7 +3138,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  error_setg(errp,
 "cannot support IGD OpRegion feature on hotplugged "
 "device");
-goto out_teardown;
+goto out_unset_idev;
  }
  
  ret = vfio_get_dev_region_info(vbasedev,

@@ -3141,13 +3147,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  if (ret) {
  error_setg_errno(errp, -ret,
   "does not support requested IGD OpRegion 
feature");
-goto out_teardown;
+goto out_unset_idev;
  }
  
  ret = vfio_pci_igd_opregion_init(vdev, opregion, errp);

  g_free(opregion);
  if (ret) {
-goto out_teardown;
+goto out_unset_idev;
  }
  }
  
@@ -3233,6 +3239,8 @@ out_deregister:

  if (vdev->intx.mmap_timer) {
  timer_free(vdev->intx.mmap_timer);
  }
+out_unset_idev:
+pci_device_unset_iommu_device(pdev);
  out_teardown:
  vfio_teardown_msi(vdev);
  vfio_bars_exit(vdev);
@@ -3261,6 +3269,7 @@ static void vfio_instance_finalize(Object *obj)
  static void vfio_exitfn(PCIDevice *pdev)
  {
  VFIOPCIDevice *vdev = VFIO_PCI(pdev);
+VFIODevice *vbasedev = >vbasedev;
  
  vfio_unregister_req_notifier(vdev);

  vfio_unregister_err_notifier(vdev);
@@ -3275,7 +3284,8 @@ static void vfio_exitfn(PCIDevice *pdev)
  vfio_teardown_msi(vdev);
  vfio_pci_disable_rp_atomics(vdev);
  vfio_bars_exit(vdev);
-vfio_migration_exit(>vbasedev);
+vfio_migration_exit(vbasedev);
+pci_device_unset_iommu_device(pdev);
  }
  
  static void vfio_pci_reset(DeviceState *dev)

Re: [PATCH v2 09/10] hw/pci: Introduce pci_device_set/unset_iommu_device()

2024-04-15 Thread Cédric Le Goater


On 4/8/24 10:12, Zhenzhong Duan wrote:

From: Yi Liu 

This adds pci_device_set/unset_iommu_device() to set/unset
HostIOMMUDevice for a given PCI device. Caller of set
should fail if set operation fails.

Extract out pci_device_get_iommu_bus_devfn() to facilitate


I would separate this change in a prereq patch.


Thanks,

C.



implementation of pci_device_set/unset_iommu_device().

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Nicolin Chen 
Signed-off-by: Zhenzhong Duan 
---
  include/hw/pci/pci.h | 40 ++-
  hw/pci/pci.c | 75 ++--
  2 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index eaa3fc99d8..4ae7fe6f3f 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -3,6 +3,7 @@
  
  #include "exec/memory.h"

  #include "sysemu/dma.h"
+#include "sysemu/host_iommu_device.h"
  
  /* PCI includes legacy ISA access.  */

  #include "hw/isa/isa.h"
@@ -383,10 +384,47 @@ typedef struct PCIIOMMUOps {
   *
   * @devfn: device and function number
   */
-   AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+AddressSpace * (*get_address_space)(PCIBus *bus, void *opaque, int devfn);
+/**
+ * @set_iommu_device: attach a HostIOMMUDevice to a vIOMMU
+ *
+ * Optional callback, if not implemented in vIOMMU, then vIOMMU can't
+ * retrieve host information from the associated HostIOMMUDevice.
+ *
+ * Return true if HostIOMMUDevice is attached, or else return false
+ * with errp set.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ *
+ * @dev: the data structure representing host IOMMU device.
+ *
+ * @errp: pass an Error out only when return false
+ *
+ */
+int (*set_iommu_device)(PCIBus *bus, void *opaque, int devfn,
+HostIOMMUDevice *dev, Error **errp);
+/**
+ * @unset_iommu_device: detach a HostIOMMUDevice from a vIOMMU
+ *
+ * Optional callback.
+ *
+ * @bus: the #PCIBus of the PCI device.
+ *
+ * @opaque: the data passed to pci_setup_iommu().
+ *
+ * @devfn: device and function number of the PCI device.
+ */
+void (*unset_iommu_device)(PCIBus *bus, void *opaque, int devfn);
  } PCIIOMMUOps;
  
  AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);

+int pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod,
+Error **errp);
+void pci_device_unset_iommu_device(PCIDevice *dev);
  
  /**

   * pci_setup_iommu: Initialize specific IOMMU handlers for a PCIBus
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e7a39cb203..8ece617673 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2648,11 +2648,27 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
  }
  }
  
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)

+/*
+ * Get IOMMU root bus, aliased bus and devfn of a PCI device
+ *
+ * IOMMU root bus is needed by all call sites to call into iommu_ops.
+ * For call sites which don't need aliased BDF, passing NULL to
+ * aliased_[bus/devfn] is allowed.
+ *
+ * @piommu_bus: return root #PCIBus backed by an IOMMU for the PCI device.
+ *
+ * @aliased_bus: return aliased #PCIBus of the PCI device, optional.
+ *
+ * @aliased_devfn: return aliased devfn of the PCI device, optional.
+ */
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+   PCIBus **piommu_bus,
+   PCIBus **aliased_bus,
+   int *aliased_devfn)
  {
  PCIBus *bus = pci_get_bus(dev);
  PCIBus *iommu_bus = bus;
-uint8_t devfn = dev->devfn;
+int devfn = dev->devfn;
  
  while (iommu_bus && !iommu_bus->iommu_ops && iommu_bus->parent_dev) {

  PCIBus *parent_bus = pci_get_bus(iommu_bus->parent_dev);
@@ -2693,13 +2709,66 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
  
  iommu_bus = parent_bus;

  }
-if (!pci_bus_bypass_iommu(bus) && iommu_bus->iommu_ops) {
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+assert(iommu_bus);
+
+if (pci_bus_bypass_iommu(bus) || !iommu_bus->iommu_ops) {
+iommu_bus = NULL;
+}
+
+*piommu_bus = iommu_bus;
+
+if (aliased_bus) {
+*aliased_bus = bus;
+}
+
+if (aliased_devfn) {
+*aliased_devfn = devfn;
+}
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+PCIBus *iommu_bus;
+int devfn;
+
+pci_device_get_iommu_bus_devfn(dev, _bus, , );
+if (iommu_bus) {
  return iommu_bus->iommu_ops->get_address_space(bus,
   iommu_bus->iommu_opaque, devfn);
  }
  return

Re: secure boot & direct kernel load (was: Re: [PATCH] x86/loader: only patch linux kernels)

2024-04-15 Thread Gerd Hoffmann

  Hi,

> > Options I see:
> > 
> >   (a) Stop using direct kernel boot, let virt-install & other tools
> >   create vfat boot media with shim+kernel+initrd instead.
> > 
> >   (b) Enroll the distro signing keys in the efi variable store, so
> >   booting the kernel without shim.efi works.
> > 
> >   (c) Add support for loading shim to qemu (and ovmf), for example
> >   with a new '-shim' command line option which stores shim.efi
> >   in some new fw_cfg file.
> 
> The problem with this is that now virt-install  has to actually
> find the correct a shim.efi binary. It is already somewhat hard
> to find a suitable kerenl+initrd binary, and AFAIK, the places
> where we get these binaries don't have shim.efi alongside.
> 
> eg for RHEL/Fedora we grab kernel+initrd from the pxeboot dir:
> 
>   
> https://fedora.mirrorservice.org/fedora/linux/development/rawhide/Everything/x86_64/os/images/pxeboot/

shim is 
https://fedora.mirrorservice.org/fedora/linux/development/rawhide/Everything/x86_64/os/EFI/BOOT/BOOTX64.EFI

> In various forums we have discussed adding the secureboot
> certs to the libosinfo database, so that we can have a
> customized EFI varstore with minimized certs, even for the
> ISO / HDD boot scenario.

Well.  It's not that easy unfortunately.  At least the "minimized certs"
part.  shim often is signed with the microsoft keys only, so you can't
drop that without rendering the install.iso unbootable.

Only adding the distro certs without removing the microsoft certs works
of course.

> If we do that, then (b) is trivial for direct kernel boot too.

Yep.

> (b) kills all birds with the same stone :-)

See above.  I'd love this being true but it is not.

> > (b) + (c) both require a fix for the patching issue.  The options
> > I see here are:
> > 
> >   (A) Move the patching from qemu to the linuxboot option rom.
> >   Strictly speaking it belongs there anyway.  It doesn't look
> >   that easy though, for qemu it is easier to gather all
> >   information needed ...
> > 
> >   (B) Provide both patched and unpatched setup header, so the
> >   guest can choose what it needs.
> > 
> >   (C) When implementing (c) above we can piggyback on the -shim
> >   switch and skip patching in case it is present.
> > 
> >   (D) Add a flag to skip the patching.
> > 
> > Comments?  Other/better ideas?
> 
> I guess (b) + (D) is probably my preference.

I prefer (B) over (D) because that doesn't require a new config option
(which probably needs support in libvirt and possibly higher up in the
management stack too ...).

Patch series implementing (B) and the -shim switch:
https://lore.kernel.org/qemu-devel/20240411094830.1337658-1-kra...@redhat.com/

Using -shim is optional, so it's up to virt-install whenever it wants go
for (b) or (c).

take care,
  Gerd

Re: [PATCH v2 08/10] vfio: Create host IOMMU device instance

2024-04-15 Thread Cédric Le Goater


On 4/8/24 10:12, Zhenzhong Duan wrote:

Create host IOMMU device instance and initialize it based on backend.

Signed-off-by: Zhenzhong Duan 
---
  include/hw/vfio/vfio-common.h | 1 +
  hw/vfio/container.c   | 5 +
  hw/vfio/iommufd.c | 8 
  3 files changed, 14 insertions(+)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index d382b12ec1..4fbba85018 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -126,6 +126,7 @@ typedef struct VFIODevice {
  OnOffAuto pre_copy_dirty_page_tracking;
  bool dirty_pages_supported;
  bool dirty_tracking;
+HostIOMMUDevice *hiod;
  int devid;
  IOMMUFDBackend *iommufd;
  } VFIODevice;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index ba0ad4a41b..fc0c027501 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -915,6 +915,7 @@ static int vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
  VFIODevice *vbasedev_iter;
  VFIOGroup *group;
  VFIOContainerBase *bcontainer;
+HIODLegacyVFIO *hiod_vfio;


s/hiod_vfio/hiod/ please. Same below.


Thanks,

C.




  int ret;
  
  if (groupid < 0) {

@@ -945,6 +946,9 @@ static int vfio_legacy_attach_device(const char *name, 
VFIODevice *vbasedev,
  vbasedev->bcontainer = bcontainer;
  QLIST_INSERT_HEAD(>device_list, vbasedev, container_next);
  QLIST_INSERT_HEAD(_device_list, vbasedev, global_next);
+hiod_vfio = HIOD_LEGACY_VFIO(object_new(TYPE_HIOD_LEGACY_VFIO));
+hiod_vfio->vdev = vbasedev;
+vbasedev->hiod = HOST_IOMMU_DEVICE(hiod_vfio);
  
  return ret;

  }
@@ -959,6 +963,7 @@ static void vfio_legacy_detach_device(VFIODevice *vbasedev)
  trace_vfio_detach_device(vbasedev->name, group->groupid);
  vfio_put_base_device(vbasedev);
  vfio_put_group(group);
+object_unref(vbasedev->hiod);
  }
  
  static int vfio_legacy_pci_hot_reset(VFIODevice *vbasedev, bool single)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 115b9f8e7f..b6d058339b 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -308,6 +308,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice 
*vbasedev,
  VFIOIOMMUFDContainer *container;
  VFIOAddressSpace *space;
  struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+HIODIOMMUFDVFIO *hiod_vfio;
  int ret, devfd;
  uint32_t ioas_id;
  Error *err = NULL;
@@ -431,6 +432,12 @@ found_container:
  QLIST_INSERT_HEAD(>device_list, vbasedev, container_next);
  QLIST_INSERT_HEAD(_device_list, vbasedev, global_next);
  
+hiod_vfio = HIOD_IOMMUFD_VFIO(object_new(TYPE_HIOD_IOMMUFD_VFIO));

+hiod_iommufd_init(HIOD_IOMMUFD(hiod_vfio), vbasedev->iommufd,
+  vbasedev->devid);
+hiod_vfio->vdev = vbasedev;
+vbasedev->hiod = HOST_IOMMU_DEVICE(hiod_vfio);
+
  trace_iommufd_cdev_device_info(vbasedev->name, devfd, vbasedev->num_irqs,
 vbasedev->num_regions, vbasedev->flags);
  return 0;
@@ -468,6 +475,7 @@ static void iommufd_cdev_detach(VFIODevice *vbasedev)
  iommufd_cdev_detach_container(vbasedev, container);
  iommufd_cdev_container_destroy(container);
  vfio_put_address_space(space);
+object_unref(vbasedev->hiod);
  
  iommufd_cdev_unbind_and_disconnect(vbasedev);

  close(vbasedev->fd);

Re: [PATCH v2 07/10] backends/iommufd: Implement get_host_iommu_info() callback

2024-04-15 Thread Cédric Le Goater


On 4/8/24 10:12, Zhenzhong Duan wrote:

It calls iommufd_backend_get_device_info() to get host IOMMU
related information.

Define a common structure HIOD_IOMMUFD_INFO to describe the info
returned from kernel. Currently only vtd, but easy to add arm smmu
when kernel supports.


I think you can merge the previous patch and this one.
 


Signed-off-by: Zhenzhong Duan 
---
  include/sysemu/iommufd.h |  7 +++
  backends/iommufd.c   | 17 +
  2 files changed, 24 insertions(+)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index fa1a866237..44ec1335b2 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h


I just noticed that include/sysemu/iommufd.h lacks a header.  Could you fix
that please ?


@@ -39,6 +39,13 @@ int iommufd_backend_get_device_info(IOMMUFDBackend *be, 
uint32_t devid,
  enum iommu_hw_info_type *type,
  void *data, uint32_t len, Error **errp);
  
+typedef struct HIOD_IOMMUFD_INFO {


Please use CamelCase names.


Thanks,

C.



+enum iommu_hw_info_type type;
+union {
+struct iommu_hw_info_vtd vtd;
+} data;
+} HIOD_IOMMUFD_INFO;
+
  #define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"
  OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass, HIOD_IOMMUFD)
  
diff --git a/backends/iommufd.c b/backends/iommufd.c

index 559affa9ec..1e9c469e65 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -240,8 +240,25 @@ void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend 
*iommufd,
  idev->devid = devid;
  }
  
+static int hiod_iommufd_get_host_iommu_info(HostIOMMUDevice *hiod,

+void *data, uint32_t len,
+Error **errp)
+{
+HIODIOMMUFD *idev = HIOD_IOMMUFD(hiod);
+HIOD_IOMMUFD_INFO *info = data;
+
+assert(sizeof(HIOD_IOMMUFD_INFO) <= len);
+
+return iommufd_backend_get_device_info(idev->iommufd, idev->devid,
+   >type, >data,
+   sizeof(info->data), errp);
+}
+
  static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
  {
+HostIOMMUDeviceClass *hiodc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hiodc->get_host_iommu_info = hiod_iommufd_get_host_iommu_info;
  }
  
  static const TypeInfo types[] = {

Re: [PATCH v2 06/10] backends/iommufd: Introduce helper function iommufd_backend_get_device_info()

2024-04-15 Thread Cédric Le Goater


On 4/8/24 10:12, Zhenzhong Duan wrote:

Introduce a helper function iommufd_backend_get_device_info() to get
host IOMMU related information through iommufd uAPI.

Signed-off-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
  include/sysemu/iommufd.h |  4 
  backends/iommufd.c   | 23 ++-
  2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 71c53cbb45..fa1a866237 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
  #include "qom/object.h"
  #include "exec/hwaddr.h"
  #include "exec/cpu-common.h"
+#include 
  #include "sysemu/host_iommu_device.h"
  
  #define TYPE_IOMMUFD_BACKEND "iommufd"

@@ -34,6 +35,9 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
  ram_addr_t size, void *vaddr, bool readonly);
  int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
hwaddr iova, ram_addr_t size);
+int iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,
+enum iommu_hw_info_type *type,
+void *data, uint32_t len, Error **errp);
  
  #define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"

  OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass, HIOD_IOMMUFD)
diff --git a/backends/iommufd.c b/backends/iommufd.c
index ef8b3a808b..559affa9ec 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -20,7 +20,6 @@
  #include "monitor/monitor.h"
  #include "trace.h"
  #include 
-#include 
  
  static void iommufd_backend_init(Object *obj)

  {
@@ -212,6 +211,28 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t 
ioas_id,
  return ret;
  }
  
+int iommufd_backend_get_device_info(IOMMUFDBackend *be, uint32_t devid,

+enum iommu_hw_info_type *type,
+void *data, uint32_t len, Error **errp)
+{
+struct iommu_hw_info info = {
+.size = sizeof(info),
+.dev_id = devid,
+.data_len = len,
+.data_uptr = (uintptr_t)data,
+};
+int ret;
+
+ret = ioctl(be->fd, IOMMU_GET_HW_INFO, );
+if (ret) {
+error_setg_errno(errp, errno, "Failed to get hardware info");
+} else {
+*type = info.out_data_type;


type should not be NULL.

+}
+
+return ret;
+}
+
  void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend *iommufd,
 uint32_t devid)
  {

Re: [PATCH v2 05/10] vfio: Implement get_host_iommu_info() callback

2024-04-15 Thread Cédric Le Goater


On 4/8/24 10:12, Zhenzhong Duan wrote:

Utilize iova_ranges to calculate host IOMMU address width and
package it in HIOD_LEGACY_INFO for vIOMMU usage.

HIOD_LEGACY_INFO will be used by both VFIO and VDPA so declare
it in host_iommu_device.h.

Signed-off-by: Zhenzhong Duan 
---
  include/sysemu/host_iommu_device.h | 10 ++
  hw/vfio/container.c| 24 
  2 files changed, 34 insertions(+)

diff --git a/include/sysemu/host_iommu_device.h 
b/include/sysemu/host_iommu_device.h
index 22ccbe3a5d..beb8be8231 100644
--- a/include/sysemu/host_iommu_device.h
+++ b/include/sysemu/host_iommu_device.h
@@ -16,4 +16,14 @@ struct HostIOMMUDeviceClass {
  int (*get_host_iommu_info)(HostIOMMUDevice *hiod, void *data, uint32_t 
len,
 Error **errp);
  };
+
+/*
+ * Define the format of host IOMMU related info that current VFIO
+ * or VDPA can privode to vIOMMU.
+ *
+ * @aw_bits: Host IOMMU address width. 0xff if no limitation.
+ */
+typedef struct HIOD_LEGACY_INFO {


Please use CamelCase names.


+uint8_t aw_bits;
+} HIOD_LEGACY_INFO;
  #endif
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 44018ef085..ba0ad4a41b 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1143,8 +1143,32 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
  vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
  };
  
+static int hiod_legacy_vfio_get_host_iommu_info(HostIOMMUDevice *hiod,

+void *data, uint32_t len,
+Error **errp)
+{
+VFIODevice *vbasedev = HIOD_LEGACY_VFIO(hiod)->vdev;
+/* iova_ranges is a sorted list */
+GList *l = g_list_last(vbasedev->bcontainer->iova_ranges);
+HIOD_LEGACY_INFO *info = data;
+
+assert(sizeof(HIOD_LEGACY_INFO) <= len);
+
+if (l) {
+Range *range = l->data;
+info->aw_bits = find_last_bit(>upb, BITS_PER_LONG) + 1;


There is a comment in range.h saying:

/*
 * Do not access members directly, use the functions!

Please introduce a new helper.


Thanks,

C.




+} else {
+info->aw_bits = 0xff;
+}
+
+return 0;
+}
+
  static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)
  {
+HostIOMMUDeviceClass *hioc = HOST_IOMMU_DEVICE_CLASS(oc);
+
+hioc->get_host_iommu_info = hiod_legacy_vfio_get_host_iommu_info;
  };
  
  static const TypeInfo types[] = {

Re: [PATCH v2 03/10] backends/iommufd: Introduce abstract HIODIOMMUFD device

2024-04-15 Thread Cédric Le Goater


On 4/8/24 10:12, Zhenzhong Duan wrote:

HIODIOMMUFD represents a host IOMMU device under iommufd backend.

Currently it includes only public iommufd handle and device id.
which could be used to get hw IOMMU information.

When nested translation is supported in future, vIOMMU is going
to have iommufd related operations like attaching/detaching hwpt,
So IOMMUFDDevice interface will be further extended at that time.

VFIO and VDPA device have different way of attaching/detaching hwpt.
So HIODIOMMUFD is still an abstract class which will be inherited by
VFIO and VDPA device.

Introduce a helper hiod_iommufd_init() to initialize HIODIOMMUFD
device.

Suggested-by: Cédric Le Goater 
Originally-by: Yi Liu 
Signed-off-by: Yi Sun 
Signed-off-by: Zhenzhong Duan 
---
  include/sysemu/iommufd.h | 22 +++
  backends/iommufd.c   | 47 ++--
  2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 9af27ebd6c..71c53cbb45 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -4,6 +4,7 @@
  #include "qom/object.h"
  #include "exec/hwaddr.h"
  #include "exec/cpu-common.h"
+#include "sysemu/host_iommu_device.h"
  
  #define TYPE_IOMMUFD_BACKEND "iommufd"

  OBJECT_DECLARE_TYPE(IOMMUFDBackend, IOMMUFDBackendClass, IOMMUFD_BACKEND)
@@ -33,4 +34,25 @@ int iommufd_backend_map_dma(IOMMUFDBackend *be, uint32_t 
ioas_id, hwaddr iova,
  ram_addr_t size, void *vaddr, bool readonly);
  int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t ioas_id,
hwaddr iova, ram_addr_t size);
+
+#define TYPE_HIOD_IOMMUFD TYPE_HOST_IOMMU_DEVICE "-iommufd"


Please keep TYPE_HOST_IOMMU_DEVICE


+OBJECT_DECLARE_TYPE(HIODIOMMUFD, HIODIOMMUFDClass, HIOD_IOMMUFD)
+
+struct HIODIOMMUFD {
+/*< private >*/
+HostIOMMUDevice parent;
+void *opaque;
+
+/*< public >*/
+IOMMUFDBackend *iommufd;
+uint32_t devid;
+};
+
+struct HIODIOMMUFDClass {
+/*< private >*/
+HostIOMMUDeviceClass parent_class;
+};


This new class doesn't seem useful. Do you have plans for handlers ?


+
+void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend *iommufd,
+   uint32_t devid);
  #endif
diff --git a/backends/iommufd.c b/backends/iommufd.c
index 62a79fa6b0..ef8b3a808b 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -212,23 +212,38 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, 
uint32_t ioas_id,
  return ret;
  }
  
-static const TypeInfo iommufd_backend_info = {

-.name = TYPE_IOMMUFD_BACKEND,
-.parent = TYPE_OBJECT,
-.instance_size = sizeof(IOMMUFDBackend),
-.instance_init = iommufd_backend_init,
-.instance_finalize = iommufd_backend_finalize,
-.class_size = sizeof(IOMMUFDBackendClass),
-.class_init = iommufd_backend_class_init,
-.interfaces = (InterfaceInfo[]) {
-{ TYPE_USER_CREATABLE },
-{ }
-}
-};
+void hiod_iommufd_init(HIODIOMMUFD *idev, IOMMUFDBackend *iommufd,
+   uint32_t devid)
+{
+idev->iommufd = iommufd;
+idev->devid = devid;
+}


This routine doesn't seem useful. I wonder if we shouldn't introduce
properties. I'm not sure this is useful either.



-static void register_types(void)
+static void hiod_iommufd_class_init(ObjectClass *oc, void *data)
  {
-type_register_static(_backend_info);
  }
  
-type_init(register_types);

+static const TypeInfo types[] = {
+{
+.name = TYPE_IOMMUFD_BACKEND,
+.parent = TYPE_OBJECT,
+.instance_size = sizeof(IOMMUFDBackend),
+.instance_init = iommufd_backend_init,
+.instance_finalize = iommufd_backend_finalize,
+.class_size = sizeof(IOMMUFDBackendClass),
+.class_init = iommufd_backend_class_init,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_USER_CREATABLE },
+{ }
+}
+}, {
+.name = TYPE_HIOD_IOMMUFD,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.instance_size = sizeof(HIODIOMMUFD),
+.class_size = sizeof(HIODIOMMUFDClass),
+.class_init = hiod_iommufd_class_init,
+.abstract = true,
+}
+};
+
+DEFINE_TYPES(types)

Re: [PATCH 08/20] target/arm: Rename arm_cpu_mp_affinity

2024-04-15 Thread Peter Maydell

On Thu, 18 Jan 2024 at 20:07, Philippe Mathieu-Daudé  wrote:
>
> From: Richard Henderson 
>
> Rename to arm_build_mp_affinity.  This frees up the name for
> other usage, and emphasizes that the cpu object is not involved.
>
> Signed-off-by: Richard Henderson 
> Signed-off-by: Philippe Mathieu-Daudé 

Hi; I just had cause to re-look at this commit. The problem
here is that to determine the right MP affinity value the
CPU *should* be involved. This is because for some Arm
CPUs the MPIDR MT bit is 0, and the CPU number is in Aff0,
but for others the MPIDR MT bit is 1, and the CPU number is
in Aff1, and Aff0 is 0. We don't currently model the latter
CPUs correctly, treating everything as MT=0, but we should.
So really arm_build_mp_affinity() ought to take a CPU
argument (which is awkward because we don't always have one
to hand at the callsite)...

thanks
-- PMM

Re: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device

2024-04-15 Thread Cédric Le Goater


On 4/8/24 10:12, Zhenzhong Duan wrote:

HIODLegacyVFIO represents a host IOMMU device under VFIO legacy
container backend.

It includes a link to VFIODevice.

Suggested-by: Eric Auger 
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
  include/hw/vfio/vfio-common.h | 11 +++
  hw/vfio/container.c   | 11 ++-
  2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index b9da6c08ef..f30772f534 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
  #endif
  #include "sysemu/sysemu.h"
  #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"
  
  #define VFIO_MSG_PREFIX "vfio %s: "
  
@@ -147,6 +148,16 @@ typedef struct VFIOGroup {

  bool ram_block_discard_allowed;
  } VFIOGroup;
  
+#define TYPE_HIOD_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-vfio"


I would prefer to keep the prefix TYPE_HOST_IOMMU_DEVICE.


+OBJECT_DECLARE_SIMPLE_TYPE(HIODLegacyVFIO, HIOD_LEGACY_VFIO)
+
+/* Abstraction of VFIO legacy host IOMMU device */
+struct HIODLegacyVFIO {


same here


+/*< private >*/
+HostIOMMUDevice parent;
+VFIODevice *vdev;


It seems to me that the back pointer should be on the container instead.
Looks more correct conceptually.



+};
+
  typedef struct VFIODMABuf {
  QemuDmaBuf buf;
  uint32_t pos_x, pos_y, pos_updates;
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 77bdec276e..44018ef085 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -1143,12 +1143,21 @@ static void vfio_iommu_legacy_class_init(ObjectClass 
*klass, void *data)
  vioc->pci_hot_reset = vfio_legacy_pci_hot_reset;
  };
  
+static void hiod_legacy_vfio_class_init(ObjectClass *oc, void *data)

+{
+};


Is it preferable to introduce routines when they are actually useful.
Please drop the .class_init definition.

Thanks,

C.



+
  static const TypeInfo types[] = {
  {
  .name = TYPE_VFIO_IOMMU_LEGACY,
  .parent = TYPE_VFIO_IOMMU,
  .class_init = vfio_iommu_legacy_class_init,
-},
+}, {
+.name = TYPE_HIOD_LEGACY_VFIO,
+.parent = TYPE_HOST_IOMMU_DEVICE,
+.instance_size = sizeof(HIODLegacyVFIO),
+.class_init = hiod_legacy_vfio_class_init,
+}
  };
  
  DEFINE_TYPES(types)

RE: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device

2024-04-15 Thread Duan, Zhenzhong



>-Original Message-
>From: Philippe Mathieu-Daudé 
>Subject: Re: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device
>
>On 15/4/24 12:10, Duan, Zhenzhong wrote:
>> Hi Philippe,
>>
>>> -Original Message-
>>> From: Philippe Mathieu-Daudé 
>>> Sent: Monday, April 15, 2024 5:20 PM
>>> To: Duan, Zhenzhong ; qemu-
>>> de...@nongnu.org
>>> Cc: alex.william...@redhat.com; c...@redhat.com;
>eric.au...@redhat.com;
>>> pet...@redhat.com; jasow...@redhat.com; m...@redhat.com;
>>> j...@nvidia.com; nicol...@nvidia.com; joao.m.mart...@oracle.com; Tian,
>>> Kevin ; Liu, Yi L ; Peng, Chao P
>>> 
>>> Subject: Re: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device
>>>
>>> On 8/4/24 10:12, Zhenzhong Duan wrote:
 HIODLegacyVFIO represents a host IOMMU device under VFIO legacy
 container backend.

 It includes a link to VFIODevice.

 Suggested-by: Eric Auger 
 Suggested-by: Cédric Le Goater 
 Signed-off-by: Zhenzhong Duan 
 ---
include/hw/vfio/vfio-common.h | 11 +++
hw/vfio/container.c   | 11 ++-
2 files changed, 21 insertions(+), 1 deletion(-)

 diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>>> common.h
 index b9da6c08ef..f30772f534 100644
 --- a/include/hw/vfio/vfio-common.h
 +++ b/include/hw/vfio/vfio-common.h
 @@ -31,6 +31,7 @@
#endif
#include "sysemu/sysemu.h"
#include "hw/vfio/vfio-container-base.h"
 +#include "sysemu/host_iommu_device.h"

#define VFIO_MSG_PREFIX "vfio %s: "

 @@ -147,6 +148,16 @@ typedef struct VFIOGroup {
bool ram_block_discard_allowed;
} VFIOGroup;

 +#define TYPE_HIOD_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-
>legacy-
>>> vfio"
 +OBJECT_DECLARE_SIMPLE_TYPE(HIODLegacyVFIO, HIOD_LEGACY_VFIO)
 +
 +/* Abstraction of VFIO legacy host IOMMU device */
 +struct HIODLegacyVFIO {
 +/*< private >*/
>>>
>>> Please drop this comment.
>>
>> Will do. But may I ask the rules when to use that comment and when not?
>
>Sure, see
>https://www.qemu.org/docs/master/devel/style.html#qemu-object-model-
>declarations

Learned, thanks Philippe.

BRs.
Zhenzhong

Re: secure boot & direct kernel load (was: Re: [PATCH] x86/loader: only patch linux kernels)

2024-04-15 Thread Daniel P . Berrangé

On Wed, Apr 10, 2024 at 12:35:13PM +0200, Gerd Hoffmann wrote:
> On Wed, Apr 10, 2024 at 03:26:29AM -0400, Michael S. Tsirkin wrote:
> > On Wed, Apr 10, 2024 at 09:21:26AM +0200, Gerd Hoffmann wrote:
> > > If the binary loaded via -kernel is *not* a linux kernel (in which
> > > case protocol == 0), do not patch the linux kernel header fields.
> > > 
> > > It's (a) pointless and (b) might break binaries by random patching
> > > and (c) changes the binary hash which in turn breaks secure boot
> > > verification.
> > > 
> > > Background: OVMF happily loads and runs not only linux kernels but
> > > any efi binary via direct kernel boot.
> > > 
> > > Note: Breaking the secure boot verification is a problem for linux
> > > kernels too, but fixed that is left for another day ...
> > 
> > Um we kind of care about Linux ;)
> > 
> > What's the plan?  I suspect we should just add a command line flag
> > to skip patching? And once we do that, it seems safer to just
> > always rely on the flag?
> 
> Well, there are more problems to solve here than just the patching.  So
> lets have a look at the bigger picture before discussion the details ...
> 
> [ Cc'ing Daniel + Cole ]
> 
> Current state of affairs is that OVMF supports two ways to boot a linux
> kernel:
> 
>  (1) Just load it as EFI binary and boot via linux kernel EFI stub,
>  which is the modern way to load a linux kernel (which is why you
>  can boot not only linux kernels but any efi binary).
> 
>  (2) Use the old EFI handover protocol.  Which is the RHEL-6 era way to
>  boot a linux kernel on EFI.
> 
> For method (1) secure boot verification must pass.  For (2) not.  So if
> you try to use direct kernel boot with secure boot enabled OVMF will
> first try (1), which will fail, then go fallback to (2).
> 
> The reason for the failure is not only the patching, but also the fact
> that the linux kernel is typically verified by shim.efi (and the distro
> keys compiled into the binary) instead of the firmware.
> 
> Going though (2) is not ideal for multiple reasons, so we need some
> strategy how we'll go handle direct kernel load with uefi and secure
> boot in a way that (1) works.
> 
> Options I see:
> 
>   (a) Stop using direct kernel boot, let virt-install & other tools
>   create vfat boot media with shim+kernel+initrd instead.
> 
>   (b) Enroll the distro signing keys in the efi variable store, so
>   booting the kernel without shim.efi works.
> 
>   (c) Add support for loading shim to qemu (and ovmf), for example
>   with a new '-shim' command line option which stores shim.efi
>   in some new fw_cfg file.

The problem with this is that now virt-install  has to actually
find the correct a shim.efi binary. It is already somewhat hard
to find a suitable kerenl+initrd binary, and AFAIK, the places
where we get these binaries don't have shim.efi alongside.

eg for RHEL/Fedora we grab kernel+initrd from the pxeboot dir:

  
https://fedora.mirrorservice.org/fedora/linux/development/rawhide/Everything/x86_64/os/images/pxeboot/

This same problem with affect both options (a) and (c).

In various forums we have discussed adding the secureboot
certs to the libosinfo database, so that we can have a
customized EFI varstore with minimized certs, even for the
ISO / HDD boot scenario. If we do that, then (b) is trivial
for direct kernel boot too. (b) kills all birds with the
same stone :-)

> 
> (b) + (c) both require a fix for the patching issue.  The options
> I see here are:
> 
>   (A) Move the patching from qemu to the linuxboot option rom.
>   Strictly speaking it belongs there anyway.  It doesn't look
>   that easy though, for qemu it is easier to gather all
>   information needed ...
> 
>   (B) Provide both patched and unpatched setup header, so the
>   guest can choose what it needs.
> 
>   (C) When implementing (c) above we can piggyback on the -shim
>   switch and skip patching in case it is present.
> 
>   (D) Add a flag to skip the patching.
> 
> Comments?  Other/better ideas?

I guess (b) + (D) is probably my preference.


With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: Qemu for TC377

2024-04-15 Thread Bastian Koppelmann

Hi Sameer,
On Sun, Apr 14, 2024 at 06:15:56PM +0200, Philippe Mathieu-Daudé wrote:
> Hi Sameer,
> 
> On 13/4/24 14:52, Sameer Kalliadan Poyil wrote:
> > Hello All,
> > I see that Latest qemu supports for tricore TC277 and TC377
> > image.png
> > But when I downloaded source code and checked for TC377 related file , I
> > didn't find anything
> > 
> > I want to run RTOS/bare metal code on TC377 . could you please let me
> > know how to start qemu on TC377 ?
> > Here is the latest version of qemu i have , I didn't download 9.0
> 
> $ qemu-system-tricore -cpu help
> Available CPUs:
>   tc1796
>   tc1797
>   tc27x
>   tc37x
> $
> 
> Try 'qemu-system-tricore -machine KIT_AURIX_TC277_TRB -cpu tc37x',
> this should start a TC377 SoC on an AURIX board (~KIT_A2G_TC377_TRB).

This is the closest you will get to TC377 board. 

I'm not sure if QEMU is the best choice for you, if you want run a RTOS, as
qemu-system-tricore is lacking:

- peripherals like SCU, SystemTimer that are a bare minimum to run a RTOS

- Simulation of time: When your RTOS runs periodic tasks you might get wrong
  results, as QEMU does not simulate time accurately. The real CPU would
  see time pass differently than QEMU. We make a best guess using the wall time.

I think for now Infineons TSIM is a better choice, as it does not lack the
points above. However it has significantly less performance compared to QEMU.

If you are only interested in running bare metal software, check out my
'boot_to_main' test [1]. The Makefile [2] shows you how to build it using
tricore-gcc [3] and how to run it in QEMU. Also tricore-gdb [4] might be
interesting for you.

If you have further questions, feel free to ask me.

Cheers,
Bastian

[1] 
https://gitlab.com/qemu-project/qemu/-/blob/master/tests/tcg/tricore/c/test_boot_to_main.c?ref_type=heads
[2] 
https://gitlab.com/qemu-project/qemu/-/blob/master/tests/tcg/tricore/Makefile.softmmu-target?ref_type=heads
[3] https://github.com/bkoppelmann/package_494
[4] https://github.com/volumit/gdb-tricore

Re: Questions about "QEMU gives wrong MPIDR value for Arm CPU types with MT=1" (gitlab issue #1608)

2024-04-15 Thread Peter Maydell

On Sat, 13 Apr 2024 at 20:59, Dorjoy Chowdhury  wrote:
>
> Hi,
> Hope everyone is doing well. I was looking at "Bite Sized" tagged QEMU
> issues in gitlab to see if there is anything that makes sense for me
> as a first time contributor. I see this issue "QEMU gives wrong MPIDR
> value for Arm CPU types with MT=1" (gitlab URL:
> https://gitlab.com/qemu-project/qemu/-/issues/1608 ).
>
> From the bug ticket description, it is very clear that I will need to
> add a bool member variable in the "AarchCPU" struct which is in
> "target/arm/cpu.h" file. I believe the next logical change is to set
> this member variable to "true" in the corresponding arm cpu "initfn"
> functions (a55, a76, noeverse_n1) which are in "target/arm/cpu64.c"
> file. I have a few questions about the following steps as I am looking
> through the code.
>
> 1. I believe I will need to update the "arm_build_mp_affinity"
> function in "target/arm/cpu.c" file to now also take in a bool
> parameter that will indicate if the function should instead put the
> "core index" in the "aff1" bits instead of the existing logic of
> "aff0" bits and the cluster id in the "aff2" bits instead of the
> existing logic of "aff1" bits. But I see this function being invoked
> from 3 other files: "hw/arm/sbsa-ref.c", "hw/arm/virt.c",
> "hw/arm/npcm7xx.c". Should the function calls in these files always
> have that corresponding argument set to "false"?

This bit of the codebase has got a bit more complicated since
I wrote up the bug report. I will look into this and get back
to you, but my suspicion is that these calls must return the
same value that the actual CPU MPIDR affinity values have,
because these values are going to end up in the DTB and ACPI
tables, and the OS will want them to match up with MPIDRs.

> 2. As per the bug ticket description, I will also need to update the
> "mpidr_read_val" function in the "target/arm/helper.c" file to only
> set the MT bit (24th) to 1 if the member variable is true. I think
> there is nothing else to be done in this function apart from checking
> and then setting the MT bit. Is my assumption correct?

Yes, that's right.

> I think doing the above steps should fix the bug and probably we don't
> need anything else. It would be great if someone can help me answer
> the questions or any suggestion would be great if my assumptions are
> wrong. Thanks.

The other thing we need to do is check the TRM (technical reference
manual) for the CPUs that were added since I filed that bug in
April 2023, to see if they need to have the flag set or not. The
ones we need to check are:
 * cortex-a710
 * neoverse-n2
 * neoverse-v1

thanks
-- PMM

Re: Hermetic virtio-vsock in QEMU

2024-04-15 Thread Daniel P . Berrangé

On Wed, Apr 03, 2024 at 02:30:33PM -0700, Roman Kiryanov wrote:
> Hi Peter, Alex and QEMU,
> 
> I work in Android Studio Emulator and we use virtio-vsock to emulate
> devices (e.g. sensors) which live in the Emulator binary. We need to run on
> Windows and in environments without CONFIG_VHOST_VSOCK, that is why we
> cannot use vhost-vsock and invented our implementation. I tried to grep the
> QEMU8 sources and I believe virtio-vsock is not available there.
> 
> Do you think it is a good idea to implement virtio-vsock in QEMU (e.g. to
> run on Windows)? If the answer is yes, could you please point where I could
> start to build an upstreamable solution (not Android Emulator specific)? It
> is not clear to me how we should make the device available for clients
> (sensors, modem, adb, etc) in a generic way.

This issue is proposing the idea of exposing VSOCK using AF_UNIX as the
host backend, in a manner that's compatible with that used by firecracker
and cloud-hypervisor:

  https://gitlab.com/qemu-project/qemu/-/issues/2095

Recent versions of Windows support AF_UNIX these days, so hopefully that
would be satisfactory as an approach ?

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PATCH] vhost-user-gpu: fix import of DMABUF

2024-04-15 Thread marcandre . lureau

From: Marc-André Lureau 

When using vhost-user-gpu with GL, qemu -display gtk doesn't show output
and prints: qemu: eglCreateImageKHR failed

Since commit 9ac06df8b ("virtio-gpu-udmabuf: correct naming of
QemuDmaBuf size properties"), egl_dmabuf_import_texture() uses
backing_{width,height} for the texture dimension.

Fixes: commit 9ac06df8b ("virtio-gpu-udmabuf: correct naming of QemuDmaBuf size 
properties")
Signed-off-by: Marc-André Lureau 
---
 hw/display/vhost-user-gpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
index 709c8a02a1..baffb1c2d4 100644
--- a/hw/display/vhost-user-gpu.c
+++ b/hw/display/vhost-user-gpu.c
@@ -273,8 +273,8 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
VhostUserGpuMsg *msg)
 }
 *dmabuf = (QemuDmaBuf) {
 .fd = fd,
-.width = m->fd_width,
-.height = m->fd_height,
+.backing_width = m->fd_width,
+.backing_height = m->fd_height,
 .stride = m->fd_stride,
 .fourcc = m->fd_drm_fourcc,
 .y0_top = m->fd_flags & VIRTIO_GPU_RESOURCE_FLAG_Y_0_TOP,
-- 
2.41.0.28.gd7d8841f67

[PATCH] vl: fix "type is NULL" in -vga help

2024-04-15 Thread marcandre . lureau

From: Marc-André Lureau 

Don't pass NULL to module_object_class_by_name().

Signed-off-by: Marc-André Lureau 
---
 system/vl.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/system/vl.c b/system/vl.c
index c644222982..23e1cb016f 100644
--- a/system/vl.c
+++ b/system/vl.c
@@ -992,9 +992,16 @@ static bool vga_interface_available(VGAInterfaceType t)
 const VGAInterfaceInfo *ti = _interfaces[t];
 
 assert(t < VGA_TYPE_MAX);
-return !ti->class_names[0] ||
-   module_object_class_by_name(ti->class_names[0]) ||
-   module_object_class_by_name(ti->class_names[1]);
+
+if (!ti->class_names[0] || 
module_object_class_by_name(ti->class_names[0])) {
+return true;
+}
+
+if (ti->class_names[1] && module_object_class_by_name(ti->class_names[1])) 
{
+return true;
+}
+
+return false;
 }
 
 static const char *
-- 
2.41.0.28.gd7d8841f67

Re: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device

2024-04-15 Thread Philippe Mathieu-Daudé


On 15/4/24 12:10, Duan, Zhenzhong wrote:

Hi Philippe,


-Original Message-
From: Philippe Mathieu-Daudé 
Sent: Monday, April 15, 2024 5:20 PM
To: Duan, Zhenzhong ; qemu-
de...@nongnu.org
Cc: alex.william...@redhat.com; c...@redhat.com; eric.au...@redhat.com;
pet...@redhat.com; jasow...@redhat.com; m...@redhat.com;
j...@nvidia.com; nicol...@nvidia.com; joao.m.mart...@oracle.com; Tian,
Kevin ; Liu, Yi L ; Peng, Chao P

Subject: Re: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device

On 8/4/24 10:12, Zhenzhong Duan wrote:

HIODLegacyVFIO represents a host IOMMU device under VFIO legacy
container backend.

It includes a link to VFIODevice.

Suggested-by: Eric Auger 
Suggested-by: Cédric Le Goater 
Signed-off-by: Zhenzhong Duan 
---
   include/hw/vfio/vfio-common.h | 11 +++
   hw/vfio/container.c   | 11 ++-
   2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-

common.h

index b9da6c08ef..f30772f534 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -31,6 +31,7 @@
   #endif
   #include "sysemu/sysemu.h"
   #include "hw/vfio/vfio-container-base.h"
+#include "sysemu/host_iommu_device.h"

   #define VFIO_MSG_PREFIX "vfio %s: "

@@ -147,6 +148,16 @@ typedef struct VFIOGroup {
   bool ram_block_discard_allowed;
   } VFIOGroup;

+#define TYPE_HIOD_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-

vfio"

+OBJECT_DECLARE_SIMPLE_TYPE(HIODLegacyVFIO, HIOD_LEGACY_VFIO)
+
+/* Abstraction of VFIO legacy host IOMMU device */
+struct HIODLegacyVFIO {
+/*< private >*/


Please drop this comment.


Will do. But may I ask the rules when to use that comment and when not?


Sure, see 
https://www.qemu.org/docs/master/devel/style.html#qemu-object-model-declarations



I see some QOM use that comment to mark private vs. public, for example:

struct AccelState {
 /*< private >*/
 Object parent_obj;


This is old style which might be cleaned some day...


};

typedef struct AccelClass {
 /*< private >*/
 ObjectClass parent_class;
 /*< public >*/




+HostIOMMUDevice parent;


Please name 'parent_obj'.


Will do.


Thanks,

Phil.



Thanks
Zhenzhong




+VFIODevice *vdev;
+};

Re: [PATCH v2] target/i386: Give IRQs a chance when resetting HF_INHIBIT_IRQ_MASK

2024-04-15 Thread Ruihan Li

Hi Paolo,

On Mon, Apr 15, 2024 at 11:32:51AM +0200, Paolo Bonzini wrote:
> What do you think about writing this:
> 
> >  /* If several instructions disable interrupts, only the first does it. 
> >  */
> >  if (inhibit && !(s->flags & HF_INHIBIT_IRQ_MASK)) {
> >  gen_set_hflag(s, HF_INHIBIT_IRQ_MASK);
> > -} else {
> > +inhibit_reset = false;
> > +} else if (!inhibit && (s->flags & HF_INHIBIT_IRQ_MASK)) {
> >  gen_reset_hflag(s, HF_INHIBIT_IRQ_MASK);
> > +inhibit_reset = true;
> > +} else {
> > +inhibit_reset = false;
> >  }
> 
> in a slightly simpler manner:
> 
> inhibit_reset = false;
> if (s->flags & HF_INHIBIT_IRQ_MASK) {
> gen_reset_hflag(s, HF_INHIBIT_IRQ_MASK);
> inhibit_reset = true;
> } else if (inhibit) {
> gen_set_hflag(s, HF_INHIBIT_IRQ_MASK);
> }

Yes, I agree with you that your changes look a bit clearer. I have
tested your changes and verified that they fix the reported bug.

> No need to submit v3, I can do the change myself when applying.

Thank you for your review. Feel free to do that.

Thanks,
Ruihan Li

[PULL 1/1] virtio-pci: fix use of a released vector

2024-04-15 Thread Michael S. Tsirkin

From: Cindy Lu 

During the booting process of the non-standard image, the behavior of the
called function in qemu is as follows:

1. vhost_net_stop() was triggered by guest image. This will call the function
virtio_pci_set_guest_notifiers() with assgin= false,
virtio_pci_set_guest_notifiers(） will release the irqfd for vector 0

2. virtio_reset() was triggered, this will set configure vector to 
VIRTIO_NO_VECTOR

3.vhost_net_start() was called (at this time, the configure vector is
still VIRTIO_NO_VECTOR) and then call virtio_pci_set_guest_notifiers() with
assgin=true, so the irqfd for vector 0 is still not "init" during this process

4. The system continues to boot and sets the vector back to 0. After that
msix_fire_vector_notifier() was triggered to unmask the vector 0 and  meet the 
crash

To fix the issue, we need to support changing the vector after 
VIRTIO_CONFIG_S_DRIVER_OK is set.

(gdb) bt
0  __pthread_kill_implementation (threadid=, 
signo=signo@entry=6, no_tid=no_tid@entry=0)
at pthread_kill.c:44
1  0x7fc87148ec53 in __pthread_kill_internal (signo=6, threadid=) at pthread_kill.c:78
2  0x7fc87143e956 in __GI_raise (sig=sig@entry=6) at 
../sysdeps/posix/raise.c:26
3  0x7fc8714287f4 in __GI_abort () at abort.c:79
4  0x7fc87142871b in __assert_fail_base
(fmt=0x7fc8715bbde0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", 
assertion=0x5606413efd53 "ret == 0", file=0x5606413ef87d 
"../accel/kvm/kvm-all.c", line=1837, function=) at assert.c:92
5  0x7fc871437536 in __GI___assert_fail
(assertion=0x5606413efd53 "ret == 0", file=0x5606413ef87d 
"../accel/kvm/kvm-all.c", line=1837, function=0x5606413f06f0 
<__PRETTY_FUNCTION__.19> "kvm_irqchip_commit_routes") at assert.c:101
6  0x560640f884b5 in kvm_irqchip_commit_routes (s=0x560642cae1f0) at 
../accel/kvm/kvm-all.c:1837
7  0x560640c98f8e in virtio_pci_one_vector_unmask
(proxy=0x560643c65f00, queue_no=4294967295, vector=0, msg=..., 
n=0x560643c6e4c8)
at ../hw/virtio/virtio-pci.c:1005
8  0x560640c99201 in virtio_pci_vector_unmask (dev=0x560643c65f00, 
vector=0, msg=...)
at ../hw/virtio/virtio-pci.c:1070
9  0x560640bc402e in msix_fire_vector_notifier (dev=0x560643c65f00, 
vector=0, is_masked=false)
at ../hw/pci/msix.c:120
10 0x560640bc40f1 in msix_handle_mask_update (dev=0x560643c65f00, vector=0, 
was_masked=true)
at ../hw/pci/msix.c:140
11 0x560640bc4503 in msix_table_mmio_write (opaque=0x560643c65f00, addr=12, 
val=0, size=4)
at ../hw/pci/msix.c:231
12 0x560640f26d83 in memory_region_write_accessor
(mr=0x560643c66540, addr=12, value=0x7fc86b7bc628, size=4, shift=0, 
mask=4294967295, attrs=...)
at ../system/memory.c:497
13 0x560640f270a6 in access_with_adjusted_size

 (addr=12, value=0x7fc86b7bc628, size=4, access_size_min=1, 
access_size_max=4, access_fn=0x560640f26c8d , 
mr=0x560643c66540, attrs=...) at ../system/memory.c:573
14 0x560640f2a2b5 in memory_region_dispatch_write (mr=0x560643c66540, 
addr=12, data=0, op=MO_32, attrs=...)
at ../system/memory.c:1521
15 0x560640f37bac in flatview_write_continue
(fv=0x7fc65805e0b0, addr=4273803276, attrs=..., ptr=0x7fc871e9c028, len=4, 
addr1=12, l=4, mr=0x560643c66540)
at ../system/physmem.c:2714
16 0x560640f37d0f in flatview_write
(fv=0x7fc65805e0b0, addr=4273803276, attrs=..., buf=0x7fc871e9c028, len=4) 
at ../system/physmem.c:2756
17 0x560640f380bf in address_space_write
(as=0x560642161ae0 , addr=4273803276, attrs=..., 
buf=0x7fc871e9c028, len=4)
at ../system/physmem.c:2863
18 0x560640f3812c in address_space_rw
(as=0x560642161ae0 , addr=4273803276, attrs=..., 
buf=0x7fc871e9c028, len=4, is_write=true) at ../system/physmem.c:2873
--Type  for more, q to quit, c to continue without paging--
19 0x560640f8aa55 in kvm_cpu_exec (cpu=0x560642f205e0) at 
../accel/kvm/kvm-all.c:2915
20 0x560640f8d731 in kvm_vcpu_thread_fn (arg=0x560642f205e0) at 
../accel/kvm/kvm-accel-ops.c:51
21 0x5606411949f4 in qemu_thread_start (args=0x560642f292b0) at 
../util/qemu-thread-posix.c:541
22 0x7fc87148cdcd in start_thread (arg=) at 
pthread_create.c:442
23 0x7fc871512630 in clone3 () at 
../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
(gdb)

MST: coding style and typo fixups

Fixes: f9a09ca3ea ("vhost: add support for configure interrupt")
Cc: qemu-sta...@nongnu.org
Signed-off-by: Cindy Lu 
Message-ID: 
<2321ade5f601367efe7380c04e3f61379c59b48f.1713173550.git@redhat.com>
Cc: Lei Yang 
Cc: Jason Wang 
Signed-off-by: Michael S. Tsirkin 
Tested-by: Cindy Lu 
---
 hw/virtio/virtio-pci.c | 37 +++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index cb6940fc0e..cb159fd078 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1424,6 +1424,38 @@ static int virtio_pci_add_mem_cap(VirtIOPCIProxy *proxy,
 return offset;
 }
 
+static void

[PULL 0/1] virtio: bugfix

2024-04-15 Thread Michael S. Tsirkin

The following changes since commit e104a960c33b68fedf26dfb7b8e00abab8f2:

  qdev-monitor: fix error message in find_device_state() (2024-04-09 02:31:33 
-0400)

are available in the Git repository at:

  https://git.kernel.org/pub/scm/virt/kvm/mst/qemu.git tags/for_upstream

for you to fetch changes up to 2ce6cff94df2650c460f809e5ad263f1d22507c0:

  virtio-pci: fix use of a released vector (2024-04-15 06:50:44 -0400)


virtio: bugfix

A last minute fix for a use of a vector after it's released.

Signed-off-by: Michael S. Tsirkin 


Cindy Lu (1):
  virtio-pci: fix use of a released vector

 hw/virtio/virtio-pci.c | 37 +++--
 1 file changed, 35 insertions(+), 2 deletions(-)

Re: Discrepancy between mmap call on DPDK/libvduse and rust vm-memory crate

2024-04-15 Thread Eugenio Perez Martin

On Sun, Apr 14, 2024 at 11:02 AM Michael S. Tsirkin  wrote:
>
> On Fri, Apr 12, 2024 at 12:15:40PM +0200, Eugenio Perez Martin wrote:
> > Hi!
> >
> > I'm building a bridge to expose vhost-user devices through VDUSE. The
> > code is still immature but I'm able to forward packets using
> > dpdk-l2fwd through VDUSE to VM. I'm now developing exposing virtiofsd,
> > but I've hit an error I'd like to discuss.
> >
> > VDUSE devices can get all the memory regions the driver is using by
> > VDUSE_IOTLB_GET_FD ioctl. It returns a file descriptor with a memory
> > region associated that can be mapped with mmap, and an information
> > entry about the map it contains:
> > * Start and end addresses from the driver POV
> > * Offset within the mmaped region of these start and end
> > * Device permissions over that region.
> >
> > [start=0xc3000][last=0xe7fff][offset=0xc3000][perm=1]
> >
> > Now when I try to map it, it is impossible for the userspace device to
> > call mmap with any offset different than 0.
>
> How exactly did you allocate memory? hugetlbfs?
>

Yes, that was definitely the cause, thank you very much!

> > So the "straightforward"
> > mmap with size = entry.last-entry.start and offset = entry.offset does
> > not work. I don't know if this is a limitation of Linux or VDUSE.
> >
> > Checking QEMU's
> > subprojects/libvduse/libvduse.c:vduse_iova_add_region() I see it
> > handles the offset by adding it up to the size, instead of using it
> > directly as a parameter in the mmap:
> >
> > void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0);
>
>
> CC Xie Yongji who wrote this code, too.
>

Thanks!

>
> > I can replicate it on the bridge for sure.
> >
> > Now I send the VhostUserMemoryRegion to the vhost-user application.
> > The struct has these members:
> > struct VhostUserMemoryRegion {
> > uint64_t guest_phys_addr;
> > uint64_t memory_size;
> > uint64_t userspace_addr;
> > uint64_t mmap_offset;
> > };
> >
> > So I can send the offset to the vhost-user device. I can check that
> > dpdk-l2fwd uses the same trick of adding offset to the size of the
> > mapping region [1], at
> > lib/vhost/vhost_user.c:vhost_user_mmap_region():
> >
> > mmap_size = region->size + mmap_offset;
> > mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
> > MAP_SHARED | populate, region->fd, 0);
> >
> > So mmap is called with offset == 0 and everybody is happy.
> >
> > Now I'm moving to virtiofsd, and vm-memory crate in particular. And it
> > performs the mmap without the size += offset trick, at
> > MmapRegionBuilder:build() [2].
> >
> > I can try to apply the offset + size trick in my bridge but I don't
> > think it is the right solution. At first glance, the right solution is
> > to mmap with the offset as vm-memory crate do. But having libvduse and
> > DPDK apply the same trick sounds to me like it is a known limitation /
> > workaround I don't know about. What is the history of this? Can VDUSE
> > problem (if any) be solved? Am I missing something?
> >
> > Thanks!
> >
> > [1] 
> > https://github.com/DPDK/dpdk/blob/e2e546ab5bf5e024986ccb5310ab43982f3bb40c/lib/vhost/vhost_user.c#L1305
> > [2] https://github.com/rust-vmm/vm-memory/blob/main/src/mmap_unix.rs#L128
>

Re: [PATCH v8] virtio-pci: fix use of a released vector

2024-04-15 Thread Cindy Lu

On Mon, Apr 15, 2024 at 5:34 PM Michael S. Tsirkin  wrote:
>
> From: Cindy Lu 
>
> During the booting process of the non-standard image, the behavior of the
> called function in qemu is as follows:
>
> 1. vhost_net_stop() was triggered by guest image. This will call the function
> virtio_pci_set_guest_notifiers() with assgin= false,
> virtio_pci_set_guest_notifiers(） will release the irqfd for vector 0
>
> 2. virtio_reset() was triggered, this will set configure vector to 
> VIRTIO_NO_VECTOR
>
> 3.vhost_net_start() was called (at this time, the configure vector is
> still VIRTIO_NO_VECTOR) and then call virtio_pci_set_guest_notifiers() with
> assgin=true, so the irqfd for vector 0 is still not "init" during this process
>
> 4. The system continues to boot and sets the vector back to 0. After that
> msix_fire_vector_notifier() was triggered to unmask the vector 0 and  meet 
> the crash
>
> To fix the issue, we need to support changing the vector after 
> VIRTIO_CONFIG_S_DRIVER_OK is set.
>
> (gdb) bt
> 0  __pthread_kill_implementation (threadid=, 
> signo=signo@entry=6, no_tid=no_tid@entry=0)
> at pthread_kill.c:44
> 1  0x7fc87148ec53 in __pthread_kill_internal (signo=6, 
> threadid=) at pthread_kill.c:78
> 2  0x7fc87143e956 in __GI_raise (sig=sig@entry=6) at 
> ../sysdeps/posix/raise.c:26
> 3  0x7fc8714287f4 in __GI_abort () at abort.c:79
> 4  0x7fc87142871b in __assert_fail_base
> (fmt=0x7fc8715bbde0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", 
> assertion=0x5606413efd53 "ret == 0", file=0x5606413ef87d 
> "../accel/kvm/kvm-all.c", line=1837, function=) at assert.c:92
> 5  0x7fc871437536 in __GI___assert_fail
> (assertion=0x5606413efd53 "ret == 0", file=0x5606413ef87d 
> "../accel/kvm/kvm-all.c", line=1837, function=0x5606413f06f0 
> <__PRETTY_FUNCTION__.19> "kvm_irqchip_commit_routes") at assert.c:101
> 6  0x560640f884b5 in kvm_irqchip_commit_routes (s=0x560642cae1f0) at 
> ../accel/kvm/kvm-all.c:1837
> 7  0x560640c98f8e in virtio_pci_one_vector_unmask
> (proxy=0x560643c65f00, queue_no=4294967295, vector=0, msg=..., 
> n=0x560643c6e4c8)
> at ../hw/virtio/virtio-pci.c:1005
> 8  0x560640c99201 in virtio_pci_vector_unmask (dev=0x560643c65f00, 
> vector=0, msg=...)
> at ../hw/virtio/virtio-pci.c:1070
> 9  0x560640bc402e in msix_fire_vector_notifier (dev=0x560643c65f00, 
> vector=0, is_masked=false)
> at ../hw/pci/msix.c:120
> 10 0x560640bc40f1 in msix_handle_mask_update (dev=0x560643c65f00, 
> vector=0, was_masked=true)
> at ../hw/pci/msix.c:140
> 11 0x560640bc4503 in msix_table_mmio_write (opaque=0x560643c65f00, 
> addr=12, val=0, size=4)
> at ../hw/pci/msix.c:231
> 12 0x560640f26d83 in memory_region_write_accessor
> (mr=0x560643c66540, addr=12, value=0x7fc86b7bc628, size=4, shift=0, 
> mask=4294967295, attrs=...)
> at ../system/memory.c:497
> 13 0x560640f270a6 in access_with_adjusted_size
>
>  (addr=12, value=0x7fc86b7bc628, size=4, access_size_min=1, 
> access_size_max=4, access_fn=0x560640f26c8d , 
> mr=0x560643c66540, attrs=...) at ../system/memory.c:573
> 14 0x560640f2a2b5 in memory_region_dispatch_write (mr=0x560643c66540, 
> addr=12, data=0, op=MO_32, attrs=...)
> at ../system/memory.c:1521
> 15 0x560640f37bac in flatview_write_continue
> (fv=0x7fc65805e0b0, addr=4273803276, attrs=..., ptr=0x7fc871e9c028, 
> len=4, addr1=12, l=4, mr=0x560643c66540)
> at ../system/physmem.c:2714
> 16 0x560640f37d0f in flatview_write
> (fv=0x7fc65805e0b0, addr=4273803276, attrs=..., buf=0x7fc871e9c028, 
> len=4) at ../system/physmem.c:2756
> 17 0x560640f380bf in address_space_write
> (as=0x560642161ae0 , addr=4273803276, attrs=..., 
> buf=0x7fc871e9c028, len=4)
> at ../system/physmem.c:2863
> 18 0x560640f3812c in address_space_rw
> (as=0x560642161ae0 , addr=4273803276, attrs=..., 
> buf=0x7fc871e9c028, len=4, is_write=true) at ../system/physmem.c:2873
> --Type  for more, q to quit, c to continue without paging--
> 19 0x560640f8aa55 in kvm_cpu_exec (cpu=0x560642f205e0) at 
> ../accel/kvm/kvm-all.c:2915
> 20 0x560640f8d731 in kvm_vcpu_thread_fn (arg=0x560642f205e0) at 
> ../accel/kvm/kvm-accel-ops.c:51
> 21 0x5606411949f4 in qemu_thread_start (args=0x560642f292b0) at 
> ../util/qemu-thread-posix.c:541
> 22 0x7fc87148cdcd in start_thread (arg=) at 
> pthread_create.c:442
> 23 0x7fc871512630 in clone3 () at 
> ../sysdeps/unix/sysv/linux/x86_64/clone3.S:81
> (gdb)
>
> MST: coding style and typo fixups
>
> Fixes: f9a09ca3ea ("vhost: add support for configure interrupt")
> Cc: qemu-sta...@nongnu.org
> Signed-off-by: Cindy Lu 
> Message-Id: <20240412062750.475180-1-l...@redhat.com>
> Reviewed-by: Michael S. Tsirkin 
> Signed-off-by: Michael S. Tsirkin 
> Signed-off-by: Michael S. Tsirkin 
> ---
>  hw/virtio/virtio-pci.c | 37 +++--
>  1 file changed, 35 insertions(+), 2 deletions(-)
>
> v7->v8:
> more cleanups, suggested by Philip

RE: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device

2024-04-15 Thread Duan, Zhenzhong

Hi Philippe,

>-Original Message-
>From: Philippe Mathieu-Daudé 
>Sent: Monday, April 15, 2024 5:20 PM
>To: Duan, Zhenzhong ; qemu-
>de...@nongnu.org
>Cc: alex.william...@redhat.com; c...@redhat.com; eric.au...@redhat.com;
>pet...@redhat.com; jasow...@redhat.com; m...@redhat.com;
>j...@nvidia.com; nicol...@nvidia.com; joao.m.mart...@oracle.com; Tian,
>Kevin ; Liu, Yi L ; Peng, Chao P
>
>Subject: Re: [PATCH v2 02/10] vfio: Introduce HIODLegacyVFIO device
>
>On 8/4/24 10:12, Zhenzhong Duan wrote:
>> HIODLegacyVFIO represents a host IOMMU device under VFIO legacy
>> container backend.
>>
>> It includes a link to VFIODevice.
>>
>> Suggested-by: Eric Auger 
>> Suggested-by: Cédric Le Goater 
>> Signed-off-by: Zhenzhong Duan 
>> ---
>>   include/hw/vfio/vfio-common.h | 11 +++
>>   hw/vfio/container.c   | 11 ++-
>>   2 files changed, 21 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-
>common.h
>> index b9da6c08ef..f30772f534 100644
>> --- a/include/hw/vfio/vfio-common.h
>> +++ b/include/hw/vfio/vfio-common.h
>> @@ -31,6 +31,7 @@
>>   #endif
>>   #include "sysemu/sysemu.h"
>>   #include "hw/vfio/vfio-container-base.h"
>> +#include "sysemu/host_iommu_device.h"
>>
>>   #define VFIO_MSG_PREFIX "vfio %s: "
>>
>> @@ -147,6 +148,16 @@ typedef struct VFIOGroup {
>>   bool ram_block_discard_allowed;
>>   } VFIOGroup;
>>
>> +#define TYPE_HIOD_LEGACY_VFIO TYPE_HOST_IOMMU_DEVICE "-legacy-
>vfio"
>> +OBJECT_DECLARE_SIMPLE_TYPE(HIODLegacyVFIO, HIOD_LEGACY_VFIO)
>> +
>> +/* Abstraction of VFIO legacy host IOMMU device */
>> +struct HIODLegacyVFIO {
>> +/*< private >*/
>
>Please drop this comment.

Will do. But may I ask the rules when to use that comment and when not?
I see some QOM use that comment to mark private vs. public, for example:

struct AccelState {
/*< private >*/
Object parent_obj;
};

typedef struct AccelClass {
/*< private >*/
ObjectClass parent_class;
/*< public >*/

>
>> +HostIOMMUDevice parent;
>
>Please name 'parent_obj'.

Will do.

Thanks
Zhenzhong

>
>> +VFIODevice *vdev;
>> +};

Re: [PATCH v5 1/3] ui/console: Introduce dpy_gl_qemu_dmabuf_get_..() helpers

2024-04-15 Thread Marc-André Lureau

Hi

On Fri, Apr 12, 2024 at 7:57 AM  wrote:
>
> From: Dongwon Kim 
>

For patchew to handle your series, you need a cover letter. See:
https://www.qemu.org/docs/master/devel/submitting-a-patch.html

> This commit introduces dpy_gl_qemu_dmabuf_get_... helpers to extract
> specific fields from the QemuDmaBuf struct. It also updates all instances
> where fields within the QemuDmaBuf struct are directly accessed, replacing
> them with calls to these new helper functions.
>
> Suggested-by: Marc-André Lureau 
> Cc: Philippe Mathieu-Daudé 
> Cc: Vivek Kasireddy 
> Signed-off-by: Dongwon Kim 
> ---
>  include/ui/console.h|  17 +
>  hw/display/vhost-user-gpu.c |   6 +-
>  hw/display/virtio-gpu-udmabuf.c |   7 +-
>  hw/vfio/display.c   |  15 +++--
>  ui/console.c| 116 +++-
>  ui/dbus-console.c   |   9 ++-
>  ui/dbus-listener.c  |  43 +++-
>  ui/egl-headless.c   |  23 +--
>  ui/egl-helpers.c|  47 +++--
>  ui/gtk-egl.c|  48 -
>  ui/gtk-gl-area.c|  37 ++
>  ui/gtk.c|   6 +-
>  ui/spice-display.c  |  50 --
>  13 files changed, 316 insertions(+), 108 deletions(-)
>
> diff --git a/include/ui/console.h b/include/ui/console.h
> index 0bc7a00ac0..6292943a82 100644
> --- a/include/ui/console.h
> +++ b/include/ui/console.h
> @@ -358,6 +358,23 @@ void dpy_gl_cursor_dmabuf(QemuConsole *con, QemuDmaBuf 
> *dmabuf,
>bool have_hot, uint32_t hot_x, uint32_t hot_y);
>  void dpy_gl_cursor_position(QemuConsole *con,
>  uint32_t pos_x, uint32_t pos_y);
> +
> +int32_t dpy_gl_qemu_dmabuf_get_fd(QemuDmaBuf *dmabuf);
> +uint32_t dpy_gl_qemu_dmabuf_get_width(QemuDmaBuf *dmabuf);
> +uint32_t dpy_gl_qemu_dmabuf_get_height(QemuDmaBuf *dmabuf);
> +uint32_t dpy_gl_qemu_dmabuf_get_stride(QemuDmaBuf *dmabuf);
> +uint32_t dpy_gl_qemu_dmabuf_get_fourcc(QemuDmaBuf *dmabuf);
> +uint64_t dpy_gl_qemu_dmabuf_get_modifier(QemuDmaBuf *dmabuf);
> +uint32_t dpy_gl_qemu_dmabuf_get_texture(QemuDmaBuf *dmabuf);
> +uint32_t dpy_gl_qemu_dmabuf_get_x(QemuDmaBuf *dmabuf);
> +uint32_t dpy_gl_qemu_dmabuf_get_y(QemuDmaBuf *dmabuf);
> +uint32_t dpy_gl_qemu_dmabuf_get_backing_width(QemuDmaBuf *dmabuf);
> +uint32_t dpy_gl_qemu_dmabuf_get_backing_height(QemuDmaBuf *dmabuf);
> +bool dpy_gl_qemu_dmabuf_get_y0_top(QemuDmaBuf *dmabuf);
> +void *dpy_gl_qemu_dmabuf_get_sync(QemuDmaBuf *dmabuf);
> +int32_t dpy_gl_qemu_dmabuf_get_fence_fd(QemuDmaBuf *dmabuf);
> +bool dpy_gl_qemu_dmabuf_get_allow_fences(QemuDmaBuf *dmabuf);
> +bool dpy_gl_qemu_dmabuf_get_draw_submitted(QemuDmaBuf *dmabuf);


I don't think it's necessary to have getters for individual fields,
you could have made a few "get_info(, , )" instead.
But now that you did it, let's keep it that way.


>  void dpy_gl_release_dmabuf(QemuConsole *con,
> QemuDmaBuf *dmabuf);
>  void dpy_gl_update(QemuConsole *con,
> diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
> index 709c8a02a1..87dcfbca10 100644
> --- a/hw/display/vhost-user-gpu.c
> +++ b/hw/display/vhost-user-gpu.c
> @@ -249,6 +249,7 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
> VhostUserGpuMsg *msg)
>  case VHOST_USER_GPU_DMABUF_SCANOUT: {
>  VhostUserGpuDMABUFScanout *m = >payload.dmabuf_scanout;
>  int fd = qemu_chr_fe_get_msgfd(>vhost_chr);
> +int old_fd;
>  QemuDmaBuf *dmabuf;
>
>  if (m->scanout_id >= g->parent_obj.conf.max_outputs) {
> @@ -262,8 +263,9 @@ vhost_user_gpu_handle_display(VhostUserGPU *g, 
> VhostUserGpuMsg *msg)
>  g->parent_obj.enable = 1;
>  con = g->parent_obj.scanout[m->scanout_id].con;
>  dmabuf = >dmabuf[m->scanout_id];
> -if (dmabuf->fd >= 0) {
> -close(dmabuf->fd);
> +old_fd = dpy_gl_qemu_dmabuf_get_fd(dmabuf);
> +if (old_fd >= 0) {
> +close(old_fd);
>  dmabuf->fd = -1;


>  }
>  dpy_gl_release_dmabuf(con, dmabuf);
> diff --git a/hw/display/virtio-gpu-udmabuf.c b/hw/display/virtio-gpu-udmabuf.c
> index d51184d658..e3f358b575 100644
> --- a/hw/display/virtio-gpu-udmabuf.c
> +++ b/hw/display/virtio-gpu-udmabuf.c
> @@ -206,6 +206,7 @@ int virtio_gpu_update_dmabuf(VirtIOGPU *g,
>  {
>  struct virtio_gpu_scanout *scanout = >parent_obj.scanout[scanout_id];
>  VGPUDMABuf *new_primary, *old_primary = NULL;
> +uint32_t width, height;
>
>  new_primary = virtio_gpu_create_dmabuf(g, scanout_id, res, fb, r);
>  if (!new_primary) {
> @@ -216,10 +217,10 @@ int virtio_gpu_update_dmabuf(VirtIOGPU *g,
>  old_primary = g->dmabuf.primary[scanout_id];
>  }
>
> +width = dpy_gl_qemu_dmabuf_get_width(_primary->buf);
> +height = dpy_gl_qemu_dmabuf_get_height(_primary->buf);
>  g->dmabuf.primary[scanout_id] =

Re: [PATCH v2] hw/i386/acpi: Set PCAT_COMPAT bit only when pic is not disabled

2024-04-15 Thread Michael S. Tsirkin

On Mon, Apr 15, 2024 at 05:51:56AM -0400, Michael S. Tsirkin wrote:
> On Mon, Apr 15, 2024 at 11:44:50AM +0200, Paolo Bonzini wrote:
> > Queued, thanks.
> > 
> > Paolo
> 
> I actually had it queued too - it's not 9.0 material though.
> If you queue it don't forget to tweak the commit log manually.

oh and then you can add my

Reviewed-by: Michael S. Tsirkin 


> -- 
> MST

Re: [PATCH v7 07/10] virtio-gpu: Handle resource blob commands

2024-04-15 Thread Akihiko Odaki


On 2024/04/15 17:49, Dmitry Osipenko wrote:

On 4/15/24 11:13, Akihiko Odaki wrote:

On 2024/04/15 17:03, Dmitry Osipenko wrote:

Hello,

On 4/13/24 14:57, Akihiko Odaki wrote:
...

+static void
+virtio_gpu_virgl_unmap_resource_blob(VirtIOGPU *g,
+ struct
virtio_gpu_simple_resource *res)
+{
+    VirtIOGPUBase *b = VIRTIO_GPU_BASE(g);
+
+    if (!res->mr) {
+    return;
+    }
+
+    memory_region_set_enabled(res->mr, false);
+    memory_region_del_subregion(>hostmem, res->mr);
+
+    /* memory region owns res->mr object and frees it when mr is
released */
+    res->mr = NULL;
+
+    virgl_renderer_resource_unmap(res->resource_id);


Hi,

First, thanks for keeping working on this.

This patch has some changes since the previous version, but it is still
vulnerable to the race condition pointed out. The memory region is
asynchronously unmapped from the guest address space, but the backing
memory on the host address space is unmapped synchronously before that.
This results in use-after-free. The whole unmapping operation needs to
be implemented in an asynchronous manner.


Thanks for the clarification! I missed this point from the previous
discussion.

Could you please clarify what do you mean by the "asynchronous manner"?
Virglrenderer API works only in the virtio-gpu-gl context, it can't be
accessed from other places.

The memory_region_del_subregion() should remove the region as long as
nobody references it, isn't it? On Linux guest nobody should reference
hostmem regions besides virtio-gpu device on the unmap, don't know about
other guests.

We can claim it a guest's fault if MR lives after the deletion and in
that case exit Qemu with a noisy error msg or leak resource. WDYT?



We need to be prepared for a faulty guest for reliability and security
as they are common goals of virtualization, and it is nice to have them
after all.

You need to call virgl_renderer_resource_unmap() after the MR actually
gets freed. The virtio-gpu-gl context is just a context with BQL so it
is fine to call virgl functions in most places.


Do you have example of a legit use-case where hostmem MR could outlive
resource mapping?


MR outliving after memory_region_del_subregion() is not a use-case, but 
a situation that occurs due to the limitation of the memory subsystem. 
It is not easy to answer how often such a situation happens.




Turning it into a error condition is much more reasonable to do than to
to worry about edge case that nobody cares about, which can't be tested
easily and that not trivial to support, IMO.

I'm not sure what you mean by turning into an error condition. I doubt 
it's possible to emit errors when someone touches an unmapped region.


Reproducing this issue is not easy as it's often cases for 
use-after-free bugs, but fixing it is not that complicated in my opinion 
since you already have an implementation which asynchronously unmaps the 
region in v6. I write my suggestions to fix problems in v6:


- Remove ref member in virgl_gpu_resource, vres_get_ref(), 
vres_put_ref(), and virgl_resource_unmap().


- Change virtio_gpu_virgl_process_cmd(), 
virgl_cmd_resource_unmap_blob(), and virgl_cmd_resource_unref() to 
return a bool, which tells if the command was processed or suspended.


- In virtio_gpu_process_cmdq(), break if the command was suspended.

- In virgl_resource_blob_async_unmap(), call virtio_gpu_gl_block(g, false).

- In virgl_cmd_resource_unmap_blob() and virgl_cmd_resource_unref(), 
call memory_region_del_subregion() and virtio_gpu_gl_block(g, true), and 
tell that the command was suspended if the reference counter of 
MemoryRegion > 0. Free and unmap the MR otherwise.


Regards,
Akihiko Odaki

Re: [PATCH v4 1/5] hw/display : Add device DM163

2024-04-15 Thread Philippe Mathieu-Daudé


Hi Inès,

On 14/4/24 15:05, Inès Varhol wrote:

This device implements the IM120417002 colors shield v1.1 for Arduino
(which relies on the DM163 8x3-channel led driving logic) and features
a simple display of an 8x8 RGB matrix. The columns of the matrix are
driven by the DM163 and the rows are driven externally.

Acked-by: Alistair Francis 
Signed-off-by: Arnaud Minier 
Signed-off-by: Inès Varhol 
---
  docs/system/arm/b-l475e-iot01a.rst |   3 +-
  include/hw/display/dm163.h |  58 +
  hw/display/dm163.c | 333 +
  hw/display/Kconfig |   3 +
  hw/display/meson.build |   1 +
  hw/display/trace-events|  14 ++
  6 files changed, 411 insertions(+), 1 deletion(-)
  create mode 100644 include/hw/display/dm163.h
  create mode 100644 hw/display/dm163.c




diff --git a/include/hw/display/dm163.h b/include/hw/display/dm163.h
new file mode 100644
index 00..00d0504640
--- /dev/null
+++ b/include/hw/display/dm163.h
@@ -0,0 +1,58 @@
+/*
+ * QEMU DM163 8x3-channel constant current led driver
+ * driving columns of associated 8x8 RGB matrix.
+ *
+ * Copyright (C) 2024 Samuel Tardieu 
+ * Copyright (C) 2024 Arnaud Minier 
+ * Copyright (C) 2024 Inès Varhol 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_DISPLAY_DM163_H
+#define HW_DISPLAY_DM163_H
+
+#include "qom/object.h"
+#include "hw/qdev-core.h"
+
+#define TYPE_DM163 "dm163"
+OBJECT_DECLARE_SIMPLE_TYPE(DM163State, DM163);
+
+#define DM163_NUM_LEDS 24
+#define RGB_MATRIX_NUM_ROWS 8
+#define RGB_MATRIX_NUM_COLS (DM163_NUM_LEDS / 3)


Maybe better as:

  #define DM163_NUM_LEDS (RGB_MATRIX_NUM_COLS * RGB_MATRIX_NUM_ROWS)


+#define COLOR_BUFFER_SIZE RGB_MATRIX_NUM_ROWS


It could ease the code to define here directly as:

  /* The last row is filled with 0 (turned off row) */
  #define COLOR_BUFFER_SIZE (RGB_MATRIX_NUM_ROWS + 1)


+
+typedef struct DM163State {
+DeviceState parent_obj;
+
+/* DM163 driver */
+uint64_t bank0_shift_register[3];
+uint64_t bank1_shift_register[3];
+uint16_t latched_outputs[DM163_NUM_LEDS];
+uint16_t outputs[DM163_NUM_LEDS];
+qemu_irq sout;
+
+uint8_t sin;
+uint8_t dck;
+uint8_t rst_b;
+uint8_t lat_b;
+uint8_t selbk;
+uint8_t en_b;
+
+/* IM120417002 colors shield */
+uint8_t activated_rows;
+
+/* 8x8 RGB matrix */
+QemuConsole *console;
+uint8_t redraw;
+/* Rows currently being displayed on the matrix. */
+/* The last row is filled with 0 (turned off row) */
+uint32_t buffer[COLOR_BUFFER_SIZE + 1][RGB_MATRIX_NUM_COLS];
+uint8_t last_buffer_idx;
+uint8_t buffer_idx_of_row[RGB_MATRIX_NUM_ROWS];
+/* Used to simulate retinal persistence of rows */
+uint8_t age_of_row[RGB_MATRIX_NUM_ROWS];


Maybe "row_persistence_delay"?


+} DM163State;
+
+#endif /* HW_DISPLAY_DM163_H */




+static void dm163_dck_gpio_handler(void *opaque, int line, int new_state)
+{
+DM163State *s = DM163(opaque);


GPIO handlers are initialized in dm163_realize() where we know @dev
is already a DM163State:

  static void dm163_realize(DeviceState *dev, Error **errp)
  {
  DM163State *s = DM163(dev);
  ^
  qdev_init_gpio_in(dev, dm163_rows_gpio_handler, 8);

So here (and other handlers) you can avoid the QOM cast macro,
and directly use:

  DM163State *s = opaque;


+
+if (new_state && !s->dck) {
+/*
+ * On raising dck, sample selbk to get the bank to use, and
+ * sample sin for the bit to enter into the bank shift buffer.
+ */
+uint64_t *sb =
+s->selbk ? s->bank1_shift_register : s->bank0_shift_register;
+/* Output the outgoing bit on sout */
+const bool sout = (s->selbk ? sb[2] & MAKE_64BIT_MASK(63, 1) :
+   sb[2] & MAKE_64BIT_MASK(15, 1)) != 0;
+qemu_set_irq(s->sout, sout);
+/* Enter sin into the shift buffer */
+sb[2] = (sb[2] << 1) | ((sb[1] >> 63) & 1);
+sb[1] = (sb[1] << 1) | ((sb[0] >> 63) & 1);
+sb[0] = (sb[0] << 1) | s->sin;
+}
+
+s->dck = new_state;
+trace_dm163_dck(new_state);
+}




+static void dm163_en_b_gpio_handler(void *opaque, int line, int new_state)
+{
+DM163State *s = DM163(opaque);
+
+s->en_b = new_state;
+dm163_propagate_outputs(s);
+trace_dm163_en_b(new_state);
+}
+
+static inline uint8_t dm163_bank0(const DM163State *s, uint8_t led)


No need to force the compiler to inline these methods.


+{
+/*
+ * Bank 1 uses 6 bits per led, so a value may be stored accross
+ * two uint64_t entries.
+ */
+const uint8_t low_bit = 6 * led;
+const uint8_t low_word = low_bit / 64;
+const uint8_t high_word = (low_bit + 5) / 64;
+const uint8_t low_shift = low_bit % 64;
+
+if (low_word == high_word) {
+/* Simple case: the value belongs to one entry. */
+return (s->bank0_shift_register[low_word] &
+

RE: [PATCH v2 01/10] backends: Introduce abstract HostIOMMUDevice

2024-04-15 Thread Duan, Zhenzhong



>-Original Message-
>From: Philippe Mathieu-Daudé 
>Subject: Re: [PATCH v2 01/10] backends: Introduce abstract
>HostIOMMUDevice
>
>Hi Zhenzhong,
>
>On 8/4/24 10:12, Zhenzhong Duan wrote:
>> Introduce HostIOMMUDevice as an abstraction of host IOMMU device.
>>
>> get_host_iommu_info() is used to get host IOMMU info, different
>> backends can have different implementations and result format.
>>
>> Introduce a macro CONFIG_HOST_IOMMU_DEVICE to define the usage
>> for VFIO, and VDPA in the future.
>>
>> Suggested-by: Cédric Le Goater 
>> Signed-off-by: Zhenzhong Duan 
>> ---
>>   MAINTAINERS|  2 ++
>>   include/sysemu/host_iommu_device.h | 19 +++
>>   backends/host_iommu_device.c   | 19 +++
>>   backends/Kconfig   |  5 +
>>   backends/meson.build   |  1 +
>>   5 files changed, 46 insertions(+)
>>   create mode 100644 include/sysemu/host_iommu_device.h
>>   create mode 100644 backends/host_iommu_device.c
>
>
>> diff --git a/include/sysemu/host_iommu_device.h
>b/include/sysemu/host_iommu_device.h
>> new file mode 100644
>> index 00..22ccbe3a5d
>> --- /dev/null
>> +++ b/include/sysemu/host_iommu_device.h
>> @@ -0,0 +1,19 @@
>> +#ifndef HOST_IOMMU_DEVICE_H
>> +#define HOST_IOMMU_DEVICE_H
>> +
>> +#include "qom/object.h"
>> +
>> +#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
>> +OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass,
>HOST_IOMMU_DEVICE)
>> +
>> +struct HostIOMMUDevice {
>> +Object parent;
>> +};
>> +
>> +struct HostIOMMUDeviceClass {
>> +ObjectClass parent_class;
>> +
>> +int (*get_host_iommu_info)(HostIOMMUDevice *hiod, void *data,
>uint32_t len,
>> +   Error **errp);
>
>Please document this new method (in particular return value and @data).
>
>Since @len is sizeof(data), can we use the size_t type?

Sure, will do.

Thanks
Zhenzhong

RE: [PATCH v2 01/10] backends: Introduce abstract HostIOMMUDevice

2024-04-15 Thread Duan, Zhenzhong



>-Original Message-
>From: Cédric Le Goater 
>Subject: Re: [PATCH v2 01/10] backends: Introduce abstract
>HostIOMMUDevice
>
>On 4/8/24 10:12, Zhenzhong Duan wrote:
>> Introduce HostIOMMUDevice as an abstraction of host IOMMU device.
>>
>> get_host_iommu_info() is used to get host IOMMU info, different
>> backends can have different implementations and result format.
>>
>> Introduce a macro CONFIG_HOST_IOMMU_DEVICE to define the usage
>> for VFIO, and VDPA in the future.
>>
>> Suggested-by: Cédric Le Goater 
>> Signed-off-by: Zhenzhong Duan 
>
>LGTM,
>
>> ---
>>   MAINTAINERS|  2 ++
>>   include/sysemu/host_iommu_device.h | 19 +++
>>   backends/host_iommu_device.c   | 19 +++
>>   backends/Kconfig   |  5 +
>>   backends/meson.build   |  1 +
>>   5 files changed, 46 insertions(+)
>>   create mode 100644 include/sysemu/host_iommu_device.h
>>   create mode 100644 backends/host_iommu_device.c
>>
>> diff --git a/MAINTAINERS b/MAINTAINERS
>> index e71183eef9..22f71cbe02 100644
>> --- a/MAINTAINERS
>> +++ b/MAINTAINERS
>> @@ -2202,6 +2202,8 @@ M: Zhenzhong Duan
>
>>   S: Supported
>>   F: backends/iommufd.c
>>   F: include/sysemu/iommufd.h
>> +F: backends/host_iommu_device.c
>> +F: include/sysemu/host_iommu_device.h
>>   F: include/qemu/chardev_open.h
>>   F: util/chardev_open.c
>>   F: docs/devel/vfio-iommufd.rst
>> diff --git a/include/sysemu/host_iommu_device.h
>b/include/sysemu/host_iommu_device.h
>> new file mode 100644
>> index 00..22ccbe3a5d
>> --- /dev/null
>> +++ b/include/sysemu/host_iommu_device.h
>> @@ -0,0 +1,19 @@
>> +#ifndef HOST_IOMMU_DEVICE_H
>> +#define HOST_IOMMU_DEVICE_H
>> +
>> +#include "qom/object.h"
>> +
>> +#define TYPE_HOST_IOMMU_DEVICE "host-iommu-device"
>> +OBJECT_DECLARE_TYPE(HostIOMMUDevice, HostIOMMUDeviceClass,
>HOST_IOMMU_DEVICE)
>> +
>> +struct HostIOMMUDevice {
>> +Object parent;
>> +};
>> +
>> +struct HostIOMMUDeviceClass {
>> +ObjectClass parent_class;
>
>Could you please document the struct and its handlers ? This is more for
>the future reader to understand the VFIO concepts than for the generated
>docs. Anyhow, it could be useful for the docs also. Overall, the QEMU VFIO
>susbsytem suffers from a lack of documentation and we should try to
>improve that in the next cycle.

Sure, will doc struct and handlers in v3.

Thanks
Zhenzhong

>
>Thanks,
>
>C.
>
>
>
>> +int (*get_host_iommu_info)(HostIOMMUDevice *hiod, void *data,
>uint32_t len,
>> +   Error **errp);
>> +};
>> +#endif
>> diff --git a/backends/host_iommu_device.c
>b/backends/host_iommu_device.c
>> new file mode 100644
>> index 00..6cb6007d8c
>> --- /dev/null
>> +++ b/backends/host_iommu_device.c
>> @@ -0,0 +1,19 @@
>> +#include "qemu/osdep.h"
>> +#include "sysemu/host_iommu_device.h"
>> +
>> +OBJECT_DEFINE_ABSTRACT_TYPE(HostIOMMUDevice,
>> +host_iommu_device,
>> +HOST_IOMMU_DEVICE,
>> +OBJECT)
>> +
>> +static void host_iommu_device_class_init(ObjectClass *oc, void *data)
>> +{
>> +}
>> +
>> +static void host_iommu_device_init(Object *obj)
>> +{
>> +}
>> +
>> +static void host_iommu_device_finalize(Object *obj)
>> +{
>> +}
>> diff --git a/backends/Kconfig b/backends/Kconfig
>> index 2cb23f62fa..34ab29e994 100644
>> --- a/backends/Kconfig
>> +++ b/backends/Kconfig
>> @@ -3,3 +3,8 @@ source tpm/Kconfig
>>   config IOMMUFD
>>   bool
>>   depends on VFIO
>> +
>> +config HOST_IOMMU_DEVICE
>> +bool
>> +default y
>> +depends on VFIO
>> diff --git a/backends/meson.build b/backends/meson.build
>> index 8b2b111497..2e975d641e 100644
>> --- a/backends/meson.build
>> +++ b/backends/meson.build
>> @@ -25,6 +25,7 @@ if have_vhost_user
>>   endif
>>   system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true: files('cryptodev-
>vhost.c'))
>>   system_ss.add(when: 'CONFIG_IOMMUFD', if_true: files('iommufd.c'))
>> +system_ss.add(when: 'CONFIG_HOST_IOMMU_DEVICE', if_true:
>files('host_iommu_device.c'))
>>   if have_vhost_user_crypto
>> system_ss.add(when: 'CONFIG_VIRTIO_CRYPTO', if_true:
>files('cryptodev-vhost-user.c'))
>>   endif

1 2 >

1 - 100 of 136 matches

Mail list logo