Re: [PATCH v2] virtio-blk: support per-device queue depth

2021-02-18 Thread Jason Wang


On 2021/2/19 11:06 上午, Joseph Qi wrote:

Hi Michael,
Could you take this now?



It has been merged:

https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/commit/?h=vhost=8c6daa79e527a05adb6f5d42cbed2fe36c6b3e83

Thanks




Thanks,
Joseph

On 2/1/21 10:17 AM, Joseph Qi wrote:

Ping...

On 1/26/21 7:13 PM, Stefan Hajnoczi wrote:

On Fri, Jan 22, 2021 at 05:21:46PM +0800, Joseph Qi wrote:

module parameter 'virtblk_queue_depth' was firstly introduced for
testing/benchmarking purposes described in commit fc4324b4597c
("virtio-blk: base queue-depth on virtqueue ringsize or module param").
And currently 'virtblk_queue_depth' is used as a saved value for the
first probed device.
Since we have different virtio-blk devices which have different
capabilities, it requires that we support per-device queue depth instead
of per-module. So defaultly use vq free elements if module parameter
'virtblk_queue_depth' is not set.

Signed-off-by: Joseph Qi 
Acked-by: Jason Wang 
---
  drivers/block/virtio_blk.c | 11 +++
  1 file changed, 7 insertions(+), 4 deletions(-)



Reviewed-by: Stefan Hajnoczi 



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-18 Thread Jason Wang


On 2021/2/18 8:43 下午, Si-Wei Liu wrote:



On 2/17/2021 8:44 PM, Jason Wang wrote:


On 2021/2/10 下午4:59, Si-Wei Liu wrote:



On 2/9/2021 7:53 PM, Jason Wang wrote:


On 2021/2/10 上午10:30, Si-Wei Liu wrote:



On 2/8/2021 10:37 PM, Jason Wang wrote:


On 2021/2/9 下午2:12, Eli Cohen wrote:

On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

On 2021/2/8 下午6:04, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

On 2021/2/8 下午2:37, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

On 2021/2/6 上午7:07, Si-Wei Liu wrote:

On 2/3/2021 11:36 PM, Eli Cohen wrote:
When a change of memory map occurs, the hardware 
resources are destroyed
and then re-created again with the new memory map. In 
such case, we need
to restore the hardware available and used indices. The 
driver failed to

restore the used index which is added here.

Also, since the driver also fails to reset the available 
and used
indices upon device reset, fix this here to avoid 
regression caused by

the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 
supported mlx5

devices")
Signed-off-by: Eli Cohen
---
v0 -> v1:
Clear indices upon device reset

     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 
++

     1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
     u64 device_addr;
     u64 driver_addr;
     u16 avail_index;
+    u16 used_index;
     bool ready;
     struct vdpa_callback cb;
     bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
     u32 virtq_id;
     struct mlx5_vdpa_net *ndev;
     u16 avail_idx;
+    u16 used_idx;
     int fw_state;
       /* keep last in the struct */
@@ -804,6 +806,7 @@ static int create_virtqueue(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtque
       obj_context = 
MLX5_ADDR_OF(create_virtio_net_q_in, in,

obj_context);
     MLX5_SET(virtio_net_q_object, obj_context, 
hw_available_index,

mvq->avail_idx);
+    MLX5_SET(virtio_net_q_object, obj_context, 
hw_used_index,

mvq->used_idx);
     MLX5_SET(virtio_net_q_object, obj_context,
queue_feature_bit_mask_12_3,
get_features_12_3(ndev->mvdev.actual_features));
     vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 
obj_context,

virtio_q_context);
@@ -1022,6 +1025,7 @@ static int connect_qps(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtqueue *m
     struct mlx5_virtq_attr {
     u8 state;
     u16 available_index;
+    u16 used_index;
     };
       static int query_virtqueue(struct mlx5_vdpa_net 
*ndev, struct

mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
     memset(attr, 0, sizeof(*attr));
     attr->state = MLX5_GET(virtio_net_q_object, 
obj_context, state);
     attr->available_index = 
MLX5_GET(virtio_net_q_object,

obj_context, hw_available_index);
+    attr->used_index = MLX5_GET(virtio_net_q_object, 
obj_context,

hw_used_index);
     kfree(out);
     return 0;
     @@ -1535,6 +1540,16 @@ static void 
teardown_virtqueues(struct

mlx5_vdpa_net *ndev)
     }
     }
     +static void clear_virtqueues(struct mlx5_vdpa_net 
*ndev)

+{
+    int i;
+
+    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+    ndev->vqs[i].avail_idx = 0;
+    ndev->vqs[i].used_idx = 0;
+    }
+}
+
     /* TODO: cross-endian support */
     static inline bool mlx5_vdpa_is_little_endian(struct 
mlx5_vdpa_dev

*mvdev)
     {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
     return err;
       ri->avail_index = attr.available_index;
+    ri->used_index = attr.used_index;
     ri->ready = mvq->ready;
     ri->num_ent = mvq->num_ent;
     ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void 
restore_channels_info(struct

mlx5_vdpa_net *ndev)
     continue;
       mvq->avail_idx = ri->avail_index;
+    mvq->used_idx = ri->used_index;
     mvq->ready = ri->ready;
     mvq->num_ent = ri->num_ent;
     mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void 
mlx5_vdpa_set_status(struct

vdpa_device *vdev, u8 status)
     if (!status) {
     mlx5_vdpa_info(mvdev, "performing device 
reset\n");

     teardown_driver(ndev);
+    clear_virtqueues(ndev);
The clearing looks fine at the first glance, as it aligns 
with the other
state cleanups floating around at the same place. However, 
the thing is
get_vq_state() is supposed to be called right after to get 
sync'ed with
the latest internal avail_index from device while vq is 
stopped. The
index was saved in 

Re: [PATCH 2/3] x86/sev-es: Check if regs->sp is trusted before adjusting #VC IST stack

2021-02-18 Thread Andy Lutomirski
On Thu, Feb 18, 2021 at 11:21 AM Joerg Roedel  wrote:
>
> On Thu, Feb 18, 2021 at 09:49:06AM -0800, Andy Lutomirski wrote:
> > I don't understand what this means.  The whole entry mechanism on x86
> > is structured so that we call a C function *and return from that C
> > function without longjmp-like magic* with the sole exception of
> > unwind_stack_do_exit().  This means that you can match up enters and
> > exits, and that unwind_stack_do_exit() needs to unwind correctly.  In
> > the former case, it's normal C and we can use normal local variables.
> > In the latter case, we know exactly what state we're trying to restore
> > and we can restore it directly without any linked lists or similar.
>
> Okay, the unwinder will likely get confused by this logic.
>
> > What do you have in mind that requires a linked list?
>
> Cases when there are multiple IST vectors besides NMI that can hit while
> the #VC handler is still on its own IST stack. #MCE comes to mind, but
> that is broken anyway. At some point #VC itself will be one of them, but
> when that happens the code will kill the machine.
> This leaves #HV in the list, and I am not sure how that is going to be
> handled yet. I think the goal is that the #HV handler is not allowed to
> cause any #VC exception. In that case the linked-list logic will not be
> needed.

Can you give me an example, even artificial, in which the linked-list
logic is useful?

>
> > > I don't see how this would break, can you elaborate?
> > >
> > > What I think happens is:
> > >
> > > SYSCALL gap (RSP is from userspace and untrusted)
> > >
> > > -> #VC - Handler on #VC IST stack detects that it interrupted
> > >the SYSCALL gap and switches to the task stack.
> > >
> >
> > Can you point me to exactly what code you're referring to?  I spent a
> > while digging through the code and macro tangle and I can't find this.
>
> See the entry code in arch/x86/entry/entry_64.S, macro idtentry_vc. It
> creates the assembly code for the handler. At some point it calls
> vc_switch_off_ist(), which is a C function in arch/x86/kernel/traps.c.
> This function tries to find a new stack for the #VC handler.
>
> The first thing it does is checking whether the exception came from the
> SYSCALL gap and just uses the task stack in that case.
>
> Then it will check for other kernel stacks which are safe to switch
> to. If that fails it uses the fall-back stack (VC2), which will direct
> the handler to a separate function which, for now, just calls panic().
> Not safe are the entry or unknown stacks.

Can you explain your reasoning in considering the entry stack unsafe?
It's 4k bytes these days.

You forgot about entry_SYSCALL_compat.

Your 8-byte alignment is confusing to me.  In valid kernel code, SP
should be 8-byte-aligned already, and, if you're trying to match
architectural behavior, the CPU aligns to 16 bytes.

We're not robust against #VC, NMI in the #VC prologue before the magic
stack switch, and a new #VC in the NMI prologue.  Nor do we appear to
have any detection of the case where #VC nests directly inside its own
prologue.  Or did I miss something else here?

If we get NMI and get #VC in the NMI *asm*, the #VC magic stack switch
looks like it will merrily run itself in the NMI special-stack-layout
section, and that sounds really quite bad.

>
> The function then copies pt_regs and returns the new stack pointer to
> assembly code, which then writes it to %RSP.
>
> > Unless AMD is more magic than I realize, the MOV SS bug^Wfeature means
> > that #DB is *not* always called in safe places.
>
> You are right, forgot about this. The MOV SS bug can very well
> trigger a #VC(#DB) exception from the syscall gap.
>
> > > And with SNP we need to be able to at least detect a malicious HV so we
> > > can reliably kill the guest. Otherwise the HV could potentially take
> > > control over the guest's execution flow and make it reveal its secrets.
> >
> > True.  But is the rest of the machinery to be secure against EFLAGS.IF
> > violations and such in place yet?
>
> Not sure what you mean by EFLAGS.IF violations, probably enabling IRQs
> while in the #VC handler? The #VC handler _must_ _not_ enable IRQs
> anywhere in its call-path. If that ever happens it is a bug.
>

I mean that, IIRC, a malicious hypervisor can inject inappropriate
vectors at inappropriate times if the #HV mechanism isn't enabled.
For example, it could inject a page fault or an interrupt in a context
in which we have the wrong GSBASE loaded.

But the #DB issue makes this moot.  We have to use IST unless we turn
off SCE.  But I admit I'm leaning toward turning off SCE until we have
a solution that seems convincingly robust.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/3] x86/sev-es: Check if regs->sp is trusted before adjusting #VC IST stack

2021-02-18 Thread Joerg Roedel
On Thu, Feb 18, 2021 at 09:49:06AM -0800, Andy Lutomirski wrote:
> I don't understand what this means.  The whole entry mechanism on x86
> is structured so that we call a C function *and return from that C
> function without longjmp-like magic* with the sole exception of
> unwind_stack_do_exit().  This means that you can match up enters and
> exits, and that unwind_stack_do_exit() needs to unwind correctly.  In
> the former case, it's normal C and we can use normal local variables.
> In the latter case, we know exactly what state we're trying to restore
> and we can restore it directly without any linked lists or similar.

Okay, the unwinder will likely get confused by this logic.

> What do you have in mind that requires a linked list?

Cases when there are multiple IST vectors besides NMI that can hit while
the #VC handler is still on its own IST stack. #MCE comes to mind, but
that is broken anyway. At some point #VC itself will be one of them, but
when that happens the code will kill the machine.
This leaves #HV in the list, and I am not sure how that is going to be
handled yet. I think the goal is that the #HV handler is not allowed to
cause any #VC exception. In that case the linked-list logic will not be
needed.

> > I don't see how this would break, can you elaborate?
> >
> > What I think happens is:
> >
> > SYSCALL gap (RSP is from userspace and untrusted)
> >
> > -> #VC - Handler on #VC IST stack detects that it interrupted
> >the SYSCALL gap and switches to the task stack.
> >
> 
> Can you point me to exactly what code you're referring to?  I spent a
> while digging through the code and macro tangle and I can't find this.

See the entry code in arch/x86/entry/entry_64.S, macro idtentry_vc. It
creates the assembly code for the handler. At some point it calls
vc_switch_off_ist(), which is a C function in arch/x86/kernel/traps.c.
This function tries to find a new stack for the #VC handler.

The first thing it does is checking whether the exception came from the
SYSCALL gap and just uses the task stack in that case.

Then it will check for other kernel stacks which are safe to switch
to. If that fails it uses the fall-back stack (VC2), which will direct
the handler to a separate function which, for now, just calls panic().
Not safe are the entry or unknown stacks.

The function then copies pt_regs and returns the new stack pointer to
assembly code, which then writes it to %RSP.

> Unless AMD is more magic than I realize, the MOV SS bug^Wfeature means
> that #DB is *not* always called in safe places.

You are right, forgot about this. The MOV SS bug can very well
trigger a #VC(#DB) exception from the syscall gap.

> > And with SNP we need to be able to at least detect a malicious HV so we
> > can reliably kill the guest. Otherwise the HV could potentially take
> > control over the guest's execution flow and make it reveal its secrets.
> 
> True.  But is the rest of the machinery to be secure against EFLAGS.IF
> violations and such in place yet?

Not sure what you mean by EFLAGS.IF violations, probably enabling IRQs
while in the #VC handler? The #VC handler _must_ _not_ enable IRQs
anywhere in its call-path. If that ever happens it is a bug.

Regards,

Joerg
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/3] x86/sev-es: Check if regs->sp is trusted before adjusting #VC IST stack

2021-02-18 Thread Andy Lutomirski
On Thu, Feb 18, 2021 at 3:25 AM Joerg Roedel  wrote:
>
> Hi Andy,
>
> On Wed, Feb 17, 2021 at 10:09:46AM -0800, Andy Lutomirski wrote:
> > Can you get rid of the linked list hack while you're at it?  This code
> > is unnecessarily convoluted right now, and it seems to be just asking
> > for weird bugs.  Just stash the old value in a local variable, please.
>
> Yeah, the linked list is not really necessary right now, because of the
> way nested NMI handling works and given that these functions are only
> used in the NMI handler right now.
> The whole #VC handling code was written with future requirements in
> mind, like what is needed when debugging registers get virtualized and
> #HV gets enabled.
> Until its clear whether __sev_es_ist_enter/exit() is needed in any of
> these paths, I'd like to keep the linked list for now. It is more
> complicated but allows nesting.

I don't understand what this means.  The whole entry mechanism on x86
is structured so that we call a C function *and return from that C
function without longjmp-like magic* with the sole exception of
unwind_stack_do_exit().  This means that you can match up enters and
exits, and that unwind_stack_do_exit() needs to unwind correctly.  In
the former case, it's normal C and we can use normal local variables.
In the latter case, we know exactly what state we're trying to restore
and we can restore it directly without any linked lists or similar.

What do you have in mind that requires a linked list?

>
> > Meanwhile, I'm pretty sure I can break this whole scheme if the
> > hypervisor is messing with us.  As a trivial example, the sequence
> > SYSCALL gap -> #VC -> NMI -> #VC will go quite poorly.
>
> I don't see how this would break, can you elaborate?
>
> What I think happens is:
>
> SYSCALL gap (RSP is from userspace and untrusted)
>
> -> #VC - Handler on #VC IST stack detects that it interrupted
>the SYSCALL gap and switches to the task stack.
>

Can you point me to exactly what code you're referring to?  I spent a
while digging through the code and macro tangle and I can't find this.

>
> -> NMI - Now running on NMI IST stack. Depending on whether the
>stack switch in the #VC handler already happened, the #VC IST
>entry is adjusted so that a subsequent #VC will not overwrite
>the interrupted handlers stack frame.
>
> -> #VC - Handler runs on the adjusted #VC IST stack and switches
>itself back to the NMI IST stack. This is safe wrt. nested
>NMIs as long as nested NMIs itself are safe.
>
> As a rule of thumb, think of the #VC handler as trying to be a non-IST
> handler by switching itself to the interrupted stack or the task stack.
> If it detects that this is not possible (which can't happen right now,
> but with SNP), it will kill the guest.

I will try to think of this, but it's hard, since I can't find the code :)

I found this comment:

 * With the current implementation it is always possible to switch to a safe
 * stack because #VC exceptions only happen at known places, like intercepted
 * instructions or accesses to MMIO areas/IO ports. They can also happen with
 * code instrumentation when the hypervisor intercepts #DB, but the critical
 * paths are forbidden to be instrumented, so #DB exceptions currently also
 * only happen in safe places.

Unless AMD is more magic than I realize, the MOV SS bug^Wfeature means
that #DB is *not* always called in safe places.

But I *thnk* the code you're talking about is this:

/*
 * If the entry is from userspace, switch stacks and treat it as
 * a normal entry.
 */
testb$3, CS-ORIG_RAX(%rsp)
jnz.Lfrom_usermode_switch_stack_\@

which does not run on #VC from kernel code.

> It needs to be IST, even without SNP, because #DB is IST too. When the
> hypervisor intercepts #DB then any #DB exception will be turned into
> #VC, so #VC needs to be handled anywhere a #DB can happen.

Eww.

>
> And with SNP we need to be able to at least detect a malicious HV so we
> can reliably kill the guest. Otherwise the HV could potentially take
> control over the guest's execution flow and make it reveal its secrets.

True.  But is the rest of the machinery to be secure against EFLAGS.IF
violations and such in place yet?

>
> Regards,
>
> Joerg
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] net: check if protocol extracted by virtio_net_hdr_set_proto is correct

2021-02-18 Thread Willem de Bruijn
On Thu, Feb 18, 2021 at 10:01 AM Balazs Nemeth  wrote:
>
> For gso packets, virtio_net_hdr_set_proto sets the protocol (if it isn't
> set) based on the type in the virtio net hdr, but the skb could contain
> anything since it could come from packet_snd through a raw socket. If
> there is a mismatch between what virtio_net_hdr_set_proto sets and
> the actual protocol, then the skb could be handled incorrectly later
> on by gso.
>
> The network header of gso packets starts at 14 bytes, but a specially
> crafted packet could fool the call to skb_flow_dissect_flow_keys_basic
> as the network header offset in the skb could be incorrect.
> Consequently, EINVAL is not returned.
>
> There are even packets that can cause an infinite loop. For example, a
> packet with ethernet type ETH_P_MPLS_UC (which is unnoticed by
> virtio_net_hdr_to_skb) that is sent to a geneve interface will be
> handled by geneve_build_skb. In turn, it calls
> udp_tunnel_handle_offloads which then calls skb_reset_inner_headers.
> After that, the packet gets passed to mpls_gso_segment. That function
> calculates the mpls header length by taking the difference between
> network_header and inner_network_header. Since the two are equal
> (due to the earlier call to skb_reset_inner_headers), it will calculate
> a header of length 0, and it will not pull any headers. Then, it will
> call skb_mac_gso_segment which will again call mpls_gso_segment, etc...
> This leads to the infinite loop.
>
> For that reason, address the root cause of the issue: don't blindly
> trust the information provided by the virtio net header. Instead,
> check if the protocol in the packet actually matches the protocol set by
> virtio_net_hdr_set_proto.
>
> Fixes: 9274124f023b ("net: stricter validation of untrusted gso packets")
> Signed-off-by: Balazs Nemeth 
> ---
>  include/linux/virtio_net.h | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
> index e8a924eeea3d..cf2c53563f22 100644
> --- a/include/linux/virtio_net.h
> +++ b/include/linux/virtio_net.h
> @@ -79,8 +79,13 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff 
> *skb,
> if (gso_type && skb->network_header) {
> struct flow_keys_basic keys;
>
> -   if (!skb->protocol)
> +   if (!skb->protocol) {
> +   const struct ethhdr *eth = skb_eth_hdr(skb);
> +

Unfortunately, cannot assume that the device type is ARPHRD_ETHER.

The underlying approach is sound: packets that have a gso type set in
the virtio_net_hdr have to be IP packets.

> virtio_net_hdr_set_proto(skb, hdr);
> +   if (skb->protocol != eth->h_proto)
> +   return -EINVAL;
> +   }
>  retry:
> if (!skb_flow_dissect_flow_keys_basic(NULL, skb, 
> ,
>   NULL, 0, 0, 0,
> --
> 2.29.2
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: Kernel panic with vhost-vdpa

2021-02-18 Thread Gautam Dawar
Hi Jason,

Thanks for your response.

On Thu, 18 Feb 2021 at 14:18, Jason Wang  wrote:

> Hi Gautam:
> On 2021/2/15 9:01 下午, Gautam Dawar wrote:
>
> Hi Jason/Michael,
>
>
>
> I observed a kernel panic while testing vhost-vdpa with Xilinx adapters.
> Here are the details for your review:
>
>
>
> Problem statement:
>
> When qemu with vhost-vdpa netdevice is run for the first time, it works
> well. But after the VM is powered off, next qemu run causes kernel panic
> due to a NULL pointer dereference in irq_bypass_register_producer().
>
>
>
> Root cause analysis:
>
> When the VM is powered off, vhost_dev_stop() is invoked which in turn
> calls vhost_vdpa_reset_device() causing the irq_bypass producers to be
> unregistered.
>
>
>
> On the next run, when qemu opens the vhost device, the vhost_vdpa_open()
> file operation calls vhost_dev_init(). Here, call_ctx->producer memory is
> cleared in vhost_vring_call_reset().
>
>
>
> Further, when the virtqueues are initialized by vhost_virtqueue_init(),
> vhost_vdpa_setup_vq_irq() again registers the irq_bypass producer for each
> virtqueue. As the node member of struct irq_bypass_producer is also
> initialized to zero, traversal on the producers list causes crash due to
> NULL pointer dereference.
>
>
> Thanks a lot for reporting this issue.
>
>
>
>
> Fix details:
>
>
>
> I think that this issue can be fixed by invoking vhost_vdpa_setup_vq_irq()
> only when vhost_vdpa_set_status() includes VIRTIO_CONFIG_S_DRIVER_OK in the
> new status value. This way, there won’t be any stale nodes in the
> irqbypass  module’s producers list which are reset in
> vhost_vring_call_reset().
>
>
>
> Patch:
>
>
>
> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index
> 62a9bb0efc55..fdad94e2fbf9 100644
>
> --- a/drivers/vhost/vdpa.c
>
> +++ b/drivers/vhost/vdpa.c
>
> @@ -409,7 +409,6 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa
> *v, unsigned int cmd,
>
> cb.private = NULL;
>
> }
>
> ops->set_vq_cb(vdpa, idx, );
>
> -   vhost_vdpa_setup_vq_irq(v, idx);
>
> break;
>
>
>
> case VHOST_SET_VRING_NUM:
>
>
>
> We can also track this issue in Bugzilla ticket 21171 (
> https://bugzilla.kernel.org/show_bug.cgi?id=211711)  and the complete
> patch is attached with this email.
>
>
> So vhost supports to remove or switch eventfd through
> vhost_vdpa_vring_ioctl(). So if userspace want to switch to another
> eventfd, we should re-do the register and unregister.
>
GD>>  This makes sense. I missed the use case where userspace may want to
switch to a different eventfd.

I think we need to deal this issue in another way. Can we check whether or
> not the producer is initialized before?
>
> Thanks
>
GD>> Initialization path is fine but the actual problem lies in the
clean-up part.
I think the following check is the cause of this issue:

static void vhost_vdpa_clean_irq(struct vhost_vdpa *v)
if (vq->call_ctx.producer.irq)

irq_bypass_unregister_producer(>call_ctx.producer);

The above if condition will prevent the de-initialization of the producer
nodes corresponding to irq 0 but  irq_bypass_unregister_producer() should
be called for all valid irq values including zero.

Accordingly, following patch is required to fix this issue:

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 62a9bb0efc55..d1c3a33c6239 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -849,7 +849,7 @@ static void vhost_vdpa_clean_irq(struct vhost_vdpa *v)

for (i = 0; i < v->nvqs; i++) {
vq = >vqs[i];
-   if (vq->call_ctx.producer.irq)
+   if (vq->call_ctx.producer.irq >= 0)

irq_bypass_unregister_producer(>call_ctx.producer);
}
 }

The revised patch (bug211711_fix.patch) is also attached with this email.

>
>
> Regards,
>
> Gautam Dawar
>
>


bug211711_fix.patch
Description: Binary data
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2 08/11] drm/qxl: fix monitors object vmap

2021-02-18 Thread Thomas Zimmermann



Am 17.02.21 um 13:32 schrieb Gerd Hoffmann:

Use the correct vmap variant.  We don't hold a reservation here,
so we can't use the _locked variant.  We can drop the pin because
qxl_bo_vmap will do that for us.

Signed-off-by: Gerd Hoffmann 


Acked-by: Thomas Zimmermann 

I simply forgot to ack this patch.


---
  drivers/gpu/drm/qxl/qxl_display.c | 7 ++-
  1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/qxl/qxl_display.c 
b/drivers/gpu/drm/qxl/qxl_display.c
index bfcc93089a94..f106da917863 100644
--- a/drivers/gpu/drm/qxl/qxl_display.c
+++ b/drivers/gpu/drm/qxl/qxl_display.c
@@ -1159,12 +1159,10 @@ int qxl_create_monitors_object(struct qxl_device *qdev)
}
qdev->monitors_config_bo = gem_to_qxl_bo(gobj);
  
-	ret = qxl_bo_pin(qdev->monitors_config_bo);

+   ret = qxl_bo_vmap(qdev->monitors_config_bo, );
if (ret)
return ret;
  
-	qxl_bo_vmap_locked(qdev->monitors_config_bo, );

-
qdev->monitors_config = qdev->monitors_config_bo->kptr;
qdev->ram_header->monitors_config =
qxl_bo_physical_address(qdev, qdev->monitors_config_bo, 0);
@@ -1189,8 +1187,7 @@ int qxl_destroy_monitors_object(struct qxl_device *qdev)
qdev->monitors_config = NULL;
qdev->ram_header->monitors_config = 0;
  
-	qxl_bo_vunmap_locked(qdev->monitors_config_bo);

-   ret = qxl_bo_unpin(qdev->monitors_config_bo);
+   ret = qxl_bo_vunmap(qdev->monitors_config_bo);
if (ret)
return ret;
  



--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Felix Imendörffer



OpenPGP_signature
Description: OpenPGP digital signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2 10/11] drm/qxl: rework cursor plane

2021-02-18 Thread Gerd Hoffmann
  Hi,

> > Well.  I suspect I could easily spend a month cleaning up and party
> > redesign the qxl driver (specifically qxl_draw.c + qxl_image.c).
> > 
> > I'm not sure I'll find the time to actually do that anytime soon.
> > I have plenty of other stuff on my TODO list, and given that the
> > world is transitioning to virtio-gpu the priority for qxl isn't
> > that high.
> 
> Well, in that case:
> 
> Acked-by: Thomas Zimmermann 
> 
> for patches 9 and 10. Having the vmap calls fixed is at least worth it.

Thanks.  Any chance you can ack 8/11 too (only patch missing an ack).

take care,
  Gerd

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 10/11] drm/qxl: rework cursor plane

2021-02-18 Thread Thomas Zimmermann

Hi

Am 18.02.21 um 12:50 schrieb Gerd Hoffmann:

   Hi,


I'm still trying to wrap my head around the qxl cursor code.

Getting vmap out of the commit tail is good, but I feel like this isn't
going in the right direction overall.

In ast, these helper functions were only good when converting the drvier to
atomic modesetting. So I removed them in the latst patchset and did all the
updates in the plane helpers directly.


I see the helper functions more as a way to get some structure into the
code flow.  The callbacks are easier to read if they just call helper
functions for stuff which needs more than a handful lines of code
(patch 9/11 exists for the same reason).

The helpers also make it easier move work from one callback to another,
but that is just a useful side-effect.

I had considered making that two separate patches, one factor out code
into functions and one moving the calls.  Turned out to not be that easy
though, because the old qxl_cursor_atomic_update() code was a rather
hairy mix of qxl_create_cursor() + qxl_primary_apply_cursor() +
qxl_primary_move_cursor().


For cursor_bo itself, it seems to be transitional state that is only used
during the plane update and crtc update . It should probably be stored in a
plane-state structure.

Some of the primary plane's functions seem to deal with cursor handling.
What's the role of the primary plane in cursor handling?


It's a quirk.  The qxl device will forget the cursor state on
qxl_io_create_primary(), so I have to remember the cursor state
and re-establish it by calling qxl_primary_apply_cursor() again.

So I'm not sure sticking this into plane state would work.  Because of
the quirk this is more than just a handover from prepare to commit.


For now, I suggest to merge patch 1 to 8 and 11; and move the cursor patches
into a new patchset.


I can merge 1-8, but 11 has to wait until the cursor is sorted.
There is a reason why 11 is last in the series ;)


I'd like ot hear Daniel's opinion on this. Do you have
further plans here?


Well.  I suspect I could easily spend a month cleaning up and party
redesign the qxl driver (specifically qxl_draw.c + qxl_image.c).

I'm not sure I'll find the time to actually do that anytime soon.
I have plenty of other stuff on my TODO list, and given that the
world is transitioning to virtio-gpu the priority for qxl isn't
that high.


Well, in that case:

Acked-by: Thomas Zimmermann 

for patches 9 and 10. Having the vmap calls fixed is at least worth it.

Best regards
Thomas



So, no, I have no short-term plans for qxl beyond fixing pins +
reservations + lockdep.

take care,
   Gerd

___
dri-devel mailing list
dri-de...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel



--
Thomas Zimmermann
Graphics Driver Developer
SUSE Software Solutions Germany GmbH
Maxfeldstr. 5, 90409 Nürnberg, Germany
(HRB 36809, AG Nürnberg)
Geschäftsführer: Felix Imendörffer



OpenPGP_signature
Description: OpenPGP digital signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1] vdpa/mlx5: Restore the hardware used index after change map

2021-02-18 Thread Si-Wei Liu



On 2/17/2021 8:44 PM, Jason Wang wrote:


On 2021/2/10 下午4:59, Si-Wei Liu wrote:



On 2/9/2021 7:53 PM, Jason Wang wrote:


On 2021/2/10 上午10:30, Si-Wei Liu wrote:



On 2/8/2021 10:37 PM, Jason Wang wrote:


On 2021/2/9 下午2:12, Eli Cohen wrote:

On Tue, Feb 09, 2021 at 11:20:14AM +0800, Jason Wang wrote:

On 2021/2/8 下午6:04, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 05:04:27PM +0800, Jason Wang wrote:

On 2021/2/8 下午2:37, Eli Cohen wrote:

On Mon, Feb 08, 2021 at 12:27:18PM +0800, Jason Wang wrote:

On 2021/2/6 上午7:07, Si-Wei Liu wrote:

On 2/3/2021 11:36 PM, Eli Cohen wrote:
When a change of memory map occurs, the hardware resources 
are destroyed
and then re-created again with the new memory map. In such 
case, we need
to restore the hardware available and used indices. The 
driver failed to

restore the used index which is added here.

Also, since the driver also fails to reset the available 
and used
indices upon device reset, fix this here to avoid 
regression caused by

the fact that used index may not be zero upon device reset.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for 
supported mlx5

devices")
Signed-off-by: Eli Cohen
---
v0 -> v1:
Clear indices upon device reset

     drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 
++

     1 file changed, 18 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 88dde3455bfd..b5fe6d2ad22f 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -87,6 +87,7 @@ struct mlx5_vq_restore_info {
     u64 device_addr;
     u64 driver_addr;
     u16 avail_index;
+    u16 used_index;
     bool ready;
     struct vdpa_callback cb;
     bool restore;
@@ -121,6 +122,7 @@ struct mlx5_vdpa_virtqueue {
     u32 virtq_id;
     struct mlx5_vdpa_net *ndev;
     u16 avail_idx;
+    u16 used_idx;
     int fw_state;
       /* keep last in the struct */
@@ -804,6 +806,7 @@ static int create_virtqueue(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtque
       obj_context = 
MLX5_ADDR_OF(create_virtio_net_q_in, in,

obj_context);
     MLX5_SET(virtio_net_q_object, obj_context, 
hw_available_index,

mvq->avail_idx);
+    MLX5_SET(virtio_net_q_object, obj_context, 
hw_used_index,

mvq->used_idx);
     MLX5_SET(virtio_net_q_object, obj_context,
queue_feature_bit_mask_12_3,
get_features_12_3(ndev->mvdev.actual_features));
     vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, 
obj_context,

virtio_q_context);
@@ -1022,6 +1025,7 @@ static int connect_qps(struct 
mlx5_vdpa_net

*ndev, struct mlx5_vdpa_virtqueue *m
     struct mlx5_virtq_attr {
     u8 state;
     u16 available_index;
+    u16 used_index;
     };
       static int query_virtqueue(struct mlx5_vdpa_net 
*ndev, struct

mlx5_vdpa_virtqueue *mvq,
@@ -1052,6 +1056,7 @@ static int query_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueu
     memset(attr, 0, sizeof(*attr));
     attr->state = MLX5_GET(virtio_net_q_object, 
obj_context, state);
     attr->available_index = 
MLX5_GET(virtio_net_q_object,

obj_context, hw_available_index);
+    attr->used_index = MLX5_GET(virtio_net_q_object, 
obj_context,

hw_used_index);
     kfree(out);
     return 0;
     @@ -1535,6 +1540,16 @@ static void 
teardown_virtqueues(struct

mlx5_vdpa_net *ndev)
     }
     }
     +static void clear_virtqueues(struct mlx5_vdpa_net 
*ndev)

+{
+    int i;
+
+    for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
+    ndev->vqs[i].avail_idx = 0;
+    ndev->vqs[i].used_idx = 0;
+    }
+}
+
     /* TODO: cross-endian support */
     static inline bool mlx5_vdpa_is_little_endian(struct 
mlx5_vdpa_dev

*mvdev)
     {
@@ -1610,6 +1625,7 @@ static int save_channel_info(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu
     return err;
       ri->avail_index = attr.available_index;
+    ri->used_index = attr.used_index;
     ri->ready = mvq->ready;
     ri->num_ent = mvq->num_ent;
     ri->desc_addr = mvq->desc_addr;
@@ -1654,6 +1670,7 @@ static void 
restore_channels_info(struct

mlx5_vdpa_net *ndev)
     continue;
       mvq->avail_idx = ri->avail_index;
+    mvq->used_idx = ri->used_index;
     mvq->ready = ri->ready;
     mvq->num_ent = ri->num_ent;
     mvq->desc_addr = ri->desc_addr;
@@ -1768,6 +1785,7 @@ static void mlx5_vdpa_set_status(struct
vdpa_device *vdev, u8 status)
     if (!status) {
     mlx5_vdpa_info(mvdev, "performing device 
reset\n");

     teardown_driver(ndev);
+    clear_virtqueues(ndev);
The clearing looks fine at the first glance, as it aligns 
with the other
state cleanups floating around at the same place. However, 
the thing is
get_vq_state() is supposed to be called right after to get 
sync'ed with
the latest internal avail_index from device while vq is 
stopped. The
index was saved in the driver software at vq suspension, 
but 

Re: [PATCH v2 10/11] drm/qxl: rework cursor plane

2021-02-18 Thread Gerd Hoffmann
  Hi,

> I'm still trying to wrap my head around the qxl cursor code.
> 
> Getting vmap out of the commit tail is good, but I feel like this isn't
> going in the right direction overall.
> 
> In ast, these helper functions were only good when converting the drvier to
> atomic modesetting. So I removed them in the latst patchset and did all the
> updates in the plane helpers directly.

I see the helper functions more as a way to get some structure into the
code flow.  The callbacks are easier to read if they just call helper
functions for stuff which needs more than a handful lines of code
(patch 9/11 exists for the same reason).

The helpers also make it easier move work from one callback to another,
but that is just a useful side-effect.

I had considered making that two separate patches, one factor out code
into functions and one moving the calls.  Turned out to not be that easy
though, because the old qxl_cursor_atomic_update() code was a rather
hairy mix of qxl_create_cursor() + qxl_primary_apply_cursor() +
qxl_primary_move_cursor().

> For cursor_bo itself, it seems to be transitional state that is only used
> during the plane update and crtc update . It should probably be stored in a
> plane-state structure.
> 
> Some of the primary plane's functions seem to deal with cursor handling.
> What's the role of the primary plane in cursor handling?

It's a quirk.  The qxl device will forget the cursor state on
qxl_io_create_primary(), so I have to remember the cursor state
and re-establish it by calling qxl_primary_apply_cursor() again.

So I'm not sure sticking this into plane state would work.  Because of
the quirk this is more than just a handover from prepare to commit.

> For now, I suggest to merge patch 1 to 8 and 11; and move the cursor patches
> into a new patchset.

I can merge 1-8, but 11 has to wait until the cursor is sorted.
There is a reason why 11 is last in the series ;)

> I'd like ot hear Daniel's opinion on this. Do you have
> further plans here?

Well.  I suspect I could easily spend a month cleaning up and party
redesign the qxl driver (specifically qxl_draw.c + qxl_image.c).

I'm not sure I'll find the time to actually do that anytime soon.
I have plenty of other stuff on my TODO list, and given that the
world is transitioning to virtio-gpu the priority for qxl isn't
that high.

So, no, I have no short-term plans for qxl beyond fixing pins +
reservations + lockdep.

take care,
  Gerd

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/3] x86/sev-es: Check if regs->sp is trusted before adjusting #VC IST stack

2021-02-18 Thread Joerg Roedel
Hi Andy,

On Wed, Feb 17, 2021 at 10:09:46AM -0800, Andy Lutomirski wrote:
> Can you get rid of the linked list hack while you're at it?  This code
> is unnecessarily convoluted right now, and it seems to be just asking
> for weird bugs.  Just stash the old value in a local variable, please.

Yeah, the linked list is not really necessary right now, because of the
way nested NMI handling works and given that these functions are only
used in the NMI handler right now.
The whole #VC handling code was written with future requirements in
mind, like what is needed when debugging registers get virtualized and
#HV gets enabled.
Until its clear whether __sev_es_ist_enter/exit() is needed in any of
these paths, I'd like to keep the linked list for now. It is more
complicated but allows nesting.

> Meanwhile, I'm pretty sure I can break this whole scheme if the
> hypervisor is messing with us.  As a trivial example, the sequence
> SYSCALL gap -> #VC -> NMI -> #VC will go quite poorly.

I don't see how this would break, can you elaborate?

What I think happens is:

SYSCALL gap (RSP is from userspace and untrusted)

-> #VC - Handler on #VC IST stack detects that it interrupted
   the SYSCALL gap and switches to the task stack.


-> NMI - Now running on NMI IST stack. Depending on whether the
   stack switch in the #VC handler already happened, the #VC IST
   entry is adjusted so that a subsequent #VC will not overwrite
   the interrupted handlers stack frame.

-> #VC - Handler runs on the adjusted #VC IST stack and switches
   itself back to the NMI IST stack. This is safe wrt. nested
   NMIs as long as nested NMIs itself are safe.

As a rule of thumb, think of the #VC handler as trying to be a non-IST
handler by switching itself to the interrupted stack or the task stack.
If it detects that this is not possible (which can't happen right now,
but with SNP), it will kill the guest.

Also #VC is currently not safe against #MC, but this is the same as with
NMI and #MC. And more care is needed when SNP gets enabled and #VCs can
happen in the stack switching/stack adjustment code itself. I will
probably just add a check then to kill the guest if an SNP related #VC
comes from noinstr code.

> Is this really better than just turning IST off for #VC and
> documenting that we are not secure against a malicious hypervisor yet?

It needs to be IST, even without SNP, because #DB is IST too. When the
hypervisor intercepts #DB then any #DB exception will be turned into
#VC, so #VC needs to be handled anywhere a #DB can happen.

And with SNP we need to be able to at least detect a malicious HV so we
can reliably kill the guest. Otherwise the HV could potentially take
control over the guest's execution flow and make it reveal its secrets.

Regards,

Joerg
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: Kernel panic with vhost-vdpa

2021-02-18 Thread Jason Wang

Hi Gautam:

On 2021/2/15 9:01 下午, Gautam Dawar wrote:


Hi Jason/Michael,

I observed a kernel panic while testing vhost-vdpa with Xilinx 
adapters. Here are the details for your review:


Problem statement:

When qemu with vhost-vdpa netdevice is run for the first time, it 
works well. But after the VM is powered off, next qemu run causes 
kernel panic due to a NULL pointer dereference in 
irq_bypass_register_producer().


Root cause analysis:

When the VM is powered off, vhost_dev_stop() is invoked which in turn 
calls vhost_vdpa_reset_device() causing the irq_bypass producers to be 
unregistered.


On the next run, when qemu opens the vhost device, the 
vhost_vdpa_open() file operation calls vhost_dev_init(). Here, 
call_ctx->producer memory is cleared in vhost_vring_call_reset().


Further, when the virtqueues are initialized by 
vhost_virtqueue_init(), vhost_vdpa_setup_vq_irq() again registers the 
irq_bypass producer for each virtqueue. As the node member of struct 
irq_bypass_producer is also initialized to zero, traversal on the 
producers list causes crash due to NULL pointer dereference.




Thanks a lot for reporting this issue.



Fix details:

I think that this issue can be fixed by invoking 
vhost_vdpa_setup_vq_irq() only when vhost_vdpa_set_status() includes 
VIRTIO_CONFIG_S_DRIVER_OK in the new status value. This way, there 
won’t be any stale nodes in the irqbypass  module’s producers list 
which are reset in vhost_vring_call_reset().


Patch:

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 
62a9bb0efc55..fdad94e2fbf9 100644


--- a/drivers/vhost/vdpa.c

+++ b/drivers/vhost/vdpa.c

@@ -409,7 +409,6 @@ static long vhost_vdpa_vring_ioctl(struct 
vhost_vdpa *v, unsigned int cmd,


cb.private = NULL;

}

ops->set_vq_cb(vdpa, idx, );

- vhost_vdpa_setup_vq_irq(v, idx);

break;

case VHOST_SET_VRING_NUM:

We can also track this issue in Bugzilla ticket 21171 
(https://bugzilla.kernel.org/show_bug.cgi?id=211711 
) and the complete 
patch is attached with this email.




So vhost supports to remove or switch eventfd through 
vhost_vdpa_vring_ioctl(). So if userspace want to switch to another 
eventfd, we should re-do the register and unregister.


I think we need to deal this issue in another way. Can we check whether 
or not the producer is initialized before?


Thanks



Regards,

Gautam Dawar

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v5 4/8] x86/mm/tlb: Flush remote and local TLBs concurrently

2021-02-18 Thread Christoph Hellwig
Given that the last patch killed the last previously existing
user of on_each_cpu_cond_mask there are now the only users.

>   if (info->freed_tables) {
> - smp_call_function_many(cpumask, flush_tlb_func,
> -(void *)info, 1);
> + on_each_cpu_cond_mask(NULL, flush_tlb_func, (void *)info, true,
> +   cpumask);

.. 

> + on_each_cpu_cond_mask(NULL, flush_tlb_func, (void *)info, true,
> +   cpumask);

Which means the cond_func is unused, and thus on_each_cpu_cond_mask can
go away entirely in favor of on_each_cpu_cond.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2 10/11] drm/qxl: rework cursor plane

2021-02-18 Thread Thomas Zimmermann

Hi

Am 17.02.21 um 13:32 schrieb Gerd Hoffmann:

Add helper functions to create and move the cursor.
Create the cursor_bo in prepare_fb callback, in the
atomic_commit callback we only send the update command
to the host.


I'm still trying to wrap my head around the qxl cursor code.

Getting vmap out of the commit tail is good, but I feel like this isn't 
going in the right direction overall.


In ast, these helper functions were only good when converting the drvier 
to atomic modesetting. So I removed them in the latst patchset and did 
all the updates in the plane helpers directly.


For cursor_bo itself, it seems to be transitional state that is only 
used during the plane update and crtc update . It should probably be 
stored in a plane-state structure.


Some of the primary plane's functions seem to deal with cursor handling. 
What's the role of the primary plane in cursor handling?


For now, I suggest to merge patch 1 to 8 and 11; and move the cursor 
patches into a new patchset. I'd like ot hear Daniel's opinion on this. 
Do you have further plans here?


If you absolutely want patches 9 and 10, I'd rubber-stamp an A-b on them.

Best regards
Thomas



Signed-off-by: Gerd Hoffmann 
---
  drivers/gpu/drm/qxl/qxl_display.c | 248 --
  1 file changed, 133 insertions(+), 115 deletions(-)

diff --git a/drivers/gpu/drm/qxl/qxl_display.c 
b/drivers/gpu/drm/qxl/qxl_display.c
index b315d7484e21..4a3d272e8d6c 100644
--- a/drivers/gpu/drm/qxl/qxl_display.c
+++ b/drivers/gpu/drm/qxl/qxl_display.c
@@ -476,12 +476,11 @@ static int qxl_primary_atomic_check(struct drm_plane 
*plane,
return qxl_check_framebuffer(qdev, bo);
  }
  
-static int qxl_primary_apply_cursor(struct drm_plane *plane)

+static int qxl_primary_apply_cursor(struct qxl_device *qdev,
+   struct drm_plane_state *plane_state)
  {
-   struct drm_device *dev = plane->dev;
-   struct qxl_device *qdev = to_qxl(dev);
-   struct drm_framebuffer *fb = plane->state->fb;
-   struct qxl_crtc *qcrtc = to_qxl_crtc(plane->state->crtc);
+   struct drm_framebuffer *fb = plane_state->fb;
+   struct qxl_crtc *qcrtc = to_qxl_crtc(plane_state->crtc);
struct qxl_cursor_cmd *cmd;
struct qxl_release *release;
int ret = 0;
@@ -505,8 +504,8 @@ static int qxl_primary_apply_cursor(struct drm_plane *plane)
  
  	cmd = (struct qxl_cursor_cmd *)qxl_release_map(qdev, release);

cmd->type = QXL_CURSOR_SET;
-   cmd->u.set.position.x = plane->state->crtc_x + fb->hot_x;
-   cmd->u.set.position.y = plane->state->crtc_y + fb->hot_y;
+   cmd->u.set.position.x = plane_state->crtc_x + fb->hot_x;
+   cmd->u.set.position.y = plane_state->crtc_y + fb->hot_y;
  
  	cmd->u.set.shape = qxl_bo_physical_address(qdev, qcrtc->cursor_bo, 0);
  
@@ -523,6 +522,113 @@ static int qxl_primary_apply_cursor(struct drm_plane *plane)

return ret;
  }
  
+static int qxl_primary_move_cursor(struct qxl_device *qdev,

+  struct drm_plane_state *plane_state)
+{
+   struct drm_framebuffer *fb = plane_state->fb;
+   struct qxl_crtc *qcrtc = to_qxl_crtc(plane_state->crtc);
+   struct qxl_cursor_cmd *cmd;
+   struct qxl_release *release;
+   int ret = 0;
+
+   if (!qcrtc->cursor_bo)
+   return 0;
+
+   ret = qxl_alloc_release_reserved(qdev, sizeof(*cmd),
+QXL_RELEASE_CURSOR_CMD,
+, NULL);
+   if (ret)
+   return ret;
+
+   ret = qxl_release_reserve_list(release, true);
+   if (ret) {
+   qxl_release_free(qdev, release);
+   return ret;
+   }
+
+   cmd = (struct qxl_cursor_cmd *)qxl_release_map(qdev, release);
+   cmd->type = QXL_CURSOR_MOVE;
+   cmd->u.position.x = plane_state->crtc_x + fb->hot_x;
+   cmd->u.position.y = plane_state->crtc_y + fb->hot_y;
+   qxl_release_unmap(qdev, release, >release_info);
+
+   qxl_release_fence_buffer_objects(release);
+   qxl_push_cursor_ring_release(qdev, release, QXL_CMD_CURSOR, false);
+   return ret;
+}
+
+static struct qxl_bo *qxl_create_cursor(struct qxl_device *qdev,
+   struct qxl_bo *user_bo,
+   int hot_x, int hot_y)
+{
+   static const u32 size = 64 * 64 * 4;
+   struct qxl_bo *cursor_bo;
+   struct dma_buf_map cursor_map;
+   struct dma_buf_map user_map;
+   struct qxl_cursor cursor;
+   int ret;
+
+   if (!user_bo)
+   return NULL;
+
+   ret = qxl_bo_create(qdev, sizeof(struct qxl_cursor) + size,
+   false, true, QXL_GEM_DOMAIN_VRAM, 1,
+   NULL, _bo);
+   if (ret)
+   goto err;
+
+   ret = qxl_bo_vmap(cursor_bo, _map);
+   if (ret)
+   goto err_unref;
+
+   ret = qxl_bo_vmap(user_bo,