Re: [PATCH] drm/cirrus: rewrite and modernize driver.

2019-04-03 Thread Gerd Hoffmann
On Thu, Apr 04, 2019 at 12:58:09PM +1000, David Airlie wrote:
> On Wed, Apr 3, 2019 at 5:23 PM Gerd Hoffmann  wrote:
> >
> > Time to kill some bad sample code people are copying from ;)
> >
> > This is a complete rewrite of the cirrus driver.  The cirrus_mode_set()
> > function is pretty much the only function which is carried over largely
> > unmodified.  Everything else is upside down.
> >
> > It is a single monster patch.  But given that it does some pretty
> > fundamental changes to the drivers workflow and also reduces the code
> > size by roughly 70% I think it'll still be alot easier to review than a
> > longish baby-step patch series.
> >
> > Changes summary:
> >  - Given the small amout of video memory (4 MB) the cirrus device has
> >the rewritten driver doesn't try to manage buffers there.  Instead
> >it will blit (memcpy) the active framebuffer to video memory.
> 
> Does it get any slower, with TTM I just wrote it to migrate just the
> frontbuffer in/out of VRAM on modeset, won't we end up with more
> copies now?

I didn't benchmark it, but if you care about performance you should not
be using cirrus anyway ...

For fbcon it probably doesn't make much of a difference, fbcon used a
shadow framebuffer before (for fbdev mmap and dirty tracking).

xorg probably ends up with more copying.

Anything doing display updates with page flips (i.e wayland) should end
up with less copying, because you have one copy (blit) instead of two
copies (migrate old frontbuffer out of vram, migrate new frontbuffer
into vram) on pageflip.

Speaking of wayland:  Seems at least gnome-shell insists on using XR24.

> >  - Only DRM_FORMAT_RGB565 (depth 16) is supported.  The old driver does
> >that too by default.  There was a module parameter which enables 24/32
> >bpp support and disables higher resolutions (due to cirrus hardware
> >constrains).  That parameter wasn't reimplemented.

> This might be the big sticking point, this is a userspace regression
> for a feature that was explicitly added a few years ago, can we really
> get away without it?

Well, I can reintroduce the module option.  I don't see any other
reasonable way to support 32bpp.  If the driver reports XR24 as
supported and also adds the higher resolutions (which work with RG16
only) to the mode list userspace will of course try the higher
resolutions with XR24 and struggle ...

cheers,
  Gerd

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 1/1] proof of concept for GPU forwarding

2019-04-03 Thread Lepton Wu
---
 arch/x86/configs/x86_64_defconfig  |5 +
 drivers/char/Makefile  |1 +
 drivers/char/forwarder/Makefile|8 +
 drivers/char/forwarder/forwarder.h |  103 ++
 drivers/char/forwarder/forwarder_drv.c | 2104 
 fs/open.c  |   18 +
 include/uapi/linux/virtwl.h|   64 +
 tools/forward/Makefile |2 +
 tools/forward/README   |   58 +
 tools/forward/qemu.diff| 1117 +
 tools/forward/wayland-proxy-main.c |   58 +
 tools/forward/wayland-proxy.c  |  297 
 12 files changed, 3835 insertions(+)
 create mode 100644 drivers/char/forwarder/Makefile
 create mode 100644 drivers/char/forwarder/forwarder.h
 create mode 100644 drivers/char/forwarder/forwarder_drv.c
 create mode 100644 include/uapi/linux/virtwl.h
 create mode 100644 tools/forward/Makefile
 create mode 100644 tools/forward/README
 create mode 100644 tools/forward/qemu.diff
 create mode 100644 tools/forward/wayland-proxy-main.c
 create mode 100644 tools/forward/wayland-proxy.c

diff --git a/arch/x86/configs/x86_64_defconfig 
b/arch/x86/configs/x86_64_defconfig
index 1d3badfda09e..6c6e55051d5c 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -310,3 +310,8 @@ CONFIG_SECURITY_SELINUX_DISABLE=y
 CONFIG_EFI_STUB=y
 CONFIG_EFI_MIXED=y
 CONFIG_ACPI_BGRT=y
+CONFIG_VIRTIO=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_VSOCKETS=y
+CONFIG_VIRTIO_VSOCKETS=y
+CONFIG_VIRTIO_VSOCKETS_COMMON=y
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index fbea7dd12932..af406b6e3e91 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -54,3 +54,4 @@ js-rtc-y = rtc.o
 obj-$(CONFIG_XILLYBUS) += xillybus/
 obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o
 obj-$(CONFIG_ADI)  += adi.o
+obj-y  += forwarder/
diff --git a/drivers/char/forwarder/Makefile b/drivers/char/forwarder/Makefile
new file mode 100644
index ..bc452e51494a
--- /dev/null
+++ b/drivers/char/forwarder/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the drm device driver.  This driver provides support for the
+# Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
+
+forwarder-y := forwarder_drv.o
+
+obj-y += forwarder.o
diff --git a/drivers/char/forwarder/forwarder.h 
b/drivers/char/forwarder/forwarder.h
new file mode 100644
index ..4937cebbf7b2
--- /dev/null
+++ b/drivers/char/forwarder/forwarder.h
@@ -0,0 +1,103 @@
+enum {
+   STREAM_MAGIC = 0xbeefc1ea,
+   EVENT_MAGIC,
+   IPC_MAGIC,
+};
+struct pwrite_stream {
+   unsigned int magic;
+   int fd;
+   unsigned int handle;
+   unsigned int offset;
+   unsigned int size;
+};
+
+#define IPC_PAGE_SIZE 32768
+
+#define IPC_COUNT 4
+
+struct ipc {
+  volatile unsigned int seq;
+  unsigned int cmd;
+  union {
+  struct {
+ int arg1;
+ int arg2;
+ int arg3;
+ int pad1;
+ };
+ struct {
+ volatile int64_t ret;
+ int64_t pad2;
+ };
+ struct {
+ int fd;
+ } ioctl;
+ struct {
+ unsigned int pn_count;
+ } hostfd;
+ struct {
+ void* addr;
+ } dmabuf;
+ struct {
+ int fd;
+ unsigned int pn_off;
+ unsigned int pn_count;
+ } mmap;
+ struct {
+ unsigned int pn_off;
+ unsigned int pn_count;
+ } munmap;
+ struct {
+ int fd;
+ int whence;
+ } lseek;
+ struct {
+ int fd;
+ unsigned int len;
+ } fallocate;
+ struct {
+ int fd;
+ unsigned int len;
+ } ftruncate;
+ struct {
+ int fd;
+ uint32_t fdc;
+ uint32_t size;
+ } msg;
+  };
+  char data[0];
+};
+
+#define WL_IOCTL_BASE 'w'
+#define VIRT_WL_MAX 32
+#define WL_IO(nr)  _IO(WL_IOCTL_BASE, nr + VIRT_WL_MAX)
+
+#define WL_CMD_NEW_RENDER_FD WL_IO(0x00)
+#define WL_CMD_NEW_WL_FD WL_IO(0x01)
+#define WL_CMD_NEW_MEM_FD WL_IO(0x02)
+#define WL_CMD_NEW_SYNC_FD WL_IO(0x03)
+#define WL_CMD_RECVMSG WL_IO(0x04)
+#define WL_CMD_SENDMSG WL_IO(0x05)
+#define WL_CMD_MMAP WL_IO(0x06)
+#define WL_CMD_MUNMAP WL_IO(0x07)
+#define WL_CMD_LSEEK WL_IO(0x08)
+#define WL_CMD_CLEAR_COUNTER WL_IO(0x09)
+#define WL_CMD_SHOW_COUNTER WL_IO(0x0A)
+#define WL_CMD_NEW_DMABUF WL_IO(0x0B)
+#define WL_CMD_FALLOCATE WL_IO(0x0C)
+#define WL_CMD_FTRUNCATE WL_IO(0x0D)
+
+#define SW_SYNC_IOC_MAGIC  'W'
+
+struct sw_sync_create_fence_data {
+   unsigned intvalue;
+   charname[32];
+   int fence; /* fd of new fence */
+};
+
+#define 

Proof of concept for GPU forwarding for Linux guest on Linux host.

2019-04-03 Thread Lepton Wu
Hi,

This is a proof of concept of GPU forwarding for Linux guest on Linux host.
I'd like to get comments and suggestions from community before I put more
time on it. To summarize what it is:

1. It's a solution to bring GPU acceleration for Linux vm guest on Linux host.
It could works with different GPU although the current proof of concept only
works with Intel GPU.

2. The basic idea is: under Linux, most applications use GPU acceleration with
help of MESA library. And MESA library interacts with kernel GPU driver by
operating on some special character device file exported by kernel GPU driver.
MESA library opens some special files in system and operations on GPU are done
by ioctl/mmap system call and regular memory operations.

We just write a kernel driver for guest Linux kernel and let it exports same
interface to user space like the real Linux GPU kernel driver. So it's an API 
proxy
between host and VM guest. We just proxy API at system call level. 

3. Some nasty things was done in guest kernel as a quick dirty hack so we don't 
need
to touch user space (wayland/mesa etc) now.

4. You can check tools/forward/README for instructions.

Thanks!

Lepton Wu (1):
  proof of concept for GPU forwarding

 arch/x86/configs/x86_64_defconfig  |5 +
 drivers/char/Makefile  |1 +
 drivers/char/forwarder/Makefile|8 +
 drivers/char/forwarder/forwarder.h |  103 ++
 drivers/char/forwarder/forwarder_drv.c | 2104 
 fs/open.c  |   18 +
 include/uapi/linux/virtwl.h|   64 +
 tools/forward/Makefile |2 +
 tools/forward/README   |   58 +
 tools/forward/qemu.diff| 1117 +
 tools/forward/wayland-proxy-main.c |   58 +
 tools/forward/wayland-proxy.c  |  297 
 12 files changed, 3835 insertions(+)
 create mode 100644 drivers/char/forwarder/Makefile
 create mode 100644 drivers/char/forwarder/forwarder.h
 create mode 100644 drivers/char/forwarder/forwarder_drv.c
 create mode 100644 include/uapi/linux/virtwl.h
 create mode 100644 tools/forward/Makefile
 create mode 100644 tools/forward/README
 create mode 100644 tools/forward/qemu.diff
 create mode 100644 tools/forward/wayland-proxy-main.c
 create mode 100644 tools/forward/wayland-proxy.c

-- 
2.21.0.392.gf8f6787159e-goog

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4 5/5] xfs: disable map_sync for async flush

2019-04-03 Thread Dave Chinner
On Wed, Apr 03, 2019 at 04:10:18PM +0530, Pankaj Gupta wrote:
> Virtio pmem provides asynchronous host page cache flush
> mechanism. we don't support 'MAP_SYNC' with virtio pmem 
> and xfs.
> 
> Signed-off-by: Pankaj Gupta 
> ---
>  fs/xfs/xfs_file.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 1f2e2845eb76..dced2eb8c91a 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1203,6 +1203,14 @@ xfs_file_mmap(
>   if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
>   return -EOPNOTSUPP;
>  
> + /* We don't support synchronous mappings with DAX files if
> +  * dax_device is not synchronous.
> +  */
> + if (IS_DAX(file_inode(filp)) && !dax_synchronous(
> + xfs_find_daxdev_for_inode(file_inode(filp))) &&
> + (vma->vm_flags & VM_SYNC))
> + return -EOPNOTSUPP;
> +
>   file_accessed(filp);
>   vma->vm_ops = _file_vm_ops;
>   if (IS_DAX(file_inode(filp)))

All this ad hoc IS_DAX conditional logic is getting pretty nasty.

xfs_file_mmap(

{
struct inode*inode = file_inode(filp);

if (vma->vm_flags & VM_SYNC) {
if (!IS_DAX(inode))
return -EOPNOTSUPP;
if (!dax_synchronous(xfs_find_daxdev_for_inode(inode))
return -EOPNOTSUPP;
}

file_accessed(filp);
vma->vm_ops = _file_vm_ops;
if (IS_DAX(inode))
vma->vm_flags |= VM_HUGEPAGE;
return 0;
}


Even better, factor out all the "MAP_SYNC supported" checks into a
helper so that the filesystem code just doesn't have to care about
the details of checking for DAX+MAP_SYNC support

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


CfP VHPC19: HPC Virtualization-Containers: Paper due May 1, 2019 (extended)

2019-04-03 Thread VHPC 19

CALL FOR PAPERS


14th Workshop on Virtualization in High­-Performance Cloud Computing
(VHPC '19) held in conjunction with the International Supercomputing
Conference - High Performance, June 16-20, 2019, Frankfurt, Germany.
(Springer LNCS Proceedings)



Date: June 20, 2019
Workshop URL: http://vhpc.org

Paper Submission Deadline: May 1, 2019 (extended)
Springer LNCS, rolling abstract submission

Abstract/Paper Submission Link: https://edas.info/newPaper.php?c=25685


Call for Papers

Containers and virtualization technologies constitute key enabling
factors for flexible resource management in modern data centers, and
particularly in cloud environments.  Cloud providers need to manage
complex infrastructures in a seamless fashion to support the highly
dynamic and heterogeneous workloads and hosted applications customers
deploy. Similarly, HPC  environments have been increasingly adopting
techniques that enable flexible management of vast computing and
networking resources, close to marginal provisioning cost, which is
unprecedented in the history of scientific and commercial computing.

Various virtualization-containerization technologies contribute to the
overall picture in different ways: machine virtualization, with its
capability to enable consolidation of multiple under­utilized servers
with heterogeneous software and operating systems (OSes), and its
capability to live­-migrate a fully operating virtual machine (VM)
with a very short downtime, enables novel and dynamic ways to manage
physical servers; OS-­level virtualization (i.e., containerization),
with its capability to isolate multiple user­-space environments and
to allow for their co­existence within the same OS kernel, promises to
provide many of the advantages of machine virtualization with high
levels of responsiveness and performance; lastly, unikernels provide
for many virtualization benefits with a minimized OS/library surface.
I/O Virtualization in turn allows physical network interfaces to take
traffic from multiple VMs or containers; network virtualization, with
its capability to create logical network overlays that are independent
of the underlying physical topology is furthermore enabling
virtualization of HPC infrastructures.

Publication

Accepted papers will be published in a Springer LNCS proceedings volume.


Topics of Interest

The VHPC program committee solicits original, high-quality submissions
related to virtualization across the entire software stack with a
special focus on the intersection of HPC, containers-virtualization
and the cloud.

Major Topics:
- HPC on Containers and VMs
- Containerized applications with OS-level virtualization
- Lightweight applications with Unikernels
- HP-as-a-Service

each major topic encompassing design/architecture, management,
performance management, modeling and configuration/tooling:

Design / Architecture:
- Containers and OS-level virtualization (LXC, Docker, rkt,
  Singularity, Shifter, i.a.)
- Hypervisor support for heterogeneous resources (GPUs, co-processors,
  FPGAs, etc.)
- Hypervisor extensions to mitigate side-channel attacks
  ([micro-]architectural timing attacks, privilege escalation)
- VM & Container trust and security models
- Multi-environment coupling, system software supporting in-situ
  analysis with HPC simulation
- Cloud reliability, fault-tolerance and high-availability
- Energy-efficient and power-aware virtualization
- Containers inside VMs with hypervisor isolation
- Virtualization support for emerging memory technologies
- Lightweight/specialized operating systems in conjunction with
  virtual machines
- Hypervisor support for heterogeneous resources (GPUs, co-processors,
  FPGAs, etc.)
- Novel unikernels and use cases for virtualized HPC environments
- ARM-based hypervisors, ARM virtualization extensions

Management:
- Container and VM management for HPC and cloud environments
- HPC services integration, services to support HPC
- Service and on-demand scheduling & resource management
- Dedicated workload management with VMs or containers
- Workflow coupling with VMs and containers
- Unikernel, lightweight VM application management
- Environments and tools for operating containerized environments
  (batch, orchestration)
- Novel models for non-HPC workload provisioning on HPC resources

Performance Measurements and Modeling:
- Performance improvements for or driven by unikernels
- Optimizations of virtual machine monitor platforms and hypervisors
- Scalability analysis of VMs and/or containers at large scale
- Performance measurement, modeling and monitoring of
  virtualized/cloud workloads
- Virtualization in supercomputing environments, HPC clusters, HPC in
  the cloud

Configuration / Tooling:
- Tool support for unikernels: configuration/build environments,
  debuggers, profilers
- Job scheduling/control/policy and 

Re: [PATCH net v5] failover: allow name change on IFF_UP slave interfaces

2019-04-03 Thread Stephen Hemminger
On Tue, 2 Apr 2019 22:22:18 -0700
"Samudrala, Sridhar"  wrote:

> On 4/2/2019 8:14 PM, Stephen Hemminger wrote:
> > On Tue, 2 Apr 2019 15:23:29 -0700
> > si-wei liu  wrote:
> >   
> >> On 4/2/2019 2:53 PM, Stephen Hemminger wrote:  
> >>> On Mon,  1 Apr 2019 19:04:53 -0400
> >>> Si-Wei Liu  wrote:
> >>> 
>  +if (dev->flags & IFF_UP &&
>  +likely(!(dev->priv_flags & IFF_FAILOVER_SLAVE)))  
> >>> Why is property limited to failover slave, it would make sense for netvsc
> >>> as well. Why not make it a flag like live address change?  
> >> Well, netvsc today is still taking the delayed approach meaning that it
> >> is incompatible yet with this live name change flag if need be. ;-)
> >>
> >> I thought Sridhar did not like to introduce an additional
> >> IFF_SLAVE_RENAME_OK flag given that failover slave is the only consumer
> >> for the time being. Even though I can get it back, patch is needed for
> >> netvsc to remove the VF takeover delay IMHO.
> >>
> >> Sridhar, what do you think we revive the IFF_SLAVE_RENAME_OK flag which
> >> allows netvsc to be used later on? Or maybe, IFF_LIVE_RENAME_OK for a
> >> better name?
> >>
> >> -Siwei  
> > 
> > I would name it IFF_LIVE_NAME_CHANGE to match IFF_LIVE_ADDR_CHANGE
> > there is no reason its use should be restricted to SLAVE devices.
> >  
> Stephen,
> May be you should consider moving netvsc to use the net_failover driver now?
> 

NO

Why would I waste time doing that when there is a working and cleaner solution
that is working across 4 OS's and three versions of five major distributions?
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] drm/cirrus: rewrite and modernize driver.

2019-04-03 Thread Daniel Stone
On Wed, 3 Apr 2019 at 16:12, Adam Jackson  wrote:
> On Wed, 2019-04-03 at 09:23 +0200, Gerd Hoffmann wrote:
> >  - Only DRM_FORMAT_RGB565 (depth 16) is supported.  The old driver does
> >that too by default.  There was a module parameter which enables 24/32
> >bpp support and disables higher resolutions (due to cirrus hardware
> >constrains).  That parameter wasn't reimplemented.
>
> One slightly annoying aspect of this (well, initially of the patch to
> clamp the default to 16bpp, but this too) is that we only have a way to
> ask the driver which format it prefers, not which ones it supports at
> all. For X's modesetting driver (and yes some of this is because X is
> awful) this creates the following failure mode:
>
> 1: user sets up xorg.conf for depth 24
> 2: user upgrades kernel, reboots
> 3: X driver detects that depth 16 is preferred, but
> 4: X core respects user's xorg.conf and tries depth 24, which
> 5: throws -EINVAL and X won't start.
>
> Possibly X should work around this by transparently setting up a shadow
> framebuffer at the user's requested depth. The problem there is, if 565
> is preferred but  works, you're adding a format-conversion blit in
> the middle for no reason. If I could ask the kernel for the entire list
> of supported formats, I could only set up the shadow if it was
> necessary.

There's already a list of supported formats for each DRM plane, which
you can get via drmModeGetPlane (being careful to enable universal
planes so you can discover the primary plane). The same information is
present in the 'IN_FORMATS' property, which is more difficult to parse
but also tells you about modifiers.

modesetting already pulls all this out (at least in the atomic path)
so we can reason about acceptable modifiers.

Cheers,
Daniel
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Qemu-devel] [PATCH v4 2/5] virtio-pmem: Add virtio pmem driver

2019-04-03 Thread Pankaj Gupta


> Subject: Re: [Qemu-devel] [PATCH v4 2/5] virtio-pmem: Add virtio pmem driver
> 
> On Wed, Apr 03, 2019 at 04:10:15PM +0530, Pankaj Gupta wrote:
> > This patch adds virtio-pmem driver for KVM guest.
> > 
> > Guest reads the persistent memory range information from
> > Qemu over VIRTIO and registers it on nvdimm_bus. It also
> > creates a nd_region object with the persistent memory
> > range information so that existing 'nvdimm/pmem' driver
> > can reserve this into system memory map. This way
> > 'virtio-pmem' driver uses existing functionality of pmem
> > driver to register persistent memory compatible for DAX
> > capable filesystems.
> > 
> > This also provides function to perform guest flush over
> > VIRTIO from 'pmem' driver when userspace performs flush
> > on DAX memory range.
> > 
> > Signed-off-by: Pankaj Gupta 
> > ---
> >  drivers/nvdimm/virtio_pmem.c |  84 +
> >  drivers/virtio/Kconfig   |  10 +++
> >  drivers/virtio/Makefile  |   1 +
> >  drivers/virtio/pmem.c| 125 +++
> >  include/linux/virtio_pmem.h  |  60 +++
> >  include/uapi/linux/virtio_ids.h  |   1 +
> >  include/uapi/linux/virtio_pmem.h |  10 +++
> >  7 files changed, 291 insertions(+)
> >  create mode 100644 drivers/nvdimm/virtio_pmem.c
> >  create mode 100644 drivers/virtio/pmem.c
> >  create mode 100644 include/linux/virtio_pmem.h
> >  create mode 100644 include/uapi/linux/virtio_pmem.h
> > 
> > diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
> > new file mode 100644
> > index ..2a1b1ba2c1ff
> > --- /dev/null
> > +++ b/drivers/nvdimm/virtio_pmem.c
> > @@ -0,0 +1,84 @@
> > +// SPDX-License-Identifier: GPL-2.0
> 
> Is this comment stile (//) acceptable?

In existing code, i can see same comment
pattern for license at some places.

> 
> > +/*
> > + * virtio_pmem.c: Virtio pmem Driver
> > + *
> > + * Discovers persistent memory range information
> > + * from host and provides a virtio based flushing
> > + * interface.
> > + */
> > +#include 
> > +#include "nd.h"
> > +
> > + /* The interrupt handler */
> > +void host_ack(struct virtqueue *vq)
> > +{
> > +   unsigned int len;
> > +   unsigned long flags;
> > +   struct virtio_pmem_request *req, *req_buf;
> > +   struct virtio_pmem *vpmem = vq->vdev->priv;
> > +
> > +   spin_lock_irqsave(>pmem_lock, flags);
> > +   while ((req = virtqueue_get_buf(vq, )) != NULL) {
> > +   req->done = true;
> > +   wake_up(>host_acked);
> > +
> > +   if (!list_empty(>req_list)) {
> > +   req_buf = list_first_entry(>req_list,
> > +   struct virtio_pmem_request, list);
> > +   list_del(>req_list);
> > +   req_buf->wq_buf_avail = true;
> > +   wake_up(_buf->wq_buf);
> > +   }
> > +   }
> > +   spin_unlock_irqrestore(>pmem_lock, flags);
> > +}
> > +EXPORT_SYMBOL_GPL(host_ack);
> > +
> > + /* The request submission function */
> > +int virtio_pmem_flush(struct nd_region *nd_region)
> > +{
> > +   int err;
> > +   unsigned long flags;
> > +   struct scatterlist *sgs[2], sg, ret;
> > +   struct virtio_device *vdev = nd_region->provider_data;
> > +   struct virtio_pmem *vpmem = vdev->priv;
> > +   struct virtio_pmem_request *req;
> > +
> > +   might_sleep();
> 
> [1]
> 
> > +   req = kmalloc(sizeof(*req), GFP_KERNEL);
> > +   if (!req)
> > +   return -ENOMEM;
> > +
> > +   req->done = req->wq_buf_avail = false;
> > +   strcpy(req->name, "FLUSH");
> > +   init_waitqueue_head(>host_acked);
> > +   init_waitqueue_head(>wq_buf);
> > +   sg_init_one(, req->name, strlen(req->name));
> > +   sgs[0] = 
> > +   sg_init_one(, >ret, sizeof(req->ret));
> > +   sgs[1] = 
> > +
> > +   spin_lock_irqsave(>pmem_lock, flags);
> > +   err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req, GFP_ATOMIC);
> 
> Is it okay to use GFP_ATOMIC in a might-sleep ([1]) function?

might sleep will give us a warning if we try to sleep from non-sleepable
context. 

We are doing it other way, i.e might_sleep is not inside GFP_ATOMIC. 

> 
> > +   if (err) {
> > +   dev_err(>dev, "failed to send command to virtio pmem 
> > device\n");
> > +
> > +   list_add_tail(>req_list, >list);
> > +   spin_unlock_irqrestore(>pmem_lock, flags);
> > +
> > +   /* When host has read buffer, this completes via host_ack */
> > +   wait_event(req->wq_buf, req->wq_buf_avail);
> > +   spin_lock_irqsave(>pmem_lock, flags);
> > +   }
> > +   virtqueue_kick(vpmem->req_vq);
> 
> You probably want to check return value here.

Don't think it will matter in this case?

> 
> > +   spin_unlock_irqrestore(>pmem_lock, flags);
> > +
> > +   /* When host has read buffer, this completes via host_ack */
> > +   wait_event(req->host_acked, req->done);
> > +   err = req->ret;
> > +   kfree(req);
> > +
> > +   return err;
> > +};
> > +EXPORT_SYMBOL_GPL(virtio_pmem_flush);
> 

Re: [PATCH v4 4/5] ext4: disable map_sync for async flush

2019-04-03 Thread Jan Kara
On Wed 03-04-19 16:10:17, Pankaj Gupta wrote:
> Virtio pmem provides asynchronous host page cache flush
> mechanism. We don't support 'MAP_SYNC' with virtio pmem 
> and ext4. 
> 
> Signed-off-by: Pankaj Gupta 

The patch looks good to me. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  fs/ext4/file.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 69d65d49837b..86e4bf464320 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -360,8 +360,10 @@ static const struct vm_operations_struct 
> ext4_file_vm_ops = {
>  static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
>  {
>   struct inode *inode = file->f_mapping->host;
> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> + struct dax_device *dax_dev = sbi->s_daxdev;
>  
> - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb
> + if (unlikely(ext4_forced_shutdown(sbi)))
>   return -EIO;
>  
>   /*
> @@ -371,6 +373,13 @@ static int ext4_file_mmap(struct file *file, struct 
> vm_area_struct *vma)
>   if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
>   return -EOPNOTSUPP;
>  
> + /* We don't support synchronous mappings with DAX files if
> +  * dax_device is not synchronous.
> +  */
> + if (IS_DAX(file_inode(file)) && !dax_synchronous(dax_dev)
> + && (vma->vm_flags & VM_SYNC))
> + return -EOPNOTSUPP;
> +
>   file_accessed(file);
>   if (IS_DAX(file_inode(file))) {
>   vma->vm_ops = _dax_vm_ops;
> -- 
> 2.20.1
> 
-- 
Jan Kara 
SUSE Labs, CR
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v4 5/5] xfs: disable map_sync for async flush

2019-04-03 Thread Pankaj Gupta
Virtio pmem provides asynchronous host page cache flush
mechanism. we don't support 'MAP_SYNC' with virtio pmem 
and xfs.

Signed-off-by: Pankaj Gupta 
---
 fs/xfs/xfs_file.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f2e2845eb76..dced2eb8c91a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1203,6 +1203,14 @@ xfs_file_mmap(
if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
return -EOPNOTSUPP;
 
+   /* We don't support synchronous mappings with DAX files if
+* dax_device is not synchronous.
+*/
+   if (IS_DAX(file_inode(filp)) && !dax_synchronous(
+   xfs_find_daxdev_for_inode(file_inode(filp))) &&
+   (vma->vm_flags & VM_SYNC))
+   return -EOPNOTSUPP;
+
file_accessed(filp);
vma->vm_ops = _file_vm_ops;
if (IS_DAX(file_inode(filp)))
-- 
2.20.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v4 4/5] ext4: disable map_sync for async flush

2019-04-03 Thread Pankaj Gupta
Virtio pmem provides asynchronous host page cache flush
mechanism. We don't support 'MAP_SYNC' with virtio pmem 
and ext4. 

Signed-off-by: Pankaj Gupta 
---
 fs/ext4/file.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 69d65d49837b..86e4bf464320 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -360,8 +360,10 @@ static const struct vm_operations_struct ext4_file_vm_ops 
= {
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
struct inode *inode = file->f_mapping->host;
+   struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+   struct dax_device *dax_dev = sbi->s_daxdev;
 
-   if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb
+   if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
 
/*
@@ -371,6 +373,13 @@ static int ext4_file_mmap(struct file *file, struct 
vm_area_struct *vma)
if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
return -EOPNOTSUPP;
 
+   /* We don't support synchronous mappings with DAX files if
+* dax_device is not synchronous.
+*/
+   if (IS_DAX(file_inode(file)) && !dax_synchronous(dax_dev)
+   && (vma->vm_flags & VM_SYNC))
+   return -EOPNOTSUPP;
+
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = _dax_vm_ops;
-- 
2.20.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v4 1/5] ibnvdimm: nd_region flush callback support

2019-04-03 Thread Pankaj Gupta
This patch adds functionality to perform flush from guest
to host over VIRTIO. We are registering a callback based
on 'nd_region' type. virtio_pmem driver requires this special
flush function. For rest of the region types we are registering
existing flush function. Report error returned by host fsync
failure to userspace.

This also handles asynchronous flush requests from the block layer
by creating a child bio and chaining it with parent bio.

Signed-off-by: Pankaj Gupta 
---
 drivers/acpi/nfit/core.c |  4 ++--
 drivers/nvdimm/claim.c   |  6 --
 drivers/nvdimm/nd.h  |  1 +
 drivers/nvdimm/pmem.c| 14 -
 drivers/nvdimm/region_devs.c | 38 ++--
 include/linux/libnvdimm.h|  8 +++-
 6 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index 5a389a4f4f65..567017a2190e 100644
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -2434,7 +2434,7 @@ static void write_blk_ctl(struct nfit_blk *nfit_blk, 
unsigned int bw,
offset = to_interleave_offset(offset, mmio);
 
writeq(cmd, mmio->addr.base + offset);
-   nvdimm_flush(nfit_blk->nd_region);
+   nvdimm_flush(nfit_blk->nd_region, NULL, false);
 
if (nfit_blk->dimm_flags & NFIT_BLK_DCR_LATCH)
readq(mmio->addr.base + offset);
@@ -2483,7 +2483,7 @@ static int acpi_nfit_blk_single_io(struct nfit_blk 
*nfit_blk,
}
 
if (rw)
-   nvdimm_flush(nfit_blk->nd_region);
+   nvdimm_flush(nfit_blk->nd_region, NULL, false);
 
rc = read_blk_stat(nfit_blk, lane) ? -EIO : 0;
return rc;
diff --git a/drivers/nvdimm/claim.c b/drivers/nvdimm/claim.c
index fb667bf469c7..a1dfa066786b 100644
--- a/drivers/nvdimm/claim.c
+++ b/drivers/nvdimm/claim.c
@@ -263,7 +263,7 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
struct nd_namespace_io *nsio = to_nd_namespace_io(>dev);
unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);
sector_t sector = offset >> 9;
-   int rc = 0;
+   int rc = 0, ret = 0;
 
if (unlikely(!size))
return 0;
@@ -301,7 +301,9 @@ static int nsio_rw_bytes(struct nd_namespace_common *ndns,
}
 
memcpy_flushcache(nsio->addr + offset, buf, size);
-   nvdimm_flush(to_nd_region(ndns->dev.parent));
+   ret = nvdimm_flush(to_nd_region(ndns->dev.parent), NULL, false);
+   if (ret)
+   rc = ret;
 
return rc;
 }
diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h
index a5ac3b240293..916cd6c5451a 100644
--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -159,6 +159,7 @@ struct nd_region {
struct badblocks bb;
struct nd_interleave_set *nd_set;
struct nd_percpu_lane __percpu *lane;
+   int (*flush)(struct nd_region *nd_region);
struct nd_mapping mapping[0];
 };
 
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index bc2f700feef8..5a5b3ea4d073 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -192,6 +192,7 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, 
struct page *page,
 
 static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
 {
+   int ret = 0;
blk_status_t rc = 0;
bool do_acct;
unsigned long start;
@@ -201,7 +202,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, 
struct bio *bio)
struct nd_region *nd_region = to_region(pmem);
 
if (bio->bi_opf & REQ_PREFLUSH)
-   nvdimm_flush(nd_region);
+   ret = nvdimm_flush(nd_region, bio, true);
 
do_acct = nd_iostat_start(bio, );
bio_for_each_segment(bvec, bio, iter) {
@@ -216,7 +217,10 @@ static blk_qc_t pmem_make_request(struct request_queue *q, 
struct bio *bio)
nd_iostat_end(bio, start);
 
if (bio->bi_opf & REQ_FUA)
-   nvdimm_flush(nd_region);
+   ret = nvdimm_flush(nd_region, bio, true);
+
+   if (ret)
+   bio->bi_status = errno_to_blk_status(ret);
 
bio_endio(bio);
return BLK_QC_T_NONE;
@@ -463,13 +467,13 @@ static int pmem_attach_disk(struct device *dev,
disk->bb = >bb;
 
dax_dev = alloc_dax(pmem, disk->disk_name, _dax_ops);
+
if (!dax_dev) {
put_disk(disk);
return -ENOMEM;
}
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
pmem->dax_dev = dax_dev;
-
gendev = disk_to_dev(disk);
gendev->groups = pmem_attribute_groups;
 
@@ -527,14 +531,14 @@ static int nd_pmem_remove(struct device *dev)
sysfs_put(pmem->bb_state);
pmem->bb_state = NULL;
}
-   nvdimm_flush(to_nd_region(dev->parent));
+   nvdimm_flush(to_nd_region(dev->parent), NULL, false);
 
return 0;
 }
 
 static void nd_pmem_shutdown(struct device *dev)
 {
-   

[PATCH v4 3/5] libnvdimm: add dax_dev sync flag

2019-04-03 Thread Pankaj Gupta
This patch adds 'DAXDEV_SYNC' flag which is set
for nd_region doing synchronous flush. This later 
is used to disable MAP_SYNC functionality for 
ext4 & xfs filesystem for devices don't support
synchronous flush.

Signed-off-by: Pankaj Gupta 
---
 drivers/dax/bus.c|  2 +-
 drivers/dax/super.c  | 13 -
 drivers/md/dm.c  |  2 +-
 drivers/nvdimm/pmem.c|  3 ++-
 drivers/nvdimm/region_devs.c |  7 +++
 include/linux/dax.h  |  9 +++--
 include/linux/libnvdimm.h|  1 +
 7 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index 2109cfe80219..431bf7d2a7f9 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -388,7 +388,7 @@ struct dev_dax *__devm_create_dev_dax(struct dax_region 
*dax_region, int id,
 * No 'host' or dax_operations since there is no access to this
 * device outside of mmap of the resulting character device.
 */
-   dax_dev = alloc_dax(dev_dax, NULL, NULL);
+   dax_dev = alloc_dax(dev_dax, NULL, NULL, true);
if (!dax_dev)
goto err;
 
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 0a339b85133e..bd6509308d05 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -186,6 +186,8 @@ enum dax_device_flags {
DAXDEV_ALIVE,
/* gate whether dax_flush() calls the low level flush routine */
DAXDEV_WRITE_CACHE,
+   /* flag to check if device supports synchronous flush */
+   DAXDEV_SYNC,
 };
 
 /**
@@ -354,6 +356,12 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev)
 }
 EXPORT_SYMBOL_GPL(dax_write_cache_enabled);
 
+bool dax_synchronous(struct dax_device *dax_dev)
+{
+   return test_bit(DAXDEV_SYNC, _dev->flags);
+}
+EXPORT_SYMBOL_GPL(dax_synchronous);
+
 bool dax_alive(struct dax_device *dax_dev)
 {
lockdep_assert_held(_srcu);
@@ -511,7 +519,7 @@ static void dax_add_host(struct dax_device *dax_dev, const 
char *host)
 }
 
 struct dax_device *alloc_dax(void *private, const char *__host,
-   const struct dax_operations *ops)
+   const struct dax_operations *ops, bool sync)
 {
struct dax_device *dax_dev;
const char *host;
@@ -534,6 +542,9 @@ struct dax_device *alloc_dax(void *private, const char 
*__host,
dax_add_host(dax_dev, host);
dax_dev->ops = ops;
dax_dev->private = private;
+   if (sync)
+   set_bit(DAXDEV_SYNC, _dev->flags);
+
return dax_dev;
 
  err_dev:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 68d24056d0b1..534e12ca6329 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1965,7 +1965,7 @@ static struct mapped_device *alloc_dev(int minor)
sprintf(md->disk->disk_name, "dm-%d", minor);
 
if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
-   dax_dev = alloc_dax(md, md->disk->disk_name, _dax_ops);
+   dax_dev = alloc_dax(md, md->disk->disk_name, _dax_ops, true);
if (!dax_dev)
goto bad;
}
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 5a5b3ea4d073..78f71ba0e7cf 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -466,7 +466,8 @@ static int pmem_attach_disk(struct device *dev,
nvdimm_badblocks_populate(nd_region, >bb, _res);
disk->bb = >bb;
 
-   dax_dev = alloc_dax(pmem, disk->disk_name, _dax_ops);
+   dax_dev = alloc_dax(pmem, disk->disk_name, _dax_ops,
+   is_nvdimm_sync(nd_region));
 
if (!dax_dev) {
put_disk(disk);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index fb1041ab32a6..8c7aa047fe2b 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1231,6 +1231,13 @@ int nvdimm_has_cache(struct nd_region *nd_region)
 }
 EXPORT_SYMBOL_GPL(nvdimm_has_cache);
 
+bool is_nvdimm_sync(struct nd_region *nd_region)
+{
+   return is_nd_pmem(_region->dev) &&
+   !test_bit(ND_REGION_ASYNC, _region->flags);
+}
+EXPORT_SYMBOL_GPL(is_nvdimm_sync);
+
 struct conflict_context {
struct nd_region *nd_region;
resource_size_t start, size;
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0dd316a74a29..9bdd50d06ef6 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -32,18 +32,19 @@ extern struct attribute_group dax_attribute_group;
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *dax_get_by_host(const char *host);
 struct dax_device *alloc_dax(void *private, const char *host,
-   const struct dax_operations *ops);
+   const struct dax_operations *ops, bool sync);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
 void dax_write_cache(struct dax_device *dax_dev, bool wc);
 bool dax_write_cache_enabled(struct dax_device *dax_dev);
+bool dax_synchronous(struct dax_device *dax_dev);
 #else
 static inline struct dax_device 

[PATCH v4 2/5] virtio-pmem: Add virtio pmem driver

2019-04-03 Thread Pankaj Gupta
This patch adds virtio-pmem driver for KVM guest.

Guest reads the persistent memory range information from
Qemu over VIRTIO and registers it on nvdimm_bus. It also
creates a nd_region object with the persistent memory
range information so that existing 'nvdimm/pmem' driver
can reserve this into system memory map. This way
'virtio-pmem' driver uses existing functionality of pmem
driver to register persistent memory compatible for DAX
capable filesystems.

This also provides function to perform guest flush over
VIRTIO from 'pmem' driver when userspace performs flush
on DAX memory range.

Signed-off-by: Pankaj Gupta 
---
 drivers/nvdimm/virtio_pmem.c |  84 +
 drivers/virtio/Kconfig   |  10 +++
 drivers/virtio/Makefile  |   1 +
 drivers/virtio/pmem.c| 125 +++
 include/linux/virtio_pmem.h  |  60 +++
 include/uapi/linux/virtio_ids.h  |   1 +
 include/uapi/linux/virtio_pmem.h |  10 +++
 7 files changed, 291 insertions(+)
 create mode 100644 drivers/nvdimm/virtio_pmem.c
 create mode 100644 drivers/virtio/pmem.c
 create mode 100644 include/linux/virtio_pmem.h
 create mode 100644 include/uapi/linux/virtio_pmem.h

diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
new file mode 100644
index ..2a1b1ba2c1ff
--- /dev/null
+++ b/drivers/nvdimm/virtio_pmem.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * virtio_pmem.c: Virtio pmem Driver
+ *
+ * Discovers persistent memory range information
+ * from host and provides a virtio based flushing
+ * interface.
+ */
+#include 
+#include "nd.h"
+
+ /* The interrupt handler */
+void host_ack(struct virtqueue *vq)
+{
+   unsigned int len;
+   unsigned long flags;
+   struct virtio_pmem_request *req, *req_buf;
+   struct virtio_pmem *vpmem = vq->vdev->priv;
+
+   spin_lock_irqsave(>pmem_lock, flags);
+   while ((req = virtqueue_get_buf(vq, )) != NULL) {
+   req->done = true;
+   wake_up(>host_acked);
+
+   if (!list_empty(>req_list)) {
+   req_buf = list_first_entry(>req_list,
+   struct virtio_pmem_request, list);
+   list_del(>req_list);
+   req_buf->wq_buf_avail = true;
+   wake_up(_buf->wq_buf);
+   }
+   }
+   spin_unlock_irqrestore(>pmem_lock, flags);
+}
+EXPORT_SYMBOL_GPL(host_ack);
+
+ /* The request submission function */
+int virtio_pmem_flush(struct nd_region *nd_region)
+{
+   int err;
+   unsigned long flags;
+   struct scatterlist *sgs[2], sg, ret;
+   struct virtio_device *vdev = nd_region->provider_data;
+   struct virtio_pmem *vpmem = vdev->priv;
+   struct virtio_pmem_request *req;
+
+   might_sleep();
+   req = kmalloc(sizeof(*req), GFP_KERNEL);
+   if (!req)
+   return -ENOMEM;
+
+   req->done = req->wq_buf_avail = false;
+   strcpy(req->name, "FLUSH");
+   init_waitqueue_head(>host_acked);
+   init_waitqueue_head(>wq_buf);
+   sg_init_one(, req->name, strlen(req->name));
+   sgs[0] = 
+   sg_init_one(, >ret, sizeof(req->ret));
+   sgs[1] = 
+
+   spin_lock_irqsave(>pmem_lock, flags);
+   err = virtqueue_add_sgs(vpmem->req_vq, sgs, 1, 1, req, GFP_ATOMIC);
+   if (err) {
+   dev_err(>dev, "failed to send command to virtio pmem 
device\n");
+
+   list_add_tail(>req_list, >list);
+   spin_unlock_irqrestore(>pmem_lock, flags);
+
+   /* When host has read buffer, this completes via host_ack */
+   wait_event(req->wq_buf, req->wq_buf_avail);
+   spin_lock_irqsave(>pmem_lock, flags);
+   }
+   virtqueue_kick(vpmem->req_vq);
+   spin_unlock_irqrestore(>pmem_lock, flags);
+
+   /* When host has read buffer, this completes via host_ack */
+   wait_event(req->host_acked, req->done);
+   err = req->ret;
+   kfree(req);
+
+   return err;
+};
+EXPORT_SYMBOL_GPL(virtio_pmem_flush);
+MODULE_LICENSE("GPL");
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
index 35897649c24f..9f634a2ed638 100644
--- a/drivers/virtio/Kconfig
+++ b/drivers/virtio/Kconfig
@@ -42,6 +42,16 @@ config VIRTIO_PCI_LEGACY
 
  If unsure, say Y.
 
+config VIRTIO_PMEM
+   tristate "Support for virtio pmem driver"
+   depends on VIRTIO
+   depends on LIBNVDIMM
+   help
+   This driver provides support for virtio based flushing interface
+   for persistent memory range.
+
+   If unsure, say M.
+
 config VIRTIO_BALLOON
tristate "Virtio balloon driver"
depends on VIRTIO
diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile
index 3a2b5c5dcf46..143ce91eabe9 100644
--- a/drivers/virtio/Makefile
+++ b/drivers/virtio/Makefile
@@ -6,3 +6,4 @@ virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o
 

[PATCH v4 0/5] virtio pmem driver

2019-04-03 Thread Pankaj Gupta
 This patch series has implementation for "virtio pmem". 
 "virtio pmem" is fake persistent memory(nvdimm) in guest 
 which allows to bypass the guest page cache. This also
 implements a VIRTIO based asynchronous flush mechanism.  
 
 Sharing guest kernel driver in this patchset with the 
 changes suggested in v3. Tested with Qemu side device 
 emulation [6] for virtio-pmem. 

 We have incorporated all the suggestions in V3. Documented 
 the impact of possible page cache side channel attacks with 
 suggested countermeasures.
 
 Details of project idea for 'virtio pmem' flushing interface 
 is shared [3] & [4].

 Implementation is divided into two parts:
 New virtio pmem guest driver and qemu code changes for new 
 virtio pmem paravirtualized device.

1. Guest virtio-pmem kernel driver
-
   - Reads persistent memory range from paravirt device and 
 registers with 'nvdimm_bus'.  
   - 'nvdimm/pmem' driver uses this information to allocate 
 persistent memory region and setup filesystem operations 
 to the allocated memory. 
   - virtio pmem driver implements asynchronous flushing 
 interface to flush from guest to host.

2. Qemu virtio-pmem device
-
   - Creates virtio pmem device and exposes a memory range to 
 KVM guest. 
   - At host side this is file backed memory which acts as 
 persistent memory. 
   - Qemu side flush uses aio thread pool API's and virtio 
 for asynchronous guest multi request handling. 

   David Hildenbrand CCed also posted a modified version[7] of 
   qemu virtio-pmem code based on updated Qemu memory device API. 

 Virtio-pmem security implications and countermeasures:
 -

 In previous posting of kernel driver, there was discussion [9]
 on possible implications of page cache side channel attacks with 
 virtio pmem. After thorough analysis of details of known side 
 channel attacks, below are the suggestions:

 - Depends entirely on how host backing image file is mapped 
   into guest address space. 

 - virtio-pmem device emulation, by default shared mapping is used
   to map host backing file. It is recommended to use separate
   backing file at host side for every guest. This will prevent
   any possibility of executing common code from multiple guests
   and any chance of inferring guest local data based based on 
   execution time.

 - If backing file is required to be shared among multiple guests 
   it is recommended to don't support host page cache eviction 
   commands from the guest driver. This will avoid any possibility
   of inferring guest local data or host data from another guest. 

 - Proposed device specification [8] for virtio-pmem device with 
   details of possible security implications and suggested 
   countermeasures for device emulation.

 Virtio-pmem errors handling:
 
  Checked behaviour of virtio-pmem for below types of errors
  Need suggestions on expected behaviour for handling these errors?

  - Hardware Errors: Uncorrectable recoverable Errors: 
  a] virtio-pmem: 
- As per current logic if error page belongs to Qemu process, 
  host MCE handler isolates(hwpoison) that page and send SIGBUS. 
  Qemu SIGBUS handler injects exception to KVM guest. 
- KVM guest then isolates the page and send SIGBUS to guest 
  userspace process which has mapped the page. 
  
  b] Existing implementation for ACPI pmem driver: 
- Handles such errors with MCE notifier and creates a list 
  of bad blocks. Read/direct access DAX operation return EIO 
  if accessed memory page fall in bad block list.
- It also starts backgound scrubbing.  
- Similar functionality can be reused in virtio-pmem with MCE 
  notifier but without scrubbing(no ACPI/ARS)? Need inputs to 
  confirm if this behaviour is ok or needs any change?

Changes from PATCH v3: [1]

- Use generic dax_synchronous() helper to check for DAXDEV_SYNC 
  flag - [Dan, Darrick, Jan]
- Add 'is_nvdimm_async' function
- Document page cache side channel attacks implications & 
  countermeasures - [Dave Chinner, Michael]

Changes from PATCH v2: [2]
- Disable MAP_SYNC for ext4 & XFS filesystems - [Dan] 
- Use name 'virtio pmem' in place of 'fake dax' 

Changes from PATCH v1: 
- 0-day build test for build dependency on libnvdimm 

 Changes suggested by - [Dan Williams]
- Split the driver into two parts virtio & pmem  
- Move queuing of async block request to block layer
- Add "sync" parameter in nvdimm_flush function
- Use indirect call for nvdimm_flush
- Don’t move declarations to common global header e.g nd.h
- nvdimm_flush() return 0 or -EIO if it fails
- Teach nsio_rw_bytes() that the flush can fail
- Rename nvdimm_flush() to generic_nvdimm_flush()
- Use 'nd_region->provider_data' for long dereferencing
- Remove virtio_pmem_freeze/restore functions
- Remove BSD license text with SPDX license text


Re: [PATCH] drm/cirrus: rewrite and modernize driver.

2019-04-03 Thread Gerd Hoffmann
  Hi,

> > So I can just remove cirrus_fb_dirty() and hook up
> > drm_atomic_helper_dirtyfb() instead.  Easy ;)
> 
> You can even use drm_gem_fb_create_with_dirty() instead of
> drm_gem_fb_create_with_funcs().

Ah, cool.  /me happily continues removing code lines.

thanks,
  Gerd

PS: incremental fixups are at
https://git.kraxel.org/cgit/linux/log/?h=drm-rewrite-cirrus

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] drm/cirrus: rewrite and modernize driver.

2019-04-03 Thread Noralf Trønnes



Den 03.04.2019 10.53, skrev Gerd Hoffmann:
>>> +struct cirrus_device {
>>> +   struct drm_device  *dev;
>>
>> Why not embed drm_device? It's the latest rage :-)
> 
> Sure, can do that.
> 
>>> +void cirrus_pipe_update(struct drm_simple_display_pipe *pipe,
>>> +   struct drm_plane_state *old_state)
>>> +{
>>> +   struct drm_plane_state *state = pipe->plane.state;
>>> +   struct drm_crtc *crtc = >crtc;
>>> +   struct drm_rect rect;
>>> +
>>> +   if (drm_atomic_helper_damage_merged(old_state, state, ))
>>> +   cirrus_fb_blit_rect(pipe->plane.state->fb, );
>>> +
>>> +   if (crtc->state->event) {
>>> +   spin_lock_irq(>dev->event_lock);
>>> +   drm_crtc_send_vblank_event(crtc, crtc->state->event);
>>> +   spin_unlock_irq(>dev->event_lock);
>>> +   crtc->state->event = NULL;
>>> +   }
>>> +}
> 
>>> +static int cirrus_fb_dirty(struct drm_framebuffer *fb,
>>> +  struct drm_file *file_priv,
>>> +  unsigned int flags, unsigned int color,
>>> +  struct drm_clip_rect *clips,
>>> +  unsigned int num_clips)
>>> +{
>>> +   struct cirrus_device *cirrus = fb->dev->dev_private;
>>> +
>>> +   if (cirrus->pipe.plane.state->fb != fb)
>>> +   return 0;
>>> +
>>> +   if (num_clips)
>>> +   cirrus_fb_blit_clips(fb, clips, num_clips);
>>> +   else
>>> +   cirrus_fb_blit_fullscreen(fb);
>>> +   return 0;
>>> +}
>>
>> Why not use the dirty helpers and implement dirty rect support in your
>> main plane update function?
> 
> Dirty rect support is already there, see above.
> 
> So I can just remove cirrus_fb_dirty() and hook up
> drm_atomic_helper_dirtyfb() instead.  Easy ;)
> 

You can even use drm_gem_fb_create_with_dirty() instead of
drm_gem_fb_create_with_funcs().

Noralf.

> cheers,
>   Gerd
> 
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
> 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] drm/cirrus: rewrite and modernize driver.

2019-04-03 Thread Gerd Hoffmann
> > +struct cirrus_device {
> > +   struct drm_device  *dev;
> 
> Why not embed drm_device? It's the latest rage :-)

Sure, can do that.

> > +void cirrus_pipe_update(struct drm_simple_display_pipe *pipe,
> > +   struct drm_plane_state *old_state)
> > +{
> > +   struct drm_plane_state *state = pipe->plane.state;
> > +   struct drm_crtc *crtc = >crtc;
> > +   struct drm_rect rect;
> > +
> > +   if (drm_atomic_helper_damage_merged(old_state, state, ))
> > +   cirrus_fb_blit_rect(pipe->plane.state->fb, );
> > +
> > +   if (crtc->state->event) {
> > +   spin_lock_irq(>dev->event_lock);
> > +   drm_crtc_send_vblank_event(crtc, crtc->state->event);
> > +   spin_unlock_irq(>dev->event_lock);
> > +   crtc->state->event = NULL;
> > +   }
> > +}

> > +static int cirrus_fb_dirty(struct drm_framebuffer *fb,
> > +  struct drm_file *file_priv,
> > +  unsigned int flags, unsigned int color,
> > +  struct drm_clip_rect *clips,
> > +  unsigned int num_clips)
> > +{
> > +   struct cirrus_device *cirrus = fb->dev->dev_private;
> > +
> > +   if (cirrus->pipe.plane.state->fb != fb)
> > +   return 0;
> > +
> > +   if (num_clips)
> > +   cirrus_fb_blit_clips(fb, clips, num_clips);
> > +   else
> > +   cirrus_fb_blit_fullscreen(fb);
> > +   return 0;
> > +}
> 
> Why not use the dirty helpers and implement dirty rect support in your
> main plane update function?

Dirty rect support is already there, see above.

So I can just remove cirrus_fb_dirty() and hook up
drm_atomic_helper_dirtyfb() instead.  Easy ;)

cheers,
  Gerd

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] drm/cirrus: rewrite and modernize driver.

2019-04-03 Thread Daniel Vetter
On Wed, Apr 3, 2019 at 9:23 AM Gerd Hoffmann  wrote:
>
> Time to kill some bad sample code people are copying from ;)
>
> This is a complete rewrite of the cirrus driver.  The cirrus_mode_set()
> function is pretty much the only function which is carried over largely
> unmodified.  Everything else is upside down.
>
> It is a single monster patch.  But given that it does some pretty
> fundamental changes to the drivers workflow and also reduces the code
> size by roughly 70% I think it'll still be alot easier to review than a
> longish baby-step patch series.
>
> Changes summary:
>  - Given the small amout of video memory (4 MB) the cirrus device has
>the rewritten driver doesn't try to manage buffers there.  Instead
>it will blit (memcpy) the active framebuffer to video memory.
>  - All gem objects are stored in main memory and are manged using the
>new shmem helpers.  ttm is out.
>  - Only DRM_FORMAT_RGB565 (depth 16) is supported.  The old driver does
>that too by default.  There was a module parameter which enables 24/32
>bpp support and disables higher resolutions (due to cirrus hardware
>constrains).  That parameter wasn't reimplemented.
>  - The simple display pipeline is used.
>  - The generic fbdev emulation is used.
>  - It's a atomic driver now.

Sounds all awesome. Some tiny comments below, with those addressed
looks all good and gets my

Acked-by: Daniel Vetter 
-Daniel

> Signed-off-by: Gerd Hoffmann 
> ---
>  drivers/gpu/drm/cirrus/cirrus_drv.h   | 251 ---
>  drivers/gpu/drm/cirrus/cirrus.c   | 602 +
>  drivers/gpu/drm/cirrus/cirrus_drv.c   | 161 ---
>  drivers/gpu/drm/cirrus/cirrus_fbdev.c | 309 -
>  drivers/gpu/drm/cirrus/cirrus_main.c  | 328 --
>  drivers/gpu/drm/cirrus/cirrus_mode.c  | 617 --
>  drivers/gpu/drm/cirrus/cirrus_ttm.c   | 343 --
>  drivers/gpu/drm/cirrus/Kconfig|   2 +-
>  drivers/gpu/drm/cirrus/Makefile   |   3 -
>  9 files changed, 603 insertions(+), 2013 deletions(-)
>  delete mode 100644 drivers/gpu/drm/cirrus/cirrus_drv.h
>  create mode 100644 drivers/gpu/drm/cirrus/cirrus.c
>  delete mode 100644 drivers/gpu/drm/cirrus/cirrus_drv.c
>  delete mode 100644 drivers/gpu/drm/cirrus/cirrus_fbdev.c
>  delete mode 100644 drivers/gpu/drm/cirrus/cirrus_main.c
>  delete mode 100644 drivers/gpu/drm/cirrus/cirrus_mode.c
>  delete mode 100644 drivers/gpu/drm/cirrus/cirrus_ttm.c
>
> diff --git a/drivers/gpu/drm/cirrus/cirrus_drv.h 
> b/drivers/gpu/drm/cirrus/cirrus_drv.h
> deleted file mode 100644
> index 828b150cdb20..
> --- a/drivers/gpu/drm/cirrus/cirrus_drv.h
> +++ /dev/null
> @@ -1,251 +0,0 @@
> -/*
> - * Copyright 2012 Red Hat
> - *
> - * This file is subject to the terms and conditions of the GNU General
> - * Public License version 2. See the file COPYING in the main
> - * directory of this archive for more details.
> - *
> - * Authors: Matthew Garrett
> - *  Dave Airlie
> - */
> -#ifndef __CIRRUS_DRV_H__
> -#define __CIRRUS_DRV_H__
> -
> -#include 
> -
> -#include 
> -#include 
> -
> -#include 
> -#include 
> -#include 
> -#include 
> -#include 
> -
> -#include 
> -
> -#define DRIVER_AUTHOR  "Matthew Garrett"
> -
> -#define DRIVER_NAME"cirrus"
> -#define DRIVER_DESC"qemu Cirrus emulation"
> -#define DRIVER_DATE"20110418"
> -
> -#define DRIVER_MAJOR   1
> -#define DRIVER_MINOR   0
> -#define DRIVER_PATCHLEVEL  0
> -
> -#define CIRRUSFB_CONN_LIMIT 1
> -
> -#define RREG8(reg) ioread8(((void __iomem *)cdev->rmmio) + (reg))
> -#define WREG8(reg, v) iowrite8(v, ((void __iomem *)cdev->rmmio) + (reg))
> -#define RREG32(reg) ioread32(((void __iomem *)cdev->rmmio) + (reg))
> -#define WREG32(reg, v) iowrite32(v, ((void __iomem *)cdev->rmmio) + (reg))
> -
> -#define SEQ_INDEX 4
> -#define SEQ_DATA 5
> -
> -#define WREG_SEQ(reg, v)   \
> -   do {\
> -   WREG8(SEQ_INDEX, reg);  \
> -   WREG8(SEQ_DATA, v); \
> -   } while (0) \
> -
> -#define CRT_INDEX 0x14
> -#define CRT_DATA 0x15
> -
> -#define WREG_CRT(reg, v)   \
> -   do {\
> -   WREG8(CRT_INDEX, reg);  \
> -   WREG8(CRT_DATA, v); \
> -   } while (0) \
> -
> -#define GFX_INDEX 0xe
> -#define GFX_DATA 0xf
> -
> -#define WREG_GFX(reg, v)   \
> -   do {\
> -   WREG8(GFX_INDEX, reg);  \
> -   WREG8(GFX_DATA, v); \
> -   } while (0)  

[PATCH] drm/cirrus: rewrite and modernize driver.

2019-04-03 Thread Gerd Hoffmann
Time to kill some bad sample code people are copying from ;)

This is a complete rewrite of the cirrus driver.  The cirrus_mode_set()
function is pretty much the only function which is carried over largely
unmodified.  Everything else is upside down.

It is a single monster patch.  But given that it does some pretty
fundamental changes to the drivers workflow and also reduces the code
size by roughly 70% I think it'll still be alot easier to review than a
longish baby-step patch series.

Changes summary:
 - Given the small amout of video memory (4 MB) the cirrus device has
   the rewritten driver doesn't try to manage buffers there.  Instead
   it will blit (memcpy) the active framebuffer to video memory.
 - All gem objects are stored in main memory and are manged using the
   new shmem helpers.  ttm is out.
 - Only DRM_FORMAT_RGB565 (depth 16) is supported.  The old driver does
   that too by default.  There was a module parameter which enables 24/32
   bpp support and disables higher resolutions (due to cirrus hardware
   constrains).  That parameter wasn't reimplemented.
 - The simple display pipeline is used.
 - The generic fbdev emulation is used.
 - It's a atomic driver now.

Signed-off-by: Gerd Hoffmann 
---
 drivers/gpu/drm/cirrus/cirrus_drv.h   | 251 ---
 drivers/gpu/drm/cirrus/cirrus.c   | 602 +
 drivers/gpu/drm/cirrus/cirrus_drv.c   | 161 ---
 drivers/gpu/drm/cirrus/cirrus_fbdev.c | 309 -
 drivers/gpu/drm/cirrus/cirrus_main.c  | 328 --
 drivers/gpu/drm/cirrus/cirrus_mode.c  | 617 --
 drivers/gpu/drm/cirrus/cirrus_ttm.c   | 343 --
 drivers/gpu/drm/cirrus/Kconfig|   2 +-
 drivers/gpu/drm/cirrus/Makefile   |   3 -
 9 files changed, 603 insertions(+), 2013 deletions(-)
 delete mode 100644 drivers/gpu/drm/cirrus/cirrus_drv.h
 create mode 100644 drivers/gpu/drm/cirrus/cirrus.c
 delete mode 100644 drivers/gpu/drm/cirrus/cirrus_drv.c
 delete mode 100644 drivers/gpu/drm/cirrus/cirrus_fbdev.c
 delete mode 100644 drivers/gpu/drm/cirrus/cirrus_main.c
 delete mode 100644 drivers/gpu/drm/cirrus/cirrus_mode.c
 delete mode 100644 drivers/gpu/drm/cirrus/cirrus_ttm.c

diff --git a/drivers/gpu/drm/cirrus/cirrus_drv.h 
b/drivers/gpu/drm/cirrus/cirrus_drv.h
deleted file mode 100644
index 828b150cdb20..
--- a/drivers/gpu/drm/cirrus/cirrus_drv.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright 2012 Red Hat
- *
- * This file is subject to the terms and conditions of the GNU General
- * Public License version 2. See the file COPYING in the main
- * directory of this archive for more details.
- *
- * Authors: Matthew Garrett
- *  Dave Airlie
- */
-#ifndef __CIRRUS_DRV_H__
-#define __CIRRUS_DRV_H__
-
-#include 
-
-#include 
-#include 
-
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include 
-
-#define DRIVER_AUTHOR  "Matthew Garrett"
-
-#define DRIVER_NAME"cirrus"
-#define DRIVER_DESC"qemu Cirrus emulation"
-#define DRIVER_DATE"20110418"
-
-#define DRIVER_MAJOR   1
-#define DRIVER_MINOR   0
-#define DRIVER_PATCHLEVEL  0
-
-#define CIRRUSFB_CONN_LIMIT 1
-
-#define RREG8(reg) ioread8(((void __iomem *)cdev->rmmio) + (reg))
-#define WREG8(reg, v) iowrite8(v, ((void __iomem *)cdev->rmmio) + (reg))
-#define RREG32(reg) ioread32(((void __iomem *)cdev->rmmio) + (reg))
-#define WREG32(reg, v) iowrite32(v, ((void __iomem *)cdev->rmmio) + (reg))
-
-#define SEQ_INDEX 4
-#define SEQ_DATA 5
-
-#define WREG_SEQ(reg, v)   \
-   do {\
-   WREG8(SEQ_INDEX, reg);  \
-   WREG8(SEQ_DATA, v); \
-   } while (0) \
-
-#define CRT_INDEX 0x14
-#define CRT_DATA 0x15
-
-#define WREG_CRT(reg, v)   \
-   do {\
-   WREG8(CRT_INDEX, reg);  \
-   WREG8(CRT_DATA, v); \
-   } while (0) \
-
-#define GFX_INDEX 0xe
-#define GFX_DATA 0xf
-
-#define WREG_GFX(reg, v)   \
-   do {\
-   WREG8(GFX_INDEX, reg);  \
-   WREG8(GFX_DATA, v); \
-   } while (0) \
-
-/*
- * Cirrus has a "hidden" DAC register that can be accessed by writing to
- * the pixel mask register to reset the state, then reading from the register
- * four times. The next write will then pass to the DAC
- */
-#define VGA_DAC_MASK 0x6
-
-#define WREG_HDR(v)\
-   do {