Re: [Intel-gfx] [PATCH] drm/i915: Perform object clflushing asynchronously

2016-11-04 Thread Chris Wilson
On Fri, Nov 04, 2016 at 08:03:57PM +, Chris Wilson wrote:
> Flushing the cachelines for an object is slow, can be as much as 100ms
> for a large framebuffer. We currently do this under the struct_mutex BKL
> on execution or on pageflip. But now with the ability to add fences to
> obj->resv for both flips and execbuf (and we naturally wait on the fence
> before CPU access), we can move the clflush operation to a workqueue and
> signal a fence for completion, thereby doing the work asynchronously and
> not blocking the driver or its clients.
> 
> Suggested-by: Akash Goel 
> Signed-off-by: Chris Wilson 
> Cc: Akash Goel 

Needs a bit more work to restrict the async operations. In the end, I
think only the explicit paths towards execbuf / flip should opt in,
as the majority will want sync (pread/pwrite/set-domain). This idea came
up in a discussion on whether we needed create2 for early clflush or
whether we could explot set-domain for the same functionality. Now, we
can do the clflush asynchronously from create, but we must do it
synchronously in set-domain (albeit now it could be done outside of the
struct_mutex).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
___
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx


[Intel-gfx] [PATCH] drm/i915: Perform object clflushing asynchronously

2016-11-04 Thread Chris Wilson
Flushing the cachelines for an object is slow, can be as much as 100ms
for a large framebuffer. We currently do this under the struct_mutex BKL
on execution or on pageflip. But now with the ability to add fences to
obj->resv for both flips and execbuf (and we naturally wait on the fence
before CPU access), we can move the clflush operation to a workqueue and
signal a fence for completion, thereby doing the work asynchronously and
not blocking the driver or its clients.

Suggested-by: Akash Goel 
Signed-off-by: Chris Wilson 
Cc: Akash Goel 
---
 drivers/gpu/drm/i915/Makefile  |   1 +
 drivers/gpu/drm/i915/i915_drv.h|   8 +-
 drivers/gpu/drm/i915/i915_gem.c|  60 +++--
 drivers/gpu/drm/i915/i915_gem_clflush.c| 138 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   6 +-
 drivers/gpu/drm/i915/intel_display.c   |  57 ++--
 6 files changed, 190 insertions(+), 80 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_gem_clflush.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 0857e5035f4d..6afd402e440b 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -29,6 +29,7 @@ i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o
 # GEM code
 i915-y += i915_cmd_parser.o \
  i915_gem_batch_pool.o \
+ i915_gem_clflush.o \
  i915_gem_context.o \
  i915_gem_dmabuf.o \
  i915_gem_evict.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2754d5de76af..c80044267333 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3410,7 +3410,13 @@ static inline u32 i915_reset_count(struct i915_gpu_error 
*error)
 
 void i915_gem_reset(struct drm_i915_private *dev_priv);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
-bool i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);
+
+void i915_gem_clflush_init(struct drm_i915_private *i915);
+int i915_gem_clflush_object(struct drm_i915_gem_object *obj,
+   unsigned int flags);
+#define I915_CLFLUSH_FORCE BIT(0)
+#define I915_CLFLUSH_SYNC BIT(1)
+
 int __must_check i915_gem_init(struct drm_device *dev);
 int __must_check i915_gem_init_hw(struct drm_device *dev);
 void i915_gem_init_swizzling(struct drm_device *dev);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cffe60237b6a..524f72774537 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -230,7 +230,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object 
*obj)
obj->mm.dirty = false;
 
if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0)
-   i915_gem_clflush_object(obj, false);
+   i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 
obj->base.read_domains = I915_GEM_DOMAIN_CPU;
obj->base.write_domain = I915_GEM_DOMAIN_CPU;
@@ -1570,6 +1570,11 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void 
*data,
 
mutex_unlock(&dev->struct_mutex);
 
+   if (err == 0)
+   err = i915_gem_object_wait(obj,
+  I915_WAIT_INTERRUPTIBLE,
+  MAX_SCHEDULE_TIMEOUT,
+  NULL);
if (write_domain != 0)
intel_fb_obj_invalidate(obj, write_origin(obj, write_domain));
 
@@ -3236,44 +3241,6 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 
alignment, u64 flags)
return ret;
 }
 
-bool
-i915_gem_clflush_object(struct drm_i915_gem_object *obj,
-   bool force)
-{
-   /* If we don't have a page list set up, then we're not pinned
-* to GPU, and we can ignore the cache flush because it'll happen
-* again at bind time.
-*/
-   if (!obj->mm.pages)
-   return false;
-
-   /*
-* Stolen memory is always coherent with the GPU as it is explicitly
-* marked as wc by the system, or the system is cache-coherent.
-*/
-   if (obj->stolen || obj->phys_handle)
-   return false;
-
-   /* If the GPU is snooping the contents of the CPU cache,
-* we do not need to manually clear the CPU cache lines.  However,
-* the caches are only snooped when the render cache is
-* flushed/invalidated.  As we always have to emit invalidations
-* and flushes when moving into and out of the RENDER domain, correct
-* snooping behaviour occurs naturally as the result of our domain
-* tracking.
-*/
-   if (!force && cpu_cache_is_coherent(obj->base.dev, obj->cache_level)) {
-   obj->cache_dirty = true;
-   return false;
-   }
-
-   trace_i915_gem_object_clflush(obj);
-   drm_clflush_sg(obj->mm.pages);
-   obj->cache_dirty = false;
-
-   return true;
-}
-
 /** Flushes the GTT wr