vmalloc BO's gives us cached reads, so no need to prefetch in that case.
Prefetching gives a ~20% speedup on a cma buffer using the mi0283qt
driver on a Raspberry Pi 1.

Signed-off-by: Noralf Trønnes <nor...@tronnes.org>
---
 drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c | 54 ++++++++++++++------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c 
b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
index ee9a8f305b26..bca905213cdd 100644
--- a/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
+++ b/drivers/gpu/drm/tinydrm/core/tinydrm-helpers.c
@@ -15,6 +15,8 @@
 #include <linux/swab.h>
 
 #include <drm/drmP.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_gem_framebuffer_helper.h>
 #include <drm/tinydrm/tinydrm.h>
 #include <drm/tinydrm/tinydrm-helpers.h>
 
@@ -115,22 +117,25 @@ void tinydrm_swab16(u16 *dst, void *vaddr, struct 
drm_framebuffer *fb,
                    struct drm_clip_rect *clip)
 {
        size_t len = (clip->x2 - clip->x1) * sizeof(u16);
+       u16 *src, *buf = NULL;
        unsigned int x, y;
-       u16 *src, *buf;
 
        /*
-        * The cma memory is write-combined so reads are uncached.
-        * Speed up by fetching one line at a time.
+        * Imported buffers are likely to be write-combined with uncached
+        * reads. Speed up by fetching one line at a time.
+        * prefetch_range() was tried, but didn't give any noticeable speedup
+        * on the Raspberry Pi 1.
         */
-       buf = kmalloc(len, GFP_KERNEL);
-       if (!buf)
-               return;
+       if (drm_gem_fb_get_obj(fb, 0)->import_attach)
+               buf = kmalloc(len, GFP_KERNEL);
 
        for (y = clip->y1; y < clip->y2; y++) {
                src = vaddr + (y * fb->pitches[0]);
                src += clip->x1;
-               memcpy(buf, src, len);
-               src = buf;
+               if (buf) {
+                       memcpy(buf, src, len);
+                       src = buf;
+               }
                for (x = clip->x1; x < clip->x2; x++)
                        *dst++ = swab16(*src++);
        }
@@ -155,19 +160,21 @@ void tinydrm_xrgb8888_to_rgb565(u16 *dst, void *vaddr,
                                struct drm_clip_rect *clip, bool swap)
 {
        size_t len = (clip->x2 - clip->x1) * sizeof(u32);
+       u32 *src, *buf = NULL;
        unsigned int x, y;
-       u32 *src, *buf;
        u16 val16;
 
-       buf = kmalloc(len, GFP_KERNEL);
-       if (!buf)
-               return;
+       /* See tinydrm_swab16() for an explanation */
+       if (drm_gem_fb_get_obj(fb, 0)->import_attach)
+               buf = kmalloc(len, GFP_KERNEL);
 
        for (y = clip->y1; y < clip->y2; y++) {
                src = vaddr + (y * fb->pitches[0]);
                src += clip->x1;
-               memcpy(buf, src, len);
-               src = buf;
+               if (buf) {
+                       memcpy(buf, src, len);
+                       src = buf;
+               }
                for (x = clip->x1; x < clip->x2; x++) {
                        val16 = ((*src & 0x00F80000) >> 8) |
                                ((*src & 0x0000FC00) >> 5) |
@@ -205,24 +212,23 @@ void tinydrm_xrgb8888_to_gray8(u8 *dst, void *vaddr, 
struct drm_framebuffer *fb,
 {
        unsigned int len = (clip->x2 - clip->x1) * sizeof(u32);
        unsigned int x, y;
-       void *buf;
+       void *buf = NULL;
        u32 *src;
 
        if (WARN_ON(fb->format->format != DRM_FORMAT_XRGB8888))
                return;
-       /*
-        * The cma memory is write-combined so reads are uncached.
-        * Speed up by fetching one line at a time.
-        */
-       buf = kmalloc(len, GFP_KERNEL);
-       if (!buf)
-               return;
+
+       /* See tinydrm_swab16() for an explanation */
+       if (drm_gem_fb_get_obj(fb, 0)->import_attach)
+               buf = kmalloc(len, GFP_KERNEL);
 
        for (y = clip->y1; y < clip->y2; y++) {
                src = vaddr + (y * fb->pitches[0]);
                src += clip->x1;
-               memcpy(buf, src, len);
-               src = buf;
+               if (buf) {
+                       memcpy(buf, src, len);
+                       src = buf;
+               }
                for (x = clip->x1; x < clip->x2; x++) {
                        u8 r = (*src & 0x00ff0000) >> 16;
                        u8 g = (*src & 0x0000ff00) >> 8;
-- 
2.14.2

_______________________________________________
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Reply via email to