On Wed, Jul 8, 2015 at 2:07 PM, Chris Wilson <ch...@chris-wilson.co.uk> wrote:
> On Wed, Jul 08, 2015 at 02:00:02PM -0700, Matt Turner wrote:
>> By keeping a pointer to the next available location, we reduce the
>> number of memory accesses needed to write to the batchbuffer.
>>
>> A net ~7k reduction of .text size, 7.5k of which is from the change to
>> intel_batchbuffer_emit_dword().
>>
>>    text     data      bss      dec      hex  filename
>> 4943740   195152    26192  5165084   4ed01c  i965_dri.so before
>> 4936804   195152    26192  5158148   4eb504  i965_dri.so after
>>
>> Combined with the previous patch, improves performance of Synmark
>> OglBatch7 by 4.05914% +/- 1.49373% (n=270) on Haswell.
>> ---
>> Full disclosure: when testing on an IVB desktop, I measured a
>> regression in the same benchmark of -4.19005% +/- 1.15188% (n=30).
>> I don't have any explanation.
>
> The problem is that it seems to generate worse code with multiple
> adjacent emit_dwords. I have seen similar regressions when doing the
> same batch[index] to *batch++ elsewhere.
> -Chris

That is in conflict with the data I've provided. In fact, I started by
noticing that if I added intel_batchbuffer_emit_dword* functions that
took multiple dwords that there was a reduction in .text size, so it's
something that I've considered.

The two attached patches demonstrate that the batch[index++] pattern
generates larger (worse) code than *batch++. 1.patch

   text   data    bss    dec    hex filename
4936804 195152  26192 5158148 4eb504 mesa-release/lib/i965_dri.so
after 1.patch (no change)
4936884 195152  26192 5158228 4eb554 mesa-release/lib/i965_dri.so after 2.patch
From 03256a2898bf74b56a35f8fcc130f8fee5902b59 Mon Sep 17 00:00:00 2001
From: Matt Turner <matts...@gmail.com>
Date: Wed, 8 Jul 2015 15:11:07 -0700
Subject: [PATCH 1/2] 1

---
 src/mesa/drivers/dri/i965/gen7_blorp.cpp      | 17 +++++++++--------
 src/mesa/drivers/dri/i965/intel_batchbuffer.h | 24 ++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index abace6d..74e0763 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -520,14 +520,15 @@ gen7_blorp_emit_ps_config(struct brw_context *brw,
    }
 
    BEGIN_BATCH(8);
-   OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
-   OUT_BATCH(params->use_wm_prog ? prog_offset : 0);
-   OUT_BATCH(dw2);
-   OUT_BATCH(0);
-   OUT_BATCH(dw4);
-   OUT_BATCH(dw5);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
+   intel_batchbuffer_emit_dword8(brw,
+                                 _3DSTATE_PS << 16 | (8 - 2),
+                                 params->use_wm_prog ? prog_offset : 0,
+                                 dw2,
+                                 0,
+                                 dw4,
+                                 dw5,
+                                 0,
+                                 0);
    ADVANCE_BATCH();
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index a12387b..a5d8f0b 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -101,6 +101,30 @@ intel_batchbuffer_emit_dword(struct brw_context *brw, GLuint dword)
 }
 
 static inline void
+intel_batchbuffer_emit_dword8(struct brw_context *brw, GLuint dword1,
+                                                       GLuint dword2,
+                                                       GLuint dword3,
+                                                       GLuint dword4,
+                                                       GLuint dword5,
+                                                       GLuint dword6,
+                                                       GLuint dword7,
+                                                       GLuint dword8)
+{
+#ifdef DEBUG
+   assert(intel_batchbuffer_space(brw) >= 4);
+#endif
+   *brw->batch.map_next++ = dword1;
+   *brw->batch.map_next++ = dword2;
+   *brw->batch.map_next++ = dword3;
+   *brw->batch.map_next++ = dword4;
+   *brw->batch.map_next++ = dword5;
+   *brw->batch.map_next++ = dword6;
+   *brw->batch.map_next++ = dword7;
+   *brw->batch.map_next++ = dword8;
+   assert(brw->batch.ring != UNKNOWN_RING);
+}
+
+static inline void
 intel_batchbuffer_emit_float(struct brw_context *brw, float f)
 {
    intel_batchbuffer_emit_dword(brw, float_as_int(f));
-- 
2.3.6

From 08c78f95dfbaa39303d25c3348ba1fa99b04b31b Mon Sep 17 00:00:00 2001
From: Matt Turner <matts...@gmail.com>
Date: Wed, 8 Jul 2015 15:30:48 -0700
Subject: [PATCH 2/2] 2

---
 src/mesa/drivers/dri/i965/brw_context.h       |  2 +-
 src/mesa/drivers/dri/i965/intel_batchbuffer.h | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index c7ad35e..c2f6658 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -875,7 +875,7 @@ struct intel_batchbuffer {
 #ifdef DEBUG
    uint16_t emit, total;
 #endif
-   uint16_t reserved_space;
+   uint16_t used, reserved_space;
    uint32_t *map_next;
    uint32_t *map;
    uint32_t *cpu_map;
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index a5d8f0b..282820a 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -113,6 +113,16 @@ intel_batchbuffer_emit_dword8(struct brw_context *brw, GLuint dword1,
 #ifdef DEBUG
    assert(intel_batchbuffer_space(brw) >= 4);
 #endif
+#if 1
+   brw->batch.map[brw->batch.used++] = dword1;
+   brw->batch.map[brw->batch.used++] = dword2;
+   brw->batch.map[brw->batch.used++] = dword3;
+   brw->batch.map[brw->batch.used++] = dword4;
+   brw->batch.map[brw->batch.used++] = dword5;
+   brw->batch.map[brw->batch.used++] = dword6;
+   brw->batch.map[brw->batch.used++] = dword7;
+   brw->batch.map[brw->batch.used++] = dword8;
+#else
    *brw->batch.map_next++ = dword1;
    *brw->batch.map_next++ = dword2;
    *brw->batch.map_next++ = dword3;
@@ -121,6 +131,7 @@ intel_batchbuffer_emit_dword8(struct brw_context *brw, GLuint dword1,
    *brw->batch.map_next++ = dword6;
    *brw->batch.map_next++ = dword7;
    *brw->batch.map_next++ = dword8;
+#endif
    assert(brw->batch.ring != UNKNOWN_RING);
 }
 
-- 
2.3.6

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to