[Mesa-dev] [PATCH] [V2] i965: Context aware user space EU control through application

2018-08-13 Thread aravindan . muthukumar
From: "Muthukumar, Aravindan" 

 The Patch here is to give control to user/ application to really
 decide what's the max GPU load it would put. If that can be
 known in advance, rpcs can be programmed accordingly.
 This solution has changes across i915,
 drm and mesa (not limited only to kernel).

 Here, we pass gpu_load_type = {high, medium, low} from application
 while context is created. Default here is 'High' and applications
 roughly know if they are going to eat up entire GPU. The typical
 usecase of 'Low' is idle screen or minor mouse movements. Users can
 read meaning of high/medium/low for their platform  & then program
 contexts accordingly. Here gpu_load_type directly translates to
 number of shader cores/EUs a particular GPU has.

 V2 : changes for setting the loadtype through setparam.

 Signed-off-by: Aravindan Muthukumar 
 Signed-off-by: Kedar J Karanje 
 Signed-off-by: Praveen Diwakar 
 Signed-off-by: Yogesh Marathe 
---
 include/drm-uapi/i915_drm.h |  1 +
 src/mesa/drivers/dri/i965/brw_bufmgr.c  | 20 
 src/mesa/drivers/dri/i965/brw_bufmgr.h  |  4 
 src/mesa/drivers/dri/i965/brw_context.c |  8 +++-
 4 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/include/drm-uapi/i915_drm.h b/include/drm-uapi/i915_drm.h
index 16e452a..737e78e 100644
--- a/include/drm-uapi/i915_drm.h
+++ b/include/drm-uapi/i915_drm.h
@@ -1453,6 +1453,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE0x4
 #define I915_CONTEXT_PARAM_BANNABLE0x5
 #define I915_CONTEXT_PARAM_PRIORITY0x6
+#define I915_CONTEXT_PARAM_LOAD_TYPE0x7
 #define   I915_CONTEXT_MAX_USER_PRIORITY   1023 /* inclusive */
 #define   I915_CONTEXT_DEFAULT_PRIORITY0
 #define   I915_CONTEXT_MIN_USER_PRIORITY   -1023 /* inclusive */
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 8ba915b..8dd4bab 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -1332,6 +1332,26 @@ brw_create_hw_context(struct brw_bufmgr *bufmgr)
return create.ctx_id;
 }
 
+/* DYNAMIC EU CONTROL */
+int
+brw_hw_context_load_type(struct brw_bufmgr *bufmgr,
+uint32_t ctx_id,
+int load_type)
+{
+   struct drm_i915_gem_context_param p = {
+   .ctx_id = ctx_id,
+   .param  = I915_CONTEXT_PARAM_LOAD_TYPE,
+   .value = load_type,
+   };
+   int err;
+
+   err = 0;
+   if(drmIoctl(bufmgr->fd, DRM_IOCTL_I915_GEM_CONTEXT_SETPARAM, ))
+   err = -errno;
+
+   return err;
+}
+
 int
 brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
 uint32_t ctx_id,
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h 
b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 68f5e0c..9e9419b 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -313,6 +313,10 @@ int brw_bo_wait(struct brw_bo *bo, int64_t timeout_ns);
 
 uint32_t brw_create_hw_context(struct brw_bufmgr *bufmgr);
 
+int brw_hw_context_load_type(struct brw_bufmgr *bufmgr,
+ uint32_t ctx_id,
+ int load_type);
+
 int brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
 uint32_t ctx_id,
 int priority);
diff --git a/src/mesa/drivers/dri/i965/brw_context.c 
b/src/mesa/drivers/dri/i965/brw_context.c
index 01a3e16..2ef21b6 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -888,7 +888,8 @@ brwCreateContext(gl_api api,
 
if (ctx_config->attribute_mask &
~(__DRIVER_CONTEXT_ATTRIB_RESET_STRATEGY |
- __DRIVER_CONTEXT_ATTRIB_PRIORITY)) {
+ __DRIVER_CONTEXT_ATTRIB_PRIORITY |
+ __DRIVER_CONTEXT_ATTRIB_LOAD_TYPE)) {
   *dri_ctx_error = __DRI_CTX_ERROR_UNKNOWN_ATTRIBUTE;
   return false;
}
@@ -1005,6 +1006,11 @@ brwCreateContext(gl_api api,
  return false;
   }
 
+  if(ctx_config->attribute_mask & __DRIVER_CONTEXT_ATTRIB_LOAD_TYPE) {
+ brw_hw_context_load_type(brw->bufmgr,
+  brw->hw_ctx,ctx_config->load_type);
+  }
+
   int hw_priority = GEN_CONTEXT_MEDIUM_PRIORITY;
   if (ctx_config->attribute_mask & __DRIVER_CONTEXT_ATTRIB_PRIORITY) {
  switch (ctx_config->priority) {
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/2] i965: Context aware user space EU control through application

2018-07-20 Thread aravindan . muthukumar
From: "Muthukumar, Aravindan" 

 The Patch here is to give control to user/ application to really
 decide what's the max GPU load it would put. If that can be
 known in advance, rpcs can be programmed accordingly.
 This solution has changes across i915,
 drm and mesa (not limited only to kernel).

 Here, we pass gpu_load_type = {high, medium, low} from application
 while context is created. Default here is 'High' and applications
 roughly know if they are going to eat up entire GPU. The typical
 usecase of 'Low' is idle screen or minor mouse movements. Users can
 read meaning of high/medium/low for their platform  & then program
 contexts accordingly. Here gpu_load_type directly translates to
 number of shader cores/EUs a particular GPU has.

 Signed-off-by: Aravindan Muthukumar 
 Signed-off-by: Kedar J Karanje 
 Signed-off-by: Praveen Diwakar 
 Signed-off-by: Yogesh Marathe 
---
 include/drm-uapi/i915_drm.h |  8 
 src/mesa/drivers/dri/i965/brw_bufmgr.c  | 19 +++
 src/mesa/drivers/dri/i965/brw_bufmgr.h  |  4 
 src/mesa/drivers/dri/i965/brw_context.c |  8 +++-
 4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/include/drm-uapi/i915_drm.h b/include/drm-uapi/i915_drm.h
index 16e452a..f07c55a 100644
--- a/include/drm-uapi/i915_drm.h
+++ b/include/drm-uapi/i915_drm.h
@@ -319,6 +319,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_PERF_ADD_CONFIG   0x37
 #define DRM_I915_PERF_REMOVE_CONFIG0x38
 #define DRM_I915_QUERY 0x39
+#define DRM_I915_LOAD_TYPE 0x3d
 
 #define DRM_IOCTL_I915_INITDRM_IOW( DRM_COMMAND_BASE + 
DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH   DRM_IO ( DRM_COMMAND_BASE + 
DRM_I915_FLUSH)
@@ -377,6 +378,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_PERF_ADD_CONFIG DRM_IOW(DRM_COMMAND_BASE + 
DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config)
 #define DRM_IOCTL_I915_PERF_REMOVE_CONFIG  DRM_IOW(DRM_COMMAND_BASE + 
DRM_I915_PERF_REMOVE_CONFIG, __u64)
 #define DRM_IOCTL_I915_QUERY   DRM_IOWR(DRM_COMMAND_BASE + 
DRM_I915_QUERY, struct drm_i915_query)
+#define DRM_IOCTL_I915_LOAD_TYPEDRM_IOWR (DRM_COMMAND_BASE + 
DRM_I915_LOAD_TYPE, struct drm_i915_load_type)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -1387,6 +1389,12 @@ struct drm_i915_gem_context_create {
__u32 pad;
 };
 
+/* Dynamic Eu control */
+struct drm_i915_load_type {
+   __u32 ctx_id;
+   __u32 load_type;
+};
+
 struct drm_i915_gem_context_destroy {
__u32 ctx_id;
__u32 pad;
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 8ba915b..ac74dfd 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -1332,6 +1332,25 @@ brw_create_hw_context(struct brw_bufmgr *bufmgr)
return create.ctx_id;
 }
 
+/* DYNAMIC EU CONTROL */
+int
+brw_hw_context_load_type(struct brw_bufmgr *bufmgr,
+uint32_t ctx_id,
+int load_type)
+{
+   struct drm_i915_load_type type = {
+   .ctx_id = ctx_id,
+   .load_type = load_type,
+   };
+   int err;
+
+   err = 0;
+   if(drmIoctl(bufmgr->fd, DRM_IOCTL_I915_LOAD_TYPE, ))
+   err = -errno;
+
+   return err;
+}
+
 int
 brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
 uint32_t ctx_id,
diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h 
b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 68f5e0c..9e9419b 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -313,6 +313,10 @@ int brw_bo_wait(struct brw_bo *bo, int64_t timeout_ns);
 
 uint32_t brw_create_hw_context(struct brw_bufmgr *bufmgr);
 
+int brw_hw_context_load_type(struct brw_bufmgr *bufmgr,
+ uint32_t ctx_id,
+ int load_type);
+
 int brw_hw_context_set_priority(struct brw_bufmgr *bufmgr,
 uint32_t ctx_id,
 int priority);
diff --git a/src/mesa/drivers/dri/i965/brw_context.c 
b/src/mesa/drivers/dri/i965/brw_context.c
index 01a3e16..2ef21b6 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -888,7 +888,8 @@ brwCreateContext(gl_api api,
 
if (ctx_config->attribute_mask &
~(__DRIVER_CONTEXT_ATTRIB_RESET_STRATEGY |
- __DRIVER_CONTEXT_ATTRIB_PRIORITY)) {
+ __DRIVER_CONTEXT_ATTRIB_PRIORITY |
+ __DRIVER_CONTEXT_ATTRIB_LOAD_TYPE)) {
   *dri_ctx_error = __DRI_CTX_ERROR_UNKNOWN_ATTRIBUTE;
   return false;
}
@@ -1005,6 +1006,11 @@ brwCreateContext(gl_api api,
  return false;
   }
 
+  if(ctx_config->attribute_mask & __

[Mesa-dev] [PATCH 1/2] RFC : Context aware user space Resource control

2018-07-20 Thread aravindan . muthukumar
From: "Muthukumar, Aravindan" 

 The Patch here is to give control to user/ application to really
 decide what's the max GPU load it would put. If that can be
 known in advance, rpcs can be programmed accordingly.

 Here, we pass gpu_load_type = {high, medium, low} from application
 while context is created. Default here is 'High' and applications
 roughly know if they are going to eat up entire GPU. The typical
 usecase of 'Low' is idle screen or minor mouse movements. Users can
 read meaning of high/medium/low for their platform & then program
 contexts accordingly. Here, gpu_load_type directly translates
 to number of shader cores/EUs a particular GPU has.

 Signed-off-by: Aravindan Muthukumar 
 Signed-off-by: Kedar J Karanje 
 Signed-off-by: Praveen Diwakar 
 Signed-off-by: Yogesh Marathe  
---
 include/EGL/eglext.h   |  8 
 include/GL/glxext.h|  9 +
 include/GL/internal/dri_interface.h| 12 
 src/egl/drivers/dri2/egl_dri2.c| 16 
 src/egl/generate/egl.xml   |  8 
 src/egl/main/eglcontext.c  |  5 +
 src/egl/main/eglcontext.h  |  2 ++
 src/glx/dri2_glx.c |  7 ++-
 src/glx/dri3_glx.c | 10 --
 src/glx/dri_common.c   | 19 ++-
 src/glx/dri_common.h   |  2 +-
 src/glx/drisw_glx.c|  9 -
 src/mesa/drivers/dri/common/dri_util.c |  6 +-
 src/mesa/drivers/dri/common/dri_util.h |  5 -
 14 files changed, 110 insertions(+), 8 deletions(-)

diff --git a/include/EGL/eglext.h b/include/EGL/eglext.h
index 2f990cc..a29eccc 100644
--- a/include/EGL/eglext.h
+++ b/include/EGL/eglext.h
@@ -918,6 +918,14 @@ EGLAPI EGLSurface EGLAPIENTRY eglCreatePixmapSurfaceHI 
(EGLDisplay dpy, EGLConfi
 #define EGL_CONTEXT_PRIORITY_LOW_IMG  0x3103
 #endif /* EGL_IMG_context_priority */
 
+#ifndef EGL_CONTEXT_load_type
+#define EGL_CONTEXT_load_type 1
+#define EGL_CONTEXT_LOAD_TYPE 0x31A0
+#define EGL_CONTEXT_LOAD_LOW  0x31A1
+#define EGL_CONTEXT_LOAD_MEDIUM   0x31A2
+#define EGL_CONTEXT_LOAD_HIGH 0x31A3
+#endif /* EGL_CONTEXT_load_type */
+
 #ifndef EGL_IMG_image_plane_attribs
 #define EGL_IMG_image_plane_attribs 1
 #define EGL_NATIVE_BUFFER_MULTIPLANE_SEPARATE_IMG 0x3105
diff --git a/include/GL/glxext.h b/include/GL/glxext.h
index 0f60a38..9d4c3d0 100644
--- a/include/GL/glxext.h
+++ b/include/GL/glxext.h
@@ -163,6 +163,15 @@ __GLXextFuncPtr glXGetProcAddress (const GLubyte 
*procName);
 #define GLX_CONTEXT_RELEASE_BEHAVIOR_FLUSH_ARB 0x2098
 #endif /* GLX_ARB_context_flush_control */
 
+#ifndef GLX_CONTEXT_load_type
+#define GLX_CONTEXT_load_type 1
+#define GLX_CONTEXT_LOAD_TYPE 0x31A0
+#define GLX_CONTEXT_LOAD_LOW 0x31A1
+#define GLX_CONTEXT_LOAD_MEDIUM 0x31A2
+#define GLX_CONTEXT_LOAD_HIGH 0x31A3
+#endif
+
+
 #ifndef GLX_ARB_create_context
 #define GLX_ARB_create_context 1
 #define GLX_CONTEXT_DEBUG_BIT_ARB 0x0001
diff --git a/include/GL/internal/dri_interface.h 
b/include/GL/internal/dri_interface.h
index 4f4795c..0a8492a 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -1140,6 +1140,18 @@ struct __DRIdri2LoaderExtensionRec {
 #define __DRI_CTX_RELEASE_BEHAVIOR_FLUSH1
 /*@}*/
 
+
+/**
+ * \name Context Load Types.
+ */
+/*@{*/
+#define __DRI_CTX_ATTRIB_LOAD_TYPE  6
+
+#define __DRI_CTX_ATTRIB_LOAD_LOW   0
+#define __DRI_CTX_ATTRIB_LOAD_MEDIUM1
+#define __DRI_CTX_ATTRIB_LOAD_HIGH  2
+/*@}*/
+
 /**
  * \name Reasons that __DRIdri2Extension::createContextAttribs might fail
  */
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index 45d0c72..c0dc97a 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -1212,6 +1212,22 @@ dri2_fill_context_attribs(struct dri2_egl_context 
*dri2_ctx,
   ctx_attribs[pos++] = __DRI_CTX_RELEASE_BEHAVIOR_NONE;
}
 
+   if(dri2_ctx->base.LoadType != EGL_CONTEXT_LOAD_HIGH) {
+  ctx_attribs[pos++] = __DRI_CTX_ATTRIB_LOAD_TYPE;
+
+  switch(dri2_ctx->base.LoadType) {
+  case EGL_CONTEXT_LOAD_MEDIUM:
+ ctx_attribs[pos++] = __DRI_CTX_ATTRIB_LOAD_MEDIUM;
+ break;
+  case EGL_CONTEXT_LOAD_LOW:
+ ctx_attribs[pos++] = __DRI_CTX_ATTRIB_LOAD_LOW;
+ break;
+  default:
+_eglError(EGL_BAD_CONFIG, "eglCreateContext");
+ return false;
+  }
+   }
+
*num_attribs = pos;
 
return true;
diff --git a/src/egl/generate/egl.xml b/src/egl/generate/egl.xml
index 9250f93..52b0c9f 100644
--- a/src/egl/generate/egl.xml
+++ b/src/egl/generate/egl.xml
@@ -460,6 +460,14 @@
 
 
 
+
+
+
+
+
+
+
+
 
 
 
diff --git a/src/egl/main/eglcontext.c b/src/egl/m

[Mesa-dev] [PATCH v4] i965 : optimized bucket index calculation

2017-11-08 Thread aravindan . muthukumar
From: Aravindan Muthukumar <aravindan.muthuku...@intel.com>

Reducing Bucket index calculation to O(1).

This algorithm calculates the index using matrix method.
Matrix arrangement is as below:
Assuming PAGE_SIZE is 4096.

  1*4096   2*40963*40964*4096
  5*4096   6*40967*40968*4096
  10*4096  12*4096   14*4096   16*4096
  20*4096  24*4096   28*4096   32*4096
   ...  ...   ...   ...
   ...  ...   ...   ...
   ...  ...   ...   max_cache_size

From this matrix its clearly seen that every row
follows the below way:
  ...   ...   ...n
n+(1/4)n  n+(1/2)n  n+(3/4)n2n

Row is calculated as log2(size/PAGE_SIZE)
Column is calculated as converting the difference
between the elements to fit into power size of two
and indexing it.

Final Index is (row*4)+(col-1)

Tested with Intel Mesa CI.

Improves performance of 3DMark on BXT by 0.705966% +/- 0.229767% (n=20)

v4: Review comments on style and code comments implemented (Ian).
v3: Review comments implemented (Ian).
v2: Review comments implemented (Jason).

Signed-off-by: Aravindan Muthukumar <aravindan.muthuku...@intel.com>
Signed-off-by: Kedar Karanje <kedar.j.kara...@intel.com>
Reviewed-by: Yogesh Marathe <yogesh.mara...@intel.com>
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c | 47 --
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 17036b5..f21df5a 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -86,6 +86,8 @@
 
 #define memclear(s) memset(, 0, sizeof(s))
 
+#define PAGE_SIZE 4096
+
 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
 
 static inline int
@@ -180,19 +182,44 @@ bo_tile_pitch(struct brw_bufmgr *bufmgr, uint32_t pitch, 
uint32_t tiling)
return ALIGN(pitch, tile_width);
 }
 
+/**
+ * This function finds the correct bucket fit for the input size.
+ * The function works with O(1) complexity when the requested size
+ * was queried instead of iterating the size through all the buckets.
+ */
 static struct bo_cache_bucket *
 bucket_for_size(struct brw_bufmgr *bufmgr, uint64_t size)
 {
-   int i;
+   /* Calculating the pages and rounding up to the page size. */
+   const unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+
+   /* Row  Bucket sizesclz((x-1) | 3)   RowColumn
+*in pages  stride   size
+*   0:   1  2  3  4 -> 30 30 30 304   1
+*   1:   5  6  7  8 -> 29 29 29 294   1
+*   2:  10 12 14 16 -> 28 28 28 288   2
+*   3:  20 24 28 32 -> 27 27 27 27   16   4
+*/
+   const unsigned row = 30 - __builtin_clz((pages - 1) | 3);
+   const unsigned row_max_pages = 4 << row;
+   
+   /* The '& ~2' is the special case for row 1. In row 1, max pages /
+* 2 is 2, but the previous row maximum is zero (because there is
+* no previous row). All row maximum sizes are power of 2, so that
+* is the only case where that bit will be set.
+*/
+   const unsigned prev_row_max_pages = (row_max_pages / 2) & ~2;
+   int col_size_log2 = row - 1;
+   col_size_log2 += (col_size_log2 < 0);
 
-   for (i = 0; i < bufmgr->num_buckets; i++) {
-  struct bo_cache_bucket *bucket = >cache_bucket[i];
-  if (bucket->size >= size) {
- return bucket;
-  }
-   }
+   const unsigned col = (pages - prev_row_max_pages +
+((1 << col_size_log2) - 1)) >> col_size_log2;
 
-   return NULL;
+   /* Calculating the index based on the row and column. */
+   const unsigned index = (row * 4) + (col - 1);
+
+   return (index < bufmgr->num_buckets) ?
+  >cache_bucket[index] : NULL;
 }
 
 int
@@ -1254,6 +1281,10 @@ add_bucket(struct brw_bufmgr *bufmgr, int size)
list_inithead(>cache_bucket[i].head);
bufmgr->cache_bucket[i].size = size;
bufmgr->num_buckets++;
+
+   assert(bucket_for_size(bufmgr, size) == >cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size - 2048) == >cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size + 1) != >cache_bucket[i]);
 }
 
 static void
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3] i965 : optimized bucket index calculation

2017-11-06 Thread aravindan . muthukumar
From: Aravindan Muthukumar <aravindan.muthuku...@intel.com>

Now the complexity has been reduced to O(1)

Algorithm calculates the index using matrix method.
Matrix arrangement is as below:
Assuming PAGE_SIZE is 4096.

  1*4096   2*40963*40964*4096
  5*4096   6*40967*40968*4096
  10*4096  12*4096   14*4096   16*4096
  20*4096  24*4096   28*4096   32*4096
   ...  ...   ...   ...
   ...  ...   ...   ...
   ...  ...   ...   max_cache_size

From this matrix its clearly seen that every row
follows the below way:
  ...   ...   ...n
n+(1/4)n  n+(1/2)n  n+(3/4)n2n

Row is calculated as log2(size/PAGE_SIZE)
Column is calculated as converting the difference
between the elements to fit into power size of two
and indexing it.

Final Index is (row*4)+(col-1)

Tested with Intel Mesa CI.

Improves performance of 3DMark on BXT by 0.705966% +/- 0.229767% (n=20)

v3: review comments implemented (Ian).
v2: review comments implemented (Jason).
 
Signed-off-by: Aravindan Muthukumar <aravindan.muthuku...@intel.com>
Signed-off-by: Kedar Karanje <kedar.j.kara...@intel.com>
Reviewed-by: Yogesh Marathe <yogesh.mara...@intel.com>
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c | 38 +++---
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 17036b5..9a423da 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -86,6 +86,8 @@
 
 #define memclear(s) memset(, 0, sizeof(s))
 
+#define PAGE_SIZE 4096
+
 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
 
 static inline int
@@ -180,19 +182,35 @@ bo_tile_pitch(struct brw_bufmgr *bufmgr, uint32_t pitch, 
uint32_t tiling)
return ALIGN(pitch, tile_width);
 }
 
+/*
+ * This function finds the correct bucket fit for the input size.
+ * The function works with O(1) complexity when the requested size
+ * was queried instead of iterating the size through all the buckets.
+ */
 static struct bo_cache_bucket *
 bucket_for_size(struct brw_bufmgr *bufmgr, uint64_t size)
 {
-   int i;
+   /* Calculating the pages and rounding up to the page size. */
+   const unsigned int pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
 
-   for (i = 0; i < bufmgr->num_buckets; i++) {
-  struct bo_cache_bucket *bucket = >cache_bucket[i];
-  if (bucket->size >= size) {
- return bucket;
-  }
-   }
+   /* Finding the row number based on the calculated pages. */
+   const unsigned int rows = 30 - __builtin_clz((pages - 1) | 3);
 
-   return NULL;
+   const unsigned int row_max_pages = 4 << rows;
+   const unsigned int prev_row_max_pages = (row_max_pages / 2) & ~2;
+
+   /* Finding the column number using column interval. */
+   int col_size_log2 = rows - 1;
+   col_size_log2 += (col_size_log2 < 0);
+
+   const unsigned int col = ( (pages - prev_row_max_pages +
+( (1 << col_size_log2) - 1) ) >> col_size_log2 );
+
+   /* Calculating the index based on the row and column. */
+   const unsigned int index = (rows * 4) + (col - 1);
+
+   return (index < bufmgr->num_buckets) ?
+  >cache_bucket[index] : NULL;
 }
 
 int
@@ -1254,6 +1272,10 @@ add_bucket(struct brw_bufmgr *bufmgr, int size)
list_inithead(>cache_bucket[i].head);
bufmgr->cache_bucket[i].size = size;
bufmgr->num_buckets++;
+
+   assert(bucket_for_size(bufmgr, size) == >cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size - 2048) == >cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size + 1) != >cache_bucket[i]);
 }
 
 static void
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3] i965 : optimized bucket index calculation.

2017-10-26 Thread aravindan . muthukumar
From: Aravindan Muthukumar <aravindan.muthuku...@intel.com>

 Avoiding the loop which was running with O(n) complexity.
 Now the complexity has been reduced to O(1)

 Algorithm calculates the index using matrix method.
 Matrix arrangement is as below:
 Assuming PAGE_SIZE is 4096.

  1*4096   2*40963*40964*4096
  5*4096   6*40967*40968*4096
  10*4096  12*4096   14*4096   16*4096
  20*4096  24*4096   28*4096   32*4096
   ...  ...   ...   ...
   ...  ...   ...   ...
   ...  ...   ...   max_cache_size

 From this matrix its clearly seen that every row
 follows the below way:
  ...   ...   ...n
n+(1/4)n  n+(1/2)n  n+(3/4)n2n

 Row is calulated as log2(size/PAGE_SIZE)
 Column is calculated as converting the difference
 between the elements to fit into power size of two
 and indexing it.

 Final Index is (row*4)+(col-1)

 Tested with Intel Mesa CI.

 Improves performance of 3d Mark on Broxton.
 Analyzed using Compare Perf Analyser:
 Average : 201.2 +/- 65.4836 (n=20)
 Percentage : 0.705966% +/- 0.229767% (n=20)

 v3: Review comments implemented

 Signed-off-by: Aravindan Muthukumar <aravindan.muthuku...@intel.com>
 Signed-off-by: Kedar Karanje <kedar.j.kara...@intel.com>
 Reviewed-by: Yogesh Marathe <yogesh.mara...@intel.com>
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c | 42 --
 1 file changed, 35 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 17036b5..49514a4 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -86,6 +86,8 @@
 
 #define memclear(s) memset(, 0, sizeof(s))
 
+#define PAGE_SIZE 4096
+
 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
 
 static inline int
@@ -180,19 +182,41 @@ bo_tile_pitch(struct brw_bufmgr *bufmgr, uint32_t pitch, 
uint32_t tiling)
return ALIGN(pitch, tile_width);
 }
 
+static inline int
+ilog2_round_up(int value)
+{
+   assert(value != 0);
+   return 32 - __builtin_clz(value - 1);
+}
+
+/*
+ * This function finds the correct bucket fit for the input size.
+ * The function works with O(1) complexity when the requested size
+ * was queried instead of iterating the size through all the buckets.
+ */
 static struct bo_cache_bucket *
 bucket_for_size(struct brw_bufmgr *bufmgr, uint64_t size)
 {
-   int i;
+   int index;
 
-   for (i = 0; i < bufmgr->num_buckets; i++) {
-  struct bo_cache_bucket *bucket = >cache_bucket[i];
-  if (bucket->size >= size) {
- return bucket;
-  }
+   /* Condition for size less than 4*4096 (16KB) page size. */
+   if (size <= 4 * PAGE_SIZE) {
+  index = DIV_ROUND_UP(size, PAGE_SIZE) - 1;
+   } else {
+  /* Number of pages of page size */
+  const int pages = DIV_ROUND_UP(size, PAGE_SIZE);
+  const int pages_log2 = ilog2_round_up(pages) - 1;
+
+  /* Finding the row and column of the matrix */
+  const int row = pages_log2 - 1;
+  const int col = DIV_ROUND_UP((pages - (1 << pages_log2)),
+   (1 << (pages_log2 - 2)));
+  /* Using the calculated row and column to index into the matrix */
+  index = (row << 2) + (col - 1);
}
 
-   return NULL;
+   return (index >= 0 && index < bufmgr->num_buckets) ?
+  >cache_bucket[index] : NULL;
 }
 
 int
@@ -1254,6 +1278,10 @@ add_bucket(struct brw_bufmgr *bufmgr, int size)
list_inithead(>cache_bucket[i].head);
bufmgr->cache_bucket[i].size = size;
bufmgr->num_buckets++;
+
+   assert(bucket_for_size(bufmgr, size) == >cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size - 2048) == >cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size + 1) != >cache_bucket[i]);
 }
 
 static void
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2] i965 : optimized bucket index calculation

2017-09-14 Thread aravindan . muthukumar
From: Aravindan Muthukumar <aravindan.muthuku...@intel.com>

Avoiding the loop which was running with O(n) complexity.
Now the complexity has been reduced to O(1)

Algorithm calculates the index using matrix method.
Matrix arrangement is as below:
Assuming PAGE_SIZE is 4096.

 1*4096   2*40963*40964*4096
 5*4096   6*40967*40968*4096
  ...  ...   ...   ...
  ...  ...   ...   ...
  ...  ...   ...   max_cache_size

From this matrix its cleary seen that every row
follows the below way:
 ...   ...   ...n
   n+(1/4)n  n+(1/2)n  n+(3/4)n2n

Row is calulated as log2(size/PAGE_SIZE)
Column is calculated as converting the difference
between the elements to fit into power size of two
and indexing it.

Final Index is (row*4)+(col-1)

Tested with Intel Mesa CI.

Improves performance of 3d Mark on Broxton.
Analyzed using Compare Perf Analyser:
Average : 201.2 +/- 65.4836 (n=20)
Percentage : 0.705966% +/- 0.229767% (n=20)

v2: Review comments regarding cosmetics and asserts implemented

Signed-off-by: Aravindan Muthukumar <aravindan.muthuku...@intel.com>
Signed-off-by: Kedar Karanje <kedar.j.kara...@intel.com>
Reviewed-by: Yogesh Marathe <yogesh.mara...@intel.com>
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c | 46 --
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 8017219..8013ccb 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -87,6 +87,8 @@
 
 #define memclear(s) memset(, 0, sizeof(s))
 
+#define PAGE_SIZE 4096
+
 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
 
 static inline int
@@ -181,19 +183,45 @@ bo_tile_pitch(struct brw_bufmgr *bufmgr, uint32_t pitch, 
uint32_t tiling)
return ALIGN(pitch, tile_width);
 }
 
+static inline int
+ilog2_round_up(int value)
+{
+   assert(value != 0);
+   return 32 - __builtin_clz(value - 1);
+}
+
+/*
+ * This function finds the correct bucket fit for the input size.
+ * The function works with O(1) complexity when the requested size
+ * was queried instead of iterating the size through all the buckets.
+ */
 static struct bo_cache_bucket *
 bucket_for_size(struct brw_bufmgr *bufmgr, uint64_t size)
 {
-   int i;
+   int index = -1;
+   int row, col = 0;
+   int pages, pages_log2;
 
-   for (i = 0; i < bufmgr->num_buckets; i++) {
-  struct bo_cache_bucket *bucket = >cache_bucket[i];
-  if (bucket->size >= size) {
- return bucket;
-  }
+   /* condition for size less  than 4*4096 (16KB) page size */
+   if(size <= 4 * PAGE_SIZE) {
+  index = DIV_ROUND_UP(size, PAGE_SIZE) - 1;;
+   } else {
+  /* Number of pages of page size */
+  pages = DIV_ROUND_UP(size, PAGE_SIZE);
+  pages_log2 = ilog2_round_up(pages) - 1;
+
+  /* Finding the row and column of the matrix */
+  row = pages_log2 - 1;
+  col = DIV_ROUND_UP((pages - (1 << pages_log2)),
+(1 << (pages_log2 - 2)));
+
+  /* Using the calculated row and column to index into the matrix */
+  index = (row << 2) + (col - 1);
}
 
-   return NULL;
+   /* Checking the error condition */
+   return (index >= 0 && index < bufmgr->num_buckets) ?
+  (>cache_bucket[index]) : NULL;
 }
 
 int
@@ -1239,6 +1267,10 @@ add_bucket(struct brw_bufmgr *bufmgr, int size)
list_inithead(>cache_bucket[i].head);
bufmgr->cache_bucket[i].size = size;
bufmgr->num_buckets++;
+
+   assert(bucket_for_size(bufmgr, size) == >cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size - 2048) == >cache_bucket[i]);
+   assert(bucket_for_size(bufmgr, size + 1) != >cache_bucket[i]);
 }
 
 static void
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965 : optimized bucket index calculation

2017-09-08 Thread aravindan . muthukumar
From: Aravindan Muthukumar <aravindan.muthuku...@intel.com>

Avoiding the loop which was running with O(n) complexity.
Now the complexity has been reduced to O(1)

Tested with piglit.
Slight performance improvement (~1%) in 3d mark.

Change-Id: Id099f1cd24ad5b691a69070eda79b8f4e9be39a6
Signed-off-by: Aravindan Muthukumar <aravindan.muthuku...@intel.com>
Signed-off-by: Kedar Karanje <kedar.j.kara...@intel.com>
Reviewed-by: Yogesh Marathe <yogesh.mara...@intel.com>
---
 src/mesa/drivers/dri/i965/brw_bufmgr.c | 48 +-
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.c 
b/src/mesa/drivers/dri/i965/brw_bufmgr.c
index 5b4e784..18cb166 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.c
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.c
@@ -87,6 +87,11 @@
 
 #define memclear(s) memset(, 0, sizeof(s))
 
+/* Macros for BO cache size */
+#define CACHE_PAGE_SIZE4096
+#define PAGE_SIZE_SHIFT12
+#define BO_CACHE_PAGE_SIZE (4 * CACHE_PAGE_SIZE)
+
 #define FILE_DEBUG_FLAG DEBUG_BUFMGR
 
 static inline int
@@ -181,19 +186,48 @@ bo_tile_pitch(struct brw_bufmgr *bufmgr, uint32_t pitch, 
uint32_t tiling)
return ALIGN(pitch, tile_width);
 }
 
+/*
+ * This functions is to find the correct bucket fit for the input size.
+ * This function works with O(1) complexity when the requested size
+ * was queried instead of iterating the size through all the buckets.
+ */
 static struct bo_cache_bucket *
 bucket_for_size(struct brw_bufmgr *bufmgr, uint64_t size)
 {
-   int i;
+   struct bo_cache_bucket *bucket = NULL;
+   int x=0,index = -1;
+   int row, col=0;
 
-   for (i = 0; i < bufmgr->num_buckets; i++) {
-  struct bo_cache_bucket *bucket = >cache_bucket[i];
-  if (bucket->size >= size) {
- return bucket;
-  }
+   /* condition for size less  than 4*4096 (4KB) page size */
+   if(size < BO_CACHE_PAGE_SIZE){
+  index = (size>>PAGE_SIZE_SHIFT)+((size%(1<<PAGE_SIZE_SHIFT)?1:0))-1;
}
+   else{
+  /* When the size is more than 4*4096, the logic follows a matrix method
+   * where the index will be searched using Arithmetico-Geometric 
progression.
+   * So the given size will be divided by 4096 & the index will be traced 
out.
+   */
+  x = size>>PAGE_SIZE_SHIFT;
 
-   return NULL;
+  /* Find the row using Geometric Progression. The highest bit set will 
give
+   * the row number. num = a * r^(n-1) where num = size a = 4 r = 2
+   */
+  row = 31 - __builtin_clz(x>>1);
+
+ /* Find the column using AP but using the row value
+  * calculated using GP.
+  */
+  col =((x-(1<<(row+1)))/(1<<(row-1)))+1;
+  col += (size%(1<<PAGE_SIZE_SHIFT<<(row-1)))?1:0;
+
+  /* Finding the index value using calculated row and col number */
+  index = ((row-1)<<2) + col + 2;
+   }
+
+   /* Checking the error condition */
+   bucket = (index >= 0 && index < 
bufmgr->num_buckets)?(>cache_bucket[index]):NULL;
+   return bucket;
 }
 
 int
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965: Avoids loop for buffer object availability in add_exec_bo

2017-07-28 Thread aravindan . muthukumar
From: Aravindan Muthukumar <aravindan.muthuku...@intel.com>

Original logic loops over the list for every buffer object. Maintained
a flag to identify whether bo is already there in list.

Improves performance - 3DMark by 2%
Tested with piglit

Signed-off-by: Aravindan Muthukumar <aravindan.muthuku...@intel.com>
Signed-off-by: Yogesh Marathe <yogesh.mara...@intel.com>
---
 src/mesa/drivers/dri/i965/brw_bufmgr.h|  5 +
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 24 +++-
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_bufmgr.h 
b/src/mesa/drivers/dri/i965/brw_bufmgr.h
index 6a6051b..912ffb0 100644
--- a/src/mesa/drivers/dri/i965/brw_bufmgr.h
+++ b/src/mesa/drivers/dri/i965/brw_bufmgr.h
@@ -132,6 +132,11 @@ struct brw_bo {
 * Boolean of whether this buffer is cache coherent
 */
bool cache_coherent;
+
+   /**
+ * Boolean to check whether bo is available in exec buffer objects list
+ */
+   bool bo_available;
 };
 
 #define BO_ALLOC_FOR_RENDER (1<<0)
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c 
b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index e2f208a..0814e8b 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -513,10 +513,8 @@ static void
 add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo *bo)
 {
if (bo != batch->bo) {
-  for (int i = 0; i < batch->exec_count; i++) {
- if (batch->exec_bos[i] == bo)
-return;
-  }
+  if(brw_batch_references(batch,bo) == true)
+ return;
 
   brw_bo_reference(bo);
}
@@ -548,6 +546,12 @@ add_exec_bo(struct intel_batchbuffer *batch, struct brw_bo 
*bo)
validation_entry->rsvd2 = 0;
 
batch->exec_bos[batch->exec_count] = bo;
+
+   /* Marking the current bo as true since this
+* is added to the exec_bos list
+*/
+   bo->bo_available = true;
+
batch->exec_count++;
batch->aperture_space += bo->size;
 }
@@ -592,6 +596,12 @@ execbuffer(int fd,
 
   bo->idle = false;
 
+  /* Marking the flags as false for all the bo's
+   * in the list to ensure it is added in the next
+   * list of exec buffers
+   */
+  bo->bo_available = false;
+
   /* Update brw_bo::offset64 */
   if (batch->validation_list[i].offset != bo->offset64) {
  DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%llx\n",
@@ -736,11 +746,7 @@ brw_batch_has_aperture_space(struct brw_context *brw, 
unsigned extra_space)
 bool
 brw_batch_references(struct intel_batchbuffer *batch, struct brw_bo *bo)
 {
-   for (int i = 0; i < batch->exec_count; i++) {
-  if (batch->exec_bos[i] == bo)
- return true;
-   }
-   return false;
+   return (bo->bo_available) ? true : false;
 }
 
 /*  This is the only way buffers get added to the validate list.
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH V3] i965 : Optimize atom state flag checks

2017-07-21 Thread aravindan . muthukumar
From: Aravindan Muthukumar <aravindan.muthuku...@intel.com>

This patch improves CPI Rate(Cycles per Instruction) and branch miss predict
for i965. The function check_state() was showing CPI retired rate.

Performance stats with android:
- CPI retired lowered by 28% (lower is better)
- Branch missprediction lowered by 13% (lower is better)
- 3DMark improved by 2%

The dissassembly doesn't show difference, although above results were observed
with patch.

V2:
- Removed memset() change
- Changed commit message as per review comments

V3:
- Indentation and changes to remove check_state as function

Signed-off-by: Aravindan Muthukumar <aravindan.muthuku...@intel.com>
Signed-off-by: Yogesh Marathe <yogesh.mara...@intel.com>
Tested-by: Asish <as...@intel.com>
---
 src/mesa/drivers/dri/i965/brw_defines.h  |  4 
 src/mesa/drivers/dri/i965/brw_state_upload.c | 24 +++-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 2a8dbf8..8c9a510 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1687,3 +1687,7 @@ enum brw_pixel_shader_coverage_mask_mode {
 # define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
 
 #endif
+
+/* Checking the state of mesa and brw before emitting atoms */
+#define CHECK_BRW_STATE(a,b) ((a.mesa & b.mesa) | (a.brw & b.brw))
+
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c 
b/src/mesa/drivers/dri/i965/brw_state_upload.c
index acaa97e..1846624 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -217,12 +217,6 @@ void brw_destroy_state( struct brw_context *brw )
 /***
  */
 
-static bool
-check_state(const struct brw_state_flags *a, const struct brw_state_flags *b)
-{
-   return ((a->mesa & b->mesa) | (a->brw & b->brw)) != 0;
-}
-
 static void accumulate_state( struct brw_state_flags *a,
  const struct brw_state_flags *b )
 {
@@ -443,10 +437,8 @@ check_and_emit_atom(struct brw_context *brw,
 struct brw_state_flags *state,
 const struct brw_tracked_state *atom)
 {
-   if (check_state(state, >dirty)) {
-  atom->emit(brw);
-  merge_ctx_state(brw, state);
-   }
+   atom->emit(brw);
+   merge_ctx_state(brw, state);
 }
 
 static inline void
@@ -541,7 +533,10 @@ brw_upload_pipeline_state(struct brw_context *brw,
 const struct brw_tracked_state *atom = [i];
 struct brw_state_flags generated;
 
- check_and_emit_atom(brw, , atom);
+ /* Checking the state and emitting atoms */
+ if (CHECK_BRW_STATE(state, atom->dirty)) {
+check_and_emit_atom(brw, , atom);
+ }
 
 accumulate_state(, >dirty);
 
@@ -550,7 +545,7 @@ brw_upload_pipeline_state(struct brw_context *brw,
  * fail;
  */
 xor_states(, , );
-assert(!check_state(, ));
+assert(!CHECK_BRW_STATE(examined, generated));
 prev = state;
   }
}
@@ -558,7 +553,10 @@ brw_upload_pipeline_state(struct brw_context *brw,
   for (i = 0; i < num_atoms; i++) {
 const struct brw_tracked_state *atom = [i];
 
- check_and_emit_atom(brw, , atom);
+ /* Checking the state and emitting atoms */
+ if (CHECK_BRW_STATE(state, atom->dirty)) {
+check_and_emit_atom(brw, , atom);
+ }
   }
}
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH V2] i965 : Optimize atom state flag checks

2017-07-20 Thread aravindan . muthukumar
From: Aravindan Muthukumar <aravindan.muthuku...@intel.com>

This patch improves CPI Rate(Cycles per Instruction)
and branch mispredict for i965. The function check_state()
was showing CPI retired rate.

Performance stats with android:
CPI retired lowered by 28% (lower is better)
Branch missprediction lowered by 13% (lower is better)
3DMark improved by 2%

The dissassembly doesn't show difference, although above
results were observed with patch.

Signed-off-by: Aravindan Muthukumar <aravindan.muthuku...@intel.com>
Signedd-off-by: Yogesh Marathe <yogesh.mara...@intel.com>
Tested-by: Asish <as...@intel.com>
---

Changes since V1:
- Removed memset() change
- Changed commit message as per review comments

 src/mesa/drivers/dri/i965/brw_defines.h  |  4 
 src/mesa/drivers/dri/i965/brw_state_upload.c | 12 
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 2a8dbf8..8c9a510 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1687,3 +1687,7 @@ enum brw_pixel_shader_coverage_mask_mode {
 # define CSDBG2_CONSTANT_BUFFER_ADDRESS_OFFSET_DISABLE (1 << 4)
 
 #endif
+
+/* Checking the state of mesa and brw before emitting atoms */
+#define CHECK_BRW_STATE(a,b) ((a.mesa & b.mesa) | (a.brw & b.brw))
+
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c 
b/src/mesa/drivers/dri/i965/brw_state_upload.c
index acaa97e..1c8b969 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -443,10 +443,8 @@ check_and_emit_atom(struct brw_context *brw,
 struct brw_state_flags *state,
 const struct brw_tracked_state *atom)
 {
-   if (check_state(state, >dirty)) {
   atom->emit(brw);
   merge_ctx_state(brw, state);
-   }
 }
 
 static inline void
@@ -541,7 +539,10 @@ brw_upload_pipeline_state(struct brw_context *brw,
 const struct brw_tracked_state *atom = [i];
 struct brw_state_flags generated;
 
- check_and_emit_atom(brw, , atom);
+ /* Checking the state and emitting atoms */
+ if (CHECK_BRW_STATE(state, atom->dirty)) {
+check_and_emit_atom(brw, , atom);
+ }
 
 accumulate_state(, >dirty);
 
@@ -558,7 +559,10 @@ brw_upload_pipeline_state(struct brw_context *brw,
   for (i = 0; i < num_atoms; i++) {
 const struct brw_tracked_state *atom = [i];
 
- check_and_emit_atom(brw, , atom);
+ /* Checking the state and emitting atoms */
+ if (CHECK_BRW_STATE(state, atom->dirty)) {
+check_and_emit_atom(brw, , atom);
+ }
   }
}
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965 : Performance Improvement

2017-07-13 Thread aravindan . muthukumar
From: Aravindan M 

This patch improves CPI Rate(Cycles per Instruction)
and CPU time utilization for i965. The functions
check_state and brw_pipeline_state_finished was found
poor CPU utilization from performance analysis.

Change-Id: I17c7e719a16e222764217a0e67b4482748537b67
Signed-off-by: Aravindan M 
Reviewed-by: Yogesh M 
Tested-by: Asish 
---
 src/mesa/drivers/dri/i965/brw_defines.h  |  3 +++
 src/mesa/drivers/dri/i965/brw_state_upload.c | 14 +++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index a4794c6..60f88ca 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1681,3 +1681,6 @@ enum brw_pixel_shader_coverage_mask_mode {
 # define GEN8_L3CNTLREG_ALL_ALLOC_MASK INTEL_MASK(31, 25)
 
 #endif
+
+/* Checking the state of mesa and brw before emitting atoms */
+#define CHECK_BRW_STATE(a,b) ((a.mesa & b.mesa) | (a.brw & b.brw))
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c 
b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 5e82c1b..434decf 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -515,7 +515,10 @@ brw_upload_pipeline_state(struct brw_context *brw,
 const struct brw_tracked_state *atom = [i];
 struct brw_state_flags generated;
 
- check_and_emit_atom(brw, , atom);
+ /* Checking the state and emitting the atoms */
+ if (CHECK_BRW_STATE(state, atom->dirty)) {
+check_and_emit_atom(brw, , atom);
+ }
 
 accumulate_state(, >dirty);
 
@@ -532,7 +535,10 @@ brw_upload_pipeline_state(struct brw_context *brw,
   for (i = 0; i < num_atoms; i++) {
 const struct brw_tracked_state *atom = [i];
 
- check_and_emit_atom(brw, , atom);
+ /* Checking the state and emitting the atoms */
+ if (CHECK_BRW_STATE(state, atom->dirty)) {
+check_and_emit_atom(brw, , atom);
+ }
   }
}
 
@@ -567,7 +573,9 @@ brw_pipeline_state_finished(struct brw_context *brw,
  brw->state.pipelines[i].mesa |= brw->NewGLState;
  brw->state.pipelines[i].brw |= brw->ctx.NewDriverState;
   } else {
- memset(>state.pipelines[i], 0, sizeof(struct brw_state_flags));
+ /* Avoiding the memset with initialization */
+ brw->state.pipelines[i].mesa = 0;
+ brw->state.pipelines[i].brw = 0ull;
   }
}
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965 : Performance Improvement

2017-07-13 Thread aravindan . muthukumar
From: Aravindan M 

This patch improves CPI Rate(Cycles per Instruction)
and CPU time utilization for i965. The functions
check_state and brw_pipeline_state_finished was found
poor CPU utilization from performance analysis.

Change-Id: I17c7e719a16e222764217a0e67b4482748537b67
Signed-off-by: Aravindan M 
Reviewed-by: Yogesh M 
Tested-by: Asish 
---
 src/mesa/drivers/dri/i965/brw_defines.h  |  3 +++
 src/mesa/drivers/dri/i965/brw_state_upload.c | 14 +++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index a4794c6..60f88ca 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1681,3 +1681,6 @@ enum brw_pixel_shader_coverage_mask_mode {
 # define GEN8_L3CNTLREG_ALL_ALLOC_MASK INTEL_MASK(31, 25)
 
 #endif
+
+/* Checking the state of mesa and brw before emitting atoms */
+#define CHECK_BRW_STATE(a,b) ((a.mesa & b.mesa) | (a.brw & b.brw))
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c 
b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 5e82c1b..434decf 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -515,7 +515,10 @@ brw_upload_pipeline_state(struct brw_context *brw,
 const struct brw_tracked_state *atom = [i];
 struct brw_state_flags generated;
 
- check_and_emit_atom(brw, , atom);
+ /* Checking the state and emitting the atoms */
+ if (CHECK_BRW_STATE(state, atom->dirty)) {
+check_and_emit_atom(brw, , atom);
+ }
 
 accumulate_state(, >dirty);
 
@@ -532,7 +535,10 @@ brw_upload_pipeline_state(struct brw_context *brw,
   for (i = 0; i < num_atoms; i++) {
 const struct brw_tracked_state *atom = [i];
 
- check_and_emit_atom(brw, , atom);
+ /* Checking the state and emitting the atoms */
+ if (CHECK_BRW_STATE(state, atom->dirty)) {
+check_and_emit_atom(brw, , atom);
+ }
   }
}
 
@@ -567,7 +573,9 @@ brw_pipeline_state_finished(struct brw_context *brw,
  brw->state.pipelines[i].mesa |= brw->NewGLState;
  brw->state.pipelines[i].brw |= brw->ctx.NewDriverState;
   } else {
- memset(>state.pipelines[i], 0, sizeof(struct brw_state_flags));
+ /* Avoiding the memset with initialization */
+ brw->state.pipelines[i].mesa = 0;
+ brw->state.pipelines[i].brw = 0ull;
   }
}
 
-- 
2.7.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev