Re: [Beignet] [PATCH v2] Add missed kernel names into built-in kernel list.

2017-06-22 Thread yan . wang
Sorry for this.
Thanks your modification.



yan.wang
 
From: Yang, Rong R
Date: 2017-06-22 14:23
To: yan.w...@linux.intel.com; beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH v2] Add missed kernel names into built-in kernel 
list.
Rename "__cl_cpy_region_unalign_same_offset;" to 
"__cl_copy_region_unalign_same_offset;",
and "__cl_copy_image_3d_to_2d;" is duplicated.
I have modified them and pushed, thanks.
 
> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> yan.w...@linux.intel.com
> Sent: Thursday, June 22, 2017 13:52
> To: beignet@lists.freedesktop.org
> Cc: Yan Wang <yan.w...@linux.intel.com>
> Subject: [Beignet] [PATCH v2] Add missed kernel names into built-in kernel
> list.
> 
> From: Yan Wang <yan.w...@linux.intel.com>
> 
> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> ---
>  src/cl_gt_device.h | 17 +
>  1 file changed, 17 insertions(+)
> 
> diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index f6cb5f8..ff23b32
> 100644
> --- a/src/cl_gt_device.h
> +++ b/src/cl_gt_device.h
> @@ -115,16 +115,33 @@ DECL_INFO_STRING(built_in_kernels,
> "__cl_copy_region_align4;"
> "__cl_cpy_region_unalign_same_offset;"
> "__cl_copy_region_unalign_dst_offset;"
> "__cl_copy_region_unalign_src_offset;"
> +   "__cl_copy_region_unalign_same_offset;"
> "__cl_copy_buffer_rect;"
> +   "__cl_copy_buffer_rect_align4;"
> "__cl_copy_image_1d_to_1d;"
> "__cl_copy_image_2d_to_2d;"
> "__cl_copy_image_3d_to_2d;"
> "__cl_copy_image_2d_to_3d;"
> "__cl_copy_image_3d_to_3d;"
> +   "__cl_copy_image_3d_to_2d;"
> "__cl_copy_image_2d_to_buffer;"
> +   "__cl_copy_image_2d_to_buffer_align4;"
> +   "__cl_copy_image_2d_to_buffer_align16;"
> "__cl_copy_image_3d_to_buffer;"
> +   "__cl_copy_image_3d_to_buffer_align4;"
> +   "__cl_copy_image_3d_to_buffer_align16;"
> "__cl_copy_buffer_to_image_2d;"
> +   "__cl_copy_buffer_to_image_2d_align4;"
> +   "__cl_copy_buffer_to_image_2d_align16;"
> "__cl_copy_buffer_to_image_3d;"
> +   "__cl_copy_buffer_to_image_3d_align4;"
> +   "__cl_copy_buffer_to_image_3d_align16;"
> +   "__cl_copy_image_1d_array_to_1d_array;"
> +   "__cl_copy_image_2d_array_to_2d_array;"
> +   "__cl_copy_image_2d_array_to_2d;"
> +   "__cl_copy_image_2d_array_to_3d;"
> +   "__cl_copy_image_2d_to_2d_array;"
> +   "__cl_copy_image_3d_to_2d_array;"
> "__cl_fill_region_unalign;"
> "__cl_fill_region_align2;"
> "__cl_fill_region_align4;"
> --
> 2.7.4
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2] Add missed kernel names into built-in kernel list.

2017-06-21 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_gt_device.h | 17 +
 1 file changed, 17 insertions(+)

diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index f6cb5f8..ff23b32 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -115,16 +115,33 @@ DECL_INFO_STRING(built_in_kernels, 
"__cl_copy_region_align4;"
"__cl_cpy_region_unalign_same_offset;"
"__cl_copy_region_unalign_dst_offset;"
"__cl_copy_region_unalign_src_offset;"
+   "__cl_copy_region_unalign_same_offset;"
"__cl_copy_buffer_rect;"
+   "__cl_copy_buffer_rect_align4;"
"__cl_copy_image_1d_to_1d;"
"__cl_copy_image_2d_to_2d;"
"__cl_copy_image_3d_to_2d;"
"__cl_copy_image_2d_to_3d;"
"__cl_copy_image_3d_to_3d;"
+   "__cl_copy_image_3d_to_2d;"
"__cl_copy_image_2d_to_buffer;"
+   "__cl_copy_image_2d_to_buffer_align4;"
+   "__cl_copy_image_2d_to_buffer_align16;"
"__cl_copy_image_3d_to_buffer;"
+   "__cl_copy_image_3d_to_buffer_align4;"
+   "__cl_copy_image_3d_to_buffer_align16;"
"__cl_copy_buffer_to_image_2d;"
+   "__cl_copy_buffer_to_image_2d_align4;"
+   "__cl_copy_buffer_to_image_2d_align16;"
"__cl_copy_buffer_to_image_3d;"
+   "__cl_copy_buffer_to_image_3d_align4;"
+   "__cl_copy_buffer_to_image_3d_align16;"
+   "__cl_copy_image_1d_array_to_1d_array;"
+   "__cl_copy_image_2d_array_to_2d_array;"
+   "__cl_copy_image_2d_array_to_2d;"
+   "__cl_copy_image_2d_array_to_3d;"
+   "__cl_copy_image_2d_to_2d_array;"
+   "__cl_copy_image_3d_to_2d_array;"
"__cl_fill_region_unalign;"
"__cl_fill_region_align2;"
"__cl_fill_region_align4;"
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Add aligned copy kernels into built-in kernel list.

2017-06-21 Thread yan . wang
Sure. I will.
Thanks.



yan.wang
 
From: Yang, Rong R
Date: 2017-06-22 09:16
To: yan.w...@linux.intel.com; beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH] Add aligned copy kernels into built-in kernel 
list.
There are still some kernels missed, can you add them together?
 
> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> yan.w...@linux.intel.com
> Sent: Wednesday, June 21, 2017 11:26
> To: beignet@lists.freedesktop.org
> Cc: Yan Wang <yan.w...@linux.intel.com>
> Subject: [Beignet] [PATCH] Add aligned copy kernels into built-in kernel list.
> 
> From: Yan Wang <yan.w...@linux.intel.com>
> 
> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> ---
>  src/cl_gt_device.h | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index f6cb5f8..8008606
> 100644
> --- a/src/cl_gt_device.h
> +++ b/src/cl_gt_device.h
> @@ -122,9 +122,17 @@ DECL_INFO_STRING(built_in_kernels,
> "__cl_copy_region_align4;"
> "__cl_copy_image_2d_to_3d;"
> "__cl_copy_image_3d_to_3d;"
> "__cl_copy_image_2d_to_buffer;"
> +   "__cl_copy_image_2d_to_buffer_align4;"
> +   "__cl_copy_image_2d_to_buffer_align16;"
> "__cl_copy_image_3d_to_buffer;"
> +   "__cl_copy_image_3d_to_buffer_align4;"
> +   "__cl_copy_image_3d_to_buffer_align16;"
> "__cl_copy_buffer_to_image_2d;"
> +   "__cl_copy_buffer_to_image_2d_align4;"
> +   "__cl_copy_buffer_to_image_2d_align16;"
> "__cl_copy_buffer_to_image_3d;"
> +   "__cl_copy_buffer_to_image_3d_align4;"
> +   "__cl_copy_buffer_to_image_3d_align16;"
> "__cl_fill_region_unalign;"
> "__cl_fill_region_align2;"
> "__cl_fill_region_align4;"
> --
> 2.7.4
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Add aligned copy kernels into built-in kernel list.

2017-06-20 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_gt_device.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index f6cb5f8..8008606 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -122,9 +122,17 @@ DECL_INFO_STRING(built_in_kernels, 
"__cl_copy_region_align4;"
"__cl_copy_image_2d_to_3d;"
"__cl_copy_image_3d_to_3d;"
"__cl_copy_image_2d_to_buffer;"
+   "__cl_copy_image_2d_to_buffer_align4;"
+   "__cl_copy_image_2d_to_buffer_align16;"
"__cl_copy_image_3d_to_buffer;"
+   "__cl_copy_image_3d_to_buffer_align4;"
+   "__cl_copy_image_3d_to_buffer_align16;"
"__cl_copy_buffer_to_image_2d;"
+   "__cl_copy_buffer_to_image_2d_align4;"
+   "__cl_copy_buffer_to_image_2d_align16;"
"__cl_copy_buffer_to_image_3d;"
+   "__cl_copy_buffer_to_image_3d_align4;"
+   "__cl_copy_buffer_to_image_3d_align16;"
"__cl_fill_region_unalign;"
"__cl_fill_region_align2;"
"__cl_fill_region_align4;"
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 2/2] Use aligned16 and aligne4 kernel to copy for large 3D image with TILE_Y.

2017-06-14 Thread yan . wang
Very thanks.



yan.wang
 
From: Yang, Rong R
Date: 2017-06-14 15:36
To: yan.w...@linux.intel.com; beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH 2/2] Use aligned16 and aligne4 kernel to copy for 
large 3D image with TILE_Y.
LGTM, except some format. I have run git clang-format by manual and pushed, 
thanks.
 
> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> yan.w...@linux.intel.com
> Sent: Tuesday, June 13, 2017 16:32
> To: beignet@lists.freedesktop.org
> Cc: Yan Wang <yan.w...@linux.intel.com>
> Subject: [Beignet] [PATCH 2/2] Use aligned16 and aligne4 kernel to copy for
> large 3D image with TILE_Y.
> 
> From: Yan Wang <yan.w...@linux.intel.com>
> 
> It is similar with 2D image for avoiding extended image width truncated.
> 
> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> ---
>  src/CMakeLists.txt |  2 +
>  src/cl_context.h   |  4 ++
>  src/cl_mem.c   | 46 
> +++---
>  .../cl_internal_copy_buffer_to_image_3d_align16.cl | 19
> +  .../cl_internal_copy_buffer_to_image_3d_align4.cl  | 19
> +  .../cl_internal_copy_image_3d_to_buffer_align16.cl | 20
> ++  .../cl_internal_copy_image_3d_to_buffer_align4.cl  | 20
> ++
>  7 files changed, 125 insertions(+), 5 deletions(-)  create mode 100644
> src/kernels/cl_internal_copy_buffer_to_image_3d_align16.cl
>  create mode 100644
> src/kernels/cl_internal_copy_buffer_to_image_3d_align4.cl
>  create mode 100644
> src/kernels/cl_internal_copy_image_3d_to_buffer_align16.cl
>  create mode 100644
> src/kernels/cl_internal_copy_image_3d_to_buffer_align4.cl
> 
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 87ad48b..ecb98b9
> 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -54,6 +54,8 @@ cl_internal_copy_image_2d_array_to_3d
> cl_internal_copy_image_3d_to_2d_array
>  cl_internal_copy_image_2d_to_buffer
> cl_internal_copy_image_2d_to_buffer_align16
> cl_internal_copy_image_3d_to_buffer
>  cl_internal_copy_buffer_to_image_2d
> cl_internal_copy_buffer_to_image_2d_align16
> cl_internal_copy_buffer_to_image_3d
>  cl_internal_copy_buffer_to_image_2d_align4
> cl_internal_copy_image_2d_to_buffer_align4
> +cl_internal_copy_buffer_to_image_3d_align4
> +cl_internal_copy_image_3d_to_buffer_align4
> +cl_internal_copy_buffer_to_image_3d_align16
> +cl_internal_copy_image_3d_to_buffer_align16
>  cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
>  cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
>  cl_internal_fill_buf_align128 cl_internal_fill_image_1d diff --git
> a/src/cl_context.h b/src/cl_context.h index 75bf895..b3a79bc 100644
> --- a/src/cl_context.h
> +++ b/src/cl_context.h
> @@ -64,10 +64,14 @@ enum _cl_internal_ker_type {
>CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
>CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN4,
>CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
> +  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN16,
> +  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN4,
>CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
>CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
>CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN4,
>CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
> +  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN16,
> +  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN4,
>CL_ENQUEUE_FILL_BUFFER_UNALIGN,  //fill buffer with 1 aligne pattern,
> pattern size=1
>CL_ENQUEUE_FILL_BUFFER_ALIGN2,   //fill buffer with 2 aligne pattern,
> pattern size=2
>CL_ENQUEUE_FILL_BUFFER_ALIGN4,   //fill buffer with 4 aligne pattern,
> pattern size=4
> diff --git a/src/cl_mem.c b/src/cl_mem.c index b6dce3f..307db50 100644
> --- a/src/cl_mem.c
> +++ b/src/cl_mem.c
> @@ -2162,13 +2162,13 @@ get_align_size_for_copy_kernel(struct
> _cl_mem_image* image, const size_t origin0
>  const size_t offset, cl_image_format *fmt) {
>size_t align_size = 0;
> 
> -  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w *
> image->bpp) % ALIGN16 == 0) &&
> +  if(((image->w * image->bpp) % ALIGN16 == 0) &&
>((origin0 * image->bpp) % ALIGN16 == 0) && (region0 % ALIGN16 == 0)
> && (offset % ALIGN16 == 0)){
>  fmt->image_channel_order = CL_RGBA;
>  fmt->image_channel_data_type = CL_UNSIGNED_INT32;
>  align_size = ALIGN16;
>}
> -  else if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image-
> >w * image->bpp) % A

[Beignet] [PATCH 2/2] Use aligned16 and aligne4 kernel to copy for large 3D image with TILE_Y.

2017-06-13 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is similar with 2D image for avoiding extended image width truncated.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/CMakeLists.txt |  2 +
 src/cl_context.h   |  4 ++
 src/cl_mem.c   | 46 +++---
 .../cl_internal_copy_buffer_to_image_3d_align16.cl | 19 +
 .../cl_internal_copy_buffer_to_image_3d_align4.cl  | 19 +
 .../cl_internal_copy_image_3d_to_buffer_align16.cl | 20 ++
 .../cl_internal_copy_image_3d_to_buffer_align4.cl  | 20 ++
 7 files changed, 125 insertions(+), 5 deletions(-)
 create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_3d_align16.cl
 create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_3d_align4.cl
 create mode 100644 src/kernels/cl_internal_copy_image_3d_to_buffer_align16.cl
 create mode 100644 src/kernels/cl_internal_copy_image_3d_to_buffer_align4.cl

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 87ad48b..ecb98b9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -54,6 +54,8 @@ cl_internal_copy_image_2d_array_to_3d 
cl_internal_copy_image_3d_to_2d_array
 cl_internal_copy_image_2d_to_buffer 
cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
 cl_internal_copy_buffer_to_image_2d 
cl_internal_copy_buffer_to_image_2d_align16 cl_internal_copy_buffer_to_image_3d
 cl_internal_copy_buffer_to_image_2d_align4 
cl_internal_copy_image_2d_to_buffer_align4
+cl_internal_copy_buffer_to_image_3d_align4 
cl_internal_copy_image_3d_to_buffer_align4
+cl_internal_copy_buffer_to_image_3d_align16 
cl_internal_copy_image_3d_to_buffer_align16
 cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
diff --git a/src/cl_context.h b/src/cl_context.h
index 75bf895..b3a79bc 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -64,10 +64,14 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN4,
   CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN4,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN4,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN4,
   CL_ENQUEUE_FILL_BUFFER_UNALIGN,  //fill buffer with 1 aligne pattern, 
pattern size=1
   CL_ENQUEUE_FILL_BUFFER_ALIGN2,   //fill buffer with 2 aligne pattern, 
pattern size=2
   CL_ENQUEUE_FILL_BUFFER_ALIGN4,   //fill buffer with 4 aligne pattern, 
pattern size=4
diff --git a/src/cl_mem.c b/src/cl_mem.c
index b6dce3f..307db50 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -2162,13 +2162,13 @@ get_align_size_for_copy_kernel(struct _cl_mem_image* 
image, const size_t origin0
 const size_t offset, cl_image_format *fmt) {
   size_t align_size = 0;
 
-  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) 
% ALIGN16 == 0) &&
+  if(((image->w * image->bpp) % ALIGN16 == 0) &&
   ((origin0 * image->bpp) % ALIGN16 == 0) && (region0 % ALIGN16 == 0) && 
(offset % ALIGN16 == 0)){
 fmt->image_channel_order = CL_RGBA;
 fmt->image_channel_data_type = CL_UNSIGNED_INT32;
 align_size = ALIGN16;
   }
-  else if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * 
image->bpp) % ALIGN4 == 0) &&
+  else if(((image->w * image->bpp) % ALIGN4 == 0) &&
   ((origin0 * image->bpp) % ALIGN4 == 0) && (region0 % ALIGN4 == 0) && 
(offset % ALIGN4 == 0)){
 fmt->image_channel_order = CL_R;
 fmt->image_channel_data_type = CL_UNSIGNED_INT32;
@@ -2247,11 +2247,29 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, 
cl_event event, struct _cl_m
   cl_internal_copy_image_2d_to_buffer_str, 
(size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL);
 }
   }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) {
-extern char cl_internal_copy_image_3d_to_buffer_str[];
-extern size_t cl_internal_copy_image_3d_to_buffer_str_size;
+if(align_size == ALIGN16){
+  extern char cl_internal_copy_image_3d_to_buffer_align16_str[];
+  extern size_t cl_internal_copy_image_3d_to_buffer_align16_str_size;
+
+  ker = cl_context_get_static_kernel_from_bin(queue->ctx, 
CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN16,
+cl_internal_copy_image_3d_to_buffer_align16_str,
+(size_t)cl_internal_copy_image_3d_to_buffer_align16_str_

[Beignet] [PATCH 1/2] Add test case for large 3D image with TILE_Y.

2017-06-13 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will test aligned4 and aligned16 kernel for 3D image.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/compiler_fill_large_image.cpp | 98 
 1 file changed, 98 insertions(+)

diff --git a/utests/compiler_fill_large_image.cpp 
b/utests/compiler_fill_large_image.cpp
index 3894d6f..61f3c3e 100644
--- a/utests/compiler_fill_large_image.cpp
+++ b/utests/compiler_fill_large_image.cpp
@@ -214,3 +214,101 @@ static void compiler_fill_large_image_3(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_3);
+
+static void compiler_fill_large_image_4(void)
+{
+  const size_t w = 8191;
+  const size_t h = 8192;
+  const size_t depth = 2;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, depth};
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  buf_data[0] = (unsigned char*) malloc(sizeof(unsigned char) * 8192 * 8192 * 
2 * 4);
+  buf_data[1] = (unsigned char*) malloc(sizeof(unsigned char) * 8192 * 8192 * 
2 * 4);
+  for (uint32_t m = 0; m < depth; ++m)
+for (uint32_t j = 0; j < h; ++j)
+  for (uint32_t i = 0; i < w; i++)
+for (uint32_t k = 0; k < 4; k++)
+  ((unsigned char*)buf_data[0])[(m * w * h + j * w + i) * 4 + k] = 
(unsigned char)rand();
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_WRITE_IMAGE(buf[0], origin, region, buf_data[0]);
+  OCL_READ_IMAGE(buf[0], origin, region, buf_data[1]);
+
+  // Check result
+  for (uint32_t m = 0; m < depth; ++m)
+for (uint32_t j = 0; j < h; ++j)
+  for (uint32_t i = 0; i < w; i++)
+for (uint32_t k = 0; k < 4; k++)
+  OCL_ASSERT(((uint8_t*)buf_data[0])[(m * w * h + j * w + i) * 4 + k] 
==
+  ((uint8_t*)buf_data[1])[(m * w * h + j * w + i) * 4 + k]);
+
+  free(buf_data[0]);
+  free(buf_data[1]);
+  buf_data[0] = NULL;
+  buf_data[1] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_4);
+
+static void compiler_fill_large_image_5(void)
+{
+  const size_t w = 8192;
+  const size_t h = 8192;
+  const size_t depth = 2;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, depth};
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  buf_data[0] = (unsigned char*) malloc(sizeof(unsigned char) * 8192 * 8192 * 
2 * 4);
+  buf_data[1] = (unsigned char*) malloc(sizeof(unsigned char) * 8192 * 8192 * 
2 * 4);
+  for (uint32_t m = 0; m < depth; ++m)
+for (uint32_t j = 0; j < h; ++j)
+  for (uint32_t i = 0; i < w; i++)
+for (uint32_t k = 0; k < 4; k++)
+  ((unsigned char*)buf_data[0])[(m * w * h + j * w + i) * 4 + k] = 
(unsigned char)rand();
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_depth = depth;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_WRITE_IMAGE(buf[0], origin, region, buf_data[0]);
+  OCL_READ_IMAGE(buf[0], origin, region, buf_data[1]);
+
+  // Check result
+  for (uint32_t m = 0; m < depth; ++m)
+for (uint32_t j = 0; j < h; ++j)
+  for (uint32_t i = 0; i < w; i++)
+for (uint32_t k = 0; k < 4; k++)
+  OCL_ASSERT(((uint8_t*)buf_data[0])[(m * w * h + j * w + i) * 4 + k] 
==
+  ((uint8_t*)buf_data[1])[(m * w * h + j * w + i) * 4 + k]);
+
+  free(buf_data[0]);
+  free(buf_data[1]);
+  buf_data[0] = NULL;
+  buf_data[1] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_5);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v5 7/7] Optimize clEnqueueWriteImageByKernel and clEnqueuReadImageByKernel.

2017-06-13 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

1. Only copy the data by origin and region defined.
2. Add clFinish to guarantee the kernel copying is finished when blocking 
writing.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 00567b9..1daf403 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1857,23 +1857,28 @@ clEnqueueReadImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, 
CL_MEM_ALLOC_HOST_PTR,
-mem->size, NULL, );
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
+  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR,
+buf_size, ptr, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
   }
 
+  cl_event e;
   err = clEnqueueCopyImageToBuffer(command_queue, mem, image->tmp_ker_buf, 
origin,
-region, 0, 0, NULL, NULL);
+region, 0, num_events_in_wait_list, event_wait_list, );
   if (err != CL_SUCCESS) {
 clReleaseMemObject(image->tmp_ker_buf);
+clReleaseEvent(e);
 image->tmp_ker_buf = NULL;
 return err;
   }
 
-  return clEnqueueReadBuffer(command_queue, image->tmp_ker_buf, blocking_read, 
0,
-mem->size, ptr, num_events_in_wait_list, event_wait_list, event);
+  err = clEnqueueReadBuffer(command_queue, image->tmp_ker_buf, blocking_read, 
0,
+buf_size, ptr, 1, , event);
+  clReleaseEvent(e);
+  return err;
 }
 
 cl_int
@@ -2064,14 +2069,20 @@ clEnqueueWriteImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
mem->size, (void*)ptr, );
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
+  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
buf_size, (void*)ptr, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
   }
 
-  return clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, mem, 0, 
origin, region,
+  err = clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, mem, 0, 
origin, region,
 num_events_in_wait_list, event_wait_list, event);
+
+  if (blocking_write)
+err = clFinish(command_queue);
+
+  return err;
 }
 
 cl_int
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v5 6/7] Fix bug of clEnqueueUnmapMemObjectForKernel and clEnqueueMapImageByKernel.

2017-06-13 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

1. Support wrrting data by mapping/unmapping mode.
2. Add mapping record logic.
3. Add clFinish to guarantee the kernel copying is finished.
4. Fix the error of calling clEnqueueMapImageByKernel.
   blocking_map and map_flags need be switched.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 147 ++-
 1 file changed, 113 insertions(+), 34 deletions(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 7b58236..00567b9 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -21,6 +21,7 @@
 #include "cl_command_queue.h"
 #include "cl_event.h"
 #include "CL/cl.h"
+#include 
 
 cl_int
 clSetMemObjectDestructorCallback(cl_mem memobj,
@@ -350,33 +351,64 @@ clEnqueueUnmapMemObjectForKernel(cl_command_queue 
command_queue,
 cl_event *event)
 {
   cl_int err = CL_SUCCESS;
-  struct _cl_mem_image *image = NULL;
-  size_t region[3];
-  size_t origin[3];
-
-  image = cl_mem_image(memobj);
+  int i, j;
+  uint8_t write_map = 0;
+  cl_mem tmp_ker_buf = NULL;
+  size_t origin[3], region[3];
+  void *v_ptr = NULL;
+
+  assert(memobj->mapped_ptr_sz >= memobj->map_ref);
+  for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
+  memobj->mapped_ptr[i].ptr = NULL;
+  v_ptr = memobj->mapped_ptr[i].v_ptr;
+  write_map = memobj->mapped_ptr[i].ker_write_map;
+  tmp_ker_buf = memobj->mapped_ptr[i].tmp_ker_buf;
+  for (j = 0; j < 3; j++) {
+region[j] = memobj->mapped_ptr[i].region[j];
+origin[j] = memobj->mapped_ptr[i].origin[j];
+memobj->mapped_ptr[i].region[j] = 0;
+memobj->mapped_ptr[i].origin[j] = 0;
+  }
+  memobj->mapped_ptr[i].size = 0;
+  memobj->mapped_ptr[i].ker_write_map = 0;
+  memobj->mapped_ptr[i].tmp_ker_buf = 0;
+  memobj->mapped_ptr[i].v_ptr = NULL;
+  memobj->map_ref--;
+  break;
+}
+  }
 
-  if (!image->tmp_ker_buf)
+  if (!tmp_ker_buf)
 return CL_INVALID_MEM_OBJECT;
 
-  origin[0] = origin[1] = origin[2] = 0;
-  region[0] = image->w;
-  region[1] = image->h;
-  region[2] = image->depth;
-
-  if (memobj->flags & CL_MEM_USE_HOST_PTR) {
-err = clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, 
memobj, 0, origin, region,
-  num_events_in_wait_list, event_wait_list, event);
-if (err != CL_SUCCESS)
-  return err;
+  cl_event e;
+  err = clEnqueueUnmapMemObject(command_queue, tmp_ker_buf, v_ptr,
+num_events_in_wait_list, event_wait_list, );
+  if (err != CL_SUCCESS) {
+clReleaseEvent(e);
+return err;
   }
 
-  err = clEnqueueUnmapMemObject(command_queue, image->tmp_ker_buf, mapped_ptr,
-num_events_in_wait_list, event_wait_list, event);
+  if (write_map) {
+err = clEnqueueCopyBufferToImage(command_queue, tmp_ker_buf, memobj, 0, 
origin, region,
+1, , event);
+if (err != CL_SUCCESS) {
+  clReleaseEvent(e);
+  return err;
+}
 
-  clReleaseMemObject(image->tmp_ker_buf);
-  image->tmp_ker_buf = NULL;
+if (event == NULL) {
+  err = clFinish(command_queue);
+  if (err != CL_SUCCESS) {
+clReleaseEvent(e);
+return err;
+  }
+}
+  }
 
+  clReleaseEvent(e);
+  clReleaseMemObject(tmp_ker_buf);
   return err;
 }
 
@@ -1516,20 +1548,24 @@ clEnqueueMapImageByKernel(cl_command_queue 
command_queue,
   cl_int *errcode_ret)
 {
   cl_int err = CL_SUCCESS;
+  void *ptr = NULL;
+  void *mem_ptr = NULL;
   struct _cl_mem_image *image = NULL;
-  size_t region[3];
-  size_t origin[3];
+  size_t region[3], copy_origin[3];
+  size_t origin[3], copy_region[3];
+  size_t offset = 0;
+  size_t buf_size = 0;
 
   image = cl_mem_image(mem);
 
   err = check_image_region(image, pregion, region);
-  if (err != CL_SUCCESS) {
+  if (err != CL_SUCCESS && errcode_ret) {
 *errcode_ret = err;
 return NULL;
   }
 
   err = check_image_origin(image, porigin, origin);
-  if (err != CL_SUCCESS) {
+  if (err != CL_SUCCESS && errcode_ret) {
 *errcode_ret = err;
 return NULL;
   }
@@ -1537,29 +1573,72 @@ clEnqueueMapImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  if(mem->flags & CL_MEM_USE_HOST_PTR)
+  if (mem->flags & CL_MEM_USE_HOST_PTR) {
+buf_size = image->w * image->h * image->depth * image->bpp;
+memset(copy_origin, 0, sizeof(size_t) * 3);
+copy_region[0] = image->w;
+copy_region[1] = image->h;
+copy_region[2] = image->depth;
 image->tmp_ker_buf =
   clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, mem->size, 
mem->host_ptr, );
-  else
+  } else {
+buf_size = region[0] * region[1] * region[2] * image->bpp;
+memcp

[Beignet] [PATCH v4 7/7] Optimize clEnqueueWriteImageByKernel and clEnqueuReadImageByKernel.

2017-06-12 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

1. Only copy the data by origin and region defined.
2. Add clFinish to guarantee the kernel copying is finished when blocking 
writing.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index f4500f5..f93066f 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1847,15 +1847,17 @@ clEnqueueReadImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, 
CL_MEM_ALLOC_HOST_PTR,
-mem->size, NULL, );
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
+  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR,
+buf_size, ptr, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
   }
 
+  cl_event e;
   err = clEnqueueCopyImageToBuffer(command_queue, mem, image->tmp_ker_buf, 
origin,
-region, 0, 0, NULL, NULL);
+region, 0, num_events_in_wait_list, event_wait_list, );
   if (err != CL_SUCCESS) {
 clReleaseMemObject(image->tmp_ker_buf);
 image->tmp_ker_buf = NULL;
@@ -1863,7 +1865,7 @@ clEnqueueReadImageByKernel(cl_command_queue command_queue,
   }
 
   return clEnqueueReadBuffer(command_queue, image->tmp_ker_buf, blocking_read, 
0,
-mem->size, ptr, num_events_in_wait_list, event_wait_list, event);
+buf_size, ptr, 1, , event);
 }
 
 cl_int
@@ -2054,14 +2056,20 @@ clEnqueueWriteImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
mem->size, (void*)ptr, );
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
+  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
buf_size, (void*)ptr, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
   }
 
-  return clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, mem, 0, 
origin, region,
+  err = clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, mem, 0, 
origin, region,
 num_events_in_wait_list, event_wait_list, event);
+
+  if (blocking_write)
+err = clFinish(command_queue);
+
+  return err;
 }
 
 cl_int
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v4 3/7] Add utest to test writing data into large image (TILE_Y) by map/unmap and USE_HOST_PTR mode.

2017-06-12 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/runtime_use_host_ptr_large_image.cpp | 115 
 1 file changed, 115 insertions(+)

diff --git a/utests/runtime_use_host_ptr_large_image.cpp 
b/utests/runtime_use_host_ptr_large_image.cpp
index c8200b3..3c77cae 100644
--- a/utests/runtime_use_host_ptr_large_image.cpp
+++ b/utests/runtime_use_host_ptr_large_image.cpp
@@ -73,3 +73,118 @@ static void runtime_use_host_ptr_large_image(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(runtime_use_host_ptr_large_image);
+
+static void runtime_use_host_ptr_large_image_1(void)
+{
+  cl_int status;
+  const size_t w = 4096;
+  const size_t h = 4096;
+  size_t image_row_pitch, image_slice_pitch;
+  size_t origin[3] = {5, 5, 0};
+  size_t region[3] = {8, 8, 1};
+  uint8_t *p = NULL;
+  uint8_t *q = NULL;
+
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+
+  size_t alignment = 4096;  //page size
+  if (cl_check_beignet())
+alignment = 64; //cacheline size, beignet has loose limitaiont to 
enable userptr
+
+  //src image
+  int ret = posix_memalign(_data[0], alignment, sizeof(uint32_t) * w * h * 
4);
+  OCL_ASSERT(ret == 0);
+  for (size_t i = 0; i < w*h*4; ++i)
+((uint32_t*)buf_data[0])[i] = i;
+
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_USE_HOST_PTR, , , buf_data[0]);
+
+  // Use mapping mode to fill data into src image
+  buf_data[1] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 
origin, region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  OCL_ASSERT(image_slice_pitch == 0);
+  for (uint32_t j = 0; j < region[1]; ++j)
+for (uint32_t i = 0; i < region[0]; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[1])[(j * w + i) * 4 + k] = rand();
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[1], 0, NULL, NULL);
+
+  // Check src image
+  origin[0] = 0;
+  origin[1] = 0;
+  origin[2] = 0;
+  region[0] = w;
+  region[1] = h;
+  region[2] = 1;
+  buf_data[1] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_READ, origin, 
region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  OCL_ASSERT(image_slice_pitch == 0);
+
+  for (uint32_t j = 0; j < h; ++j) {
+p = ((uint8_t*)buf_data[0]) + j * image_row_pitch;
+q = ((uint8_t*)buf_data[1]) + j * image_row_pitch;
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)p)[i * 4 + k] == ((uint32_t*)q)[i * 4 + k]);
+  }
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[1], 0, NULL, NULL);
+
+  //dst image
+  ret = posix_memalign(_data[1], alignment, sizeof(uint32_t) * w * h * 4);
+  OCL_ASSERT(ret == 0);
+  for (size_t i = 0; i < w*h*4; ++i)
+((uint32_t*)buf_data[1])[i] = 0;
+
+  OCL_CREATE_IMAGE(buf[1], CL_MEM_USE_HOST_PTR, , , buf_data[1]);
+
+  OCL_CREATE_KERNEL("runtime_use_host_ptr_image");
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  origin[0] = 0;
+  origin[1] = 0;
+  origin[2] = 0;
+  region[0] = w;
+  region[1] = h;
+  region[2] = 1;
+  void* mapptr = (int*)clEnqueueMapImage(queue, buf[1], CL_TRUE, CL_MAP_READ, 
origin, region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, NULL);
+  OCL_ASSERT(mapptr == buf_data[1]);
+  for (uint32_t j = 0; j < h; ++j) {
+p = ((uint8_t*)buf_data[0]) + j * image_row_pitch;
+q = ((uint8_t*)buf_data[1]) + j * image_row_pitch;
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)p)[i * 4 + k] == ((uint32_t*)q)[i * 4 + k]);
+  }
+  clEnqueueUnmapMemObject(queue, buf[1], mapptr, 0, NULL, NULL);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+  free(buf_data[1]);
+  buf_data[1] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_use_host_ptr_large_image_1);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v4 6/7] Fix bug of clEnqueueUnmapMemObjectForKernel and clEnqueueMapImageByKernel.

2017-06-12 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

1. Support wrrting data by mapping/unmapping mode.
2. Add mapping record logic.
3. Add clFinish to guarantee the kernel copying is finished.
4. Fix the error of calling clEnqueueMapImageByKernel.
   blocking_map and map_flags need be switched.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 135 +--
 1 file changed, 102 insertions(+), 33 deletions(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 7b58236..f4500f5 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -21,6 +21,7 @@
 #include "cl_command_queue.h"
 #include "cl_event.h"
 #include "CL/cl.h"
+#include 
 
 cl_int
 clSetMemObjectDestructorCallback(cl_mem memobj,
@@ -350,33 +351,57 @@ clEnqueueUnmapMemObjectForKernel(cl_command_queue 
command_queue,
 cl_event *event)
 {
   cl_int err = CL_SUCCESS;
-  struct _cl_mem_image *image = NULL;
-  size_t region[3];
-  size_t origin[3];
-
-  image = cl_mem_image(memobj);
+  int i, j;
+  uint8_t write_map = 0;
+  cl_mem tmp_ker_buf = NULL;
+  size_t origin[3], region[3];
+  void *v_ptr = NULL;
+
+  assert(memobj->mapped_ptr_sz >= memobj->map_ref);
+  for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
+  memobj->mapped_ptr[i].ptr = NULL;
+  v_ptr = memobj->mapped_ptr[i].v_ptr;
+  write_map = memobj->mapped_ptr[i].ker_write_map;
+  tmp_ker_buf = memobj->mapped_ptr[i].tmp_ker_buf;
+  for (j = 0; j < 3; j++) {
+region[j] = memobj->mapped_ptr[i].region[j];
+origin[j] = memobj->mapped_ptr[i].origin[j];
+memobj->mapped_ptr[i].region[j] = 0;
+memobj->mapped_ptr[i].origin[j] = 0;
+  }
+  memobj->mapped_ptr[i].size = 0;
+  memobj->mapped_ptr[i].ker_write_map = 0;
+  memobj->mapped_ptr[i].tmp_ker_buf = 0;
+  memobj->mapped_ptr[i].v_ptr = NULL;
+  memobj->map_ref--;
+  break;
+}
+  }
 
-  if (!image->tmp_ker_buf)
+  if (!tmp_ker_buf)
 return CL_INVALID_MEM_OBJECT;
 
-  origin[0] = origin[1] = origin[2] = 0;
-  region[0] = image->w;
-  region[1] = image->h;
-  region[2] = image->depth;
+  cl_event e;
+  err = clEnqueueUnmapMemObject(command_queue, tmp_ker_buf, v_ptr,
+num_events_in_wait_list, event_wait_list, );
+  if (err != CL_SUCCESS)
+return err;
 
-  if (memobj->flags & CL_MEM_USE_HOST_PTR) {
-err = clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, 
memobj, 0, origin, region,
-  num_events_in_wait_list, event_wait_list, event);
+  if (write_map) {
+err = clEnqueueCopyBufferToImage(command_queue, tmp_ker_buf, memobj, 0, 
origin, region,
+1, , event);
 if (err != CL_SUCCESS)
   return err;
-  }
 
-  err = clEnqueueUnmapMemObject(command_queue, image->tmp_ker_buf, mapped_ptr,
-num_events_in_wait_list, event_wait_list, event);
-
-  clReleaseMemObject(image->tmp_ker_buf);
-  image->tmp_ker_buf = NULL;
+if (event == NULL) {
+  err = clFinish(command_queue);
+  if (err != CL_SUCCESS)
+return err;
+}
+  }
 
+  clReleaseMemObject(tmp_ker_buf);
   return err;
 }
 
@@ -1516,20 +1541,24 @@ clEnqueueMapImageByKernel(cl_command_queue 
command_queue,
   cl_int *errcode_ret)
 {
   cl_int err = CL_SUCCESS;
+  void *ptr = NULL;
+  void *mem_ptr = NULL;
   struct _cl_mem_image *image = NULL;
-  size_t region[3];
-  size_t origin[3];
+  size_t region[3], copy_origin[3];
+  size_t origin[3], copy_region[3];
+  size_t offset = 0;
+  size_t buf_size = 0;
 
   image = cl_mem_image(mem);
 
   err = check_image_region(image, pregion, region);
-  if (err != CL_SUCCESS) {
+  if (err != CL_SUCCESS && errcode_ret) {
 *errcode_ret = err;
 return NULL;
   }
 
   err = check_image_origin(image, porigin, origin);
-  if (err != CL_SUCCESS) {
+  if (err != CL_SUCCESS && errcode_ret) {
 *errcode_ret = err;
 return NULL;
   }
@@ -1537,29 +1566,69 @@ clEnqueueMapImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  if(mem->flags & CL_MEM_USE_HOST_PTR)
+  if (mem->flags & CL_MEM_USE_HOST_PTR) {
+buf_size = image->w * image->h * image->depth * image->bpp;
+memset(copy_origin, 0, sizeof(size_t) * 3);
+copy_region[0] = image->w;
+copy_region[1] = image->h;
+copy_region[2] = image->depth;
 image->tmp_ker_buf =
   clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, mem->size, 
mem->host_ptr, );
-  else
+  } else {
+buf_size = region[0] * region[1] * region[2] * image->bpp;
+memcpy(copy_origin, origin, sizeof(size_t) * 3);
+memcpy(copy_region, region, sizeof(size_t) * 3);
 image->tmp_ker_buf =
-  clCreateBuffer(command_queue->ctx, CL_MEM_

[Beignet] [PATCH v4 5/7] Add clFinish for guarantee the kernel copying is finished when create TILE_Y large image.

2017-06-12 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_mem.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index 3f41fd8..b6dce3f 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -817,6 +817,13 @@ _cl_new_image_copy_from_host_ptr(cl_context ctx,
 return NULL;
   }
 
+  err = clFinish(ctx->image_queue);
+  if(err != CL_SUCCESS) {
+clReleaseMemObject(buf);
+clReleaseMemObject(mem);
+return NULL;
+  }
+
   clReleaseMemObject(buf);
   if (flags & CL_MEM_USE_HOST_PTR && data) {
 mem->host_ptr = data;
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v4 4/7] Add cl_mem_record_map_mem_for_kernel() for record map adress for TILE_Y image by kernel copying.

2017-06-12 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_mem.c | 109 +--
 src/cl_mem.h |   5 +++
 2 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index a8543c9..3f41fd8 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -2650,38 +2650,17 @@ error:
   goto exit;
 }
 
-LOCAL cl_int
-cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
-  size_t size, const size_t *origin, const size_t *region)
+static cl_int
+get_mapped_address(cl_mem mem)
 {
-  // TODO: Need to add MT safe logic.
-
   cl_int slot = -1;
-  int err = CL_SUCCESS;
-  size_t sub_offset = 0;
-
-  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
-struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
-sub_offset = buffer->sub_offset;
-  }
-
-  ptr = (char*)ptr + offset + sub_offset;
-  if(mem->flags & CL_MEM_USE_HOST_PTR) {
-assert(mem->host_ptr);
-//only calc ptr here, will do memcpy in enqueue
-*mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
-  } else {
-*mem_ptr = ptr;
-  }
-  /* Record the mapped address. */
   if (!mem->mapped_ptr_sz) {
 mem->mapped_ptr_sz = 16;
 mem->mapped_ptr = (cl_mapped_ptr *)malloc(
 sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
 if (!mem->mapped_ptr) {
   cl_mem_unmap_auto(mem);
-  err = CL_OUT_OF_HOST_MEMORY;
-  goto error;
+  return slot;
 }
 memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
 slot = 0;
@@ -2698,8 +2677,7 @@ cl_mem_record_map_mem(cl_mem mem, void *ptr, void 
**mem_ptr, size_t offset,
   sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
   if (!new_ptr) {
 cl_mem_unmap_auto(mem);
-err = CL_OUT_OF_HOST_MEMORY;
-goto error;
+return slot;
   }
   memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
   memcpy(new_ptr, mem->mapped_ptr,
@@ -2710,7 +2688,86 @@ cl_mem_record_map_mem(cl_mem mem, void *ptr, void 
**mem_ptr, size_t offset,
   mem->mapped_ptr = new_ptr;
 }
   }
+
   assert(slot != -1);
+  return slot;
+}
+
+LOCAL cl_int
+cl_mem_record_map_mem_for_kernel(cl_mem mem, void *ptr, void **mem_ptr, size_t 
offset,
+  size_t size, const size_t *origin, const size_t *region,
+  cl_mem tmp_ker_buf, uint8_t write_map)
+{
+  // TODO: Need to add MT safe logic.
+
+  cl_int slot = -1;
+  int err = CL_SUCCESS;
+  size_t sub_offset = 0;
+
+  //ptr = (char*)ptr + offset + sub_offset;
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+assert(mem->host_ptr);
+//only calc ptr here, will do memcpy in enqueue
+*mem_ptr = (char*)ptr + offset + sub_offset;
+  } else {
+*mem_ptr = ptr;
+  }
+  /* Record the mapped address. */
+  slot = get_mapped_address(mem);
+  if (slot == -1) {
+err = CL_OUT_OF_HOST_MEMORY;
+goto error;
+  }
+  mem->mapped_ptr[slot].ptr = *mem_ptr;
+  mem->mapped_ptr[slot].v_ptr = ptr;
+  mem->mapped_ptr[slot].size = size;
+  mem->mapped_ptr[slot].ker_write_map = write_map;
+  mem->mapped_ptr[slot].tmp_ker_buf = tmp_ker_buf;
+  if(origin) {
+assert(region);
+mem->mapped_ptr[slot].origin[0] = origin[0];
+mem->mapped_ptr[slot].origin[1] = origin[1];
+mem->mapped_ptr[slot].origin[2] = origin[2];
+mem->mapped_ptr[slot].region[0] = region[0];
+mem->mapped_ptr[slot].region[1] = region[1];
+mem->mapped_ptr[slot].region[2] = region[2];
+  }
+  mem->map_ref++;
+error:
+  if (err != CL_SUCCESS)
+*mem_ptr = NULL;
+  return err;
+}
+
+LOCAL cl_int
+cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
+  size_t size, const size_t *origin, const size_t *region)
+{
+  // TODO: Need to add MT safe logic.
+
+  cl_int slot = -1;
+  int err = CL_SUCCESS;
+  size_t sub_offset = 0;
+
+  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
+struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+sub_offset = buffer->sub_offset;
+  }
+
+  ptr = (char*)ptr + offset + sub_offset;
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+assert(mem->host_ptr);
+//only calc ptr here, will do memcpy in enqueue
+*mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
+  } else {
+*mem_ptr = ptr;
+  }
+  /* Record the mapped address. */
+  slot = get_mapped_address(mem);
+  if (slot == -1) {
+err = CL_OUT_OF_HOST_MEMORY;
+goto error;
+  }
   mem->mapped_ptr[slot].ptr = *mem_ptr;
   mem->mapped_ptr[slot].v_ptr = ptr;
   mem->mapped_ptr[slot].size = size;
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 0b33c31..ce1294d 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -61,6 +61,8 @@ typedef struct _cl_mapped_ptr {
   size_t size;
   size_t origin[3];  /* mapped origin */
   size_t region[3];  /* mappe

[Beignet] [PATCH v4 2/7] Add utest to test writing data into large image (TILE_Y) by map/unmap mode.

2017-06-12 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is used to reproduce the bug of clCopyImage/clFillImage of conformance test.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/compiler_copy_large_image.cpp | 198 +++
 1 file changed, 198 insertions(+)

diff --git a/utests/compiler_copy_large_image.cpp 
b/utests/compiler_copy_large_image.cpp
index 66998a7..37fdaab 100644
--- a/utests/compiler_copy_large_image.cpp
+++ b/utests/compiler_copy_large_image.cpp
@@ -119,3 +119,201 @@ static void compiler_copy_large_image_1(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_copy_large_image_1);
+
+static void compiler_copy_large_image_2(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, 1};
+  size_t image_row_pitch, image_slice_pitch;
+  cl_int status;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+  uint8_t *p = NULL;
+  uint8_t *q = NULL;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * 4);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] = k;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_CREATE_IMAGE(buf[1], 0, , , NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+
+  // Use mapping mode to fill data into src image
+  buf_data[1] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 
origin, region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  OCL_ASSERT(image_slice_pitch == 0);
+  memcpy(buf_data[1], buf_data[0], image_row_pitch * h);
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[1], 0, NULL, NULL);
+
+  // Check src image
+  buf_data[1] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_READ, origin, 
region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  OCL_ASSERT(image_slice_pitch == 0);
+
+  for (uint32_t j = 0; j < h; ++j) {
+p = ((uint8_t*)buf_data[0]) + j * image_row_pitch;
+q = ((uint8_t*)buf_data[1]) + j * image_row_pitch;
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)p)[i * 4 + k] == ((uint32_t*)q)[i * 4 + k]);
+  }
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[1], 0, NULL, NULL);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+  buf_data[1] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  OCL_SET_ARG(2, sizeof(sampler), );
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  buf_data[0] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_READ, origin, 
region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+  buf_data[1] = clEnqueueMapImage(queue, buf[1], CL_TRUE, CL_MAP_READ, origin, 
region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  for (uint32_t j = 0; j < h; ++j) {
+p = ((uint8_t*)buf_data[0]) + j * image_row_pitch;
+q = ((uint8_t*)buf_data[1]) + j * image_row_pitch;
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)p)[i * 4 + k] == ((uint32_t*)q)[i * 4 + k]);
+  }
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[0], 0, NULL, NULL);
+  clEnqueueUnmapMemObject(queue, buf[1], buf_data[1], 0, NULL, NULL);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_large_image_2);
+
+static void compiler_copy_large_image_3(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  size_t origin[3] = {5, 5, 0};
+  size_t region[3] = {8, 8, 1};
+  size_t image_row_pitch, image_slice_pitch;
+  cl_int status;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+  uint8_t *p = NULL;
+  uint8_t *q = NULL;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * region[0] * region[1] * 
4);
+  for (uint32_t j = 0; j < region[1]; ++j)
+for (uint32_t i = 0; i < region[0]; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * region[0] + i) * 4 + k] = k;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE

[Beignet] [PATCH v4 1/7] Add utest case for filling image by small region.

2017-06-12 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is used to reproduce the bug of allocations of conformance test.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/compiler_fill_large_image.cpp | 50 
 1 file changed, 50 insertions(+)

diff --git a/utests/compiler_fill_large_image.cpp 
b/utests/compiler_fill_large_image.cpp
index 1ecf65b..3894d6f 100644
--- a/utests/compiler_fill_large_image.cpp
+++ b/utests/compiler_fill_large_image.cpp
@@ -164,3 +164,53 @@ static void compiler_fill_large_image_2(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_2);
+
+static void compiler_fill_large_image_3(void)
+{
+  const size_t w = 8192;
+  const size_t h = 8192;
+  const size_t num_of_lines = 8;
+  size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, num_of_lines, 1};
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * num_of_lines * 4);
+  buf_data[1] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * 4);
+
+  memset(buf_data[0], 0, sizeof(uint32_t) * w * num_of_lines * 4);
+  memset(buf_data[1], 0, sizeof(uint32_t) * w * h * 4);
+
+  for (uint32_t j = 0; j < num_of_lines; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] = (uint32_t)rand();
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_WRITE_IMAGE(buf[0], origin, region, buf_data[0]);
+  OCL_READ_IMAGE(buf[0], origin, region, buf_data[1]);
+
+  // Check result
+  for (uint32_t j = 0; j < num_of_lines; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+
+  free(buf_data[0]);
+  free(buf_data[1]);
+  buf_data[0] = NULL;
+  buf_data[1] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_3);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 7/7] Optimize clEnqueueWriteImageByKernel and clEnqueuReadImageByKernel.

2017-06-07 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

1. Only copy the data by origin and region defined.
2. Add clFinish to guarantee the kernel copying is finished when blocking 
writing.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index a450e19..46672b7 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1843,8 +1843,9 @@ clEnqueueReadImageByKernel(cl_command_queue command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
   image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, 
CL_MEM_ALLOC_HOST_PTR,
-mem->size, NULL, );
+buf_size, NULL, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
@@ -1859,7 +1860,7 @@ clEnqueueReadImageByKernel(cl_command_queue command_queue,
   }
 
   return clEnqueueReadBuffer(command_queue, image->tmp_ker_buf, blocking_read, 
0,
-mem->size, ptr, num_events_in_wait_list, event_wait_list, event);
+buf_size, ptr, num_events_in_wait_list, event_wait_list, event);
 }
 
 cl_int
@@ -2050,14 +2051,20 @@ clEnqueueWriteImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
mem->size, (void*)ptr, );
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
+  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
buf_size, (void*)ptr, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
   }
 
-  return clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, mem, 0, 
origin, region,
+  err = clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, mem, 0, 
origin, region,
 num_events_in_wait_list, event_wait_list, event);
+
+  if (blocking_write)
+err = clFinish(command_queue);
+
+  return err;
 }
 
 cl_int
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 6/7] Fix bug of clEnqueueUnmapMemObjectForKernel and clEnqueueMapImageByKernel.

2017-06-07 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

1. Support wrrting data by mapping/unmapping mode.
2. Add mapping record logic.
3. Add clFinish to guarantee the kernel copying is finished.
4. Fix the error of calling clEnqueueMapImageByKernel.
   blocking_map and map_flags need be switched.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 131 +--
 1 file changed, 98 insertions(+), 33 deletions(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 7b58236..a450e19 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -21,6 +21,7 @@
 #include "cl_command_queue.h"
 #include "cl_event.h"
 #include "CL/cl.h"
+#include 
 
 cl_int
 clSetMemObjectDestructorCallback(cl_mem memobj,
@@ -350,33 +351,54 @@ clEnqueueUnmapMemObjectForKernel(cl_command_queue 
command_queue,
 cl_event *event)
 {
   cl_int err = CL_SUCCESS;
-  struct _cl_mem_image *image = NULL;
-  size_t region[3];
-  size_t origin[3];
-
-  image = cl_mem_image(memobj);
+  int i, j;
+  uint8_t write_map = 0;
+  cl_mem tmp_ker_buf = NULL;
+  size_t origin[3], region[3];
+  void *v_ptr = NULL;
+
+  assert(memobj->mapped_ptr_sz >= memobj->map_ref);
+  for (i = 0; i < memobj->mapped_ptr_sz; i++) {
+if (memobj->mapped_ptr[i].ptr == mapped_ptr) {
+  memobj->mapped_ptr[i].ptr = NULL;
+  v_ptr = memobj->mapped_ptr[i].v_ptr;
+  write_map = memobj->mapped_ptr[i].ker_write_map;
+  tmp_ker_buf = memobj->mapped_ptr[i].tmp_ker_buf;
+  for (j = 0; j < 3; j++) {
+region[j] = memobj->mapped_ptr[i].region[j];
+origin[j] = memobj->mapped_ptr[i].origin[j];
+memobj->mapped_ptr[i].region[j] = 0;
+memobj->mapped_ptr[i].origin[j] = 0;
+  }
+  memobj->mapped_ptr[i].size = 0;
+  memobj->mapped_ptr[i].ker_write_map = 0;
+  memobj->mapped_ptr[i].tmp_ker_buf = 0;
+  memobj->mapped_ptr[i].v_ptr = NULL;
+  memobj->map_ref--;
+  break;
+}
+  }
 
-  if (!image->tmp_ker_buf)
+  if (!tmp_ker_buf)
 return CL_INVALID_MEM_OBJECT;
 
-  origin[0] = origin[1] = origin[2] = 0;
-  region[0] = image->w;
-  region[1] = image->h;
-  region[2] = image->depth;
+  err = clEnqueueUnmapMemObject(command_queue, tmp_ker_buf, v_ptr,
+num_events_in_wait_list, event_wait_list, event);
+  if (err != CL_SUCCESS)
+return err;
 
-  if (memobj->flags & CL_MEM_USE_HOST_PTR) {
-err = clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, 
memobj, 0, origin, region,
-  num_events_in_wait_list, event_wait_list, event);
+  if (write_map) {
+err = clEnqueueCopyBufferToImage(command_queue, tmp_ker_buf, memobj, 0, 
origin, region,
+num_events_in_wait_list, event_wait_list, event);
 if (err != CL_SUCCESS)
   return err;
-  }
-
-  err = clEnqueueUnmapMemObject(command_queue, image->tmp_ker_buf, mapped_ptr,
-num_events_in_wait_list, event_wait_list, event);
 
-  clReleaseMemObject(image->tmp_ker_buf);
-  image->tmp_ker_buf = NULL;
+err = clFinish(command_queue);
+if (err != CL_SUCCESS)
+  return err;
+  }
 
+  clReleaseMemObject(tmp_ker_buf);
   return err;
 }
 
@@ -1516,20 +1538,24 @@ clEnqueueMapImageByKernel(cl_command_queue 
command_queue,
   cl_int *errcode_ret)
 {
   cl_int err = CL_SUCCESS;
+  void *ptr = NULL;
+  void *mem_ptr = NULL;
   struct _cl_mem_image *image = NULL;
-  size_t region[3];
-  size_t origin[3];
+  size_t region[3], copy_origin[3];
+  size_t origin[3], copy_region[3];
+  size_t offset = 0;
+  size_t buf_size = 0;
 
   image = cl_mem_image(mem);
 
   err = check_image_region(image, pregion, region);
-  if (err != CL_SUCCESS) {
+  if (err != CL_SUCCESS && errcode_ret) {
 *errcode_ret = err;
 return NULL;
   }
 
   err = check_image_origin(image, porigin, origin);
-  if (err != CL_SUCCESS) {
+  if (err != CL_SUCCESS && errcode_ret) {
 *errcode_ret = err;
 return NULL;
   }
@@ -1537,29 +1563,68 @@ clEnqueueMapImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  if(mem->flags & CL_MEM_USE_HOST_PTR)
+  if (mem->flags & CL_MEM_USE_HOST_PTR) {
+buf_size = image->w * image->h * image->depth * image->bpp;
+memset(copy_origin, 0, sizeof(size_t) * 3);
+copy_region[0] = image->w;
+copy_region[1] = image->h;
+copy_region[2] = image->depth;
 image->tmp_ker_buf =
   clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, mem->size, 
mem->host_ptr, );
-  else
+  } else {
+buf_size = region[0] * region[1] * region[2] * image->bpp;
+memcpy(copy_origin, origin, sizeof(size_t) * 3);
+memcpy(copy_region, region, sizeof(size_t) * 3);
 image->tmp_ker_buf =
-  clCreateBuffer(command_queue->ctx, CL_MEM_ALLOC_HOST_PTR,

[Beignet] [PATCH v3 5/7] Add clFinish for guarantee the kernel copying is finished when create TILE_Y large image.

2017-06-07 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_mem.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index 3f41fd8..b6dce3f 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -817,6 +817,13 @@ _cl_new_image_copy_from_host_ptr(cl_context ctx,
 return NULL;
   }
 
+  err = clFinish(ctx->image_queue);
+  if(err != CL_SUCCESS) {
+clReleaseMemObject(buf);
+clReleaseMemObject(mem);
+return NULL;
+  }
+
   clReleaseMemObject(buf);
   if (flags & CL_MEM_USE_HOST_PTR && data) {
 mem->host_ptr = data;
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 1/7] Add utest case for filling image by small region.

2017-06-07 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is used to reproduce the bug of allocations of conformance test.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/compiler_fill_large_image.cpp | 50 
 1 file changed, 50 insertions(+)

diff --git a/utests/compiler_fill_large_image.cpp 
b/utests/compiler_fill_large_image.cpp
index 1ecf65b..3894d6f 100644
--- a/utests/compiler_fill_large_image.cpp
+++ b/utests/compiler_fill_large_image.cpp
@@ -164,3 +164,53 @@ static void compiler_fill_large_image_2(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_2);
+
+static void compiler_fill_large_image_3(void)
+{
+  const size_t w = 8192;
+  const size_t h = 8192;
+  const size_t num_of_lines = 8;
+  size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, num_of_lines, 1};
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * num_of_lines * 4);
+  buf_data[1] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * 4);
+
+  memset(buf_data[0], 0, sizeof(uint32_t) * w * num_of_lines * 4);
+  memset(buf_data[1], 0, sizeof(uint32_t) * w * h * 4);
+
+  for (uint32_t j = 0; j < num_of_lines; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] = (uint32_t)rand();
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_WRITE_IMAGE(buf[0], origin, region, buf_data[0]);
+  OCL_READ_IMAGE(buf[0], origin, region, buf_data[1]);
+
+  // Check result
+  for (uint32_t j = 0; j < num_of_lines; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+
+  free(buf_data[0]);
+  free(buf_data[1]);
+  buf_data[0] = NULL;
+  buf_data[1] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_3);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 2/7] Add utest to test writing data into large image (TILE_Y) by map/unmap mode.

2017-06-07 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

it is used to reproduce the bug of clCopyImage/clFillImage of conformance test.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/compiler_copy_large_image.cpp | 176 +++
 1 file changed, 176 insertions(+)

diff --git a/utests/compiler_copy_large_image.cpp 
b/utests/compiler_copy_large_image.cpp
index 66998a7..94b46ed 100644
--- a/utests/compiler_copy_large_image.cpp
+++ b/utests/compiler_copy_large_image.cpp
@@ -119,3 +119,179 @@ static void compiler_copy_large_image_1(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_copy_large_image_1);
+
+static void compiler_copy_large_image_2(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, 1};
+  size_t image_row_pitch, image_slice_pitch;
+  cl_int status;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * 4);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] = k;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_CREATE_IMAGE(buf[1], 0, , , NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+
+  // Use mapping mode to fill data into src image
+  buf_data[1] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 
origin, region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  OCL_ASSERT(image_row_pitch == w * 4 * sizeof(uint32_t));
+  OCL_ASSERT(image_slice_pitch == 0);
+  memcpy(buf_data[1], buf_data[0], image_row_pitch * h);
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[1], 0, NULL, NULL);
+
+  // Check src image
+  buf_data[1] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_READ, origin, 
region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  OCL_ASSERT(image_row_pitch == w * 4 * sizeof(uint32_t));
+  OCL_ASSERT(image_slice_pitch == 0);
+
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[1], 0, NULL, NULL);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+  buf_data[1] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  OCL_SET_ARG(2, sizeof(sampler), );
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_large_image_2);
+
+static void compiler_copy_large_image_3(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  const size_t origin[3] = {5, 5, 0};
+  const size_t region[3] = {8, 8, 1};
+  size_t image_row_pitch, image_slice_pitch;
+  cl_int status;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * region[0] * region[1] * 
4);
+  for (uint32_t j = 0; j < region[1]; ++j)
+for (uint32_t i = 0; i < region[0]; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * region[0] + i) * 4 + k] = k;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_CREATE_IMAGE(buf[1], 0, , , NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+
+  // Use mapping mode to fill data into src image
+  buf_data[1] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 
origin, region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  OCL_ASSERT(image_row_pitch == region[0] * sizeof(uint32_t) * 4);
+  OCL_ASSERT(i

[Beignet] [PATCH v3 4/7] Add cl_mem_record_map_mem_for_kernel() for record map adress for TILE_Y image by kernel copying.

2017-06-07 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_mem.c | 109 +--
 src/cl_mem.h |   5 +++
 2 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index a8543c9..3f41fd8 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -2650,38 +2650,17 @@ error:
   goto exit;
 }
 
-LOCAL cl_int
-cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
-  size_t size, const size_t *origin, const size_t *region)
+static cl_int
+get_mapped_address(cl_mem mem)
 {
-  // TODO: Need to add MT safe logic.
-
   cl_int slot = -1;
-  int err = CL_SUCCESS;
-  size_t sub_offset = 0;
-
-  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
-struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
-sub_offset = buffer->sub_offset;
-  }
-
-  ptr = (char*)ptr + offset + sub_offset;
-  if(mem->flags & CL_MEM_USE_HOST_PTR) {
-assert(mem->host_ptr);
-//only calc ptr here, will do memcpy in enqueue
-*mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
-  } else {
-*mem_ptr = ptr;
-  }
-  /* Record the mapped address. */
   if (!mem->mapped_ptr_sz) {
 mem->mapped_ptr_sz = 16;
 mem->mapped_ptr = (cl_mapped_ptr *)malloc(
 sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz);
 if (!mem->mapped_ptr) {
   cl_mem_unmap_auto(mem);
-  err = CL_OUT_OF_HOST_MEMORY;
-  goto error;
+  return slot;
 }
 memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
 slot = 0;
@@ -2698,8 +2677,7 @@ cl_mem_record_map_mem(cl_mem mem, void *ptr, void 
**mem_ptr, size_t offset,
   sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2);
   if (!new_ptr) {
 cl_mem_unmap_auto(mem);
-err = CL_OUT_OF_HOST_MEMORY;
-goto error;
+return slot;
   }
   memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr));
   memcpy(new_ptr, mem->mapped_ptr,
@@ -2710,7 +2688,86 @@ cl_mem_record_map_mem(cl_mem mem, void *ptr, void 
**mem_ptr, size_t offset,
   mem->mapped_ptr = new_ptr;
 }
   }
+
   assert(slot != -1);
+  return slot;
+}
+
+LOCAL cl_int
+cl_mem_record_map_mem_for_kernel(cl_mem mem, void *ptr, void **mem_ptr, size_t 
offset,
+  size_t size, const size_t *origin, const size_t *region,
+  cl_mem tmp_ker_buf, uint8_t write_map)
+{
+  // TODO: Need to add MT safe logic.
+
+  cl_int slot = -1;
+  int err = CL_SUCCESS;
+  size_t sub_offset = 0;
+
+  //ptr = (char*)ptr + offset + sub_offset;
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+assert(mem->host_ptr);
+//only calc ptr here, will do memcpy in enqueue
+*mem_ptr = (char*)ptr + offset + sub_offset;
+  } else {
+*mem_ptr = ptr;
+  }
+  /* Record the mapped address. */
+  slot = get_mapped_address(mem);
+  if (slot == -1) {
+err = CL_OUT_OF_HOST_MEMORY;
+goto error;
+  }
+  mem->mapped_ptr[slot].ptr = *mem_ptr;
+  mem->mapped_ptr[slot].v_ptr = ptr;
+  mem->mapped_ptr[slot].size = size;
+  mem->mapped_ptr[slot].ker_write_map = write_map;
+  mem->mapped_ptr[slot].tmp_ker_buf = tmp_ker_buf;
+  if(origin) {
+assert(region);
+mem->mapped_ptr[slot].origin[0] = origin[0];
+mem->mapped_ptr[slot].origin[1] = origin[1];
+mem->mapped_ptr[slot].origin[2] = origin[2];
+mem->mapped_ptr[slot].region[0] = region[0];
+mem->mapped_ptr[slot].region[1] = region[1];
+mem->mapped_ptr[slot].region[2] = region[2];
+  }
+  mem->map_ref++;
+error:
+  if (err != CL_SUCCESS)
+*mem_ptr = NULL;
+  return err;
+}
+
+LOCAL cl_int
+cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset,
+  size_t size, const size_t *origin, const size_t *region)
+{
+  // TODO: Need to add MT safe logic.
+
+  cl_int slot = -1;
+  int err = CL_SUCCESS;
+  size_t sub_offset = 0;
+
+  if(mem->type == CL_MEM_SUBBUFFER_TYPE) {
+struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
+sub_offset = buffer->sub_offset;
+  }
+
+  ptr = (char*)ptr + offset + sub_offset;
+  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+assert(mem->host_ptr);
+//only calc ptr here, will do memcpy in enqueue
+*mem_ptr = (char *)mem->host_ptr + offset + sub_offset;
+  } else {
+*mem_ptr = ptr;
+  }
+  /* Record the mapped address. */
+  slot = get_mapped_address(mem);
+  if (slot == -1) {
+err = CL_OUT_OF_HOST_MEMORY;
+goto error;
+  }
   mem->mapped_ptr[slot].ptr = *mem_ptr;
   mem->mapped_ptr[slot].v_ptr = ptr;
   mem->mapped_ptr[slot].size = size;
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 0b33c31..ce1294d 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -61,6 +61,8 @@ typedef struct _cl_mapped_ptr {
   size_t size;
   size_t origin[3];  /* mapped origin */
   size_t region[3];  /* mappe

[Beignet] [PATCH v3 3/7] Add utest to test writing data into large image (TILE_Y) by map/unmap and USE_HOST_PTR mode.

2017-06-07 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/runtime_use_host_ptr_large_image.cpp | 109 
 1 file changed, 109 insertions(+)

diff --git a/utests/runtime_use_host_ptr_large_image.cpp 
b/utests/runtime_use_host_ptr_large_image.cpp
index c8200b3..8f3e330 100644
--- a/utests/runtime_use_host_ptr_large_image.cpp
+++ b/utests/runtime_use_host_ptr_large_image.cpp
@@ -73,3 +73,112 @@ static void runtime_use_host_ptr_large_image(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(runtime_use_host_ptr_large_image);
+
+static void runtime_use_host_ptr_large_image_1(void)
+{
+  cl_int status;
+  const size_t w = 4096;
+  const size_t h = 4096;
+  size_t image_row_pitch, image_slice_pitch;
+  size_t origin[3] = {5, 5, 0};
+  size_t region[3] = {8, 8, 1};
+
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+
+  size_t alignment = 4096;  //page size
+  if (cl_check_beignet())
+alignment = 64; //cacheline size, beignet has loose limitaiont to 
enable userptr
+
+  //src image
+  int ret = posix_memalign(_data[0], alignment, sizeof(uint32_t) * w * h * 
4);
+  OCL_ASSERT(ret == 0);
+  for (size_t i = 0; i < w*h*4; ++i)
+((uint32_t*)buf_data[0])[i] = i;
+
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_USE_HOST_PTR, , , buf_data[0]);
+
+  // Use mapping mode to fill data into src image
+  buf_data[1] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_WRITE, 
origin, region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  OCL_ASSERT(image_row_pitch == w * 4 * sizeof(uint32_t));
+  OCL_ASSERT(image_slice_pitch == 0);
+  for (uint32_t j = 0; j < region[1]; ++j)
+for (uint32_t i = 0; i < region[0]; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[1])[(j * w + i) * 4 + k] = rand();
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[1], 0, NULL, NULL);
+
+  // Check src image
+  origin[0] = 0;
+  origin[1] = 0;
+  origin[2] = 0;
+  region[0] = w;
+  region[1] = h;
+  region[2] = 1;
+  buf_data[1] = clEnqueueMapImage(queue, buf[0], CL_TRUE, CL_MAP_READ, origin, 
region,
+_row_pitch, _slice_pitch, 0, NULL, NULL, );
+
+  OCL_ASSERT(image_row_pitch == w * 4 * sizeof(uint32_t));
+  OCL_ASSERT(image_slice_pitch == 0);
+
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+
+  clEnqueueUnmapMemObject(queue, buf[0], buf_data[1], 0, NULL, NULL);
+
+  //dst image
+  ret = posix_memalign(_data[1], alignment, sizeof(uint32_t) * w * h * 4);
+  OCL_ASSERT(ret == 0);
+  for (size_t i = 0; i < w*h*4; ++i)
+((uint32_t*)buf_data[1])[i] = 0;
+
+  OCL_CREATE_IMAGE(buf[1], CL_MEM_USE_HOST_PTR, , , buf_data[1]);
+
+  OCL_CREATE_KERNEL("runtime_use_host_ptr_image");
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  origin[0] = 0;
+  origin[1] = 0;
+  origin[2] = 0;
+  region[0] = w;
+  region[1] = h;
+  region[2] = 1;
+  size_t pitch = 0;
+  void* mapptr = (int*)clEnqueueMapImage(queue, buf[1], CL_TRUE, CL_MAP_READ, 
origin, region, , NULL, 0, NULL, NULL, NULL);
+  OCL_ASSERT(mapptr == buf_data[1]);
+  for (uint32_t i = 0; i < w*h*4; ++i) {
+OCL_ASSERT(((uint32_t*)buf_data[0])[i] == ((uint32_t*)buf_data[1])[i]);
+  }
+  clEnqueueUnmapMemObject(queue, buf[1], mapptr, 0, NULL, NULL);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+  free(buf_data[1]);
+  buf_data[1] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_use_host_ptr_large_image_1);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2 2/2] Fix bug of size of tmp_ker_buf for TILE_Y copying of large image.

2017-05-26 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

1. The size should be calculated based region and bpp of image instead
of the whole image size.
2. When use blocking mode, the copying kernel need be finished.
Otherwise, it will cause allocations of conformance test failed.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 26 +++---
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 7b58236..96f3272 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -371,6 +371,10 @@ clEnqueueUnmapMemObjectForKernel(cl_command_queue 
command_queue,
   return err;
   }
 
+  err = clFinish(command_queue);
+  if (err != CL_SUCCESS)
+return err;
+
   err = clEnqueueUnmapMemObject(command_queue, image->tmp_ker_buf, mapped_ptr,
 num_events_in_wait_list, event_wait_list, event);
 
@@ -1537,12 +1541,13 @@ clEnqueueMapImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
   if(mem->flags & CL_MEM_USE_HOST_PTR)
 image->tmp_ker_buf =
-  clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, mem->size, 
mem->host_ptr, );
+  clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, buf_size, 
mem->host_ptr, );
   else
 image->tmp_ker_buf =
-  clCreateBuffer(command_queue->ctx, CL_MEM_ALLOC_HOST_PTR, mem->size, 
NULL, );
+  clCreateBuffer(command_queue->ctx, CL_MEM_ALLOC_HOST_PTR, buf_size, 
NULL, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 *errcode_ret = err;
@@ -1559,7 +1564,7 @@ clEnqueueMapImageByKernel(cl_command_queue command_queue,
   }
 
   return clEnqueueMapBuffer(command_queue, image->tmp_ker_buf, blocking_map, 
map_flags, 0,
-mem->size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+buf_size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
 }
 
 void *
@@ -1778,8 +1783,9 @@ clEnqueueReadImageByKernel(cl_command_queue command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
   image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, 
CL_MEM_ALLOC_HOST_PTR,
-mem->size, NULL, );
+buf_size, NULL, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
@@ -1794,7 +1800,7 @@ clEnqueueReadImageByKernel(cl_command_queue command_queue,
   }
 
   return clEnqueueReadBuffer(command_queue, image->tmp_ker_buf, blocking_read, 
0,
-mem->size, ptr, num_events_in_wait_list, event_wait_list, event);
+buf_size, ptr, num_events_in_wait_list, event_wait_list, event);
 }
 
 cl_int
@@ -1985,14 +1991,20 @@ clEnqueueWriteImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
mem->size, (void*)ptr, );
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
+  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
buf_size, (void*)ptr, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
   }
 
-  return clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, mem, 0, 
origin, region,
+  err = clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, mem, 0, 
origin, region,
 num_events_in_wait_list, event_wait_list, event);
+
+  if (blocking_write)
+err = clFinish(command_queue);
+
+  return err;
 }
 
 cl_int
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 1/2] Add utest case for filling image by small region.

2017-05-26 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is used to reproduce the bug of allocations of conformance test.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/compiler_fill_large_image.cpp | 50 
 1 file changed, 50 insertions(+)

diff --git a/utests/compiler_fill_large_image.cpp 
b/utests/compiler_fill_large_image.cpp
index 1ecf65b..3894d6f 100644
--- a/utests/compiler_fill_large_image.cpp
+++ b/utests/compiler_fill_large_image.cpp
@@ -164,3 +164,53 @@ static void compiler_fill_large_image_2(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_2);
+
+static void compiler_fill_large_image_3(void)
+{
+  const size_t w = 8192;
+  const size_t h = 8192;
+  const size_t num_of_lines = 8;
+  size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, num_of_lines, 1};
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * num_of_lines * 4);
+  buf_data[1] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * 4);
+
+  memset(buf_data[0], 0, sizeof(uint32_t) * w * num_of_lines * 4);
+  memset(buf_data[1], 0, sizeof(uint32_t) * w * h * 4);
+
+  for (uint32_t j = 0; j < num_of_lines; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] = (uint32_t)rand();
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_WRITE_IMAGE(buf[0], origin, region, buf_data[0]);
+  OCL_READ_IMAGE(buf[0], origin, region, buf_data[1]);
+
+  // Check result
+  for (uint32_t j = 0; j < num_of_lines; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+
+  free(buf_data[0]);
+  free(buf_data[1]);
+  buf_data[0] = NULL;
+  buf_data[1] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_3);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 2/2] Fix bug of size of tmp_ker_buf for TILE_Y copying of large image.

2017-05-26 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

the size should be calculated based region and bpp of image instead
of the whole image size.
Otherwise, it will cause allocations of conformance test failed.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 7b58236..209d0dc 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1537,12 +1537,13 @@ clEnqueueMapImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
   if(mem->flags & CL_MEM_USE_HOST_PTR)
 image->tmp_ker_buf =
-  clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, mem->size, 
mem->host_ptr, );
+  clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, buf_size, 
mem->host_ptr, );
   else
 image->tmp_ker_buf =
-  clCreateBuffer(command_queue->ctx, CL_MEM_ALLOC_HOST_PTR, mem->size, 
NULL, );
+  clCreateBuffer(command_queue->ctx, CL_MEM_ALLOC_HOST_PTR, buf_size, 
NULL, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 *errcode_ret = err;
@@ -1559,7 +1560,7 @@ clEnqueueMapImageByKernel(cl_command_queue command_queue,
   }
 
   return clEnqueueMapBuffer(command_queue, image->tmp_ker_buf, blocking_map, 
map_flags, 0,
-mem->size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+buf_size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
 }
 
 void *
@@ -1778,8 +1779,9 @@ clEnqueueReadImageByKernel(cl_command_queue command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
   image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, 
CL_MEM_ALLOC_HOST_PTR,
-mem->size, NULL, );
+buf_size, NULL, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
@@ -1794,7 +1796,7 @@ clEnqueueReadImageByKernel(cl_command_queue command_queue,
   }
 
   return clEnqueueReadBuffer(command_queue, image->tmp_ker_buf, blocking_read, 
0,
-mem->size, ptr, num_events_in_wait_list, event_wait_list, event);
+buf_size, ptr, num_events_in_wait_list, event_wait_list, event);
 }
 
 cl_int
@@ -1985,7 +1987,8 @@ clEnqueueWriteImageByKernel(cl_command_queue 
command_queue,
   if (image->tmp_ker_buf)
 clReleaseMemObject(image->tmp_ker_buf);
 
-  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
mem->size, (void*)ptr, );
+  size_t buf_size = region[0] * region[1] * region[2] * image->bpp;
+  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
buf_size, (void*)ptr, );
   if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
 image->tmp_ker_buf = NULL;
 return err;
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2 2/2] Fix bug of clEnqueueCopyBufferToImage and clEnqueueCopyImageToBuffer.

2017-05-25 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

"imagedim_non_pow_2" cases of  basic modudle of confrmance shows
regression after use TILE_Y mode for large image by previous patch.
This bug comes from the non-align16 kernel of clEnqueueCopyBufferToImage
and clEnqueueCopyImageToBuffer.
It will force CL_RGBA/CL_UNORM_INT8/8191x8192 image of conformance test
to CL_R/CL_UNSIGNED_INT8/32764x8192 image for copying.
So it makes width as 8191 x 4 = 32764 and its width will exceed the maximum
width (16 x 1024 = 16384) of GEN surface state structure which only has 14 bits.
So use align4 copy kernel to avoid this bug.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/CMakeLists.txt |  1 +
 src/cl_context.h   |  2 +
 src/cl_mem.c   | 78 ++
 .../cl_internal_copy_buffer_to_image_2d_align4.cl  | 18 +
 .../cl_internal_copy_image_2d_to_buffer_align4.cl  | 18 +
 5 files changed, 89 insertions(+), 28 deletions(-)
 create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d_align4.cl
 create mode 100644 src/kernels/cl_internal_copy_image_2d_to_buffer_align4.cl

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 77a1c87..6433566 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -53,6 +53,7 @@ cl_internal_copy_image_2d_array_to_2d_array 
cl_internal_copy_image_2d_array_to_2
 cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array
 cl_internal_copy_image_2d_to_buffer 
cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
 cl_internal_copy_buffer_to_image_2d 
cl_internal_copy_buffer_to_image_2d_align16 cl_internal_copy_buffer_to_image_3d
+cl_internal_copy_buffer_to_image_2d_align4 
cl_internal_copy_image_2d_to_buffer_align4
 cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
diff --git a/src/cl_context.h b/src/cl_context.h
index 8ba499f..75bf895 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -62,9 +62,11 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY,   //copy image 3d to image 2d array
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,   //copy image 2d to buffer
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN4,
   CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN4,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
   CL_ENQUEUE_FILL_BUFFER_UNALIGN,  //fill buffer with 1 aligne pattern, 
pattern size=1
   CL_ENQUEUE_FILL_BUFFER_ALIGN2,   //fill buffer with 2 aligne pattern, 
pattern size=2
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 0c49c3d..a8543c9 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -2146,6 +2146,36 @@ fail:
   return ret;
 }
 
+#define ALIGN16 16
+#define ALIGN4 4
+#define ALIGN1 1
+
+static size_t
+get_align_size_for_copy_kernel(struct _cl_mem_image* image, const size_t 
origin0, const size_t region0,
+const size_t offset, cl_image_format *fmt) {
+  size_t align_size = 0;
+
+  if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) 
% ALIGN16 == 0) &&
+  ((origin0 * image->bpp) % ALIGN16 == 0) && (region0 % ALIGN16 == 0) && 
(offset % ALIGN16 == 0)){
+fmt->image_channel_order = CL_RGBA;
+fmt->image_channel_data_type = CL_UNSIGNED_INT32;
+align_size = ALIGN16;
+  }
+  else if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * 
image->bpp) % ALIGN4 == 0) &&
+  ((origin0 * image->bpp) % ALIGN4 == 0) && (region0 % ALIGN4 == 0) && 
(offset % ALIGN4 == 0)){
+fmt->image_channel_order = CL_R;
+fmt->image_channel_data_type = CL_UNSIGNED_INT32;
+align_size = ALIGN4;
+  }
+  else{
+fmt->image_channel_order = CL_R;
+fmt->image_channel_data_type = CL_UNSIGNED_INT8;
+align_size = ALIGN1;
+  }
+
+  return align_size;
+}
+
 LOCAL cl_int
 cl_mem_copy_image_to_buffer(cl_command_queue queue, cl_event event, struct 
_cl_mem_image* image, cl_mem buffer,
  const size_t *src_origin, const size_t dst_offset, 
const size_t *region) {
@@ -2158,7 +2188,6 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, 
cl_event event, struct _cl_m
   cl_image_format fmt;
   size_t origin0, region0;
   size_t kn_dst_offset;
-  int align16 = 0;
   size_t align_size = 1;
   size_t w_saved;
 
@@ -2176,18 +2205,7 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, 
cl_event event, struct _cl_m
   w_saved = image->w;
   region0 = region[0] * bpp;
   kn_dst_offset = dst_offset;
-  if((image->image_type == CL_MEM_OBJ

[Beignet] [PATCH 2/2] Fix bug of clEnqueueCopyBufferToImage and clEnqueueCopyImageToBuffer.

2017-05-24 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

"imagedim_non_pow_2" cases of  basic modudle of confrmance shows
regression after use TILE_Y mode for large image by previous patch.
This bug comes from the non-align16 kernel of clEnqueueCopyBufferToImage
and clEnqueueCopyImageToBuffer.
It will force CL_RGBA/CL_UNORM_INT8/8191x8192 image of conformance test
to CL_R/CL_UNSIGNED_INT8/32764x8192 image for copying.
So it makes width as 8191 x 4 = 32764 and its width will exceed the maximum
width (16 x 1024 = 16384) of GEN surface state structure which only has 14 bits.
So use align4 copy kernel to avoid this bug.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/CMakeLists.txt |  1 +
 src/cl_context.h   |  2 ++
 src/cl_mem.c   | 32 ++
 .../cl_internal_copy_buffer_to_image_2d_align4.cl  | 18 
 .../cl_internal_copy_image_2d_to_buffer_align4.cl  | 18 
 5 files changed, 71 insertions(+)
 create mode 100644 src/kernels/cl_internal_copy_buffer_to_image_2d_align4.cl
 create mode 100644 src/kernels/cl_internal_copy_image_2d_to_buffer_align4.cl

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 77a1c87..6433566 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -53,6 +53,7 @@ cl_internal_copy_image_2d_array_to_2d_array 
cl_internal_copy_image_2d_array_to_2
 cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array
 cl_internal_copy_image_2d_to_buffer 
cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer
 cl_internal_copy_buffer_to_image_2d 
cl_internal_copy_buffer_to_image_2d_align16 cl_internal_copy_buffer_to_image_3d
+cl_internal_copy_buffer_to_image_2d_align4 
cl_internal_copy_image_2d_to_buffer_align4
 cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
diff --git a/src/cl_context.h b/src/cl_context.h
index 8ba499f..75bf895 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -62,9 +62,11 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY,   //copy image 3d to image 2d array
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER,   //copy image 2d to buffer
   CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16,
+  CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN4,
   CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER,   //copy image 3d tobuffer
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D,   //copy buffer to image 2d
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16,
+  CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN4,
   CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D,   //copy buffer to image 3d
   CL_ENQUEUE_FILL_BUFFER_UNALIGN,  //fill buffer with 1 aligne pattern, 
pattern size=1
   CL_ENQUEUE_FILL_BUFFER_ALIGN2,   //fill buffer with 2 aligne pattern, 
pattern size=2
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 0c49c3d..3b9a3be 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -2159,6 +2159,7 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, 
cl_event event, struct _cl_m
   size_t origin0, region0;
   size_t kn_dst_offset;
   int align16 = 0;
+  int align4 = 0;
   size_t align_size = 1;
   size_t w_saved;
 
@@ -2183,6 +2184,13 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, 
cl_event event, struct _cl_m
 align16 = 1;
 align_size = 16;
   }
+  else if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * 
image->bpp) % 4 == 0) &&
+  ((src_origin[0] * bpp) % 4 == 0) && (region0 % 4 == 0) && (dst_offset % 
4 == 0)){
+fmt.image_channel_order = CL_R;
+fmt.image_channel_data_type = CL_UNSIGNED_INT32;
+align4 = 1;
+align_size = 4;
+  }
   else{
 fmt.image_channel_order = CL_R;
 fmt.image_channel_data_type = CL_UNSIGNED_INT8;
@@ -2206,6 +2214,14 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, 
cl_event event, struct _cl_m
 cl_internal_copy_image_2d_to_buffer_align16_str,
 (size_t)cl_internal_copy_image_2d_to_buffer_align16_str_size, 
NULL);
 }
+else if(align4){
+  extern char cl_internal_copy_image_2d_to_buffer_align4_str[];
+  extern size_t cl_internal_copy_image_2d_to_buffer_align4_str_size;
+
+  ker = cl_context_get_static_kernel_from_bin(queue->ctx, 
CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN4,
+cl_internal_copy_image_2d_to_buffer_align4_str,
+(size_t)cl_internal_copy_image_2d_to_buffer_align4_str_size, 
NULL);
+}
 else{
   extern char cl_internal_copy_image_2d_to_buffer_str[];
   extern size_t cl_internal_copy_image_2d_to_buffer_str_size;
@@ -2263,6 +2279,7 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, 
cl_event event, cl_mem buffe
   size_t origin0, region0;
   size_t kn_src_offset;
   int align16 = 0;
+  int align4 = 0;
   size_t align_size = 1;
   size_t w_saved = 0;
 
@@ -2287,6 +2304,13 @@ c

[Beignet] [PATCH 1/2] Add utest to reproduce the bug of imagedim_non_pow_2 cases of conformance test.

2017-05-24 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/compiler_fill_large_image.cpp | 46 
 1 file changed, 46 insertions(+)

diff --git a/utests/compiler_fill_large_image.cpp 
b/utests/compiler_fill_large_image.cpp
index 6fb872d..1ecf65b 100644
--- a/utests/compiler_fill_large_image.cpp
+++ b/utests/compiler_fill_large_image.cpp
@@ -118,3 +118,49 @@ static void compiler_fill_large_image_1(void)
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_1);
+
+static void compiler_fill_large_image_2(void)
+{
+  const size_t w = 8191;
+  const size_t h = 8192;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, 1};
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (unsigned char*) malloc(sizeof(unsigned char) * 8192 * 8192 * 
4);
+  buf_data[1] = (unsigned char*) malloc(sizeof(unsigned char) * 8192 * 8192 * 
4);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((unsigned char*)buf_data[0])[(j * w + i) * 4 + k] = (unsigned 
char)rand();
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNORM_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_WRITE_IMAGE(buf[0], origin, region, buf_data[0]);
+  OCL_READ_IMAGE(buf[0], origin, region, buf_data[1]);
+
+  // Check result
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint8_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint8_t*)buf_data[1])[(j * w + i) * 4 + k]);
+
+  free(buf_data[0]);
+  free(buf_data[1]);
+  buf_data[0] = NULL;
+  buf_data[1] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_2);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 8/8] Implement TILE_Y large image in clEnqueueWriteImage.

2017-05-16 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will fail to copy data from host ptr to TILE_Y large image by memcpy.
Use clEnqueueCopyBufferToImage to do this on GPU side.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 46 ++
 1 file changed, 46 insertions(+)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 91525b1..7b58236 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1954,6 +1954,47 @@ clEnqueueReadImage(cl_command_queue command_queue,
   return err;
 }
 
+static cl_int
+clEnqueueWriteImageByKernel(cl_command_queue command_queue,
+cl_mem mem,
+cl_bool blocking_write,
+const size_t *porigin,
+const size_t *pregion,
+size_t row_pitch,
+size_t slice_pitch,
+const void *ptr,
+cl_uint num_events_in_wait_list,
+const cl_event *event_wait_list,
+cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+
+  image = cl_mem_image(mem);
+
+  err = check_image_region(image, pregion, region);
+  if (err != CL_SUCCESS)
+return err;
+
+  err = check_image_origin(image, porigin, origin);
+  if (err != CL_SUCCESS)
+return err;
+
+  if (image->tmp_ker_buf)
+clReleaseMemObject(image->tmp_ker_buf);
+
+  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
mem->size, (void*)ptr, );
+  if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
+image->tmp_ker_buf = NULL;
+return err;
+  }
+
+  return clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, mem, 0, 
origin, region,
+num_events_in_wait_list, event_wait_list, event);
+}
+
 cl_int
 clEnqueueWriteImage(cl_command_queue command_queue,
 cl_mem mem,
@@ -2039,6 +2080,11 @@ clEnqueueWriteImage(cl_command_queue command_queue,
   break;
 }
 
+if (image->is_ker_copy) {
+  return clEnqueueWriteImageByKernel(command_queue, mem, blocking_write, 
origin,
+region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, 
event_wait_list, event);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 7/8] Implement TILE_Y large image in clEnqueueReadImage.

2017-05-16 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will fail to copy data from TILE_Y large image to buffer by memcpy.
Use clEnqueueCopyImageToBuffer to do this on GPU side.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 8678bd9..91525b1 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1747,6 +1747,56 @@ clEnqueueMapImage(cl_command_queue command_queue,
   return mem_ptr;
 }
 
+static cl_int
+clEnqueueReadImageByKernel(cl_command_queue command_queue,
+   cl_mem mem,
+   cl_bool blocking_read,
+   const size_t *porigin,
+   const size_t *pregion,
+   size_t row_pitch,
+   size_t slice_pitch,
+   void *ptr,
+   cl_uint num_events_in_wait_list,
+   const cl_event *event_wait_list,
+   cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+
+  image = cl_mem_image(mem);
+
+  err = check_image_region(image, pregion, region);
+  if (err != CL_SUCCESS)
+return err;
+
+  err = check_image_origin(image, porigin, origin);
+  if (err != CL_SUCCESS)
+return err;
+
+  if (image->tmp_ker_buf)
+clReleaseMemObject(image->tmp_ker_buf);
+
+  image->tmp_ker_buf = clCreateBuffer(command_queue->ctx, 
CL_MEM_ALLOC_HOST_PTR,
+mem->size, NULL, );
+  if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
+image->tmp_ker_buf = NULL;
+return err;
+  }
+
+  err = clEnqueueCopyImageToBuffer(command_queue, mem, image->tmp_ker_buf, 
origin,
+region, 0, 0, NULL, NULL);
+  if (err != CL_SUCCESS) {
+clReleaseMemObject(image->tmp_ker_buf);
+image->tmp_ker_buf = NULL;
+return err;
+  }
+
+  return clEnqueueReadBuffer(command_queue, image->tmp_ker_buf, blocking_read, 
0,
+mem->size, ptr, num_events_in_wait_list, event_wait_list, event);
+}
+
 cl_int
 clEnqueueReadImage(cl_command_queue command_queue,
cl_mem mem,
@@ -1832,6 +1882,11 @@ clEnqueueReadImage(cl_command_queue command_queue,
   break;
 }
 
+if (image->is_ker_copy) {
+  return clEnqueueReadImageByKernel(command_queue, mem, blocking_read, 
origin,
+region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, 
event_wait_list, event);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 5/8] Create image with TILE_Y mode still when image size>128MB for performance.

2017-05-16 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It may failed to copy data from host ptr to TILE_Y large image.
So use clCopyBufferToImage to do this on GPU side.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_context.c |   6 
 src/cl_context.h |   2 +-
 src/cl_mem.c | 107 ---
 src/cl_mem.h |   2 ++
 4 files changed, 111 insertions(+), 6 deletions(-)

diff --git a/src/cl_context.c b/src/cl_context.c
index 1ba2302..4b8281c 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -342,6 +342,7 @@ cl_context_new(struct _cl_context_prop *props, cl_uint 
dev_num, cl_device_id* al
   TRY_ALLOC_NO_ERR (ctx->drv, cl_driver_new(props));
   ctx->props = *props;
   ctx->ver = cl_driver_get_ver(ctx->drv);
+  ctx->image_queue = NULL;
 
 exit:
   return ctx;
@@ -362,6 +363,11 @@ cl_context_delete(cl_context ctx)
   if (CL_OBJECT_DEC_REF(ctx) > 1)
 return;
 
+  if (ctx->image_queue) {
+clReleaseCommandQueue(ctx->image_queue);
+ctx->image_queue = NULL;
+  }
+
   /* delete the internal programs. */
   for (i = CL_INTERNAL_KERNEL_MIN; i < CL_INTERNAL_KERNEL_MAX; i++) {
 if (ctx->internal_kernels[i]) {
diff --git a/src/cl_context.h b/src/cl_context.h
index 4812afd..8ba499f 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -129,7 +129,7 @@ struct _cl_context {
   void (CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *);
  /* User's callback when error occur in 
context */
   void *user_data;   /* A pointer to user supplied data */
-
+  cl_command_queue image_queue;  /* A internal command queue for image 
data copying */
 };
 
 #define CL_OBJECT_CONTEXT_MAGIC 0x20BBCADE993134AALL
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 4a7bec8..0c49c3d 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -153,6 +153,8 @@ cl_mem_allocate(enum cl_mem_type type,
   if (mem->type == CL_MEM_IMAGE_TYPE) {
 cl_mem_image(mem)->is_image_from_buffer = 0;
 cl_mem_image(mem)->is_image_from_nv12_image = 0;
+cl_mem_image(mem)->is_ker_copy = 0;
+cl_mem_image(mem)->tmp_ker_buf = NULL;
   }
 
   if (sz != 0) {
@@ -751,6 +753,80 @@ cl_image_tiling_t cl_get_default_tiling(cl_driver drv)
 }
 
 static cl_mem
+_cl_new_image_copy_from_host_ptr(cl_context ctx,
+  cl_mem_flags flags,
+  const cl_image_format *fmt,
+  const cl_mem_object_type image_type,
+  size_t w,
+  size_t h,
+  size_t depth,
+  size_t pitch,
+  size_t slice_pitch,
+  size_t sz,
+  size_t aligned_pitch,
+  uint32_t intel_fmt,
+  uint32_t bpp,
+  cl_image_tiling_t tiling,
+  void *data,   //pointer from application
+  cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, depth};
+  size_t aligned_slice_pitch = 0;
+
+  if (ctx->image_queue == NULL) {
+ctx->image_queue = clCreateCommandQueueWithProperties(ctx, 
ctx->devices[0], 0, );
+if (err != CL_SUCCESS || !ctx->image_queue) {
+  *errcode_ret = err;
+  ctx->image_queue = NULL;
+  return NULL;
+}
+  }
+
+  // Map host ptr to OCL buffer
+  cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, sz, data, );
+  if (err != CL_SUCCESS) {
+*errcode_ret = err;
+return NULL;
+  }
+
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != 
CL_NO_TILE, NULL, NULL, );
+  if (mem == NULL || err != CL_SUCCESS) {
+clReleaseMemObject(buf);
+return NULL;
+  }
+
+  cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+
+  if (image_type == CL_MEM_OBJECT_IMAGE2D)
+aligned_slice_pitch = 0;
+  else
+//SKL need use tiling's aligned_h to calc slice_pitch and IVB to BDW need 
CL_NO_TILE's aligned_h to calc.
+aligned_slice_pitch = aligned_pitch * ALIGN(h, 
cl_buffer_get_tiling_align(ctx, tiling, 2));
+
+  cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
+intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
+0, 0, 0);
+
+  err = clEnqueueCopyBufferToImage(ctx->image_queue, buf, mem, 0, origin, 
region, 0, NULL, NULL);
+  if(err != CL_SUCCESS) {
+clReleaseMemObject(buf);
+clReleaseMemObject(mem);
+return NULL;
+  }
+
+  clReleaseMemObject(buf);
+  if (flags & CL_MEM_USE_HOST_PTR && data) {
+mem->host_ptr = data;
+cl_mem_image(mem)->host_row_pitch = pitch;
+cl_mem_image(mem)->host_slice_pitch = slice_pitch;
+  }
+  return mem;
+}
+
+static cl_mem
 _cl_mem_new_image(cl_context ctx,
   cl_mem_flags flags,
   const cl_image_format *fmt,
@@ -765,6 +841,7 @@ _cl_mem_new_image(cl_context 

[Beignet] [PATCH v3 6/8] Implement TILE_Y large image in clEnqueueMapImage and clEnqueueUnmapMemObject.

2017-05-16 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will fail to copy data from TILE_Y large image to buffer by memcpy.
Use clEnqueueCopyImageToBuffer to do this.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 111 +++
 1 file changed, 111 insertions(+)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 0d19bf8..8678bd9 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -341,6 +341,45 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
   return mem_ptr;
 }
 
+static cl_int
+clEnqueueUnmapMemObjectForKernel(cl_command_queue command_queue,
+cl_mem memobj,
+void *mapped_ptr,
+cl_uint num_events_in_wait_list,
+const cl_event *event_wait_list,
+cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+
+  image = cl_mem_image(memobj);
+
+  if (!image->tmp_ker_buf)
+return CL_INVALID_MEM_OBJECT;
+
+  origin[0] = origin[1] = origin[2] = 0;
+  region[0] = image->w;
+  region[1] = image->h;
+  region[2] = image->depth;
+
+  if (memobj->flags & CL_MEM_USE_HOST_PTR) {
+err = clEnqueueCopyBufferToImage(command_queue, image->tmp_ker_buf, 
memobj, 0, origin, region,
+  num_events_in_wait_list, event_wait_list, event);
+if (err != CL_SUCCESS)
+  return err;
+  }
+
+  err = clEnqueueUnmapMemObject(command_queue, image->tmp_ker_buf, mapped_ptr,
+num_events_in_wait_list, event_wait_list, event);
+
+  clReleaseMemObject(image->tmp_ker_buf);
+  image->tmp_ker_buf = NULL;
+
+  return err;
+}
+
 cl_int
 clEnqueueUnmapMemObject(cl_command_queue command_queue,
 cl_mem memobj,
@@ -370,6 +409,11 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
   break;
 }
 
+if (CL_OBJECT_IS_IMAGE(memobj) && cl_mem_image(memobj)->is_ker_copy) {
+  return clEnqueueUnmapMemObjectForKernel(command_queue, memobj, 
mapped_ptr,
+num_events_in_wait_list, event_wait_list, event);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
@@ -1457,6 +1501,67 @@ check_image_origin(struct _cl_mem_image *image, const 
size_t *porigin, size_t *o
   return CL_SUCCESS;
 }
 
+static void *
+clEnqueueMapImageByKernel(cl_command_queue command_queue,
+  cl_mem mem,
+  cl_bool blocking_map,
+  cl_map_flags map_flags,
+  const size_t *porigin,
+  const size_t *pregion,
+  size_t *image_row_pitch,
+  size_t *image_slice_pitch,
+  cl_uint num_events_in_wait_list,
+  const cl_event *event_wait_list,
+  cl_event *event,
+  cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+
+  image = cl_mem_image(mem);
+
+  err = check_image_region(image, pregion, region);
+  if (err != CL_SUCCESS) {
+*errcode_ret = err;
+return NULL;
+  }
+
+  err = check_image_origin(image, porigin, origin);
+  if (err != CL_SUCCESS) {
+*errcode_ret = err;
+return NULL;
+  }
+
+  if (image->tmp_ker_buf)
+clReleaseMemObject(image->tmp_ker_buf);
+
+  if(mem->flags & CL_MEM_USE_HOST_PTR)
+image->tmp_ker_buf =
+  clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, mem->size, 
mem->host_ptr, );
+  else
+image->tmp_ker_buf =
+  clCreateBuffer(command_queue->ctx, CL_MEM_ALLOC_HOST_PTR, mem->size, 
NULL, );
+  if (image->tmp_ker_buf == NULL || err != CL_SUCCESS) {
+image->tmp_ker_buf = NULL;
+*errcode_ret = err;
+return NULL;
+  }
+
+  err = clEnqueueCopyImageToBuffer(command_queue, mem, image->tmp_ker_buf, 
origin, region,
+0, 0, NULL, NULL);
+  if (err != CL_SUCCESS) {
+clReleaseMemObject(image->tmp_ker_buf);
+image->tmp_ker_buf = NULL;
+*errcode_ret = err;
+return NULL;
+  }
+
+  return clEnqueueMapBuffer(command_queue, image->tmp_ker_buf, blocking_map, 
map_flags, 0,
+mem->size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+}
+
 void *
 clEnqueueMapImage(cl_command_queue command_queue,
   cl_mem mem,
@@ -1530,6 +1635,12 @@ clEnqueueMapImage(cl_command_queue command_queue,
   break;
 }
 
+if (CL_OBJECT_IS_IMAGE(mem) && cl_mem_image(mem)->is_ker_copy) {
+  return clEnqueueMapImageByKernel(command_queue, mem, map_flags, 
blocking_map, origin, region,
+image_row_pitch, image_slice_pitch, num_events_in_wait_list, 
event_wait_list,
+event, errcode_ret);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_li

[Beignet] [PATCH v3 4/8] Add image use_hostptr benchmark case for testing large image operations.

2017-05-16 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is for testing large image with TILE_Y mode.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 benchmark/CMakeLists.txt |  1 +
 benchmark/benchmark_use_host_ptr_large_image.cpp | 84 
 2 files changed, 85 insertions(+)
 create mode 100644 benchmark/benchmark_use_host_ptr_large_image.cpp

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index f9b246b..4c43daf 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -13,6 +13,7 @@ set (benchmark_sources
   ../utests/vload_bench.cpp
   benchmark_copy_buf.cpp
   benchmark_use_host_ptr_buffer.cpp
+  benchmark_use_host_ptr_large_image.cpp
   benchmark_read_buffer.cpp
   benchmark_read_image.cpp
   benchmark_copy_buffer_to_image.cpp
diff --git a/benchmark/benchmark_use_host_ptr_large_image.cpp 
b/benchmark/benchmark_use_host_ptr_large_image.cpp
new file mode 100644
index 000..c943a87
--- /dev/null
+++ b/benchmark/benchmark_use_host_ptr_large_image.cpp
@@ -0,0 +1,84 @@
+#include "utests/utest_helper.hpp"
+#include 
+#include 
+
+double benchmark_use_host_ptr_large_image(void)
+{
+  struct timeval start,stop;
+
+  const size_t w = 4096;
+  const size_t h = 4096;
+
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+
+  size_t alignment = 4096;  //page size
+  if (cl_check_beignet())
+alignment = 64; //cacheline size, beignet has loose limitaiont to 
enable userptr
+
+  //src image
+  int ret = posix_memalign(_data[0], alignment, sizeof(uint32_t) * w * h * 
4);
+  OCL_ASSERT(ret == 0);
+  for (size_t i = 0; i < w*h*4; ++i)
+((uint32_t*)buf_data[0])[i] = i;
+
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_USE_HOST_PTR, , , buf_data[0]);
+
+  //dst image
+  ret = posix_memalign(_data[1], alignment, sizeof(uint32_t) * w * h * 4);
+  OCL_ASSERT(ret == 0);
+  for (size_t i = 0; i < w*h*4; ++i)
+((uint32_t*)buf_data[1])[i] = 0;
+
+  OCL_CREATE_IMAGE(buf[1], CL_MEM_USE_HOST_PTR, , , buf_data[1]);
+
+  OCL_CREATE_KERNEL("runtime_use_host_ptr_image");
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+
+  size_t origin[3];
+  origin[0] = 0;
+  origin[1] = 0;
+  origin[2] = 0;
+  size_t region[3];
+  region[0] = w;
+  region[1] = h;
+  region[2] = 1;
+  size_t pitch = 0;
+
+  gettimeofday(,0);
+  for (size_t i=0; i<100; i++) {
+OCL_NDRANGE(2);
+void* mapptr = (int*)clEnqueueMapImage(queue, buf[1], CL_TRUE, 
CL_MAP_READ, origin,
+  region, , NULL, 0, NULL, NULL, NULL);
+OCL_ASSERT(mapptr == buf_data[1]);
+clEnqueueUnmapMemObject(queue, buf[1], mapptr, 0, NULL, NULL);
+  }
+  gettimeofday(,0);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+  free(buf_data[1]);
+  buf_data[1] = NULL;
+
+  double elapsed = time_subtract(, , 0);
+
+  return BANDWIDTH(w*h*sizeof(uint32_t)*4*100*2, elapsed);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_use_host_ptr_large_image, "GB/S");
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 2/8] Add image filling case for testing large image operations.

2017-05-16 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is for testing large image with TILE_Y mode.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/CMakeLists.txt|   1 +
 utests/compiler_fill_large_image.cpp | 120 +++
 2 files changed, 121 insertions(+)
 create mode 100644 utests/compiler_fill_large_image.cpp

diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 145b02c..b02068a 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -111,6 +111,7 @@ set (utests_sources
   compiler_abs.cpp
   compiler_abs_diff.cpp
   compiler_fill_image.cpp
+  compiler_fill_large_image.cpp
   compiler_fill_image0.cpp
   compiler_fill_image_1d.cpp
   compiler_fill_image_3d.cpp
diff --git a/utests/compiler_fill_large_image.cpp 
b/utests/compiler_fill_large_image.cpp
new file mode 100644
index 000..6fb872d
--- /dev/null
+++ b/utests/compiler_fill_large_image.cpp
@@ -0,0 +1,120 @@
+#include 
+#include "utest_helper.hpp"
+
+static void compiler_fill_large_image(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, 1};
+  uint32_t color = 0x12345678;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image");
+
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(color), );
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  uint32_t *data = (uint32_t*)malloc(sizeof(uint32_t) * w * h * 4);
+  OCL_READ_IMAGE(buf[0], origin, region, data);
+  for (uint32_t j = 0; j < h; ++j) {
+for (uint32_t i = 0; i < w; i++) {
+  uint32_t k = (j * w + i) * 4;
+  OCL_ASSERT(data[k] == 0x12);
+  OCL_ASSERT(data[k + 1] == 0x34);
+  OCL_ASSERT(data[k + 2] == 0x56);
+  OCL_ASSERT(data[k + 3] == 0x78);
+}
+  }
+  free(data);
+
+  OCL_MAP_BUFFER_GTT(0);
+  for (uint32_t j = 0; j < h; ++j) {
+for (uint32_t i = 0; i < w; i++) {
+  uint32_t k = (j * w + i) * 4;
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k] == 0x12);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 1] == 0x34);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 2] == 0x56);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 3] == 0x78);
+}
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image);
+
+static void compiler_fill_large_image_1(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, 1};
+  uint32_t color[4] = {0x12, 0x34, 0x56, 0x78};
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+
+  // Fill Image
+  clEnqueueFillImage(queue, buf[0], color, origin, region, 0, NULL, NULL);
+
+  // Check result
+  uint32_t *data = (uint32_t*)malloc(sizeof(uint32_t) * w * h * 4);
+  OCL_READ_IMAGE(buf[0], origin, region, data);
+  for (uint32_t j = 0; j < h; ++j) {
+for (uint32_t i = 0; i < w; i++) {
+  int k = (j * w + i) * 4;
+  OCL_ASSERT(data[k] == 0x12);
+  OCL_ASSERT(data[k + 1] == 0x34);
+  OCL_ASSERT(data[k + 2] == 0x56);
+  OCL_ASSERT(data[k + 3] == 0x78);
+}
+  }
+  free(data);
+
+  OCL_MAP_BUFFER_GTT(0);
+  for (uint32_t j = 0; j < h; ++j) {
+for (uint32_t i = 0; i < w; i++) {
+  int k = (j * w + i) * 4;
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k] == 0x12);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 1] == 0x34);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 2] == 0x56);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 3] == 0x78);
+}
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_1);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v3 1/8] Add image copying case for testing large image operations.

2017-05-16 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is for testing large image with TILE_Y mode.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/CMakeLists.txt|   1 +
 utests/compiler_copy_large_image.cpp | 121 +++
 2 files changed, 122 insertions(+)
 create mode 100644 utests/compiler_copy_large_image.cpp

diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index b7ef742..145b02c 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -101,6 +101,7 @@ set (utests_sources
   compiler_convert_uchar_sat.cpp
   compiler_copy_buffer.cpp
   compiler_copy_image.cpp
+  compiler_copy_large_image.cpp
   compiler_copy_image_1d.cpp
   compiler_copy_image_3d.cpp
   compiler_copy_buffer_row.cpp
diff --git a/utests/compiler_copy_large_image.cpp 
b/utests/compiler_copy_large_image.cpp
new file mode 100644
index 000..66998a7
--- /dev/null
+++ b/utests/compiler_copy_large_image.cpp
@@ -0,0 +1,121 @@
+#include 
+#include "utest_helper.hpp"
+
+static void compiler_copy_large_image(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * 4);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] = k;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t) * 4;
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, , , buf_data[0]);
+
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, , , NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  OCL_SET_ARG(2, sizeof(sampler), );
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_large_image);
+
+static void compiler_copy_large_image_1(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, 1};
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * 4);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] = k;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_CREATE_IMAGE(buf[1], 0, , , NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  OCL_WRITE_IMAGE(buf[0], origin, region, buf_data[0]);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  OCL_SET_ARG(2, sizeof(sampler), );
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_large_image_1);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2 6/6] Implement TILE_Y large image in clEnqueueWriteImage.

2017-05-15 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will fail to copy data from host ptr to TILE_Y large image by memcpy.
Use clEnqueueCopyBufferToImage to do this on GPU side.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 46 ++
 1 file changed, 46 insertions(+)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 8cbddf4..b8f27b2 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1929,6 +1929,47 @@ clEnqueueReadImage(cl_command_queue command_queue,
   return err;
 }
 
+static cl_int
+clEnqueueWriteImageByKernel(cl_command_queue command_queue,
+cl_mem mem,
+cl_bool blocking_write,
+const size_t *porigin,
+const size_t *pregion,
+size_t row_pitch,
+size_t slice_pitch,
+const void *ptr,
+cl_uint num_events_in_wait_list,
+const cl_event *event_wait_list,
+cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+
+  image = cl_mem_image(mem);
+
+  err = check_image_region(image, pregion, region);
+  if (err != CL_SUCCESS)
+return err;
+
+  err = check_image_origin(image, porigin, origin);
+  if (err != CL_SUCCESS)
+return err;
+
+  if (mem->tmp_ker_buf)
+clReleaseMemObject(mem->tmp_ker_buf);
+
+  mem->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
mem->size, (void*)ptr, );
+  if (mem->tmp_ker_buf == NULL || err != CL_SUCCESS) {
+mem->tmp_ker_buf = NULL;
+return err;
+  }
+
+  return clEnqueueCopyBufferToImage(command_queue, mem->tmp_ker_buf, mem, 0, 
origin, region,
+num_events_in_wait_list, event_wait_list, event);
+}
+
 cl_int
 clEnqueueWriteImage(cl_command_queue command_queue,
 cl_mem mem,
@@ -2014,6 +2055,11 @@ clEnqueueWriteImage(cl_command_queue command_queue,
   break;
 }
 
+if (mem->is_ker_copy) {
+  return clEnqueueWriteImageByKernel(command_queue, mem, blocking_write, 
origin,
+region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, 
event_wait_list, event);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2 5/6] Implement TILE_Y large image in clEnqueueReadImage.

2017-05-15 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will fail to copy data from TILE_Y large image to buffer by memcpy.
Use clEnqueueCopyImageToBuffer to do this on GPU side.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 53 +
 1 file changed, 53 insertions(+)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 7c6699d..8cbddf4 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1724,6 +1724,54 @@ clEnqueueMapImage(cl_command_queue command_queue,
   return mem_ptr;
 }
 
+static cl_int
+clEnqueueReadImageByKernel(cl_command_queue command_queue,
+   cl_mem mem,
+   cl_bool blocking_read,
+   const size_t *porigin,
+   const size_t *pregion,
+   size_t row_pitch,
+   size_t slice_pitch,
+   void *ptr,
+   cl_uint num_events_in_wait_list,
+   const cl_event *event_wait_list,
+   cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+
+  image = cl_mem_image(mem);
+
+  err = check_image_region(image, pregion, region);
+  if (err != CL_SUCCESS)
+return err;
+
+  err = check_image_origin(image, porigin, origin);
+  if (err != CL_SUCCESS)
+return err;
+
+  if (mem->tmp_ker_buf)
+clReleaseMemObject(mem->tmp_ker_buf);
+
+  mem->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_ALLOC_HOST_PTR, 
mem->size, NULL, );
+  if (mem->tmp_ker_buf == NULL || err != CL_SUCCESS) {
+mem->tmp_ker_buf = NULL;
+return err;
+  }
+
+  err = clEnqueueCopyImageToBuffer(command_queue, mem, mem->tmp_ker_buf, 
origin, region,
+0, 0, NULL, NULL);
+  if (err != CL_SUCCESS) {
+clReleaseMemObject(mem->tmp_ker_buf);
+return err;
+  }
+
+  return clEnqueueReadBuffer(command_queue, mem->tmp_ker_buf, blocking_read, 
0, mem->size, ptr,
+num_events_in_wait_list, event_wait_list, event);
+}
+
 cl_int
 clEnqueueReadImage(cl_command_queue command_queue,
cl_mem mem,
@@ -1809,6 +1857,11 @@ clEnqueueReadImage(cl_command_queue command_queue,
   break;
 }
 
+if (mem->is_ker_copy) {
+  return clEnqueueReadImageByKernel(command_queue, mem, blocking_read, 
origin,
+region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, 
event_wait_list, event);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2 4/6] Implement TILE_Y large image in clEnqueueMapImage and clEnqueueUnmapMemObject.

2017-05-15 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will fail to copy data from TILE_Y large image to buffer by memcpy.
Use clEnqueueCopyImageToBuffer to do this.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 88 
 1 file changed, 88 insertions(+)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 0d19bf8..7c6699d 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -341,6 +341,26 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
   return mem_ptr;
 }
 
+static cl_int
+clEnqueueUnmapMemObjectForKernel(cl_command_queue command_queue,
+cl_mem memobj,
+void *mapped_ptr,
+cl_uint num_events_in_wait_list,
+const cl_event *event_wait_list,
+cl_event *event)
+{
+  if (!memobj->tmp_ker_buf)
+return CL_INVALID_MEM_OBJECT;
+
+  cl_int err = clEnqueueUnmapMemObject(command_queue, memobj->tmp_ker_buf, 
mapped_ptr,
+num_events_in_wait_list, event_wait_list, event);
+
+  clReleaseMemObject(memobj->tmp_ker_buf);
+  memobj->tmp_ker_buf = NULL;
+
+  return err;
+}
+
 cl_int
 clEnqueueUnmapMemObject(cl_command_queue command_queue,
 cl_mem memobj,
@@ -370,6 +390,11 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
   break;
 }
 
+if (memobj->is_ker_copy) {
+  return clEnqueueUnmapMemObjectForKernel(command_queue, memobj, 
mapped_ptr,
+num_events_in_wait_list, event_wait_list, event);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
@@ -1457,6 +1482,63 @@ check_image_origin(struct _cl_mem_image *image, const 
size_t *porigin, size_t *o
   return CL_SUCCESS;
 }
 
+static void *
+clEnqueueMapImageByKernel(cl_command_queue command_queue,
+  cl_mem mem,
+  cl_bool blocking_map,
+  cl_map_flags map_flags,
+  const size_t *porigin,
+  const size_t *pregion,
+  size_t *image_row_pitch,
+  size_t *image_slice_pitch,
+  cl_uint num_events_in_wait_list,
+  const cl_event *event_wait_list,
+  cl_event *event,
+  cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+
+  image = cl_mem_image(mem);
+
+  err = check_image_region(image, pregion, region);
+  if (err != CL_SUCCESS) {
+*errcode_ret = err;
+return NULL;
+  }
+
+  err = check_image_origin(image, porigin, origin);
+  if (err != CL_SUCCESS) {
+*errcode_ret = err;
+return NULL;
+  }
+
+  if (mem->tmp_ker_buf)
+clReleaseMemObject(mem->tmp_ker_buf);
+
+  mem->tmp_ker_buf =
+clCreateBuffer(command_queue->ctx, CL_MEM_ALLOC_HOST_PTR, mem->size, NULL, 
);
+  if (mem->tmp_ker_buf == NULL || err != CL_SUCCESS) {
+mem->tmp_ker_buf = NULL;
+*errcode_ret = err;
+return NULL;
+  }
+
+  err = clEnqueueCopyImageToBuffer(command_queue, mem, mem->tmp_ker_buf, 
origin, region,
+0, 0, NULL, NULL);
+  if (err != CL_SUCCESS) {
+clReleaseMemObject(mem->tmp_ker_buf);
+mem->tmp_ker_buf = NULL;
+*errcode_ret = err;
+return NULL;
+  }
+
+  return clEnqueueMapBuffer(command_queue, mem->tmp_ker_buf, blocking_map, 
map_flags, 0, mem->size,
+num_events_in_wait_list, event_wait_list, event, errcode_ret);
+}
+
 void *
 clEnqueueMapImage(cl_command_queue command_queue,
   cl_mem mem,
@@ -1530,6 +1612,12 @@ clEnqueueMapImage(cl_command_queue command_queue,
   break;
 }
 
+if (mem->is_ker_copy) {
+  return clEnqueueMapImageByKernel(command_queue, mem, map_flags, 
blocking_map, origin, region,
+image_row_pitch, image_slice_pitch, num_events_in_wait_list, 
event_wait_list,
+event, errcode_ret);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 5/6] Implement TILE_Y large image in clEnqueueReadImage.

2017-05-09 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will fail to copy data from TILE_Y large image to buffer by memcpy.
Use clEnqueueCopyImageToBuffer to do this on GPU side.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 53 +
 1 file changed, 53 insertions(+)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 4ae70f7..c2aeb11 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1724,6 +1724,54 @@ clEnqueueMapImage(cl_command_queue command_queue,
   return mem_ptr;
 }
 
+static cl_int
+clEnqueueReadImageByKernel(cl_command_queue command_queue,
+   cl_mem mem,
+   cl_bool blocking_read,
+   const size_t *porigin,
+   const size_t *pregion,
+   size_t row_pitch,
+   size_t slice_pitch,
+   void *ptr,
+   cl_uint num_events_in_wait_list,
+   const cl_event *event_wait_list,
+   cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+
+  image = cl_mem_image(mem);
+
+  err = check_image_region(image, pregion, region);
+  if (err != CL_SUCCESS)
+return err;
+
+  err = check_image_origin(image, porigin, origin);
+  if (err != CL_SUCCESS)
+return err;
+
+  if (mem->tmp_ker_buf)
+clReleaseMemObject(mem->tmp_ker_buf);
+
+  mem->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_READ_WRITE, 
mem->size, NULL, );
+  if (mem->tmp_ker_buf == NULL || err != CL_SUCCESS) {
+mem->tmp_ker_buf = NULL;
+return err;
+  }
+
+  err = clEnqueueCopyImageToBuffer(command_queue, mem, mem->tmp_ker_buf, 
origin, region,
+0, 0, NULL, NULL);
+  if (err != CL_SUCCESS) {
+clReleaseMemObject(mem->tmp_ker_buf);
+return err;
+  }
+
+  return clEnqueueReadBuffer(command_queue, mem->tmp_ker_buf, blocking_read, 
0, mem->size, ptr,
+num_events_in_wait_list, event_wait_list, event);
+}
+
 cl_int
 clEnqueueReadImage(cl_command_queue command_queue,
cl_mem mem,
@@ -1809,6 +1857,11 @@ clEnqueueReadImage(cl_command_queue command_queue,
   break;
 }
 
+if (mem->is_ker_copy) {
+  return clEnqueueReadImageByKernel(command_queue, mem, blocking_read, 
origin,
+region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, 
event_wait_list, event);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 6/6] Implement TILE_Y large image in clEnqueueWriteImage.

2017-05-09 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will fail to copy data from host ptr to TILE_Y large image by memcpy.
Use clEnqueueCopyBufferToImage to do this on GPU side.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 47 +++
 1 file changed, 47 insertions(+)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index c2aeb11..c009c56 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -1929,6 +1929,48 @@ clEnqueueReadImage(cl_command_queue command_queue,
   return err;
 }
 
+static cl_int
+clEnqueueWriteImageByKernel(cl_command_queue command_queue,
+cl_mem mem,
+cl_bool blocking_write,
+const size_t *porigin,
+const size_t *pregion,
+size_t row_pitch,
+size_t slice_pitch,
+const void *ptr,
+cl_uint num_events_in_wait_list,
+const cl_event *event_wait_list,
+cl_event *event)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+  size_t aligned_slice_pitch = 0;
+
+  image = cl_mem_image(mem);
+
+  err = check_image_region(image, pregion, region);
+  if (err != CL_SUCCESS)
+return err;
+
+  err = check_image_origin(image, porigin, origin);
+  if (err != CL_SUCCESS)
+return err;
+
+  if (mem->tmp_ker_buf)
+clReleaseMemObject(mem->tmp_ker_buf);
+
+  mem->tmp_ker_buf = clCreateBuffer(command_queue->ctx, CL_MEM_USE_HOST_PTR, 
mem->size, ptr, );
+  if (mem->tmp_ker_buf == NULL || err != CL_SUCCESS) {
+mem->tmp_ker_buf = NULL;
+return err;
+  }
+
+  return clEnqueueCopyBufferToImage(command_queue, mem->tmp_ker_buf, mem, 0, 
origin, region,
+num_events_in_wait_list, event_wait_list, event);
+}
+
 cl_int
 clEnqueueWriteImage(cl_command_queue command_queue,
 cl_mem mem,
@@ -2014,6 +2056,11 @@ clEnqueueWriteImage(cl_command_queue command_queue,
   break;
 }
 
+if (mem->is_ker_copy) {
+  return clEnqueueWriteImageByKernel(command_queue, mem, blocking_write, 
origin,
+region, row_pitch, slice_pitch, ptr, num_events_in_wait_list, 
event_wait_list, event);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 4/6] Implement TILE_Y large image in clEnqueueMapImage and clEnqueueUnmapMemObject.

2017-05-09 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It will fail to copy data from TILE_Y large image to buffer by memcpy.
Use clEnqueueCopyImageToBuffer to do this.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_api_mem.c | 88 
 1 file changed, 88 insertions(+)

diff --git a/src/cl_api_mem.c b/src/cl_api_mem.c
index 0d19bf8..4ae70f7 100644
--- a/src/cl_api_mem.c
+++ b/src/cl_api_mem.c
@@ -341,6 +341,26 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
   return mem_ptr;
 }
 
+static cl_int
+clEnqueueUnmapMemObjectForKernel(cl_command_queue command_queue,
+cl_mem memobj,
+void *mapped_ptr,
+cl_uint num_events_in_wait_list,
+const cl_event *event_wait_list,
+cl_event *event)
+{
+  if (!memobj->tmp_ker_buf)
+return CL_INVALID_MEM_OBJECT;
+
+  cl_int err = clEnqueueUnmapMemObject(command_queue, memobj->tmp_ker_buf, 
mapped_ptr,
+num_events_in_wait_list, event_wait_list, event);
+
+  clReleaseMemObject(memobj->tmp_ker_buf);
+  memobj->tmp_ker_buf = NULL;
+
+  return err;
+}
+
 cl_int
 clEnqueueUnmapMemObject(cl_command_queue command_queue,
 cl_mem memobj,
@@ -370,6 +390,11 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
   break;
 }
 
+if (memobj->is_ker_copy) {
+  return clEnqueueUnmapMemObjectForKernel(command_queue, memobj, 
mapped_ptr,
+num_events_in_wait_list, event_wait_list, event);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
@@ -1457,6 +1482,63 @@ check_image_origin(struct _cl_mem_image *image, const 
size_t *porigin, size_t *o
   return CL_SUCCESS;
 }
 
+static void *
+clEnqueueMapImageByKernel(cl_command_queue command_queue,
+  cl_mem mem,
+  cl_bool blocking_map,
+  cl_map_flags map_flags,
+  const size_t *porigin,
+  const size_t *pregion,
+  size_t *image_row_pitch,
+  size_t *image_slice_pitch,
+  cl_uint num_events_in_wait_list,
+  const cl_event *event_wait_list,
+  cl_event *event,
+  cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  struct _cl_mem_image *image = NULL;
+  size_t region[3];
+  size_t origin[3];
+
+  image = cl_mem_image(mem);
+
+  err = check_image_region(image, pregion, region);
+  if (err != CL_SUCCESS) {
+*errcode_ret = err;
+return NULL;
+  }
+
+  err = check_image_origin(image, porigin, origin);
+  if (err != CL_SUCCESS) {
+*errcode_ret = err;
+return NULL;
+  }
+
+  if (mem->tmp_ker_buf)
+clReleaseMemObject(mem->tmp_ker_buf);
+
+  mem->tmp_ker_buf =
+clCreateBuffer(command_queue->ctx, CL_MEM_READ_WRITE, mem->size, NULL, 
);
+  if (mem->tmp_ker_buf == NULL || err != CL_SUCCESS) {
+mem->tmp_ker_buf = NULL;
+*errcode_ret = err;
+return NULL;
+  }
+
+  err = clEnqueueCopyImageToBuffer(command_queue, mem, mem->tmp_ker_buf, 
origin, region,
+0, 0, NULL, NULL);
+  if (err != CL_SUCCESS) {
+clReleaseMemObject(mem->tmp_ker_buf);
+mem->tmp_ker_buf = NULL;
+*errcode_ret = err;
+return NULL;
+  }
+
+  return clEnqueueMapBuffer(command_queue, mem->tmp_ker_buf, blocking_map, 
map_flags, 0, mem->size,
+num_events_in_wait_list, event_wait_list, event, errcode_ret);
+}
+
 void *
 clEnqueueMapImage(cl_command_queue command_queue,
   cl_mem mem,
@@ -1530,6 +1612,12 @@ clEnqueueMapImage(cl_command_queue command_queue,
   break;
 }
 
+if (mem->is_ker_copy) {
+  return clEnqueueMapImageByKernel(command_queue, mem, map_flags, 
blocking_map, origin, region,
+image_row_pitch, image_slice_pitch, num_events_in_wait_list, 
event_wait_list,
+event, errcode_ret);
+}
+
 err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list,
   event, command_queue->ctx);
 if (err != CL_SUCCESS) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 3/6] Create image with TILE_Y mode still when image size > 128MB for performance.

2017-05-09 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It may failed to copy data from host ptr to TILE_Y large image.
So use clCopyBufferToImage to do this on GPU side.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 src/cl_mem.c | 100 ---
 src/cl_mem.h |   2 ++
 2 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/src/cl_mem.c b/src/cl_mem.c
index 4a7bec8..fe0dd2f 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -149,6 +149,8 @@ cl_mem_allocate(enum cl_mem_type type,
   mem->is_userptr = 0;
   mem->offset = 0;
   mem->is_svm = 0;
+  mem->is_ker_copy = 0;
+  mem->tmp_ker_buf = NULL;
   mem->cmrt_mem = NULL;
   if (mem->type == CL_MEM_IMAGE_TYPE) {
 cl_mem_image(mem)->is_image_from_buffer = 0;
@@ -750,6 +752,77 @@ cl_image_tiling_t cl_get_default_tiling(cl_driver drv)
   return tiling;
 }
 
+cl_command_queue image_queue = NULL;
+
+static cl_mem
+_cl_new_image_copy_from_host_ptr(cl_context ctx,
+  cl_mem_flags flags,
+  const cl_image_format *fmt,
+  const cl_mem_object_type image_type,
+  size_t w,
+  size_t h,
+  size_t depth,
+  size_t pitch,
+  size_t slice_pitch,
+  size_t sz,
+  size_t aligned_pitch,
+  uint32_t intel_fmt,
+  uint32_t bpp,
+  cl_image_tiling_t tiling,
+  void *data,   //pointer from application
+  cl_int *errcode_ret)
+{
+  cl_int err = CL_SUCCESS;
+  cl_mem mem = NULL;
+  size_t origin[3] = {0, 0, 0};
+  size_t region[3] = {w, h, depth};
+  size_t aligned_slice_pitch = 0;
+
+  if (image_queue == NULL) {
+image_queue = clCreateCommandQueueWithProperties(ctx, ctx->devices[0], 0, 
);
+if (err != CL_SUCCESS) {
+  *errcode_ret = err;
+  return NULL;
+}
+  }
+
+  // Map host ptr to OCL buffer
+  cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, sz, data, );
+  if (err != CL_SUCCESS) {
+*errcode_ret = err;
+return NULL;
+  }
+
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != 
CL_NO_TILE, NULL, NULL, );
+  if (mem == NULL || err != CL_SUCCESS) {
+clReleaseMemObject(buf);
+return NULL;
+  }
+
+  cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch);
+
+  if (image_type == CL_MEM_OBJECT_IMAGE2D)
+aligned_slice_pitch = 0;
+  else
+//SKL need use tiling's aligned_h to calc slice_pitch and IVB to BDW need 
CL_NO_TILE's aligned_h to calc.
+aligned_slice_pitch = aligned_pitch * ALIGN(h, 
cl_buffer_get_tiling_align(ctx, tiling, 2));
+
+  cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
+intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
+0, 0, 0);
+
+  err = clEnqueueCopyBufferToImage(image_queue, buf, mem, 0, origin, region, 
0, NULL, NULL);
+  if(err != CL_SUCCESS) {
+clReleaseMemObject(buf);
+clReleaseMemObject(mem);
+return NULL;
+  }
+
+  mem->is_ker_copy = 1;
+  clReleaseMemObject(buf);
+  return mem;
+}
+
 static cl_mem
 _cl_mem_new_image(cl_context ctx,
   cl_mem_flags flags,
@@ -765,6 +838,7 @@ _cl_mem_new_image(cl_context ctx,
   cl_int *errcode_ret)
 {
   cl_int err = CL_SUCCESS;
+  cl_bool is_ker_copy = 0;
   cl_mem mem = NULL;
   cl_mem_object_type image_type = orig_image_type;
   uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT;
@@ -931,11 +1005,22 @@ _cl_mem_new_image(cl_context ctx,
 
   /* If sz is large than 128MB, map gtt may fail in some system.
  Because there is no obviours performance drop, disable tiling. */
-  if(tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
-tiling = CL_NO_TILE;
-aligned_pitch = w * bpp;
-aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
-sz = aligned_pitch * aligned_h * depth;
+  if (tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) {
+if ((image_type == CL_MEM_OBJECT_IMAGE2D || image_type == 
CL_MEM_OBJECT_IMAGE3D) &&
+(flags & CL_MEM_COPY_HOST_PTR)) {
+  mem = _cl_new_image_copy_from_host_ptr(ctx, flags, fmt, image_type, w, 
h, depth, pitch,
+  slice_pitch, sz, aligned_pitch, intel_fmt, bpp, tiling, data, );
+  if (mem != NULL)
+goto exit;
+}
+
+if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) {
+  tiling = CL_NO_TILE;
+  aligned_pitch = w * bpp;
+  aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
+  sz = aligned_pitch * aligned_h * depth;
+} else
+  is_ker_copy = 1;
   }
 
   if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) {
@@ -992,6 +1077,8 @@ _cl_mem_new_image(cl_context ctx,
   cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data);
   }
 
+  mem->is_ker_copy = is_ker_copy;
+
 exit:
   if (errcode_ret)
  

[Beignet] [PATCH 2/6] Add image filling case for testing large image operations.

2017-05-09 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is for testing large image with TILE_Y mode.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/CMakeLists.txt|   1 +
 utests/compiler_fill_large_image.cpp | 124 +++
 2 files changed, 125 insertions(+)
 create mode 100644 utests/compiler_fill_large_image.cpp

diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 145b02c..b02068a 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -111,6 +111,7 @@ set (utests_sources
   compiler_abs.cpp
   compiler_abs_diff.cpp
   compiler_fill_image.cpp
+  compiler_fill_large_image.cpp
   compiler_fill_image0.cpp
   compiler_fill_image_1d.cpp
   compiler_fill_image_3d.cpp
diff --git a/utests/compiler_fill_large_image.cpp 
b/utests/compiler_fill_large_image.cpp
new file mode 100644
index 000..6c4b54a
--- /dev/null
+++ b/utests/compiler_fill_large_image.cpp
@@ -0,0 +1,124 @@
+#include 
+#include "utest_helper.hpp"
+
+static void compiler_fill_large_image(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, 1};
+  uint32_t color = 0x12345678;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image");
+
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(color), );
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  uint32_t *data = (uint32_t*)malloc(sizeof(uint32_t) * w * h * 4);
+  OCL_READ_IMAGE(buf[0], origin, region, data);
+  for (uint32_t j = 0; j < h; ++j) {
+for (uint32_t i = 0; i < w; i++) {
+  uint32_t k = (j * w + i) * 4;
+  OCL_ASSERT(data[k] == 0x12);
+  OCL_ASSERT(data[k + 1] == 0x34);
+  OCL_ASSERT(data[k + 2] == 0x56);
+  OCL_ASSERT(data[k + 3] == 0x78);
+}
+  }
+  free(data);
+
+  OCL_MAP_BUFFER_GTT(0);
+  for (uint32_t j = 0; j < h; ++j) {
+for (uint32_t i = 0; i < w; i++) {
+  uint32_t k = (j * w + i) * 4;
+  /*printf("((uint32_t*)buf_data[0])[k] = %d\n", 
((uint32_t*)buf_data[0])[k]);
+  printf("((uint32_t*)buf_data[0])[k + 1] = %d\n", 
((uint32_t*)buf_data[0])[k + 1]);
+  printf("((uint32_t*)buf_data[0])[k + 2] = %d\n", 
((uint32_t*)buf_data[0])[k + 2]);
+  printf("((uint32_t*)buf_data[0])[k + 3] = %d\n", 
((uint32_t*)buf_data[0])[k + 3]);*/
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k] == 0x12);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 1] == 0x34);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 2] == 0x56);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 3] == 0x78);
+}
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image);
+
+static void compiler_fill_large_image_1(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, 1};
+  uint32_t color[4] = {0x12, 0x34, 0x56, 0x78};
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+
+  // Setup kernel and images
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+
+  // Fill Image
+  clEnqueueFillImage(queue, buf[0], color, origin, region, 0, NULL, NULL);
+
+  // Check result
+  uint32_t *data = (uint32_t*)malloc(sizeof(uint32_t) * w * h * 4);
+  OCL_READ_IMAGE(buf[0], origin, region, data);
+  for (uint32_t j = 0; j < h; ++j) {
+for (uint32_t i = 0; i < w; i++) {
+  int k = (j * w + i) * 4;
+  OCL_ASSERT(data[k] == 0x12);
+  OCL_ASSERT(data[k + 1] == 0x34);
+  OCL_ASSERT(data[k + 2] == 0x56);
+  OCL_ASSERT(data[k + 3] == 0x78);
+}
+  }
+  free(data);
+
+  OCL_MAP_BUFFER_GTT(0);
+  for (uint32_t j = 0; j < h; ++j) {
+for (uint32_t i = 0; i < w; i++) {
+  int k = (j * w + i) * 4;
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k] == 0x12);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 1] == 0x34);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 2] == 0x56);
+  OCL_ASSERT(((uint32_t*)buf_data[0])[k + 3] == 0x78);
+}
+  }
+  OCL_UNMAP_BUFFER_GTT(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_large_image_1);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 1/6] Add image copying case for testing large image operations.

2017-05-09 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It is for testing large image with TILE_Y mode.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 utests/CMakeLists.txt|   1 +
 utests/compiler_copy_large_image.cpp | 121 +++
 2 files changed, 122 insertions(+)
 create mode 100644 utests/compiler_copy_large_image.cpp

diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index b7ef742..145b02c 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -101,6 +101,7 @@ set (utests_sources
   compiler_convert_uchar_sat.cpp
   compiler_copy_buffer.cpp
   compiler_copy_image.cpp
+  compiler_copy_large_image.cpp
   compiler_copy_image_1d.cpp
   compiler_copy_image_3d.cpp
   compiler_copy_buffer_row.cpp
diff --git a/utests/compiler_copy_large_image.cpp 
b/utests/compiler_copy_large_image.cpp
new file mode 100644
index 000..66998a7
--- /dev/null
+++ b/utests/compiler_copy_large_image.cpp
@@ -0,0 +1,121 @@
+#include 
+#include "utest_helper.hpp"
+
+static void compiler_copy_large_image(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * 4);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] = k;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = w * sizeof(uint32_t) * 4;
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, , , buf_data[0]);
+
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[1], 0, , , NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  OCL_SET_ARG(2, sizeof(sampler), );
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_large_image);
+
+static void compiler_copy_large_image_1(void)
+{
+  const size_t w = 4096;
+  const size_t h = 4096;
+  const size_t origin[3] = {0, 0, 0};
+  const size_t region[3] = {w, h, 1};
+  cl_image_format format;
+  cl_image_desc desc;
+  cl_sampler sampler;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_copy_image");
+  buf_data[0] = (uint32_t*) malloc(sizeof(uint32_t) * w * h * 4);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] = k;
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT32;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;
+  OCL_CREATE_IMAGE(buf[0], 0, , , NULL);
+  OCL_CREATE_IMAGE(buf[1], 0, , , NULL);
+  OCL_CREATE_SAMPLER(sampler, CL_ADDRESS_REPEAT, CL_FILTER_NEAREST);
+  OCL_WRITE_IMAGE(buf[0], origin, region, buf_data[0]);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  OCL_SET_ARG(2, sizeof(sampler), );
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+  OCL_NDRANGE(2);
+
+  // Check result
+  OCL_MAP_BUFFER_GTT(0);
+  OCL_MAP_BUFFER_GTT(1);
+  for (uint32_t j = 0; j < h; ++j)
+for (uint32_t i = 0; i < w; i++)
+  for (uint32_t k = 0; k < 4; k++)
+OCL_ASSERT(((uint32_t*)buf_data[0])[(j * w + i) * 4 + k] ==
+  ((uint32_t*)buf_data[1])[(j * w + i) * 4 + k]);
+  OCL_UNMAP_BUFFER_GTT(0);
+  OCL_UNMAP_BUFFER_GTT(1);
+
+  OCL_CALL(clReleaseSampler, sampler);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_copy_large_image_1);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH v2] Provide more possible candidate of load/store as possible.

2017-03-09 Thread yan . wang
Some typo. Sorry for it.
I have modified it.



yan.wang
 
From: yan.wang
Date: 2017-03-10 10:52
To: ruiling.song; beignet
Subject: Re: [Beignet] [PATCH v2] Provide more possible candidate of load/store 
as possible.
It comes from darktable perforamnce tuning.
For float type, maxVecSize is 4, so maxLimit = 4 * 8 = 32.
I am not sure the reason of maxLimit = maxVecSize * 8.
32 is too small for searching and could not find more available load after 
leading load.
It will improve eaw_decompose kernel of darktable from 2.1876s to 1.8855s 
because reduce send from 3 send (2 float, 2 float, 1 float) to 2 send (4 float, 
1 float).
There is another issue when compling eaw_decompose kernel and I will submit 
another patch for it.
At least need set one low bound for maxLimit like 150 to avoid searching range 
too small.



yan.wang
 
From: Song, Ruiling
Date: 2017-03-10 10:39
To: yan.w...@linux.intel.com; beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH v2] Provide more possible candidate of load/store 
as possible.
 
 
> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> yan.w...@linux.intel.com
> Sent: Thursday, March 9, 2017 5:41 PM
> To: beignet@lists.freedesktop.org
> Cc: Yan Wang <yan.w...@linux.intel.com>
> Subject: [Beignet] [PATCH v2] Provide more possible candidate of load/store as
> possible.
> 
> From: Yan Wang <yan.w...@linux.intel.com>
> 
> Avoid searching range too small in some case like vector of float.
> It will lead more load/store merged for improving perforamnce.
> 
> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> ---
>  backend/src/llvm/llvm_loadstore_optimization.cpp | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp
> b/backend/src/llvm/llvm_loadstore_optimization.cpp
> index e797e98..e569a8e 100644
> --- a/backend/src/llvm/llvm_loadstore_optimization.cpp
> +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
> @@ -180,7 +180,7 @@ namespace gbe {
>  BasicBlock::iterator J = start;
>  ++J;
> 
> -unsigned maxLimit = maxVecSize * 8;
> +unsigned maxLimit = std::max(maxVecSize * 8, 150u);
 
Could you give some performance number against some known benchmarks?
Please select some complex enough OpenCL kernel. Maybe luxmark? Darktable?
How it would benefit the runtime performance and how much it would hurt the 
compile-time performance?
So we could know whether the change is reasonable.
 
Thanks!
Ruiling
>  bool reordered = false;
> 
>  for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
> --
> 2.7.4
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH v2] Provide more possible candidate of load/store as possible.

2017-03-09 Thread yan . wang
It comes from darktable perforamnce tuning.
For float type, maxVecSize is 4, so maxLimit = 4 * 8 = 32.
I am not sure the reason of maxLimit = maxVecSize * 8.
32 is too samll for saerching and could not find more available load after 
leading load.
It will improve eaw_decompose kernel of darktable from 2.1876s to 1.8855s 
because reduce send from 3 send (2 float, 2 float, 1 float) to 2 send (4 float, 
1 float).
There is another issue when compiing eaw_decompose kernel and I will submit 
another patch for it.
At least need set one low bound for maxLimit like 150 to avoid seaching range 
too slow.



yan.wang
 
From: Song, Ruiling
Date: 2017-03-10 10:39
To: yan.w...@linux.intel.com; beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH v2] Provide more possible candidate of load/store 
as possible.
 
 
> -Original Message-
> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of
> yan.w...@linux.intel.com
> Sent: Thursday, March 9, 2017 5:41 PM
> To: beignet@lists.freedesktop.org
> Cc: Yan Wang <yan.w...@linux.intel.com>
> Subject: [Beignet] [PATCH v2] Provide more possible candidate of load/store as
> possible.
> 
> From: Yan Wang <yan.w...@linux.intel.com>
> 
> Avoid searching range too small in some case like vector of float.
> It will lead more load/store merged for improving perforamnce.
> 
> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
> ---
>  backend/src/llvm/llvm_loadstore_optimization.cpp | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp
> b/backend/src/llvm/llvm_loadstore_optimization.cpp
> index e797e98..e569a8e 100644
> --- a/backend/src/llvm/llvm_loadstore_optimization.cpp
> +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
> @@ -180,7 +180,7 @@ namespace gbe {
>  BasicBlock::iterator J = start;
>  ++J;
> 
> -unsigned maxLimit = maxVecSize * 8;
> +unsigned maxLimit = std::max(maxVecSize * 8, 150u);
 
Could you give some performance number against some known benchmarks?
Please select some complex enough OpenCL kernel. Maybe luxmark? Darktable?
How it would benefit the runtime performance and how much it would hurt the 
compile-time performance?
So we could know whether the change is reasonable.
 
Thanks!
Ruiling
>  bool reordered = false;
> 
>  for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
> --
> 2.7.4
> 
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2] Provide more possible candidate of load/store as possible.

2017-03-09 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Avoid searching range too small in some case like vector of float.
It will lead more load/store merged for improving perforamnce.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/llvm/llvm_loadstore_optimization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp 
b/backend/src/llvm/llvm_loadstore_optimization.cpp
index e797e98..e569a8e 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -180,7 +180,7 @@ namespace gbe {
 BasicBlock::iterator J = start;
 ++J;
 
-unsigned maxLimit = maxVecSize * 8;
+unsigned maxLimit = std::max(maxVecSize * 8, 150u);
 bool reordered = false;
 
 for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Provide more possible candidate of load/store as possible.

2017-03-09 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Avoid search range too small in same case like vector of float.
It will lead more load/store merged for improving perforamnce.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/llvm/llvm_loadstore_optimization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp 
b/backend/src/llvm/llvm_loadstore_optimization.cpp
index e797e98..e569a8e 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -180,7 +180,7 @@ namespace gbe {
 BasicBlock::iterator J = start;
 ++J;
 
-unsigned maxLimit = maxVecSize * 8;
+unsigned maxLimit = std::max(maxVecSize * 8, 150u);
 bool reordered = false;
 
 for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] MAD compact instrcution could not support "absolute" attribute.

2017-02-23 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

If absolute of SRCs of MAD instruction is 1, doens't use compact
instruction.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/backend/gen_insn_compact.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/src/backend/gen_insn_compact.cpp 
b/backend/src/backend/gen_insn_compact.cpp
index 62fcb61..22305f7 100644
--- a/backend/src/backend/gen_insn_compact.cpp
+++ b/backend/src/backend/gen_insn_compact.cpp
@@ -804,6 +804,8 @@ namespace gbe {
 if( control_index == -1) return false;
 if( src0.negation + src1.negation + src2.negation > 1)
   return false;
+if( src0.absolute + src1.absolute + src2.absolute > 0)
+  return false;
 
 GenCompactInstruction *insn = p->nextCompact(opcode);
 insn->src3Insn.bits1.control_index = control_index;
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Avoid possible invalid pointer by vector interator.

2016-12-28 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

"revisit" as vector containber will be pushed more elements in
findPointerEsacape() and cause previous interator to introduce
possible invalid pointer.
When compiling huge kernel like blender, it will cause random
segment fault crash.
[] operator will be more safe.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/llvm/llvm_gen_backend.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 8c7a230..e3543ae 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1437,8 +1437,8 @@ namespace gbe
   }
 }
 // storing/loading pointer would introduce revisit
-for (std::vector::iterator iter = revisit.begin(); iter != 
revisit.end(); ++iter) {
-  findPointerEscape(*iter, mixedPtr, true, revisit);
+for (size_t i = 0; i < revisit.size(); ++i) {
+  findPointerEscape(revisit[i], mixedPtr, true, revisit);
 }
 
 // the second pass starts from mixed pointer
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Avoid possible invalid pointer by vector interator.

2016-12-28 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

"revisit" as vector containber will be pushed more elements in
findPointerEsacape() and cause previous interator to introduce
possible invalid pointer.
When compiling huge kernel like blender, it will cause random
segment fault crash.
[] operator will be more safe.
---
 backend/src/llvm/llvm_gen_backend.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 8c7a230..e3543ae 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1437,8 +1437,8 @@ namespace gbe
   }
 }
 // storing/loading pointer would introduce revisit
-for (std::vector::iterator iter = revisit.begin(); iter != 
revisit.end(); ++iter) {
-  findPointerEscape(*iter, mixedPtr, true, revisit);
+for (size_t i = 0; i < revisit.size(); ++i) {
+  findPointerEscape(revisit[i], mixedPtr, true, revisit);
 }
 
 // the second pass starts from mixed pointer
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] GBE: reorder the LLVM pass to reduce the compilation time.

2016-12-25 Thread Yan Wang
LGTM.
Thanks.

Yan Wang

On Fri, 2016-12-16 at 16:38 +0800, Yang Rong wrote:
> Set all function's linkage to LinkOnceAnyLinkage, then Inlining pass
> could delete the inlined functions.
> And reorder createFunctionInliningPass before
> createStripAttributesPass
> can reduce the compilation time significant, but haven't found the
> root
> casue.
> 
> Signed-off-by: Yang Rong <rong.r.y...@intel.com>
> ---
>  backend/src/llvm/StripAttributes.cpp | 6 --
>  backend/src/llvm/llvm_to_gen.cpp | 6 +++---
>  2 files changed, 7 insertions(+), 5 deletions(-)
> 
> diff --git a/backend/src/llvm/StripAttributes.cpp
> b/backend/src/llvm/StripAttributes.cpp
> index 3bf3853..9d07c29 100644
> --- a/backend/src/llvm/StripAttributes.cpp
> +++ b/backend/src/llvm/StripAttributes.cpp
> @@ -89,10 +89,12 @@ namespace {
>  char StripAttributes::ID = 0;
>  
>  bool StripAttributes::runOnFunction(Function ) {
> -  if (!gbe::isKernelFunction(Func))
> -Func.addFnAttr(Attribute::AlwaysInline);
>Func.setCallingConv(CallingConv::C);
>Func.setLinkage(GlobalValue::ExternalLinkage);
> +  if (!gbe::isKernelFunction(Func)) {
> +Func.addFnAttr(Attribute::AlwaysInline);
> +Func.setLinkage(GlobalValue::LinkOnceAnyLinkage);
> +  }
>  
>for (Function::iterator BB = Func.begin(), E = Func.end();
> BB != E; ++BB) {
> diff --git a/backend/src/llvm/llvm_to_gen.cpp
> b/backend/src/llvm/llvm_to_gen.cpp
> index e108810..a889c56 100644
> --- a/backend/src/llvm/llvm_to_gen.cpp
> +++ b/backend/src/llvm/llvm_to_gen.cpp
> @@ -136,6 +136,9 @@ namespace gbe
>  MPM.add(createBasicAliasAnalysisPass());
>  #endif
>  MPM.add(createIntrinsicLoweringPass());
> +MPM.add(createBarrierNodupPass(false));   // remove noduplicate
> fnAttr before inlining.
> +MPM.add(createFunctionInliningPass(2));
> +MPM.add(createBarrierNodupPass(true));// restore noduplicate
> fnAttr after inlining.
>  MPM.add(createStripAttributesPass()); // Strip unsupported
> attributes and calling conventions.
>  MPM.add(createSamplerFixPass());
>  MPM.add(createGlobalOptimizerPass()); // Optimize out global
> vars
> @@ -146,9 +149,6 @@ namespace gbe
>  MPM.add(createInstructionCombiningPass());// Clean up after IPCP
> & DAE
>  MPM.add(createCFGSimplificationPass());   // Clean up after IPCP
> & DAE
>  MPM.add(createPruneEHPass()); // Remove dead EH info
> -MPM.add(createBarrierNodupPass(false));   // remove noduplicate
> fnAttr before inlining.
> -MPM.add(createFunctionInliningPass(2));
> -MPM.add(createBarrierNodupPass(true));// restore noduplicate
> fnAttr after inlining.
>  #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
>  MPM.add(createPostOrderFunctionAttrsLegacyPass());
>  #elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Restore jump threading pass for reducing compiling time when run the large and complex kernel like Luxmark.

2016-12-08 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Jump threading pass could optimize the connection between LLVM
basic blocks of the function and provide the chance to merge and
remove unnecessary basic blocks to reduce the compilation time and
ASM code size.

Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/llvm/llvm_to_gen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 42f24b3..e108810 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -214,7 +214,7 @@ namespace gbe
 // Run instcombine after redundancy elimination to exploit opportunities
 // opened up by them.
 MPM.add(createInstructionCombiningPass());
-//MPM.add(createJumpThreadingPass()); // Thread jumps
+MPM.add(createJumpThreadingPass()); // Thread jumps
 MPM.add(createCorrelatedValuePropagationPass());
 MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
 MPM.add(createAggressiveDCEPass()); // Delete dead instructions
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 1/2] remove some redundant code for printf

2016-11-29 Thread Yan Wang
LGTM. 
Thanks.

Yan Wang

On Mon, 2016-11-21 at 18:16 +0800, Guo, Yejun wrote:
> tmp0 is added into src in selection stage, and just ignored at
> context
> stage, it is redundant.
> 
> Signed-off-by: Guo, Yejun <yejun@intel.com>
> ---
>  backend/src/backend/gen_context.cpp|  2 --
>  backend/src/backend/gen_insn_selection.cpp | 54 +---
> --
>  2 files changed, 15 insertions(+), 41 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index c38b7af..186c8d9 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -3474,8 +3474,6 @@ namespace gbe
>  const GenRegister tmp1 = ra->genReg(insn.dst(2));
>  GenRegister src;
>  uint32_t srcNum = insn.srcNum;
> -if (insn.extra.continueFlag)
> -  srcNum--;
>  
>  GenRegister addr = GenRegister::retype(tmp0, GEN_TYPE_UD);
>  GenRegister data = GenRegister::retype(tmp1, GEN_TYPE_UD);
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index c14e0bc..1808c7b 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -2131,49 +2131,25 @@ namespace gbe
>  
>void Selection::Opaque::PRINTF(GenRegister dst, uint8_t bti,
> GenRegister tmp0, GenRegister tmp1,
> GenRegister src[8], int srcNum, uint16_t num, bool
> isContinue, uint32_t totalSize) {
> -if (isContinue) {
> -  SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF,
> 3, srcNum + 1);
> -  SelectionVector *vector = this->appendVector();
> -
> -  for (int i = 0; i < srcNum; i++)
> -insn->src(i) = src[i];
> -
> -  insn->src(srcNum) = tmp0;
> -
> -  insn->dst(0) = dst;
> -  insn->dst(1) = tmp0;
> -  insn->dst(2) = tmp1;
> -
> -  vector->regNum = 2;
> -  vector->reg = >dst(1);
> -  vector->offsetID = 0;
> -  vector->isSrc = 0;
> -
> -  insn->extra.printfSize = static_cast(totalSize);
> -  insn->extra.continueFlag = isContinue;
> -  insn->extra.printfBTI = bti;
> -  insn->extra.printfNum = num;
> -} else {
> -  SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF,
> 3, srcNum);
> -  SelectionVector *vector = this->appendVector();
> +SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3,
> srcNum);
> +SelectionVector *vector = this->appendVector();
>  
> -  for (int i = 0; i < srcNum; i++)
> -insn->src(i) = src[i];
> +for (int i = 0; i < srcNum; i++)
> +  insn->src(i) = src[i];
>  
> -  insn->dst(0) = dst;
> -  insn->dst(1) = tmp0;
> -  insn->dst(2) = tmp1;
> +insn->dst(0) = dst;
> +insn->dst(1) = tmp0;
> +insn->dst(2) = tmp1;
>  
> -  vector->regNum = 2;
> -  vector->reg = >dst(1);
> -  vector->offsetID = 0;
> -  vector->isSrc = 0;
> +vector->regNum = 2;
> +vector->reg = >dst(1);
> +vector->offsetID = 0;
> +vector->isSrc = 0;
>  
> -  insn->extra.printfSize = static_cast(totalSize);
> -  insn->extra.continueFlag = isContinue;
> -  insn->extra.printfBTI = bti;
> -  insn->extra.printfNum = num;
> -}
> +insn->extra.printfSize = static_cast(totalSize);
> +insn->extra.continueFlag = isContinue;
> +insn->extra.printfBTI = bti;
> +insn->extra.printfNum = num;
>}
>  
>void Selection::Opaque::WORKGROUP_OP(uint32_t wg_op,
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH 2/2] do not care dst for printf

2016-11-29 Thread Yan Wang
LGTM.
Thanks.

Yan Wang

On Mon, 2016-11-21 at 18:16 +0800, Guo, Yejun wrote:
> acutally, the dst of printf means nothing, don't need to touch it.
> 
> Signed-off-by: Guo, Yejun <yejun@intel.com>
> ---
>  backend/src/backend/gen_context.cpp| 14 ++
>  backend/src/backend/gen_insn_selection.cpp | 20 +---
>  2 files changed, 11 insertions(+), 23 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp
> b/backend/src/backend/gen_context.cpp
> index 186c8d9..a73ccb6 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -3469,9 +3469,8 @@ namespace gbe
>}
>  
>void GenContext::emitPrintfInstruction(const SelectionInstruction
> ) {
> -const GenRegister dst = ra->genReg(insn.dst(0));
> -const GenRegister tmp0 = ra->genReg(insn.dst(1));
> -const GenRegister tmp1 = ra->genReg(insn.dst(2));
> +const GenRegister tmp0 = ra->genReg(insn.dst(0));
> +const GenRegister tmp1 = ra->genReg(insn.dst(1));
>  GenRegister src;
>  uint32_t srcNum = insn.srcNum;
>  
> @@ -3518,15 +3517,6 @@ namespace gbe
>  emitPrintfLongInstruction(addr, data, src,
> insn.extra.printfBTI);
>}
>  }
> -
> -if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
> -  p->push();
> -  p->curr.execWidth = 1;
> -}
> -p->MOV(dst, GenRegister::immd(0));
> -if (dst.hstride == GEN_HORIZONTAL_STRIDE_0) {
> -  p->pop();
> -}
>}
>  
>void GenContext::setA0Content(uint16_t new_a0[16], uint16_t
> max_offset, int sz) {
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 1808c7b..88fe1a6 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -705,7 +705,7 @@ namespace gbe
>  /*! Store the profiling info */
>  void STORE_PROFILING(uint32_t profilingType, uint32_t bti,
> GenRegister tmp0, GenRegister tmp1, GenRegister ts[5], int tsNum);
>  /*! Printf */
> -void PRINTF(GenRegister dst, uint8_t bti, GenRegister tmp0,
> GenRegister tmp1, GenRegister src[8],
> +void PRINTF(uint8_t bti, GenRegister tmp0, GenRegister tmp1,
> GenRegister src[8],
>  int srcNum, uint16_t num, bool isContinue, uint32_t
> totalSize);
>  /*! Multiply 64-bit integers */
>  void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool
> native_long);
> @@ -2129,20 +2129,19 @@ namespace gbe
>  }
>}
>  
> -  void Selection::Opaque::PRINTF(GenRegister dst, uint8_t bti,
> GenRegister tmp0, GenRegister tmp1,
> +  void Selection::Opaque::PRINTF(uint8_t bti, GenRegister tmp0,
> GenRegister tmp1,
> GenRegister src[8], int srcNum, uint16_t num, bool
> isContinue, uint32_t totalSize) {
> -SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3,
> srcNum);
> +SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 2,
> srcNum);
>  SelectionVector *vector = this->appendVector();
>  
>  for (int i = 0; i < srcNum; i++)
>insn->src(i) = src[i];
>  
> -insn->dst(0) = dst;
> -insn->dst(1) = tmp0;
> -insn->dst(2) = tmp1;
> +insn->dst(0) = tmp0;
> +insn->dst(1) = tmp1;
>  
>  vector->regNum = 2;
> -vector->reg = >dst(1);
> +vector->reg = >dst(0);
>  vector->offsetID = 0;
>  vector->isSrc = 0;
>  
> @@ -7041,8 +7040,7 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>uint8_t BTI = insn.getBti();
>GenRegister tmp0, tmp1;
>uint32_t srcNum = insn.getSrcNum();
> -  GenRegister dst = sel.selReg(insn.getDst(0), TYPE_S32);
> -  //GBE_ASSERT(srcNum);
> +
>uint32_t i = 0;
>uint32_t totalSize = 0;
>bool isContinue = false;
> @@ -7063,14 +7061,14 @@ extern bool OCL_DEBUGINFO; // first defined
> by calling BVAR in program.cpp
>i = 0;
>GenRegister regs[8];
>if (srcNum == 0) {
> -  sel.PRINTF(dst, BTI, tmp0, tmp1, regs, srcNum, num,
> isContinue, totalSize);
> +  sel.PRINTF(BTI, tmp0, tmp1, regs, srcNum, num, isContinue,
> totalSize);
>} else {
>  do {
>uint32_t s = srcNum < 8 ? srcNum : 8;
>for (uint32_t j = 0; j < s; j++) {
>  regs[j] = sel.selReg(insn.getSrc(i + j), insn.getType(i
> + j));
>}
> -  sel.PRINTF(dst, BTI, tmp0, tmp1, regs, s, num, isContinue,
> totalSize);
> +  sel.PRINTF(BTI, tmp0, tmp1, regs, s, num, isContinue,
> totalSize);
>  
>if (srcNum > 8) {
>  srcNum -= 8;
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2] Fix bug: Initialize bti of LoadInstuctionPattern::shootByteGatherMsg().

2016-11-23 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

If it isn't initialized, Luxmark hotel scene will display wrong.
---
 backend/src/backend/gen_insn_selection.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index c14e0bc..c9a5be7 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -4487,7 +4487,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   using namespace ir;
   unsigned addrBytes = typeSize(addr.type);
   AddressMode AM = insn.getAddressMode();
-  vector btiTemp;
+  vector btiTemp = sel.getBTITemps(AM);
   if (AM == AM_DynamicBti || AM == AM_StaticBti) {
 if (AM == AM_DynamicBti) {
   Register btiReg = insn.getBtiReg();
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Fix bug: Initialize bti of LoadInstuctionPattern::shootByteGatherMsg().

2016-11-23 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

If it isn't initialized, Luxmark hotel scene will display wrong.
---
 backend/src/backend/gen_insn_selection.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index c14e0bc..4cdf2cd 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -4487,8 +4487,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   using namespace ir;
   unsigned addrBytes = typeSize(addr.type);
   AddressMode AM = insn.getAddressMode();
-  vector btiTemp;
   if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+vector btiTemp = sel.getBTITemps(AM);
 if (AM == AM_DynamicBti) {
   Register btiReg = insn.getBtiReg();
   sel.BYTE_GATHER(dst, addr, elemSize, sel.selReg(btiReg, TYPE_U32), 
btiTemp);
@@ -4499,6 +4499,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) {
 unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
 GenRegister addrDW = addr;
+vector btiTemp;
 if (addrBytes == 8) {
   addrDW = convertU64ToU32(sel, addr);
 }
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Fix bug: Initialize bti LoadInstuctionPattern::shootUntypedReadMsg().

2016-11-23 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

If it isn't initialized, Luxmark hotel scene will display wrong.
---
 backend/src/backend/gen_insn_selection.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index c14e0bc..4cdf2cd 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -4487,8 +4487,8 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   using namespace ir;
   unsigned addrBytes = typeSize(addr.type);
   AddressMode AM = insn.getAddressMode();
-  vector btiTemp;
   if (AM == AM_DynamicBti || AM == AM_StaticBti) {
+vector btiTemp = sel.getBTITemps(AM);
 if (AM == AM_DynamicBti) {
   Register btiReg = insn.getBtiReg();
   sel.BYTE_GATHER(dst, addr, elemSize, sel.selReg(btiReg, TYPE_U32), 
btiTemp);
@@ -4499,6 +4499,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
   } else if (addrSpace == ir::MEM_LOCAL || isReadConstantLegacy(insn)) {
 unsigned bti = addrSpace == ir::MEM_CONSTANT ? BTI_CONSTANT : 0xfe;
 GenRegister addrDW = addr;
+vector btiTemp;
 if (addrBytes == 8) {
   addrDW = convertU64ToU32(sel, addr);
 }
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Fix getting bitwidth of PointerType of LLVM.

2016-11-17 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

PointerType could not be forced to IntegerTyoe for getting bitwidth.
With Rong's comments, use getTypeBitSize() instead of
Type::getIntegerBitWidth().
---
 backend/src/llvm/llvm_gen_backend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 397c721..d6b0665 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -1366,7 +1366,7 @@ namespace gbe
   }
   Builder.SetInsertPoint(cast(theUser));
 
-  Type *ptyTy = IntegerType::get(ptr->getContext(), 
ptr->getType()->getIntegerBitWidth());
+  Type *ptyTy = IntegerType::get(ptr->getContext(), 
getTypeBitSize(unit, ptr->getType()));
   Value *v1 = Builder.CreatePtrToInt(pointerOp, ptyTy);
 
   Value *v2 = 
Builder.CreatePtrToInt(getSinglePointerOrigin(pointerOp), ptyTy);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Reduce the compilation time of inline pass in runModulePass().

2016-10-25 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It could reduce much compilation time when run Luxmark scenes.
Avoid calling inline pass many times in runModulePass when module is
changed by the other pass.
Create a single funtion to run inline pass.
In this single funtion, lower pass and strict math related pass are
also added for keep enough precision.
---
 backend/src/llvm/llvm_to_gen.cpp | 53 +---
 1 file changed, 50 insertions(+), 3 deletions(-)

diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index e108810..0d51ee3 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -106,6 +106,55 @@ namespace gbe
 FPM.doFinalization();
   }
 
+  void runInlinePass(Module , TARGETLIBRARY *libraryInfo, const DataLayout 
, int optLevel, bool strictMath)
+  {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+legacy::PassManager MPM;
+#else
+PassManager MPM;
+#endif
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 6
+MPM.add(new DataLayoutPass());
+#elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 5
+MPM.add(new DataLayoutPass(DL));
+#else
+MPM.add(new DataLayout(DL));
+#endif
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+MPM.add(new TargetLibraryInfoWrapperPass(*libraryInfo));
+#else
+MPM.add(new TargetLibraryInfo(*libraryInfo));
+#endif
+MPM.add(createIntrinsicLoweringPass());
+MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr 
before inlining.
+MPM.add(createFunctionInliningPass(2));
+//MPM.add(createAlwaysInlinerPass());
+MPM.add(createBarrierNodupPass(true));// restore noduplicate fnAttr 
after inlining.
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+// FIXME Workaround: we find that CustomLoopUnroll may increase register 
pressure greatly,
+// and it may even make som cl kernel cannot compile because of limited 
scratch memory for spill.
+// As we observe this under strict math. So we disable CustomLoopUnroll if 
strict math is enabled.
+if (!strictMath) {
+#if !defined(__ANDROID__)
+  MPM.add(createCustomLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll 
loops
+#endif
+  MPM.add(createLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
+  if(optLevel > 0) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
+MPM.add(createSROAPass());
+#else
+MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+#endif
+MPM.add(createGVNPass()); // Remove redundancies
+  }
+}
+#endif
+MPM.run(mod);
+  }
+
   void runModulePass(Module , TARGETLIBRARY *libraryInfo, const DataLayout 
, int optLevel, bool strictMath)
   {
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
@@ -146,9 +195,6 @@ namespace gbe
 MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
 MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
 MPM.add(createPruneEHPass()); // Remove dead EH info
-MPM.add(createBarrierNodupPass(false));   // remove noduplicate fnAttr 
before inlining.
-MPM.add(createFunctionInliningPass(2));
-MPM.add(createBarrierNodupPass(true));// restore noduplicate fnAttr 
after inlining.
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 9
 MPM.add(createPostOrderFunctionAttrsLegacyPass());
 #elif LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 8
@@ -344,6 +390,7 @@ namespace gbe
 OUTPUT_BITCODE(AFTER_LINK, mod);
 
 runFuntionPass(mod, libraryInfo, DL);
+runInlinePass(mod, libraryInfo, DL, optLevel, strictMath);
 runModulePass(mod, libraryInfo, DL, optLevel, strictMath);
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
 legacy::PassManager passes;
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Add read_imagef benchmark for optimization.

2016-09-13 Thread Yan Wang
On Mon, 2016-09-12 at 06:53 +, Yang, Rong R wrote:
> 
> > -Original Message-
> > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On
> > Behalf Of
> > yan.w...@linux.intel.com
> > Sent: Monday, September 5, 2016 14:52
> > To: beignet@lists.freedesktop.org
> > Cc: Yan Wang <yan.w...@linux.intel.com>
> > Subject: [Beignet] [PATCH] Add read_imagef benchmark for
> > optimization.
> > 
> > From: Yan Wang <yan.w...@linux.intel.com>
> > 
> > ---
> >  benchmark/CMakeLists.txt |  1 +
> >  benchmark/benchmark_read_image_float.cpp | 65
> > 
> >  kernels/compiler_read_image_float.cl |  9 +
> >  3 files changed, 75 insertions(+)
> >  create mode 100644 benchmark/benchmark_read_image_float.cpp
> >  create mode 100644 kernels/compiler_read_image_float.cl
> > 
> > diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
> > index
> > f9b246b..556275e 100644
> > --- a/benchmark/CMakeLists.txt
> > +++ b/benchmark/CMakeLists.txt
> > @@ -15,6 +15,7 @@ set (benchmark_sources
> >benchmark_use_host_ptr_buffer.cpp
> >benchmark_read_buffer.cpp
> >benchmark_read_image.cpp
> > +  benchmark_read_image_float.cpp
> >benchmark_copy_buffer_to_image.cpp
> >benchmark_copy_image_to_buffer.cpp
> >benchmark_copy_buffer.cpp
> > diff --git a/benchmark/benchmark_read_image_float.cpp
> > b/benchmark/benchmark_read_image_float.cpp
> > new file mode 100644
> > index 000..b0c2fb4
> > --- /dev/null
> > +++ b/benchmark/benchmark_read_image_float.cpp
> > @@ -0,0 +1,65 @@
> > +#include 
> > +#include "utests/utest_helper.hpp"
> > +#include 
> > +
> > +#define NUM 400
> > +
> > +double benchmark_read_image_float(void) {
> > +  struct timeval start,stop;
> > +
> > +  const size_t w = 128;
> > +  const size_t h = 128;
> > +  const size_t sz = w * h;
> > +  cl_image_format format;
> > +  cl_image_desc desc;
> > +
> > +  memset(, 0x0, sizeof(cl_image_desc));  memset(, 0x0,
> > + sizeof(cl_image_format));
> > +
> > +  // Setup kernel and images
> > +  OCL_CREATE_KERNEL("compiler_read_image_float");
> > +  buf_data[0] = (uint32_t*) malloc(sizeof(float) * sz);  for
> > (uint32_t
> > + i = 0; i < sz; ++i) {
> > +((float*)buf_data[0])[i] = rand();
> > +  }
> > +
> > +  format.image_channel_order = CL_R;
> > +  format.image_channel_data_type = CL_FLOAT;  desc.image_type =
> > + CL_MEM_OBJECT_IMAGE2D;  desc.image_width = w;  desc.image_height
> > =
> > h;
> > + OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, , ,
> > + buf_data[0]);  OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(float),
> > NULL);
> > +
> > +  free(buf_data[0]);
> > +  buf_data[0] = NULL;
> > +
> > +  // Run the kernel
> > +  OCL_SET_ARG(0, sizeof(cl_mem), [0]);  OCL_SET_ARG(1,
> > + sizeof(cl_mem), [1]);  OCL_SET_ARG(2, sizeof(cl_int), );
> > + globals[0] = w;  globals[1] = h;  locals[0] = 16;  locals[1] =
> > 16;
> > +
> > +  OCL_NDRANGE(2);
> > +  OCL_FINISH();
> > +
> > +  gettimeofday(,0);
> > +  for (size_t i=0; i<NUM; i++) {
> > +OCL_NDRANGE(2);
> > +  }
> > +  OCL_FINISH();
> > +  gettimeofday(,0);
> > +
> > +  free(buf_data[0]);
> > +  buf_data[0] = NULL;
> > +
> > +  double elapsed = time_subtract(, , 0);
> > +
> > +  return BANDWIDTH(sz * sizeof(float) * NUM, elapsed); }
> > +
> > +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_image_float,
> > "GB/S");
> > diff --git a/kernels/compiler_read_image_float.cl
> > b/kernels/compiler_read_image_float.cl
> > new file mode 100644
> > index 000..f581438
> > --- /dev/null
> > +++ b/kernels/compiler_read_image_float.cl
> > @@ -0,0 +1,9 @@
> > +__constant sampler_t sampler= CLK_NORMALIZED_COORDS_FALSE |
> > CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
> Do you really need CLK_FILTER_LINEAR?
> CLK_FILTER_LINEAR's image filter/cache's behavior is more complicated 
> than CLK_FILTER_NEAREST. For benchmark, I think CLK_FILTER_NEAREST is 
> better.
This benchmark is comparing with OpenCV optical flow cases which use
CLK_FILTER_LINEAR. So far I think CLK_FILTER_NEAREST is OK too.
I could submit it again. 
> 
> > +
> > +__kernel void compiler_read_image_float(__read_only image2d_t src,
> > +__global float* dst, int w) {
> > +int xi = get_global_id(0);
> > +int yi = get_global_id(1);
> > +float4 v = read_imagef(src, sampler, (float2)(xi, yi));
> > +*(dst + yi * w + xi) = v.x;
> > +}
> > --
> > 1.9.1
> > 
> > ___
> > Beignet mailing list
> > Beignet@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/beignet
> > 
> > ___
> > Beignet mailing list
> > Beignet@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Add read_imagef benchmark for optimization.

2016-09-05 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

---
 benchmark/CMakeLists.txt |  1 +
 benchmark/benchmark_read_image_float.cpp | 65 
 kernels/compiler_read_image_float.cl |  9 +
 3 files changed, 75 insertions(+)
 create mode 100644 benchmark/benchmark_read_image_float.cpp
 create mode 100644 kernels/compiler_read_image_float.cl

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index f9b246b..556275e 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -15,6 +15,7 @@ set (benchmark_sources
   benchmark_use_host_ptr_buffer.cpp
   benchmark_read_buffer.cpp
   benchmark_read_image.cpp
+  benchmark_read_image_float.cpp
   benchmark_copy_buffer_to_image.cpp
   benchmark_copy_image_to_buffer.cpp
   benchmark_copy_buffer.cpp
diff --git a/benchmark/benchmark_read_image_float.cpp 
b/benchmark/benchmark_read_image_float.cpp
new file mode 100644
index 000..b0c2fb4
--- /dev/null
+++ b/benchmark/benchmark_read_image_float.cpp
@@ -0,0 +1,65 @@
+#include 
+#include "utests/utest_helper.hpp"
+#include 
+
+#define NUM 400
+
+double benchmark_read_image_float(void)
+{
+  struct timeval start,stop;
+
+  const size_t w = 128;
+  const size_t h = 128;
+  const size_t sz = w * h;
+  cl_image_format format;
+  cl_image_desc desc;
+
+  memset(, 0x0, sizeof(cl_image_desc));
+  memset(, 0x0, sizeof(cl_image_format));
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("compiler_read_image_float");
+  buf_data[0] = (uint32_t*) malloc(sizeof(float) * sz);
+  for (uint32_t i = 0; i < sz; ++i) {
+((float*)buf_data[0])[i] = rand();
+  }
+
+  format.image_channel_order = CL_R;
+  format.image_channel_data_type = CL_FLOAT;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+  desc.image_width = w;
+  desc.image_height = h;
+  OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, , , buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, sz * sizeof(float), NULL);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), [0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), [1]);
+  OCL_SET_ARG(2, sizeof(cl_int), );
+  globals[0] = w;
+  globals[1] = h;
+  locals[0] = 16;
+  locals[1] = 16;
+
+  OCL_NDRANGE(2);
+  OCL_FINISH();
+
+  gettimeofday(,0);
+  for (size_t i=0; i<NUM; i++) {
+OCL_NDRANGE(2);
+  }
+  OCL_FINISH();
+  gettimeofday(,0);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  double elapsed = time_subtract(, , 0);
+
+  return BANDWIDTH(sz * sizeof(float) * NUM, elapsed);
+}
+
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_read_image_float, "GB/S");
diff --git a/kernels/compiler_read_image_float.cl 
b/kernels/compiler_read_image_float.cl
new file mode 100644
index 000..f581438
--- /dev/null
+++ b/kernels/compiler_read_image_float.cl
@@ -0,0 +1,9 @@
+__constant sampler_t sampler= CLK_NORMALIZED_COORDS_FALSE | 
CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;
+
+__kernel void compiler_read_image_float(__read_only image2d_t src, __global 
float* dst, int w)
+{
+int xi = get_global_id(0);
+int yi = get_global_id(1);
+float4 v = read_imagef(src, sampler, (float2)(xi, yi));
+*(dst + yi * w + xi) = v.x;
+}
-- 
1.9.1

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Add cl_khr_3d_image_writes into info string.

2016-06-02 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

The extension is supported in fact and avoid misunderstanding.
---
 src/cl_extensions.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 349f2f1..183aafc 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -48,6 +48,8 @@ void check_opt1_extension(cl_extensions_t *extensions)
 #endif
 if (id == EXT_ID(khr_image2d_from_buffer))
   extensions->extensions[id].base.ext_enabled = 1;
+if (id == EXT_ID(khr_3d_image_writes))
+  extensions->extensions[id].base.ext_enabled = 1;
   }
 }
 
-- 
2.5.0

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Remove unncessary assertion in printf processing.

2016-05-03 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

It causes alert when printf long vector.
---
 backend/src/llvm/llvm_gen_backend.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 51a1dab..7d21ebf 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -4558,7 +4558,7 @@ namespace gbe
 uint32_t srcElemNum = 0;
 Value *srcValue = I.getOperand(n + 1);
 ir::Type srcType = getVectorInfo(ctx, srcValue, srcElemNum);
-GBE_ASSERT(!(srcType == ir::TYPE_S64 || srcType == 
ir::TYPE_DOUBLE));
+GBE_ASSERT(!(srcType == ir::TYPE_DOUBLE));
 
 uint32_t elemID = 0;
 for (elemID = 0; elemID < srcElemNum; ++elemID) {
-- 
2.4.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Add condition checking of residuals because it may be NULL.

2016-03-28 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

---
 src/kernels/cl_internal_block_motion_estimate_intel.cl | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/kernels/cl_internal_block_motion_estimate_intel.cl 
b/src/kernels/cl_internal_block_motion_estimate_intel.cl
index 23c5488..e56520a 100644
--- a/src/kernels/cl_internal_block_motion_estimate_intel.cl
+++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl
@@ -341,7 +341,8 @@ void block_motion_estimate_intel(accelerator_intel_t accel,
 mv_index = index * 1;
 if( lid_x == 0 ){
   motion_vector_buffer[mv_index] = mv[lid_x];
-  residuals[mv_index] = 2 * res[lid_x];
+  if(residuals)
+residuals[mv_index] = 2 * res[lid_x];
 }
   }
   //CL_ME_MB_TYPE_8x8_INTEL
@@ -350,7 +351,8 @@ void block_motion_estimate_intel(accelerator_intel_t accel,
   mv_index = lgid_y * num_groups_x * 4 + lgid_x * 2;
   mv_index = mv_index + num_groups_x * 2 * (lid_x / 2) + (lid_x % 2);
   motion_vector_buffer[mv_index] = mv[lid_x];
-  residuals[mv_index] = 2 * res[lid_x];
+  if(residuals)
+residuals[mv_index] = 2 * res[lid_x];
 }
   }
   //CL_ME_MB_TYPE_4x4_INTEL
@@ -359,7 +361,8 @@ void block_motion_estimate_intel(accelerator_intel_t accel,
   mv_index = lgid_y * num_groups_x * 16 + lgid_x * 4;
   mv_index = mv_index + num_groups_x * 4 * (lid_x / 4) + (lid_x % 4);
   motion_vector_buffer[mv_index] = mv[lid_x];
-  residuals[mv_index] = 2 * res[lid_x];
+  if(residuals)
+residuals[mv_index] = 2 * res[lid_x];
 }
   }
 
-- 
2.5.0

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] utest: do not check MV near image border

2016-03-19 Thread yan . wang
Now this case could passed when previous test_printf case has multiply tests.
VME engine seems to read data out of specified image buffer which is based
on drm bo.
If this drm bo of src/ref image object reuse from previous bo with garbage
by coincidence, it will cause different MV results.

Yan Wang

> if the image width and height is not aligned, the VME hardware block
> could use the data out of the image, there is no clear rule defines
> the behavior of this case, so do not check the MVs near the border.
>
> Signed-off-by: Guo Yejun <yejun@intel.com>
> ---
>  utests/builtin_kernel_block_motion_estimate_intel.cpp | 18
> +++---
>  1 file changed, 11 insertions(+), 7 deletions(-)
>
> diff --git a/utests/builtin_kernel_block_motion_estimate_intel.cpp
> b/utests/builtin_kernel_block_motion_estimate_intel.cpp
> index 12bcb7d..008b27c 100644
> --- a/utests/builtin_kernel_block_motion_estimate_intel.cpp
> +++ b/utests/builtin_kernel_block_motion_estimate_intel.cpp
> @@ -48,7 +48,7 @@ void builtin_kernel_block_motion_estimate_intel(void)
>if (i >= 32 && i <= 47 && j >= 16 && j <= 31)
>  image_data2[w * j + i] = image_data1[w * j + i] = 100;
>else
> -image_data2[w * j + i] = image_data1[w * j + i] = 0;
> +image_data2[w * j + i] = image_data1[w * j + i] = 17;
>  }
>}
>
> @@ -61,8 +61,9 @@ void builtin_kernel_block_motion_estimate_intel(void)
>OCL_CREATE_IMAGE(buf[0], CL_MEM_COPY_HOST_PTR, , ,
> image_data1);//src
>OCL_CREATE_IMAGE(buf[1], CL_MEM_COPY_HOST_PTR, , ,
> image_data2);//ref
>
> -  const size_t mv = (80/16) * (48/16);
> -  OCL_CREATE_BUFFER(buf[2], 0, mv * sizeof(int) * 4, NULL);
> +  const size_t mv_w = (w + 15) / 16;
> +  const size_t mv_h = (h + 15) / 16;
> +  OCL_CREATE_BUFFER(buf[2], 0, mv_w * mv_h * sizeof(short) * 2, NULL);
>
>OCL_SET_ARG(0, sizeof(cl_accelerator_intel), );
>OCL_SET_ARG(1, sizeof(cl_mem), [0]);
> @@ -76,7 +77,7 @@ void builtin_kernel_block_motion_estimate_intel(void)
>OCL_CALL(clEnqueueNDRangeKernel, queue, kernel, 2, NULL, globals, NULL,
> 0, NULL, NULL);
>
>OCL_MAP_BUFFER(2);
> -  short expected[] = {-64, -48,
> +  short expected[] = {-64, -48,   //S13.2 fixed point value
>  -64, -48,
>  -64, -48,
>  -64, -48,
> @@ -92,9 +93,12 @@ void builtin_kernel_block_motion_estimate_intel(void)
>  0, -48,
>  -64, -48};
>short* res = (short*)buf_data[2];
> -  for (uint32_t j = 0; j < mv; ++j) {
> -OCL_ASSERT(res[j * 2 + 0] == expected[j * 2 + 0]);
> -OCL_ASSERT(res[j * 2 + 1] == expected[j * 2 + 1]);
> +  for (uint32_t j = 0; j < mv_h - 1; ++j) {
> +for (uint32_t i = 0; i < mv_w - 1; ++i) {
> +uint32_t index = j * mv_w * 2 + i * 2;
> +OCL_ASSERT(res[index + 0] == expected[index + 0]);
> +OCL_ASSERT(res[index + 1] == expected[index + 1]);
> +}
>}
>OCL_UNMAP_BUFFER(2);
>
> --
> 1.9.1
>
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/beignet
>

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [Printf v2][PATCH 07/12] Implement emision of printf instruction.

2016-02-04 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/llvm/llvm_gen_backend.cpp | 95 +--
 1 file changed, 80 insertions(+), 15 deletions(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index dba9dba..4870285 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -486,6 +486,9 @@ namespace gbe
 typedef map>::iterator PtrOrigMapIter;
 // map pointer source to bti
 map BtiMap;
+// map printf pointer source to bti
+int printfBti;
+uint32_t printfNum;
 // map ptr to its bti register
 map BtiValueMap;
 // map ptr to it's base
@@ -520,6 +523,8 @@ namespace gbe
 unit(unit),
 ctx(unit),
 regTranslator(ctx),
+printfBti(-1),
+printfNum(0),
 LI(0),
 TheModule(0),
 btiBase(BTI_RESERVED_NUM),
@@ -586,6 +591,7 @@ namespace gbe
   addrStoreInst.clear();
   // Reset for next function
   btiBase = BTI_RESERVED_NUM;
+  printfBti = -1;
   return false;
 }
 /*! Given a possible pointer value, find out the interested escape like
@@ -594,7 +600,7 @@ namespace gbe
 /*! For all possible pointers, GlobalVariable, function pointer argument,
 alloca instruction, find their pointer escape points */
 void analyzePointerOrigin(Function );
-unsigned getNewBti(Value *origin, bool isImage);
+unsigned getNewBti(Value *origin, bool force);
 void assignBti(Function );
 bool isSingleBti(Value *Val);
 Value *getBtiRegister(Value *v);
@@ -717,11 +723,10 @@ namespace gbe
 // handle load of dword/qword with unaligned address
 void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, 
ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool 
fixedBTI);
 void visitInstruction(Instruction ) {NOT_SUPPORTED;}
-void* getPrintfInfo(CallInst* inst)
-{
-  if ([inst])
-return (void*)[inst];
-  return NULL;
+ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) {
+  if (unit.printfs.find(inst) == unit.printfs.end())
+return NULL;
+  return [inst];
 }
 private:
   void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug 
infomation in context for subsequently passing to Gen insn
@@ -1127,21 +1132,15 @@ namespace gbe
 }
   }
 
-  unsigned GenWriter::getNewBti(Value *origin, bool isImage) {
+  unsigned GenWriter::getNewBti(Value *origin, bool force) {
 unsigned new_bti = 0;
-if (isImage) {
+if (force) {
   new_bti = btiBase;
   incBtiBase();
   return new_bti;
 }
 
-if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
-  new_bti = btiBase;
-  incBtiBase();
-} else if 
(origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
-  new_bti = btiBase;
-  incBtiBase();
-} else if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) 
{
+if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
   new_bti = btiBase;
   incBtiBase();
 }
@@ -3716,6 +3715,16 @@ namespace gbe
 this->newRegister();
 break;
   case GEN_OCL_PRINTF:
+this->newRegister();  // fall through
+  case GEN_OCL_PUTS:
+  {
+ // We need a new BTI as printf output.
+ if (printfBti < 0) {
+   printfBti = this->getNewBti(, true);
+   ctx.getFunction().getPrintfSet()->setBufBTI(printfBti);
+ }
+ break;
+  }
   case GEN_OCL_CALC_TIMESTAMP:
   case GEN_OCL_STORE_PROFILING:
   case GEN_OCL_DEBUGWAIT:
@@ -4527,6 +4536,62 @@ namespace gbe
 
   case GEN_OCL_PRINTF:
   {
+ir::PrintfSet::PrintfFmt* fmt = getPrintfInfo();
+if (fmt == NULL)
+  break;
+
+ctx.getFunction().getPrintfSet()->append(printfNum, fmt);
+
+vector tupleData;
+vector tupleTypeData;
+int argNum = static_cast(I.getNumOperands());
+argNum -= 2; // no fmt and last NULL.
+int realArgNum = argNum;
+
+for (int n = 0; n < argNum; n++) {
+  /* First, ignore %s, the strings are recorded and not passed to 
GPU. */
+  llvm::Constant* args = 
dyn_cast(I.getOperand(n + 1));
+  llvm::Constant* args_ptr = NULL;
+  if (args)
+args_ptr = dyn_cast(args->getOperand(0));
+
+  if (args_ptr) {
+ConstantDataSequential* fmt_arg = 
dyn_cast(args_ptr->getOperand(0));
+if (fmt_arg && fmt_arg->isCString()) {
+  realArgNum--;
+  continue;
+}
+   

Re: [Beignet] [Printf v2][PATCH 07/12] Add the implementation of printf ir instruction.

2016-02-04 Thread yan . wang
Sorry. I have re-sent 7/12.

Yan Wang

> patch of 06 and 07 have the same title?
> I think it is a typo here.
> Please correct it.
> All the other things are OK, just rename this one and
> the whole patchset can be pushed later.
>
> Also can push my patch about printf test cases together.
>
> On Mon, Feb 01, 2016 at 03:42:16PM +0800, yan.w...@linux.intel.com wrote:
>> Date: Mon,  1 Feb 2016 15:42:16 +0800
>> From: yan.w...@linux.intel.com
>> To: beignet@lists.freedesktop.org
>> Cc: Yan Wang <yan.w...@linux.intel.com>
>> Subject: [Beignet] [Printf v2][PATCH 07/12] Add the implementation of
>>  printf ir instruction.
>> X-Mailer: git-send-email 2.5.0
>>
>> From: Yan Wang <yan.w...@linux.intel.com>
>>
>> Contributor: Junyan He <junyan...@linux.intel.com>
>> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
>> ---
>>  backend/src/llvm/llvm_gen_backend.cpp | 95
>> +--
>>  1 file changed, 80 insertions(+), 15 deletions(-)
>>
>> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
>> b/backend/src/llvm/llvm_gen_backend.cpp
>> index dba9dba..4870285 100644
>> --- a/backend/src/llvm/llvm_gen_backend.cpp
>> +++ b/backend/src/llvm/llvm_gen_backend.cpp
>> @@ -486,6 +486,9 @@ namespace gbe
>>  typedef map>::iterator
>> PtrOrigMapIter;
>>  // map pointer source to bti
>>  map BtiMap;
>> +// map printf pointer source to bti
>> +int printfBti;
>> +uint32_t printfNum;
>>  // map ptr to its bti register
>>  map BtiValueMap;
>>  // map ptr to it's base
>> @@ -520,6 +523,8 @@ namespace gbe
>>  unit(unit),
>>  ctx(unit),
>>  regTranslator(ctx),
>> +printfBti(-1),
>> +printfNum(0),
>>  LI(0),
>>  TheModule(0),
>>  btiBase(BTI_RESERVED_NUM),
>> @@ -586,6 +591,7 @@ namespace gbe
>>addrStoreInst.clear();
>>// Reset for next function
>>btiBase = BTI_RESERVED_NUM;
>> +  printfBti = -1;
>>return false;
>>  }
>>  /*! Given a possible pointer value, find out the interested escape
>> like
>> @@ -594,7 +600,7 @@ namespace gbe
>>  /*! For all possible pointers, GlobalVariable, function pointer
>> argument,
>>  alloca instruction, find their pointer escape points */
>>  void analyzePointerOrigin(Function );
>> -unsigned getNewBti(Value *origin, bool isImage);
>> +unsigned getNewBti(Value *origin, bool force);
>>  void assignBti(Function );
>>  bool isSingleBti(Value *Val);
>>  Value *getBtiRegister(Value *v);
>> @@ -717,11 +723,10 @@ namespace gbe
>>  // handle load of dword/qword with unaligned address
>>  void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues,
>> ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool
>> dwAligned, bool fixedBTI);
>>  void visitInstruction(Instruction ) {NOT_SUPPORTED;}
>> -void* getPrintfInfo(CallInst* inst)
>> -{
>> -  if ([inst])
>> -return (void*)[inst];
>> -  return NULL;
>> +ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) {
>> +  if (unit.printfs.find(inst) == unit.printfs.end())
>> +return NULL;
>> +  return [inst];
>>  }
>>  private:
>>void setDebugInfo_CTX(llvm::Instruction * insn); // store the
>> debug infomation in context for subsequently passing to Gen insn
>> @@ -1127,21 +1132,15 @@ namespace gbe
>>  }
>>}
>>
>> -  unsigned GenWriter::getNewBti(Value *origin, bool isImage) {
>> +  unsigned GenWriter::getNewBti(Value *origin, bool force) {
>>  unsigned new_bti = 0;
>> -if (isImage) {
>> +if (force) {
>>new_bti = btiBase;
>>incBtiBase();
>>return new_bti;
>>  }
>>
>> -if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
>> -  new_bti = btiBase;
>> -  incBtiBase();
>> -} else if
>> (origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
>> -  new_bti = btiBase;
>> -  incBtiBase();
>> -} else if
>> (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
>> +if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf")))
>> {
>>new_bti = btiBase;
>>incBtiBase();
>>  }
>> @@ -3716,6

Re: [Beignet] [PATCH] Fix type assert error generated by lstPartSum incorrect type

2016-02-03 Thread yan . wang
After applying this patch, benchmark of workgroup add optimization could
run on my BSW platform.
Thanks.

Yan Wang

> Signed-off-by: Grigore Lupescu <grigore.lupe...@intel.com>
> ---
>  backend/src/backend/gen_insn_selection.cpp | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index 5eccfc6..0b9fe45 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -6241,10 +6241,10 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>  GenRegister result = sel.selReg(sel.reg(FAMILY_WORD), type);
>
>  vector lstPartSum;
> -lstPartSum.push_back(sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U32));
> -lstPartSum.push_back(sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U32));
> -lstPartSum.push_back(sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U32));
> -lstPartSum.push_back(sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U32));
> +lstPartSum.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
> +lstPartSum.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
> +lstPartSum.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
> +lstPartSum.push_back(sel.selReg(sel.reg(FAMILY_DWORD), type));
>  vector fakeTemps;
>  fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_WORD), type));
>  fakeTemps.push_back(sel.selReg(sel.reg(FAMILY_WORD), type));
> --
> 2.5.0
>
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
>

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [Printf v2][PATCH 03/12] Reconstruct printf parser.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/ir/unit.cpp |   1 -
 backend/src/ir/unit.hpp |   2 +-
 backend/src/llvm/llvm_gen_backend.cpp   |   4 +-
 backend/src/llvm/llvm_printf_parser.cpp | 115 +++-
 4 files changed, 56 insertions(+), 66 deletions(-)

diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
index a350c60..5604244 100644
--- a/backend/src/ir/unit.cpp
+++ b/backend/src/ir/unit.cpp
@@ -34,7 +34,6 @@ namespace ir {
   Unit::~Unit(void) {
 for (const auto  : functions) GBE_DELETE(pair.second);
 delete profilingInfo;
-for (const auto  : printfs) GBE_DELETE(pair.second);
   }
   Function *Unit::getFunction(const std::string ) const {
 auto it = functions.find(name);
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index 10a1af6..9b9e41f 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -47,7 +47,7 @@ namespace ir {
   public:
 typedef map<std::string, Function*> FunctionSet;
 /*! Moved from printf pass */
-map<llvm::CallInst*, PrintfSet::PrintfFmt*> printfs;
+map<llvm::CallInst*, PrintfSet::PrintfFmt> printfs;
 /*! Create an empty unit */
 Unit(PointerSize pointerSize = POINTER_32_BITS);
 /*! Release everything (*including* the function pointers) */
diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index dec023c..dba9dba 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -719,8 +719,8 @@ namespace gbe
 void visitInstruction(Instruction ) {NOT_SUPPORTED;}
 void* getPrintfInfo(CallInst* inst)
 {
-  if (unit.printfs[inst])
-return (void*)unit.printfs[inst];
+  if ([inst])
+return (void*)[inst];
   return NULL;
 }
 private:
diff --git a/backend/src/llvm/llvm_printf_parser.cpp 
b/backend/src/llvm/llvm_printf_parser.cpp
index 1c88981..c9ec85f 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -293,41 +293,21 @@ error:
   public:
 static char ID;
 typedef std::pair<Instruction*, bool> PrintfInst;
-std::vector deadprintfs;
 Module* module;
 IRBuilder<>* builder;
 Type* intTy;
-Value* pbuf_ptr;
-Value* index_buf_ptr;
-Value* g1Xg2Xg3;
-Value* wg_offset;
-int out_buf_sizeof_offset;
 ir::Unit 
-int printf_num;
-int totalSizeofSize;
-
-struct PrintfParserInfo {
-  llvm::CallInst* call;
-  PrintfSet::PrintfFmt* printf_fmt;
-};
 
 PrintfParser(ir::Unit ) : FunctionPass(ID),
-unit(unit)
+  unit(unit)
 {
   module = NULL;
   builder = NULL;
   intTy = NULL;
-  out_buf_sizeof_offset = 0;
-  pbuf_ptr = NULL;
-  index_buf_ptr = NULL;
-  g1Xg2Xg3 = NULL;
-  wg_offset = NULL;
-  printf_num = 0;
-  totalSizeofSize = 0;
 }
 
-bool parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, 
int& sizeof_size);
-bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& 
dst_type, int& sizeof_size);
+bool parseOnePrintfInstruction(CallInst * call);
+bool generateOneParameterInst(PrintfSlot& slot, Value* arg, Value*& 
new_arg);
 
 virtual const char *getPassName() const
 {
@@ -337,7 +317,7 @@ error:
 virtual bool runOnFunction(llvm::Function );
   };
 
-  bool PrintfParser::parseOnePrintfInstruction(CallInst * call, 
PrintfParserInfo& info, int& sizeof_size)
+  bool PrintfParser::parseOnePrintfInstruction(CallInst * call)
   {
 CallSite CS(call);
 CallSite::arg_iterator CI_FMT = CS.arg_begin();
@@ -355,20 +335,50 @@ error:
 }
 
 std::string fmt = fmt_arg->getAsCString();
+if (fmt.size() == 0)
+  return false;
 
 PrintfSet::PrintfFmt* printf_fmt = NULL;
 
 if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) 
{//at lease print something
+  printf("Warning: Parse the printf inst %s failed, no output for it\n", 
fmt.c_str());
   return false;
 }
 
 /* iff parameter more than %, error. */
 /* str_fmt arg0 arg1 ... NULL */
-if (param_num + 2 < static_cast(call->getNumOperands())) {
+if (param_num + 2 != static_cast(call->getNumOperands())) {
   delete printf_fmt;
+  printf("Warning: Parse the printf inst %s failed, parameters do not 
match the %% number, no output for it\n",
+ fmt.c_str());
   return false;
 }
 
+/* Insert some conversion if types do not match. */
+builder->SetInsertPoint(call);
+int i = 1;
+for (auto  : *printf_fmt) {
+  if (s.type == PRINTF_SLOT_TYPE_STRING)
+continue;
+
+  assert(i < static_cast(call->getNumOperands()) - 1);
+  

[Beignet] [Printf v2][PATCH 04/12] Add LLVM fcuntion definition of printf.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/llvm/llvm_gen_ocl_function.hxx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx 
b/backend/src/llvm/llvm_gen_ocl_function.hxx
index e3d89a3..dd7816c 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -171,7 +171,8 @@ DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
 DECL_LLVM_GEN_FUNCTION(VME, __gen_ocl_vme)
 
 // printf function
-DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
+DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf_stub)
+DECL_LLVM_GEN_FUNCTION(PUTS, __gen_ocl_puts_stub)
 
 // store timestamp function
 DECL_LLVM_GEN_FUNCTION(CALC_TIMESTAMP, __gen_ocl_calc_timestamp)
-- 
2.4.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [Printf v2][PATCH 11/12] Output printf result.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/ir/printf.cpp | 122 +-
 backend/src/ir/printf.hpp |   2 +-
 2 files changed, 112 insertions(+), 12 deletions(-)

diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
index 19daa19..7ca127d 100644
--- a/backend/src/ir/printf.cpp
+++ b/backend/src/ir/printf.cpp
@@ -35,7 +35,7 @@ namespace gbe
 static void generatePrintfFmtString(PrintfState& state, std::string& str)
 {
   char num_str[16];
-  str += "%";
+  str = "%";
 
   if (state.left_justified) {
 str += "-";
@@ -87,21 +87,121 @@ namespace gbe
 #define PRINT_SOMETHING(target_ty, conv)  do {  \
   if (!vec_i)   \
 pf_str = pf_str + std::string(#conv);   \
-  char *ptr = ((char *)buf_addr + sizeOfSize * global_wk_sz0 * 
global_wk_sz1 * global_wk_sz2 * n \
-   + slot.state->out_buf_sizeof_offset *\
-   global_wk_sz0 * global_wk_sz1 * global_wk_sz2);  \
-  target_ty* obj_ptr = ((target_ty *)ptr) + (k*global_wk_sz0*global_wk_sz1 
+ j*global_wk_sz0 + i) * vec_num + vec_i; \
-  if ((char *)obj_ptr + sizeof(target_ty) > (char *)buf_addr + output_sz) 
{\
-printf("\n\n!!!The printf message is out of range because of the 
limited buffer, ignore.\n"); \
-return; \
-  } \
-  printf(pf_str.c_str(),  *obj_ptr);\
+  printf(pf_str.c_str(), log.getData()); \
 } while (0)
 
+static void printOutOneStatement(PrintfSet::PrintfFmt& fmt, PrintfLog& log)
+{
+  std::string pf_str = "";
+  for (auto& slot : fmt) {
+if (slot.type == PRINTF_SLOT_TYPE_STRING) {
+  printf("%s", slot.str.c_str());
+  continue;
+}
+assert(slot.type == PRINTF_SLOT_TYPE_STATE);
+
+generatePrintfFmtString(slot.state, pf_str);
+
+int vec_num;
+vec_num = slot.state.vector_n > 0 ? slot.state.vector_n : 1;
+
+for (int vec_i = 0; vec_i < vec_num; vec_i++) {
+  if (vec_i)
+printf(",");
+
+  switch (slot.state.conversion_specifier) {
+case PRINTF_CONVERSION_D:
+case PRINTF_CONVERSION_I:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, d);
+  else
+PRINT_SOMETHING(int, d);
+  break;
+
+case PRINTF_CONVERSION_O:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, o);
+  else
+PRINT_SOMETHING(int, o);
+  break;
+case PRINTF_CONVERSION_U:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, u);
+  else
+PRINT_SOMETHING(int, u);
+  break;
+case PRINTF_CONVERSION_X:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, X);
+  else
+PRINT_SOMETHING(int, X);
+  break;
+case PRINTF_CONVERSION_x:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, x);
+  else
+PRINT_SOMETHING(int, x);
+  break;
+
+case PRINTF_CONVERSION_C:
+  PRINT_SOMETHING(char, c);
+  break;
+
+case PRINTF_CONVERSION_F:
+  PRINT_SOMETHING(float, F);
+  break;
+case PRINTF_CONVERSION_f:
+  PRINT_SOMETHING(float, f);
+  break;
+case PRINTF_CONVERSION_E:
+  PRINT_SOMETHING(float, E);
+  break;
+case PRINTF_CONVERSION_e:
+  PRINT_SOMETHING(float, e);
+  break;
+case PRINTF_CONVERSION_G:
+  PRINT_SOMETHING(float, G);
+  break;
+case PRINTF_CONVERSION_g:
+  PRINT_SOMETHING(float, g);
+  break;
+case PRINTF_CONVERSION_A:
+  PRINT_SOMETHING(float, A);
+  break;
+case PRINTF_CONVERSION_a:
+  PRINT_SOMETHING(float, a);
+  break;
+case PRINTF_CONVERSION_P:
+  PRINT_SOMETHING(int, p);
+  break;
+
+case PRINTF_CONVERSION_S:
+  pf_str = pf_str + "s";
+  printf(pf_s

[Beignet] [Printf v2][PATCH 01/12] Change printf data structure and remove old code.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/backend/program.cpp |  14 --
 backend/src/backend/program.hpp |  10 +-
 backend/src/gbe_bin_interpreter.cpp |   2 -
 backend/src/ir/printf.cpp   | 168 -
 backend/src/ir/printf.hpp   |  76 +++-
 backend/src/llvm/llvm_gen_backend.cpp   |  13 +-
 backend/src/llvm/llvm_printf_parser.cpp | 313 ++--
 7 files changed, 36 insertions(+), 560 deletions(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 7d12f73..4eca9f1 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -1265,24 +1265,12 @@ namespace gbe {
 return ps->getBufBTI();
   }
 
-  static uint8_t kernelGetPrintfIndexBufBTI(void * printf_info) {
-if (printf_info == NULL) return 0;
-const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
-return ps->getIndexBufBTI();
-  }
-
   static void kernelReleasePrintfSet(void * printf_info) {
 if (printf_info == NULL) return;
 ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
 delete ps;
   }
 
-  static uint32_t kernelGetPrintfSizeOfSize(void * printf_info) {
-if (printf_info == NULL) return 0;
-const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
-return ps->getPrintfSizeOfSize();
-  }
-
   static void kernelOutputPrintf(void * printf_info, void* index_addr,
  void* buf_addr, size_t global_wk_sz0,
  size_t global_wk_sz1, size_t global_wk_sz2,
@@ -1426,9 +1414,7 @@ namespace gbe
   gbe_dup_profiling = gbe::kernelDupProfiling;
   gbe_output_profiling = gbe::kernelOutputProfiling;
   gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
-  gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
   gbe_dup_printfset = gbe::kernelDupPrintfSet;
-  gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
   gbe_release_printf_info = gbe::kernelReleasePrintfSet;
   gbe_output_printf = gbe::kernelOutputPrintf;
   genSetupCallBacks();
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 775e560..e5c4b95 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -152,10 +152,7 @@ namespace gbe {
 void setPrintfSet(ir::PrintfSet * from) {
   printfSet = from;
 }
-/* ! Return the offset in the sizeof(xxx). */
-uint32_t getPrintfSizeOfSize(void) const {
-  return printfSet ? printfSet->getPrintfSizeOfSize() : 0;
-}
+
 uint32_t getPrintfNum() const {
   return printfSet ? printfSet->getPrintfNum() : 0;
 }
@@ -169,11 +166,6 @@ namespace gbe {
   return printfSet->getBufBTI();
 }
 
-uint8_t getPrintfIndexBufBTI() const {
-  GBE_ASSERT(printfSet);
-  return printfSet->getIndexBufBTI();
-}
-
 void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
   size_t global_wk_sz1, size_t global_wk_sz2, size_t 
output_sz) {
   if(printfSet)
diff --git a/backend/src/gbe_bin_interpreter.cpp 
b/backend/src/gbe_bin_interpreter.cpp
index 4756842..34d04dd 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -70,9 +70,7 @@ struct BinInterpCallBackInitializer
 gbe_output_profiling = gbe::kernelOutputProfiling;
 gbe_get_printf_num = gbe::kernelGetPrintfNum;
 gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
-gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
 gbe_dup_printfset = gbe::kernelDupPrintfSet;
-gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
 gbe_release_printf_info = gbe::kernelReleasePrintfSet;
 gbe_output_printf = gbe::kernelOutputPrintf;
   }
diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
index 2e08248..3873ca9 100644
--- a/backend/src/ir/printf.cpp
+++ b/backend/src/ir/printf.cpp
@@ -32,43 +32,6 @@ namespace gbe
 
 pthread_mutex_t PrintfSet::lock = PTHREAD_MUTEX_INITIALIZER;
 
-PrintfSlot::~PrintfSlot(void)
-{
-if (ptr)
-{
-  if (type == PRINTF_SLOT_TYPE_STRING) {
-free(ptr);
-ptr = NULL;
-  } else if (type == PRINTF_SLOT_TYPE_STATE) {
-delete state;
-state = NULL;
-  } else {
-type = PRINTF_SLOT_TYPE_NONE;
-ptr = NULL;
-  }
-}
-}
-
-uint32_t PrintfSet::append(PrintfFmt* fmt, Unit& unit)
-{
-  fmts.push_back(*fmt);
-  vector& vp = fmts.back().first;
-
-  for (vector::iterator f = vp.begin(); f !=  vp.end(); ++f) {
-if (f->type == PRINTF_SLOT_TYPE_STRING)
-  continue;
-
-slots.push_back(*f);
-  }
-
-  /* Update the total size of size. *

[Beignet] [Printf v2][PATCH 02/12] Add PrintfLog structure.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/ir/printf.hpp | 25 +
 1 file changed, 25 insertions(+)

diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
index def6331..6b2b741 100644
--- a/backend/src/ir/printf.hpp
+++ b/backend/src/ir/printf.hpp
@@ -144,6 +144,31 @@ namespace gbe
   }
 };
 
+struct PrintfLog {
+  uint32_t magic;  // 0xAABBCCDD as magic for ASSERT.
+  uint32_t size;  // Size of this printf log, include header.
+  uint32_t statementNum; // which printf within one kernel.
+  const char* content;
+
+  PrintfLog(const char* p) {
+GBE_ASSERT(*((uint32_t *)p) == 0xAABBCCDD);
+magic = *((uint32_t *)p);
+p += sizeof(uint32_t);
+size = *((uint32_t *)p);
+p += sizeof(uint32_t);
+statementNum = *((uint32_t *)p);
+p += sizeof(uint32_t);
+content = p;
+  }
+
+  template 
+  T getData(void) {
+T D = *((T *)content);
+content += sizeof(T);
+return D;
+  }
+};
+
 class Context;
 
 class PrintfSet //: public Serializable
-- 
2.4.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [Printf v2][PATCH 05/12] Add tuple processing logic for printf.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/ir/context.hpp  | 5 +
 backend/src/ir/function.hpp | 8 
 2 files changed, 13 insertions(+)

diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index b95741f..877d639 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -149,6 +149,11 @@ namespace ir {
   GBE_ASSERTM(fn != NULL, "No function currently defined");
   return fn->file.appendArrayTuple(reg, regNum);
 }
+/*! Make a tuple from an array of types */
+INLINE Tuple arrayTypeTuple(const ir::Type *type, uint32_t num) {
+  GBE_ASSERTM(fn != NULL, "No function currently defined");
+  return fn->file.appendArrayTypeTuple((uint8_t*)type, num);
+}
 /*! We just use variadic templates to forward instruction functions */
 #define DECL_INSN(NAME, FAMILY) \
 template  INLINE void NAME(Args...args);
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 78250cf..5785bee 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -341,6 +341,14 @@ namespace ir {
 INLINE void setRegister(Tuple ID, uint32_t which, Register reg) {
   file.set(ID, which, reg);
 }
+/*! Get the type from the tuple vector */
+INLINE uint8_t getType(Tuple ID, uint32_t which) const {
+  return file.getType(ID, which);
+}
+/*! Set the type into the tuple vector */
+INLINE void setType(Tuple ID, uint32_t which, uint8_t type) {
+  file.setType(ID, which, type);
+}
 /*! Get the register file */
 INLINE const RegisterFile (void) const { return file; }
 /*! Get the given value ie immediate from the function */
-- 
2.4.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [Printf v2][PATCH 09/12] Implement ASM generation of printf.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/backend/gen8_context.cpp | 36 +++
 backend/src/backend/gen8_context.hpp |  1 +
 backend/src/backend/gen_context.cpp  | 70 
 backend/src/backend/gen_context.hpp  |  1 +
 4 files changed, 108 insertions(+)

diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index 7455bfc..2e76f53 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -1264,6 +1264,42 @@ namespace gbe
 p->ADD(dst, dst, res);
   }
 
+  void Gen8Context::emitPrintfLongInstruction(GenRegister& addr, GenRegister& 
data,
+ GenRegister& src, uint32_t bti) {
+GenRegister tempSrc, tempDst;
+GenRegister nextSrc, nextDst;
+p->push();
+  tempSrc = GenRegister::h2(GenRegister::retype(src, GEN_TYPE_UD));
+  tempDst = GenRegister::retype(data, GEN_TYPE_UD);
+  p->curr.execWidth = 8;
+  p->curr.quarterControl = GEN_COMPRESSION_Q1;
+  p->MOV(tempDst, tempSrc);
+
+  p->curr.quarterControl = GEN_COMPRESSION_Q2;
+  nextSrc = GenRegister::Qn(tempSrc, 1);
+  nextDst = GenRegister::Qn(tempDst, 1);
+  p->MOV(nextDst, nextSrc);
+p->pop();
+p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+
+p->push();
+  tempSrc = GenRegister::h2(
+GenRegister::retype(GenRegister::offset(src, 0, 4), GEN_TYPE_UD));
+  tempDst = GenRegister::retype(data, GEN_TYPE_UD);
+  p->curr.execWidth = 8;
+  p->curr.quarterControl = GEN_COMPRESSION_Q1;
+  p->MOV(tempDst, tempSrc);
+
+  p->curr.quarterControl = GEN_COMPRESSION_Q2;
+  nextSrc = GenRegister::Qn(tempSrc, 1);
+  nextDst = GenRegister::Qn(tempDst, 1);
+  p->MOV(nextDst, nextSrc);
+p->pop();
+p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+  }
+
   void ChvContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int 
sz) {
 if (sz == 0)
   sz = 16;
diff --git a/backend/src/backend/gen8_context.hpp 
b/backend/src/backend/gen8_context.hpp
index cc415c6..2e6eae5 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -82,6 +82,7 @@ namespace gbe
 virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, 
int sz = 0);
 virtual void subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& 
tmp);
 virtual void addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& 
tmp);
+virtual void emitPrintfLongInstruction(GenRegister& addr, GenRegister& 
data, GenRegister& src, uint32_t bti);
 virtual GenEncoder* generateEncoder(void) {
   return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
 }
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 7807289..8acf67d 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3148,7 +3148,77 @@ do { \
   p->patchJMPI(oneThreadJip, (p->n_instruction() - oneThreadJip), 0);
   }
 
+  void GenContext::emitPrintfLongInstruction(GenRegister& addr, GenRegister& 
data,
+ GenRegister& src, uint32_t bti) {
+p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src.bottom_half());
+p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+
+p->MOV(GenRegister::retype(data, GEN_TYPE_UD), 
src.top_half(this->simdWidth));
+p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+  }
+
   void GenContext::emitPrintfInstruction(const SelectionInstruction ) {
+const GenRegister dst = ra->genReg(insn.dst(0));
+const GenRegister tmp0 = ra->genReg(insn.dst(1));
+GenRegister src;
+uint32_t srcNum = insn.srcNum;
+if (insn.extra.continueFlag)
+  srcNum--;
+
+GenRegister addr = GenRegister::retype(tmp0, GEN_TYPE_UD);
+GenRegister data = GenRegister::offset(addr, 2);
+
+if (!insn.extra.continueFlag) {
+  p->push(); {
+p->curr.predicate = GEN_PREDICATE_NONE;
+p->curr.noMask = 1;
+//ptr[0] is the total count of the log size.
+p->MOV(addr, GenRegister::immud(0));
+p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
+  } p->pop();
+
+  p->ATOMIC(addr, GEN_ATOMIC_OP_ADD, addr, 
GenRegister::immud(insn.extra.printfBTI), 2);
+  /* Write out the header. */
+  p->MOV(data, GenRegister::immud(

[Beignet] [Printf v2][PATCH 10/12] Implement printf buffer management.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/backend/program.cpp | 10 +
 backend/src/backend/program.h   | 12 +-
 backend/src/backend/program.hpp |  7 
 backend/src/ir/printf.cpp   |  3 +-
 backend/src/ir/printf.hpp   |  3 +-
 backend/src/ir/profile.cpp  |  3 --
 backend/src/ir/profile.hpp  | 20 +-
 src/cl_command_queue.c  | 21 +++
 src/cl_command_queue_gen7.c | 36 ++
 src/cl_driver.h | 12 +++---
 src/cl_gbe_loader.cpp   | 10 -
 src/cl_gbe_loader.h |  2 -
 src/intel/intel_gpgpu.c | 82 ++---
 src/intel/intel_gpgpu.h |  4 +-
 14 files changed, 58 insertions(+), 167 deletions(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 4eca9f1..b3c3229 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -1271,15 +1271,11 @@ namespace gbe {
 delete ps;
   }
 
-  static void kernelOutputPrintf(void * printf_info, void* index_addr,
- void* buf_addr, size_t global_wk_sz0,
- size_t global_wk_sz1, size_t global_wk_sz2,
- size_t output_sz)
+  static void kernelOutputPrintf(void * printf_info, void* buf_addr)
   {
 if (printf_info == NULL) return;
 ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
-ps->outputPrintf(index_addr, buf_addr, global_wk_sz0,
- global_wk_sz1, global_wk_sz2, output_sz);
+ps->outputPrintf(buf_addr);
   }
 
   static void kernelGetCompileWorkGroupSize(gbe_kernel gbeKernel, size_t 
wg_size[3]) {
@@ -1363,9 +1359,7 @@ GBE_EXPORT_SYMBOL gbe_get_profiling_bti_cb 
*gbe_get_profiling_bti = NULL;
 GBE_EXPORT_SYMBOL gbe_get_printf_num_cb *gbe_get_printf_num = NULL;
 GBE_EXPORT_SYMBOL gbe_dup_printfset_cb *gbe_dup_printfset = NULL;
 GBE_EXPORT_SYMBOL gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti = NULL;
-GBE_EXPORT_SYMBOL gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti 
= NULL;
 GBE_EXPORT_SYMBOL gbe_release_printf_info_cb *gbe_release_printf_info = NULL;
-GBE_EXPORT_SYMBOL gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size = 
NULL;
 GBE_EXPORT_SYMBOL gbe_output_printf_cb *gbe_output_printf = NULL;
 
 #ifdef GBE_COMPILER_AVAILABLE
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 45805f9..db770a6 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -97,8 +97,6 @@ enum gbe_curbe_type {
   GBE_CURBE_GROUP_NUM_Z,
   GBE_CURBE_WORK_DIM,
   GBE_CURBE_IMAGE_INFO,
-  GBE_CURBE_PRINTF_BUF_POINTER,
-  GBE_CURBE_PRINTF_INDEX_POINTER,
   GBE_CURBE_KERNEL_ARGUMENT,
   GBE_CURBE_EXTRA_ARGUMENT,
   GBE_CURBE_BLOCK_IP,
@@ -163,9 +161,6 @@ extern gbe_get_printf_num_cb *gbe_get_printf_num;
 typedef uint8_t (gbe_get_printf_buf_bti_cb)(void* printf_info);
 extern gbe_get_printf_buf_bti_cb *gbe_get_printf_buf_bti;
 
-typedef uint8_t (gbe_get_printf_indexbuf_bti_cb)(void* printf_info);
-extern gbe_get_printf_indexbuf_bti_cb *gbe_get_printf_indexbuf_bti;
-
 /*! Release the printfset */
 typedef void (gbe_release_printf_info_cb)(void* printf_info);
 extern gbe_release_printf_info_cb *gbe_release_printf_info;
@@ -174,12 +169,7 @@ extern gbe_release_printf_info_cb *gbe_release_printf_info;
 typedef void* (gbe_dup_printfset_cb)(gbe_kernel gbeKernel);
 extern gbe_dup_printfset_cb *gbe_dup_printfset;
 
-/*! Get the printf buffer const offset */
-typedef uint32_t (gbe_get_printf_sizeof_size_cb)(void* printf_info);
-extern gbe_get_printf_sizeof_size_cb *gbe_get_printf_sizeof_size;
-
-typedef void (gbe_output_printf_cb) (void* printf_info, void* index_addr, 
void* buf_addr,
-  size_t global_wk_sz0, size_t global_wk_sz1, size_t 
global_wk_sz2, size_t outbuf_sz);
+typedef void (gbe_output_printf_cb) (void* printf_info, void* buf_addr);
 extern gbe_output_printf_cb* gbe_output_printf;
 
 /*! Create a new program from the given source code (zero terminated string) */
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index e5c4b95..9570806 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -166,13 +166,6 @@ namespace gbe {
   return printfSet->getBufBTI();
 }
 
-void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
-  size_t global_wk_sz1, size_t global_wk_sz2, size_t 
output_sz) {
-  if(printfSet)
-printfSet->outputPrintf(index_addr, buf_addr, global_wk_sz0,
-global_wk_sz1, global_wk_sz2, output_sz);
-}
-
 uint32_t getProfilingBufBTI() const {
   GBE_ASSERT(profilingInfo);
   return profilingInfo->getBTI();
diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/

[Beignet] [Printf v2][PATCH 06/12] Add the implementation of printf ir instruction.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/ir/instruction.cpp | 57 +-
 backend/src/ir/instruction.hpp | 13 ++
 backend/src/ir/instruction.hxx |  1 +
 backend/src/ir/register.cpp|  8 ++
 backend/src/ir/register.hpp| 21 
 5 files changed, 99 insertions(+), 1 deletion(-)

diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index bb5aac5..652c1fb 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -994,6 +994,40 @@ namespace ir {
 Register dst[1];
 };
 
+class ALIGNED_INSTRUCTION PrintfInstruction :
+  public BasePolicy,
+  public TupleSrcPolicy,
+  public NDstPolicy<PrintfInstruction, 1>
+{
+  public:
+INLINE PrintfInstruction(Register dst, Tuple srcTuple, Tuple typeTuple,
+ uint8_t srcNum, uint8_t bti, uint16_t num) {
+  this->opcode = OP_PRINTF;
+  this->dst[0] = dst;
+  this->src = srcTuple;
+  this->type = typeTuple;
+  this->srcNum = srcNum;
+  this->bti = bti;
+  this->num = num;
+}
+INLINE bool wellFormed(const Function , std::string ) const;
+INLINE void out(std::ostream , const Function ) const;
+
+uint32_t getNum(void) const { return this->num; }
+uint32_t getBti(void) const { return this->bti; }
+Type getType(const Function& fn, uint32_t ID) const {
+  GBE_ASSERTM(ID < this->srcNum, "Out-of-bound types");
+  return (Type)fn.getType(type, ID);
+}
+
+uint32_t srcNum:8;//!< Source Number
+uint32_t bti:8;   //!< The BTI
+uint32_t num:16;  //!< The printf statement number of one kernel.
+Tuple src;
+Tuple type;
+Register dst[1];
+};
+
 #undef ALIGNED_INSTRUCTION
 
 /
@@ -1473,6 +1507,10 @@ namespace ir {
   return true;
 }
 
+INLINE bool PrintfInstruction::wellFormed(const Function , std::string 
) const {
+  return true;
+}
+
 #undef CHECK_TYPE
 
 /
@@ -1702,6 +1740,11 @@ namespace ir {
 
   out << "TheadID Map at SLM: " << this->slmAddr;
 }
+
+INLINE void PrintfInstruction::out(std::ostream , const Function ) 
const {
+  this->outOpcode(out);
+}
+
   } /* namespace internal */
 
   std::ostream << (std::ostream , AddressSpace addrSpace) {
@@ -1862,6 +1905,10 @@ START_INTROSPECTION(WorkGroupInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(WorkGroupInstruction)
 
+START_INTROSPECTION(PrintfInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(PrintfInstruction)
+
 #undef END_INTROSPECTION
 #undef START_INTROSPECTION
 #undef DECL_INSN
@@ -2008,7 +2055,8 @@ END_FUNCTION(Instruction, Register)
opcode == OP_ATOMIC ||
opcode == OP_CALC_TIMESTAMP ||
opcode == OP_STORE_PROFILING ||
-   opcode == OP_WAIT;
+   opcode == OP_WAIT ||
+   opcode == OP_PRINTF;
   }
 
 #define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
@@ -2071,6 +2119,9 @@ DECL_MEM_FN(StoreProfilingInstruction, uint32_t, 
getBTI(void), getBTI())
 DECL_MEM_FN(WorkGroupInstruction, Type, getType(void), getType())
 DECL_MEM_FN(WorkGroupInstruction, WorkGroupOps, getWorkGroupOpcode(void), 
getWorkGroupOpcode())
 DECL_MEM_FN(WorkGroupInstruction, uint32_t, getSlmAddr(void), getSlmAddr())
+DECL_MEM_FN(PrintfInstruction, uint32_t, getNum(void), getNum())
+DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti())
+DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), 
getType(fn, ID))
 
 #undef DECL_MEM_FN
 
@@ -2369,6 +2420,10 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register 
reg), setBtiReg(reg))
 return internal::WorkGroupInstruction(opcode, slmAddr, dst, srcTuple, 
srcNum, type).convert();
   }
 
+  Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t 
srcNum, uint8_t bti, uint16_t num) {
+return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, 
num).convert();
+  }
+
   std::ostream << (std::ostream , const Instruction ) {
 const Function  = insn.getFunction();
 const BasicBlock *bb = insn.getParent();
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index ec7b9b5..9cc926d 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -611,6 +611,17 @@ namespace ir {
 uint32_t getSlmAddr(void) const;
   };
 
+  /*! Printf instruction. */
+  class PrintfInstruction : public Instr

[Beignet] [Printf v2][PATCH 08/12] Implement instruction selection of printf.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/backend/gen_context.cpp|   3 +
 backend/src/backend/gen_context.hpp|   1 +
 .../src/backend/gen_insn_gen7_schedule_info.hxx|   3 +-
 backend/src/backend/gen_insn_selection.cpp | 116 +
 backend/src/backend/gen_insn_selection.hpp |   6 ++
 backend/src/backend/gen_insn_selection.hxx |   1 +
 6 files changed, 129 insertions(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 0ea0dd0..7807289 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3148,6 +3148,9 @@ do { \
   p->patchJMPI(oneThreadJip, (p->n_instruction() - oneThreadJip), 0);
   }
 
+  void GenContext::emitPrintfInstruction(const SelectionInstruction ) {
+  }
+
   void GenContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int 
sz) {
 if (sz == 0)
   sz = 8;
diff --git a/backend/src/backend/gen_context.hpp 
b/backend/src/backend/gen_context.hpp
index 22ec0ea..036fa78 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -180,6 +180,7 @@ namespace gbe
 void emitCalcTimestampInstruction(const SelectionInstruction );
 void emitStoreProfilingInstruction(const SelectionInstruction );
 void emitWorkGroupOpInstruction(const SelectionInstruction );
+void emitPrintfInstruction(const SelectionInstruction );
 void scratchWrite(const GenRegister header, uint32_t offset, uint32_t 
reg_num, uint32_t reg_type, uint32_t channel_mode);
 void scratchRead(const GenRegister dst, const GenRegister header, uint32_t 
offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode);
 unsigned beforeMessage(const SelectionInstruction , GenRegister bti, 
GenRegister flagTemp, GenRegister btiTmp, unsigned desc);
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx 
b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 8ef422f..112df32 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -47,4 +47,5 @@ DECL_GEN7_SCHEDULE(I64SATSUB,   20,40,  20)
 DECL_GEN7_SCHEDULE(F64DIV,  20,40,  20)
 DECL_GEN7_SCHEDULE(CalcTimestamp,   80,1,1)
 DECL_GEN7_SCHEDULE(StoreProfiling,  80,1,1)
-DECL_GEN7_SCHEDULE(WorkGroupOp,80, 1,   1)
+DECL_GEN7_SCHEDULE(WorkGroupOp, 80,1,1)
+DECL_GEN7_SCHEDULE(Printf,  80,1,1)
diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index 001a3c5..6eecde2 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -676,6 +676,9 @@ namespace gbe
 void CALC_TIMESTAMP(GenRegister ts[4], int tsN, GenRegister tmp, uint32_t 
pointNum, uint32_t tsType);
 /*! Store the profiling info */
 void STORE_PROFILING(uint32_t profilingType, uint32_t bti, GenRegister 
tmp0, GenRegister tmp1, GenRegister ts[4], int tsNum);
+/*! Printf */
+void PRINTF(GenRegister dst, uint8_t bti, GenRegister tmp0, GenRegister 
tmp1, GenRegister src[8],
+int srcNum, uint16_t num, bool isContinue, uint32_t totalSize);
 /*! Multiply 64-bit integers */
 void I64MUL(Reg dst, Reg src0, Reg src1, GenRegister *tmp, bool 
native_long);
 /*! 64-bit integer division */
@@ -1905,6 +1908,53 @@ namespace gbe
 }
   }
 
+  void Selection::Opaque::PRINTF(GenRegister dst, uint8_t bti, GenRegister 
tmp0, GenRegister tmp1,
+   GenRegister src[8], int srcNum, uint16_t num, bool isContinue, 
uint32_t totalSize) {
+if (isContinue) {
+  SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum + 
1);
+  SelectionVector *vector = this->appendVector();
+
+  for (int i = 0; i < srcNum; i++)
+insn->src(i) = src[i];
+
+  insn->src(srcNum) = tmp0;
+
+  insn->dst(0) = dst;
+  insn->dst(1) = tmp0;
+  insn->dst(2) = tmp1;
+
+  vector->regNum = 2;
+  vector->reg = >dst(1);
+  vector->offsetID = 0;
+  vector->isSrc = 0;
+
+  insn->extra.printfSize = static_cast(totalSize);
+  insn->extra.continueFlag = isContinue;
+  insn->extra.printfBTI = bti;
+  insn->extra.printfNum = num;
+} else {
+  SelectionInstruction *insn = this->appendInsn(SEL_OP_PRINTF, 3, srcNum);
+  SelectionVector *vector = this->appendVector();
+
+  for (int i = 0; i < srcNum; i++)
+insn->src(i) = src[i];
+
+  insn->dst(0) = dst;
+  insn->dst(1) = tmp0;
+  insn->dst(2) = tmp1;
+
+  vector->regNum = 2;
+  vector->

[Beignet] [Printf v2][PATCH 07/12] Add the implementation of printf ir instruction.

2016-01-31 Thread yan . wang
From: Yan Wang <yan.w...@linux.intel.com>

Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/llvm/llvm_gen_backend.cpp | 95 +--
 1 file changed, 80 insertions(+), 15 deletions(-)

diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index dba9dba..4870285 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -486,6 +486,9 @@ namespace gbe
 typedef map>::iterator PtrOrigMapIter;
 // map pointer source to bti
 map BtiMap;
+// map printf pointer source to bti
+int printfBti;
+uint32_t printfNum;
 // map ptr to its bti register
 map BtiValueMap;
 // map ptr to it's base
@@ -520,6 +523,8 @@ namespace gbe
 unit(unit),
 ctx(unit),
 regTranslator(ctx),
+printfBti(-1),
+printfNum(0),
 LI(0),
 TheModule(0),
 btiBase(BTI_RESERVED_NUM),
@@ -586,6 +591,7 @@ namespace gbe
   addrStoreInst.clear();
   // Reset for next function
   btiBase = BTI_RESERVED_NUM;
+  printfBti = -1;
   return false;
 }
 /*! Given a possible pointer value, find out the interested escape like
@@ -594,7 +600,7 @@ namespace gbe
 /*! For all possible pointers, GlobalVariable, function pointer argument,
 alloca instruction, find their pointer escape points */
 void analyzePointerOrigin(Function );
-unsigned getNewBti(Value *origin, bool isImage);
+unsigned getNewBti(Value *origin, bool force);
 void assignBti(Function );
 bool isSingleBti(Value *Val);
 Value *getBtiRegister(Value *v);
@@ -717,11 +723,10 @@ namespace gbe
 // handle load of dword/qword with unaligned address
 void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, 
ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool 
fixedBTI);
 void visitInstruction(Instruction ) {NOT_SUPPORTED;}
-void* getPrintfInfo(CallInst* inst)
-{
-  if ([inst])
-return (void*)[inst];
-  return NULL;
+ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) {
+  if (unit.printfs.find(inst) == unit.printfs.end())
+return NULL;
+  return [inst];
 }
 private:
   void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug 
infomation in context for subsequently passing to Gen insn
@@ -1127,21 +1132,15 @@ namespace gbe
 }
   }
 
-  unsigned GenWriter::getNewBti(Value *origin, bool isImage) {
+  unsigned GenWriter::getNewBti(Value *origin, bool force) {
 unsigned new_bti = 0;
-if (isImage) {
+if (force) {
   new_bti = btiBase;
   incBtiBase();
   return new_bti;
 }
 
-if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
-  new_bti = btiBase;
-  incBtiBase();
-} else if 
(origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
-  new_bti = btiBase;
-  incBtiBase();
-} else if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) 
{
+if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
   new_bti = btiBase;
   incBtiBase();
 }
@@ -3716,6 +3715,16 @@ namespace gbe
 this->newRegister();
 break;
   case GEN_OCL_PRINTF:
+this->newRegister();  // fall through
+  case GEN_OCL_PUTS:
+  {
+ // We need a new BTI as printf output.
+ if (printfBti < 0) {
+   printfBti = this->getNewBti(, true);
+   ctx.getFunction().getPrintfSet()->setBufBTI(printfBti);
+ }
+ break;
+  }
   case GEN_OCL_CALC_TIMESTAMP:
   case GEN_OCL_STORE_PROFILING:
   case GEN_OCL_DEBUGWAIT:
@@ -4527,6 +4536,62 @@ namespace gbe
 
   case GEN_OCL_PRINTF:
   {
+ir::PrintfSet::PrintfFmt* fmt = getPrintfInfo();
+if (fmt == NULL)
+  break;
+
+ctx.getFunction().getPrintfSet()->append(printfNum, fmt);
+
+vector tupleData;
+vector tupleTypeData;
+int argNum = static_cast(I.getNumOperands());
+argNum -= 2; // no fmt and last NULL.
+int realArgNum = argNum;
+
+for (int n = 0; n < argNum; n++) {
+  /* First, ignore %s, the strings are recorded and not passed to 
GPU. */
+  llvm::Constant* args = 
dyn_cast(I.getOperand(n + 1));
+  llvm::Constant* args_ptr = NULL;
+  if (args)
+args_ptr = dyn_cast(args->getOperand(0));
+
+  if (args_ptr) {
+ConstantDataSequential* fmt_arg = 
dyn_cast(args_ptr->getOperand(0));
+if (fmt_arg && fmt_arg->isCString()) {
+  realArgNum--;
+  continue;
+}
+   

Re: [Beignet] [Printf][PATCH 06/11] Implement emision of printf instruction.

2016-01-31 Thread yan . wang
Now the root cause has been founded.
The allocated surface size is not enough because it is dependent on global
size.
I Will fix it and resend patch set based on all previous review comments.
Thanks.

Yan Wang

> After applied the printf patch set, I find the last test still
> failed, please help to check.
>
> On Thu, Jan 28, 2016 at 12:33:05PM +0800, He Junyan wrote:
>> Date: Thu, 28 Jan 2016 12:33:05 +0800
>> From: He Junyan <junyan...@inbox.com>
>> To: beignet@lists.freedesktop.org
>> Subject: Re: [Beignet] [Printf][PATCH 06/11] Implement emision of printf
>>  instruction.
>>
>> On Thu, Jan 21, 2016 at 11:30:21AM +0800, Yan Wang wrote:
>> > Date: Thu, 21 Jan 2016 11:30:21 +0800
>> > From: Yan Wang <yan.w...@linux.intel.com>
>> > To: beignet@lists.freedesktop.org
>> > Cc: Yan Wang <yan.w...@linux.intel.com>
>> > Subject: [Beignet] [Printf][PATCH 06/11] Implement emision of printf
>> >  instruction.
>> > X-Mailer: git-send-email 2.5.0
>> >
>> > Contributor: Junyan He <junyan...@linux.intel.com>
>> > Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
>> > ---
>> >  backend/src/ir/context.hpp|  5 ++
>> >  backend/src/llvm/llvm_gen_backend.cpp | 89
>> ---
>> >  2 files changed, 78 insertions(+), 16 deletions(-)
>> >
>> I think it is better to write another patch to type TUPLE logic
>> > diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
>> > index b95741f..877d639 100644
>> > --- a/backend/src/ir/context.hpp
>> > +++ b/backend/src/ir/context.hpp
>> > @@ -149,6 +149,11 @@ namespace ir {
>> >GBE_ASSERTM(fn != NULL, "No function currently defined");
>> >return fn->file.appendArrayTuple(reg, regNum);
>> >  }
>> > +/*! Make a tuple from an array of types */
>> > +INLINE Tuple arrayTypeTuple(const ir::Type *type, uint32_t num) {
>> > +  GBE_ASSERTM(fn != NULL, "No function currently defined");
>> > +  return fn->file.appendArrayTypeTuple((uint8_t*)type, num);
>> > +}
>> >  /*! We just use variadic templates to forward instruction
>> functions */
>> >  #define DECL_INSN(NAME, FAMILY) \
>> >  template  INLINE void NAME(Args...args);
>> > diff --git a/backend/src/llvm/llvm_gen_backend.cpp
>> b/backend/src/llvm/llvm_gen_backend.cpp
>> > index dba9dba..cc736d7 100644
>> > --- a/backend/src/llvm/llvm_gen_backend.cpp
>> > +++ b/backend/src/llvm/llvm_gen_backend.cpp
>> > @@ -486,6 +486,9 @@ namespace gbe
>> >  typedef map>::iterator
>> PtrOrigMapIter;
>> >  // map pointer source to bti
>> >  map BtiMap;
>> > +// map printf pointer source to bti
>> > +int printfBti;
>> > +uint32_t printfNum;
>> >  // map ptr to its bti register
>> >  map BtiValueMap;
>> >  // map ptr to it's base
>> > @@ -520,6 +523,8 @@ namespace gbe
>> >  unit(unit),
>> >  ctx(unit),
>> >  regTranslator(ctx),
>> > +printfBti(-1),
>> Also need to reset printfBti for each runOnFunction.
>>
>> > +printfNum(0),
>> >  LI(0),
>> >  TheModule(0),
>> >  btiBase(BTI_RESERVED_NUM),
>> > @@ -594,7 +599,7 @@ namespace gbe
>> >  /*! For all possible pointers, GlobalVariable, function pointer
>> argument,
>> >  alloca instruction, find their pointer escape points */
>> >  void analyzePointerOrigin(Function );
>> > -unsigned getNewBti(Value *origin, bool isImage);
>> > +unsigned getNewBti(Value *origin, bool force);
>> >  void assignBti(Function );
>> >  bool isSingleBti(Value *Val);
>> >  Value *getBtiRegister(Value *v);
>> > @@ -717,12 +722,7 @@ namespace gbe
>> >  // handle load of dword/qword with unaligned address
>> >  void emitUnalignedDQLoadStore(ir::Register ptr, Value
>> *llvmValues, ir::AddressSpace addrSpace, ir::Register bti, bool
>> isLoad, bool dwAligned, bool fixedBTI);
>> >  void visitInstruction(Instruction ) {NOT_SUPPORTED;}
>> > -void* getPrintfInfo(CallInst* inst)
>> > -{
>> > -  if ([inst])
>> > -return (void*)[inst];
>> > -  return NULL;
>> > -}
>> > +ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) { return
>> [inst]

[Beignet] [Printf][PATCH 08/11] Implement ASM generation of printf.

2016-01-20 Thread Yan Wang
Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/backend/gen8_context.cpp | 36 +++
 backend/src/backend/gen8_context.hpp |  1 +
 backend/src/backend/gen_context.cpp  | 70 
 backend/src/backend/gen_context.hpp  |  1 +
 4 files changed, 108 insertions(+)

diff --git a/backend/src/backend/gen8_context.cpp 
b/backend/src/backend/gen8_context.cpp
index 7455bfc..2e76f53 100644
--- a/backend/src/backend/gen8_context.cpp
+++ b/backend/src/backend/gen8_context.cpp
@@ -1264,6 +1264,42 @@ namespace gbe
 p->ADD(dst, dst, res);
   }
 
+  void Gen8Context::emitPrintfLongInstruction(GenRegister& addr, GenRegister& 
data,
+ GenRegister& src, uint32_t bti) {
+GenRegister tempSrc, tempDst;
+GenRegister nextSrc, nextDst;
+p->push();
+  tempSrc = GenRegister::h2(GenRegister::retype(src, GEN_TYPE_UD));
+  tempDst = GenRegister::retype(data, GEN_TYPE_UD);
+  p->curr.execWidth = 8;
+  p->curr.quarterControl = GEN_COMPRESSION_Q1;
+  p->MOV(tempDst, tempSrc);
+
+  p->curr.quarterControl = GEN_COMPRESSION_Q2;
+  nextSrc = GenRegister::Qn(tempSrc, 1);
+  nextDst = GenRegister::Qn(tempDst, 1);
+  p->MOV(nextDst, nextSrc);
+p->pop();
+p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+
+p->push();
+  tempSrc = GenRegister::h2(
+GenRegister::retype(GenRegister::offset(src, 0, 4), GEN_TYPE_UD));
+  tempDst = GenRegister::retype(data, GEN_TYPE_UD);
+  p->curr.execWidth = 8;
+  p->curr.quarterControl = GEN_COMPRESSION_Q1;
+  p->MOV(tempDst, tempSrc);
+
+  p->curr.quarterControl = GEN_COMPRESSION_Q2;
+  nextSrc = GenRegister::Qn(tempSrc, 1);
+  nextDst = GenRegister::Qn(tempDst, 1);
+  p->MOV(nextDst, nextSrc);
+p->pop();
+p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+  }
+
   void ChvContext::setA0Content(uint16_t new_a0[16], uint16_t max_offset, int 
sz) {
 if (sz == 0)
   sz = 16;
diff --git a/backend/src/backend/gen8_context.hpp 
b/backend/src/backend/gen8_context.hpp
index cc415c6..2e6eae5 100644
--- a/backend/src/backend/gen8_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -82,6 +82,7 @@ namespace gbe
 virtual void setA0Content(uint16_t new_a0[16], uint16_t max_offset = 0, 
int sz = 0);
 virtual void subTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& 
tmp);
 virtual void addTimestamps(GenRegister& t0, GenRegister& t1, GenRegister& 
tmp);
+virtual void emitPrintfLongInstruction(GenRegister& addr, GenRegister& 
data, GenRegister& src, uint32_t bti);
 virtual GenEncoder* generateEncoder(void) {
   return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
 }
diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 7807289..8acf67d 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3148,7 +3148,77 @@ do { \
   p->patchJMPI(oneThreadJip, (p->n_instruction() - oneThreadJip), 0);
   }
 
+  void GenContext::emitPrintfLongInstruction(GenRegister& addr, GenRegister& 
data,
+ GenRegister& src, uint32_t bti) {
+p->MOV(GenRegister::retype(data, GEN_TYPE_UD), src.bottom_half());
+p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+
+p->MOV(GenRegister::retype(data, GEN_TYPE_UD), 
src.top_half(this->simdWidth));
+p->UNTYPED_WRITE(addr, GenRegister::immud(bti), 1);
+p->ADD(addr, addr, GenRegister::immud(sizeof(uint32_t)));
+  }
+
   void GenContext::emitPrintfInstruction(const SelectionInstruction ) {
+const GenRegister dst = ra->genReg(insn.dst(0));
+const GenRegister tmp0 = ra->genReg(insn.dst(1));
+GenRegister src;
+uint32_t srcNum = insn.srcNum;
+if (insn.extra.continueFlag)
+  srcNum--;
+
+GenRegister addr = GenRegister::retype(tmp0, GEN_TYPE_UD);
+GenRegister data = GenRegister::offset(addr, 2);
+
+if (!insn.extra.continueFlag) {
+  p->push(); {
+p->curr.predicate = GEN_PREDICATE_NONE;
+p->curr.noMask = 1;
+//ptr[0] is the total count of the log size.
+p->MOV(addr, GenRegister::immud(0));
+p->MOV(data, GenRegister::immud(insn.extra.printfSize + 12));
+  } p->pop();
+
+  p->ATOMIC(addr, GEN_ATOMIC_OP_ADD, addr, 
GenRegister::immud(insn.extra.printfBTI), 2);
+  /* Write out the header. */
+  p->MOV(data, GenRegister::immud(0xAABBCCDD));
+  p->UNTYPED_WRITE(addr, GenRegister::immud

[Beignet] [Printf][PATCH 06/11] Implement emision of printf instruction.

2016-01-20 Thread Yan Wang
Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/ir/context.hpp|  5 ++
 backend/src/llvm/llvm_gen_backend.cpp | 89 ---
 2 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index b95741f..877d639 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -149,6 +149,11 @@ namespace ir {
   GBE_ASSERTM(fn != NULL, "No function currently defined");
   return fn->file.appendArrayTuple(reg, regNum);
 }
+/*! Make a tuple from an array of types */
+INLINE Tuple arrayTypeTuple(const ir::Type *type, uint32_t num) {
+  GBE_ASSERTM(fn != NULL, "No function currently defined");
+  return fn->file.appendArrayTypeTuple((uint8_t*)type, num);
+}
 /*! We just use variadic templates to forward instruction functions */
 #define DECL_INSN(NAME, FAMILY) \
 template  INLINE void NAME(Args...args);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index dba9dba..cc736d7 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -486,6 +486,9 @@ namespace gbe
 typedef map>::iterator PtrOrigMapIter;
 // map pointer source to bti
 map BtiMap;
+// map printf pointer source to bti
+int printfBti;
+uint32_t printfNum;
 // map ptr to its bti register
 map BtiValueMap;
 // map ptr to it's base
@@ -520,6 +523,8 @@ namespace gbe
 unit(unit),
 ctx(unit),
 regTranslator(ctx),
+printfBti(-1),
+printfNum(0),
 LI(0),
 TheModule(0),
 btiBase(BTI_RESERVED_NUM),
@@ -594,7 +599,7 @@ namespace gbe
 /*! For all possible pointers, GlobalVariable, function pointer argument,
 alloca instruction, find their pointer escape points */
 void analyzePointerOrigin(Function );
-unsigned getNewBti(Value *origin, bool isImage);
+unsigned getNewBti(Value *origin, bool force);
 void assignBti(Function );
 bool isSingleBti(Value *Val);
 Value *getBtiRegister(Value *v);
@@ -717,12 +722,7 @@ namespace gbe
 // handle load of dword/qword with unaligned address
 void emitUnalignedDQLoadStore(ir::Register ptr, Value *llvmValues, 
ir::AddressSpace addrSpace, ir::Register bti, bool isLoad, bool dwAligned, bool 
fixedBTI);
 void visitInstruction(Instruction ) {NOT_SUPPORTED;}
-void* getPrintfInfo(CallInst* inst)
-{
-  if ([inst])
-return (void*)[inst];
-  return NULL;
-}
+ir::PrintfSet::PrintfFmt* getPrintfInfo(CallInst* inst) { return 
[inst]; }
 private:
   void setDebugInfo_CTX(llvm::Instruction * insn); // store the debug 
infomation in context for subsequently passing to Gen insn
   ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t 
index = 0u);
@@ -1127,21 +1127,15 @@ namespace gbe
 }
   }
 
-  unsigned GenWriter::getNewBti(Value *origin, bool isImage) {
+  unsigned GenWriter::getNewBti(Value *origin, bool force) {
 unsigned new_bti = 0;
-if (isImage) {
+if (force) {
   new_bti = btiBase;
   incBtiBase();
   return new_bti;
 }
 
-if(origin->getName().equals(StringRef("__gen_ocl_printf_buf"))) {
-  new_bti = btiBase;
-  incBtiBase();
-} else if 
(origin->getName().equals(StringRef("__gen_ocl_printf_index_buf"))) {
-  new_bti = btiBase;
-  incBtiBase();
-} else if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) 
{
+if (origin->getName().equals(StringRef("__gen_ocl_profiling_buf"))) {
   new_bti = btiBase;
   incBtiBase();
 }
@@ -3716,6 +3710,16 @@ namespace gbe
 this->newRegister();
 break;
   case GEN_OCL_PRINTF:
+this->newRegister();  // fall through
+  case GEN_OCL_PUTS:
+  {
+ // We need a new BTI as printf output.
+ if (printfBti < 0) {
+   printfBti = this->getNewBti(, true);
+   ctx.getFunction().getPrintfSet()->setBufBTI(printfBti);
+ }
+ break;
+  }
   case GEN_OCL_CALC_TIMESTAMP:
   case GEN_OCL_STORE_PROFILING:
   case GEN_OCL_DEBUGWAIT:
@@ -4527,6 +4531,59 @@ namespace gbe
 
   case GEN_OCL_PRINTF:
   {
+ir::PrintfSet::PrintfFmt* fmt = getPrintfInfo();
+ctx.getFunction().getPrintfSet()->append(printfNum, fmt);
+
+vector tupleData;
+vector tupleTypeData;
+int argNum = static_cast(I.getNumOperands());
+argNum -= 2; // no fmt and last NULL.
+int realArgNum = argNum;
+
+for (int n = 0; n < argNum; n++) {
+  /* First, ignore %s, the strings are recorded and not passed to 
GPU. */
+  llvm::Consta

[Beignet] [Printf][PATCH 10/11] Output printf result.

2016-01-20 Thread Yan Wang
Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/ir/printf.cpp | 122 +-
 backend/src/ir/printf.hpp |   2 +-
 2 files changed, 112 insertions(+), 12 deletions(-)

diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
index 19daa19..7ca127d 100644
--- a/backend/src/ir/printf.cpp
+++ b/backend/src/ir/printf.cpp
@@ -35,7 +35,7 @@ namespace gbe
 static void generatePrintfFmtString(PrintfState& state, std::string& str)
 {
   char num_str[16];
-  str += "%";
+  str = "%";
 
   if (state.left_justified) {
 str += "-";
@@ -87,21 +87,121 @@ namespace gbe
 #define PRINT_SOMETHING(target_ty, conv)  do {  \
   if (!vec_i)   \
 pf_str = pf_str + std::string(#conv);   \
-  char *ptr = ((char *)buf_addr + sizeOfSize * global_wk_sz0 * 
global_wk_sz1 * global_wk_sz2 * n \
-   + slot.state->out_buf_sizeof_offset *\
-   global_wk_sz0 * global_wk_sz1 * global_wk_sz2);  \
-  target_ty* obj_ptr = ((target_ty *)ptr) + (k*global_wk_sz0*global_wk_sz1 
+ j*global_wk_sz0 + i) * vec_num + vec_i; \
-  if ((char *)obj_ptr + sizeof(target_ty) > (char *)buf_addr + output_sz) 
{\
-printf("\n\n!!!The printf message is out of range because of the 
limited buffer, ignore.\n"); \
-return; \
-  } \
-  printf(pf_str.c_str(),  *obj_ptr);\
+  printf(pf_str.c_str(), log.getData()); \
 } while (0)
 
+static void printOutOneStatement(PrintfSet::PrintfFmt& fmt, PrintfLog& log)
+{
+  std::string pf_str = "";
+  for (auto& slot : fmt) {
+if (slot.type == PRINTF_SLOT_TYPE_STRING) {
+  printf("%s", slot.str.c_str());
+  continue;
+}
+assert(slot.type == PRINTF_SLOT_TYPE_STATE);
+
+generatePrintfFmtString(slot.state, pf_str);
+
+int vec_num;
+vec_num = slot.state.vector_n > 0 ? slot.state.vector_n : 1;
+
+for (int vec_i = 0; vec_i < vec_num; vec_i++) {
+  if (vec_i)
+printf(",");
+
+  switch (slot.state.conversion_specifier) {
+case PRINTF_CONVERSION_D:
+case PRINTF_CONVERSION_I:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, d);
+  else
+PRINT_SOMETHING(int, d);
+  break;
+
+case PRINTF_CONVERSION_O:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, o);
+  else
+PRINT_SOMETHING(int, o);
+  break;
+case PRINTF_CONVERSION_U:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, u);
+  else
+PRINT_SOMETHING(int, u);
+  break;
+case PRINTF_CONVERSION_X:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, X);
+  else
+PRINT_SOMETHING(int, X);
+  break;
+case PRINTF_CONVERSION_x:
+  if (slot.state.length_modifier == PRINTF_LM_L)
+PRINT_SOMETHING(uint64_t, x);
+  else
+PRINT_SOMETHING(int, x);
+  break;
+
+case PRINTF_CONVERSION_C:
+  PRINT_SOMETHING(char, c);
+  break;
+
+case PRINTF_CONVERSION_F:
+  PRINT_SOMETHING(float, F);
+  break;
+case PRINTF_CONVERSION_f:
+  PRINT_SOMETHING(float, f);
+  break;
+case PRINTF_CONVERSION_E:
+  PRINT_SOMETHING(float, E);
+  break;
+case PRINTF_CONVERSION_e:
+  PRINT_SOMETHING(float, e);
+  break;
+case PRINTF_CONVERSION_G:
+  PRINT_SOMETHING(float, G);
+  break;
+case PRINTF_CONVERSION_g:
+  PRINT_SOMETHING(float, g);
+  break;
+case PRINTF_CONVERSION_A:
+  PRINT_SOMETHING(float, A);
+  break;
+case PRINTF_CONVERSION_a:
+  PRINT_SOMETHING(float, a);
+  break;
+case PRINTF_CONVERSION_P:
+  PRINT_SOMETHING(int, p);
+  break;
+
+case PRINTF_CONVERSION_S:
+  pf_str = pf_str + "s";
+  printf(pf_str.c_str(), slot.state.str.c_str());
+  b

[Beignet] [Printf][PATCH 04/11] Add the implementation of printf ir instruction.

2016-01-20 Thread Yan Wang
Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/ir/function.hpp|  8 ++
 backend/src/ir/instruction.cpp | 57 +-
 backend/src/ir/instruction.hpp | 13 ++
 backend/src/ir/instruction.hxx |  1 +
 backend/src/ir/register.cpp|  8 ++
 backend/src/ir/register.hpp| 21 
 6 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 78250cf..5785bee 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -341,6 +341,14 @@ namespace ir {
 INLINE void setRegister(Tuple ID, uint32_t which, Register reg) {
   file.set(ID, which, reg);
 }
+/*! Get the type from the tuple vector */
+INLINE uint8_t getType(Tuple ID, uint32_t which) const {
+  return file.getType(ID, which);
+}
+/*! Set the type into the tuple vector */
+INLINE void setType(Tuple ID, uint32_t which, uint8_t type) {
+  file.setType(ID, which, type);
+}
 /*! Get the register file */
 INLINE const RegisterFile (void) const { return file; }
 /*! Get the given value ie immediate from the function */
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index bb5aac5..652c1fb 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -994,6 +994,40 @@ namespace ir {
 Register dst[1];
 };
 
+class ALIGNED_INSTRUCTION PrintfInstruction :
+  public BasePolicy,
+  public TupleSrcPolicy,
+  public NDstPolicy<PrintfInstruction, 1>
+{
+  public:
+INLINE PrintfInstruction(Register dst, Tuple srcTuple, Tuple typeTuple,
+ uint8_t srcNum, uint8_t bti, uint16_t num) {
+  this->opcode = OP_PRINTF;
+  this->dst[0] = dst;
+  this->src = srcTuple;
+  this->type = typeTuple;
+  this->srcNum = srcNum;
+  this->bti = bti;
+  this->num = num;
+}
+INLINE bool wellFormed(const Function , std::string ) const;
+INLINE void out(std::ostream , const Function ) const;
+
+uint32_t getNum(void) const { return this->num; }
+uint32_t getBti(void) const { return this->bti; }
+Type getType(const Function& fn, uint32_t ID) const {
+  GBE_ASSERTM(ID < this->srcNum, "Out-of-bound types");
+  return (Type)fn.getType(type, ID);
+}
+
+uint32_t srcNum:8;//!< Source Number
+uint32_t bti:8;   //!< The BTI
+uint32_t num:16;  //!< The printf statement number of one kernel.
+Tuple src;
+Tuple type;
+Register dst[1];
+};
+
 #undef ALIGNED_INSTRUCTION
 
 /
@@ -1473,6 +1507,10 @@ namespace ir {
   return true;
 }
 
+INLINE bool PrintfInstruction::wellFormed(const Function , std::string 
) const {
+  return true;
+}
+
 #undef CHECK_TYPE
 
 /
@@ -1702,6 +1740,11 @@ namespace ir {
 
   out << "TheadID Map at SLM: " << this->slmAddr;
 }
+
+INLINE void PrintfInstruction::out(std::ostream , const Function ) 
const {
+  this->outOpcode(out);
+}
+
   } /* namespace internal */
 
   std::ostream << (std::ostream , AddressSpace addrSpace) {
@@ -1862,6 +1905,10 @@ START_INTROSPECTION(WorkGroupInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(WorkGroupInstruction)
 
+START_INTROSPECTION(PrintfInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(PrintfInstruction)
+
 #undef END_INTROSPECTION
 #undef START_INTROSPECTION
 #undef DECL_INSN
@@ -2008,7 +2055,8 @@ END_FUNCTION(Instruction, Register)
opcode == OP_ATOMIC ||
opcode == OP_CALC_TIMESTAMP ||
opcode == OP_STORE_PROFILING ||
-   opcode == OP_WAIT;
+   opcode == OP_WAIT ||
+   opcode == OP_PRINTF;
   }
 
 #define DECL_MEM_FN(CLASS, RET, PROTOTYPE, CALL) \
@@ -2071,6 +2119,9 @@ DECL_MEM_FN(StoreProfilingInstruction, uint32_t, 
getBTI(void), getBTI())
 DECL_MEM_FN(WorkGroupInstruction, Type, getType(void), getType())
 DECL_MEM_FN(WorkGroupInstruction, WorkGroupOps, getWorkGroupOpcode(void), 
getWorkGroupOpcode())
 DECL_MEM_FN(WorkGroupInstruction, uint32_t, getSlmAddr(void), getSlmAddr())
+DECL_MEM_FN(PrintfInstruction, uint32_t, getNum(void), getNum())
+DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti())
+DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), 
getType(fn, ID))
 
 #undef DECL_MEM_FN
 
@@ -2369,6 +2420,10 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register 
reg), setBtiReg(reg))
 return internal::WorkGroupIn

[Beignet] [Printf][PATCH 01/11] Change printf data structure and remove old code.

2016-01-20 Thread Yan Wang
Contributor: Junyan He <junyan...@linux.intel.com>
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/backend/program.cpp |  14 --
 backend/src/backend/program.hpp |  10 +-
 backend/src/gbe_bin_interpreter.cpp |   2 -
 backend/src/ir/printf.cpp   | 168 -
 backend/src/ir/printf.hpp   |  76 +++-
 backend/src/llvm/llvm_gen_backend.cpp   |  13 +-
 backend/src/llvm/llvm_printf_parser.cpp | 313 ++--
 7 files changed, 36 insertions(+), 560 deletions(-)

diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 7d12f73..4eca9f1 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -1265,24 +1265,12 @@ namespace gbe {
 return ps->getBufBTI();
   }
 
-  static uint8_t kernelGetPrintfIndexBufBTI(void * printf_info) {
-if (printf_info == NULL) return 0;
-const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
-return ps->getIndexBufBTI();
-  }
-
   static void kernelReleasePrintfSet(void * printf_info) {
 if (printf_info == NULL) return;
 ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
 delete ps;
   }
 
-  static uint32_t kernelGetPrintfSizeOfSize(void * printf_info) {
-if (printf_info == NULL) return 0;
-const ir::PrintfSet *ps = (ir::PrintfSet *)printf_info;
-return ps->getPrintfSizeOfSize();
-  }
-
   static void kernelOutputPrintf(void * printf_info, void* index_addr,
  void* buf_addr, size_t global_wk_sz0,
  size_t global_wk_sz1, size_t global_wk_sz2,
@@ -1426,9 +1414,7 @@ namespace gbe
   gbe_dup_profiling = gbe::kernelDupProfiling;
   gbe_output_profiling = gbe::kernelOutputProfiling;
   gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
-  gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
   gbe_dup_printfset = gbe::kernelDupPrintfSet;
-  gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
   gbe_release_printf_info = gbe::kernelReleasePrintfSet;
   gbe_output_printf = gbe::kernelOutputPrintf;
   genSetupCallBacks();
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 775e560..e5c4b95 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -152,10 +152,7 @@ namespace gbe {
 void setPrintfSet(ir::PrintfSet * from) {
   printfSet = from;
 }
-/* ! Return the offset in the sizeof(xxx). */
-uint32_t getPrintfSizeOfSize(void) const {
-  return printfSet ? printfSet->getPrintfSizeOfSize() : 0;
-}
+
 uint32_t getPrintfNum() const {
   return printfSet ? printfSet->getPrintfNum() : 0;
 }
@@ -169,11 +166,6 @@ namespace gbe {
   return printfSet->getBufBTI();
 }
 
-uint8_t getPrintfIndexBufBTI() const {
-  GBE_ASSERT(printfSet);
-  return printfSet->getIndexBufBTI();
-}
-
 void outputPrintf(void* index_addr, void* buf_addr, size_t global_wk_sz0,
   size_t global_wk_sz1, size_t global_wk_sz2, size_t 
output_sz) {
   if(printfSet)
diff --git a/backend/src/gbe_bin_interpreter.cpp 
b/backend/src/gbe_bin_interpreter.cpp
index 4756842..34d04dd 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -70,9 +70,7 @@ struct BinInterpCallBackInitializer
 gbe_output_profiling = gbe::kernelOutputProfiling;
 gbe_get_printf_num = gbe::kernelGetPrintfNum;
 gbe_get_printf_buf_bti = gbe::kernelGetPrintfBufBTI;
-gbe_get_printf_indexbuf_bti = gbe::kernelGetPrintfIndexBufBTI;
 gbe_dup_printfset = gbe::kernelDupPrintfSet;
-gbe_get_printf_sizeof_size = gbe::kernelGetPrintfSizeOfSize;
 gbe_release_printf_info = gbe::kernelReleasePrintfSet;
 gbe_output_printf = gbe::kernelOutputPrintf;
   }
diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
index 2e08248..3873ca9 100644
--- a/backend/src/ir/printf.cpp
+++ b/backend/src/ir/printf.cpp
@@ -32,43 +32,6 @@ namespace gbe
 
 pthread_mutex_t PrintfSet::lock = PTHREAD_MUTEX_INITIALIZER;
 
-PrintfSlot::~PrintfSlot(void)
-{
-if (ptr)
-{
-  if (type == PRINTF_SLOT_TYPE_STRING) {
-free(ptr);
-ptr = NULL;
-  } else if (type == PRINTF_SLOT_TYPE_STATE) {
-delete state;
-state = NULL;
-  } else {
-type = PRINTF_SLOT_TYPE_NONE;
-ptr = NULL;
-  }
-}
-}
-
-uint32_t PrintfSet::append(PrintfFmt* fmt, Unit& unit)
-{
-  fmts.push_back(*fmt);
-  vector& vp = fmts.back().first;
-
-  for (vector::iterator f = vp.begin(); f !=  vp.end(); ++f) {
-if (f->type == PRINTF_SLOT_TYPE_STRING)
-  continue;
-
-slots.push_back(*f);
-  }
-
-  /* Update the total size of size. */
-  if (slots.size() > 0)
-sizeOfS

Re: [Beignet] [PATCH v2] Use CreateCall instead of CreateCall2.

2015-11-19 Thread yan . wang
So should we rollback to v1? It should be safe because it is only for LLVM
>=3.7.
Thanks.

Yan Wang

> Build fail in LLVM3.5.2.
>
>> -Original Message-
>> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf
>> Of
>> Yan Wang
>> Sent: Thursday, November 19, 2015 10:41
>> To: beignet@lists.freedesktop.org
>> Cc: Yan Wang
>> Subject: [Beignet] [PATCH v2] Use CreateCall instead of CreateCall2.
>>
>> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
>> ---
>>  backend/src/llvm/llvm_profiling.cpp | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/backend/src/llvm/llvm_profiling.cpp
>> b/backend/src/llvm/llvm_profiling.cpp
>> index 8c9157c..3fbd00d 100644
>> --- a/backend/src/llvm/llvm_profiling.cpp
>> +++ b/backend/src/llvm/llvm_profiling.cpp
>> @@ -177,12 +177,12 @@ namespace gbe
>>builder->SetInsertPoint(instI);
>>/* Add the timestamp store function call. */
>>// __gen_ocl_store_timestamp(int nth, int type);
>> -  builder->CreateCall2(cast(module-
>> >getOrInsertFunction(
>> +
>> + builder->CreateCall(cast(module->getOrInsertFunction(
>>"__gen_ocl_calc_timestamp", Type::getVoidTy(module-
>> >getContext()),
>>IntegerType::getInt32Ty(module->getContext()),
>>IntegerType::getInt32Ty(module->getContext()),
>>NULL)),
>> -  /* the args */ ConstantInt::get(intTy, pointNum++),
>> ConstantInt::get(intTy, profilingType));
>> +  /* the args */ {ConstantInt::get(intTy, pointNum++),
>> + ConstantInt::get(intTy, profilingType)});
>>  }
>>  /* We insert one store_profiling at the end of the last block to
>> hold the
>> place. */
>>  llvm::Function::iterator BE = F.end(); @@ -190,12 +190,12 @@
>> namespace
>> gbe
>>  BasicBlock::iterator retInst = BE->end();
>>  retInst--;
>>  builder->SetInsertPoint(retInst);
>> -
>> builder->CreateCall2(cast(module->getOrInsertFunction(
>> +
>> + builder->CreateCall(cast(module->getOrInsertFunction(
>>  "__gen_ocl_store_profiling",
>> Type::getVoidTy(module->getContext()),
>>  ptrTy,
>>  IntegerType::getInt32Ty(module->getContext()),
>>  NULL)),
>> -/* the args */profilingBuf, ConstantInt::get(intTy,
>> profilingType));
>> +/* the args */{profilingBuf, ConstantInt::get(intTy,
>> + profilingType)});
>>
>>  delete builder;
>>  return changed;
>> --
>> 2.5.0
>>
>> ___
>> Beignet mailing list
>> Beignet@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/beignet
> ___
> Beignet mailing list
> Beignet@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/beignet
>

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH v2] Use CreateCall instead of CreateCall2.

2015-11-19 Thread yan . wang
Thanks.

Yan Wang

> The llvm function prototype is CreateCall((Value *Callee, ArrayRef *> Args = None, const Twine  = "")
> Cast from std::initializer_list to ArrayRef<> is not supported on older
> llvm version.
> Please try:
>/* Add the timestamp store function call. */
>// __gen_ocl_store_timestamp(int nth, int type);
> -
> builder->CreateCall2(cast(module->getOrInsertFunction(
> +  Value *Args[2] = {ConstantInt::get(intTy, pointNum++),
> ConstantInt::get(intTy, profilingType)};
> +
> builder->CreateCall(cast(module->getOrInsertFunction(
>"__gen_ocl_calc_timestamp",
> Type::getVoidTy(module->getContext()),
>IntegerType::getInt32Ty(module->getContext()),
>IntegerType::getInt32Ty(module->getContext()),
>NULL)),
> -  /* the args */ ConstantInt::get(intTy, pointNum++),
> ConstantInt::get(intTy, profilingType));
> +  ArrayRef<Value*>(Args));
>  }
>  /* We insert one store_profiling at the end of the last block to hold
> the place. */
>  llvm::Function::iterator BE = F.end();
> @@ -190,12 +191,14 @@ namespace gbe
>  BasicBlock::iterator retInst = BE->end();
>  retInst--;
>  builder->SetInsertPoint(retInst);
> -
> builder->CreateCall2(cast(module->getOrInsertFunction(
> +Value *Args2[2] = {profilingBuf, ConstantInt::get(intTy,
> profilingType)};
> +
> +
> builder->CreateCall(cast(module->getOrInsertFunction(
>  "__gen_ocl_store_profiling",
> Type::getVoidTy(module->getContext()),
>  ptrTy,
>  IntegerType::getInt32Ty(module->getContext()),
>  NULL)),
> -/* the args */profilingBuf, ConstantInt::get(intTy,
> profilingType));
> +ArrayRef<Value*>(Args2));
>
> It works at least on llvm 3.6. it should also work on llvm 3.5.
> Thanks!
> Ruiling
>
>> -Original Message-
>> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf
>> Of
>> Yang, Rong R
>> Sent: Thursday, November 19, 2015 3:55 PM
>> To: Yan Wang <yan.w...@linux.intel.com>; beignet@lists.freedesktop.org
>> Subject: Re: [Beignet] [PATCH v2] Use CreateCall instead of
>> CreateCall2.
>>
>> Build fail in LLVM3.5.2.
>>
>> > -Original Message-
>> > From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On
>> Behalf
>> Of
>> > Yan Wang
>> > Sent: Thursday, November 19, 2015 10:41
>> > To: beignet@lists.freedesktop.org
>> > Cc: Yan Wang
>> > Subject: [Beignet] [PATCH v2] Use CreateCall instead of CreateCall2.
>> >
>> > Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
>> > ---
>> >  backend/src/llvm/llvm_profiling.cpp | 8 
>> >  1 file changed, 4 insertions(+), 4 deletions(-)
>> >
>> > diff --git a/backend/src/llvm/llvm_profiling.cpp
>> > b/backend/src/llvm/llvm_profiling.cpp
>> > index 8c9157c..3fbd00d 100644
>> > --- a/backend/src/llvm/llvm_profiling.cpp
>> > +++ b/backend/src/llvm/llvm_profiling.cpp
>> > @@ -177,12 +177,12 @@ namespace gbe
>> >builder->SetInsertPoint(instI);
>> >/* Add the timestamp store function call. */
>> >// __gen_ocl_store_timestamp(int nth, int type);
>> > -  builder->CreateCall2(cast(module-
>> > >getOrInsertFunction(
>> > +
>> > +
>> builder->CreateCall(cast(module->getOrInsertFunction(
>> >"__gen_ocl_calc_timestamp", Type::getVoidTy(module-
>> > >getContext()),
>> >IntegerType::getInt32Ty(module->getContext()),
>> >IntegerType::getInt32Ty(module->getContext()),
>> >NULL)),
>> > -  /* the args */ ConstantInt::get(intTy, pointNum++),
>> > ConstantInt::get(intTy, profilingType));
>> > +  /* the args */ {ConstantInt::get(intTy, pointNum++),
>> > + ConstantInt::get(intTy, profilingType)});
>> >  }
>> >  /* We insert one store_profiling at the end of the last block to
>> hold the
>> > place. */
>> >  llvm::Function::iterator BE = F.end(); @@ -190,12 +190,12 @@
>> namespace
>> > gbe
>> >  BasicBlock::iterator retInst = BE->end();
>> >  retInst--;
>> >  builder->SetInsertPoint(retInst);
>> > -builder->CreateCall2(cast(module-
>> >getOrInsertFunction(
>> > +
>> >

[Beignet] [PATCH] Use CreateCall instead of CreateCall2 because llvm3.7 has removed CreateCall2.

2015-11-18 Thread Yan Wang
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/llvm/llvm_profiling.cpp | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/backend/src/llvm/llvm_profiling.cpp 
b/backend/src/llvm/llvm_profiling.cpp
index 8c9157c..e548305 100644
--- a/backend/src/llvm/llvm_profiling.cpp
+++ b/backend/src/llvm/llvm_profiling.cpp
@@ -177,12 +177,21 @@ namespace gbe
   builder->SetInsertPoint(instI);
   /* Add the timestamp store function call. */
   // __gen_ocl_store_timestamp(int nth, int type);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+  builder->CreateCall(cast(module->getOrInsertFunction(
+  "__gen_ocl_calc_timestamp", 
Type::getVoidTy(module->getContext()),
+  IntegerType::getInt32Ty(module->getContext()),
+  IntegerType::getInt32Ty(module->getContext()),
+  NULL)),
+  /* the args */ {ConstantInt::get(intTy, pointNum++), 
ConstantInt::get(intTy, profilingType)});
+#else
   builder->CreateCall2(cast(module->getOrInsertFunction(
   "__gen_ocl_calc_timestamp", 
Type::getVoidTy(module->getContext()),
   IntegerType::getInt32Ty(module->getContext()),
   IntegerType::getInt32Ty(module->getContext()),
   NULL)),
   /* the args */ ConstantInt::get(intTy, pointNum++), 
ConstantInt::get(intTy, profilingType));
+#endif
 }
 /* We insert one store_profiling at the end of the last block to hold the 
place. */
 llvm::Function::iterator BE = F.end();
@@ -190,12 +199,21 @@ namespace gbe
 BasicBlock::iterator retInst = BE->end();
 retInst--;
 builder->SetInsertPoint(retInst);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
+builder->CreateCall(cast(module->getOrInsertFunction(
+"__gen_ocl_store_profiling", Type::getVoidTy(module->getContext()),
+ptrTy,
+IntegerType::getInt32Ty(module->getContext()),
+NULL)),
+/* the args */{profilingBuf, ConstantInt::get(intTy, profilingType)});
+#else
 builder->CreateCall2(cast(module->getOrInsertFunction(
 "__gen_ocl_store_profiling", Type::getVoidTy(module->getContext()),
 ptrTy,
 IntegerType::getInt32Ty(module->getContext()),
 NULL)),
 /* the args */profilingBuf, ConstantInt::get(intTy, profilingType));
+#endif
 
 delete builder;
 return changed;
-- 
1.9.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH] Use CreateCall instead of CreateCall2 because llvm3.7 has removed CreateCall2.

2015-11-18 Thread yan . wang
>
>
>> -Original Message-
>> From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf
>> Of
>> Yan Wang
>> Sent: Wednesday, November 18, 2015 6:47 PM
>> To: beignet@lists.freedesktop.org
>> Cc: Yan Wang <yan.w...@linux.intel.com>
>> Subject: [Beignet] [PATCH] Use CreateCall instead of CreateCall2
>> because
>> llvm3.7 has removed CreateCall2.
>>
>> Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
>> ---
>>  backend/src/llvm/llvm_profiling.cpp | 18 ++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/backend/src/llvm/llvm_profiling.cpp
>> b/backend/src/llvm/llvm_profiling.cpp
>> index 8c9157c..e548305 100644
>> --- a/backend/src/llvm/llvm_profiling.cpp
>> +++ b/backend/src/llvm/llvm_profiling.cpp
>> @@ -177,12 +177,21 @@ namespace gbe
>>builder->SetInsertPoint(instI);
>>/* Add the timestamp store function call. */
>>// __gen_ocl_store_timestamp(int nth, int type);
>> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
>> +
>> builder->CreateCall(cast(module->getOrInsertFunction(
>> +  "__gen_ocl_calc_timestamp", Type::getVoidTy(module-
>> >getContext()),
>> +  IntegerType::getInt32Ty(module->getContext()),
>> +  IntegerType::getInt32Ty(module->getContext()),
>> +  NULL)),
>> +  /* the args */ {ConstantInt::get(intTy, pointNum++),
>> ConstantInt::get(intTy, profilingType)});
>
> I think you can directly remove CreateCall2 verion.
> llvm supports CreateCall((Value *Callee, ArrayRef Args = None,
> const Twine  = "") through all llvm version.
> So, let's keep code simple.
>
> Thanks!
> Ruiling

Sure. I will submit it again after modified it.
Thanks.

Yan Wang

>
>> +#else
>>builder->CreateCall2(cast(module-
>> >getOrInsertFunction(
>>"__gen_ocl_calc_timestamp", Type::getVoidTy(module-
>> >getContext()),
>>IntegerType::getInt32Ty(module->getContext()),
>>IntegerType::getInt32Ty(module->getContext()),
>>NULL)),
>>/* the args */ ConstantInt::get(intTy, pointNum++),
>> ConstantInt::get(intTy, profilingType));
>> +#endif
>>  }
>>  /* We insert one store_profiling at the end of the last block to
>> hold the
>> place. */
>>  llvm::Function::iterator BE = F.end();
>> @@ -190,12 +199,21 @@ namespace gbe
>>  BasicBlock::iterator retInst = BE->end();
>>  retInst--;
>>  builder->SetInsertPoint(retInst);
>> +#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 7
>> +
>> builder->CreateCall(cast(module->getOrInsertFunction(
>> +"__gen_ocl_store_profiling", Type::getVoidTy(module-
>> >getContext()),
>> +ptrTy,
>> +IntegerType::getInt32Ty(module->getContext()),
>> +NULL)),
>> +/* the args */{profilingBuf, ConstantInt::get(intTy,
>> profilingType)});
>> +#else
>>  builder->CreateCall2(cast(module->getOrInsertFunction(
>>  "__gen_ocl_store_profiling",
>> Type::getVoidTy(module->getContext()),
>>  ptrTy,
>>  IntegerType::getInt32Ty(module->getContext()),
>>  NULL)),
>>  /* the args */profilingBuf, ConstantInt::get(intTy,
>> profilingType));
>> +#endif
>>
>>  delete builder;
>>  return changed;
>> --
>> 1.9.3
>>
>> ___
>> Beignet mailing list
>> Beignet@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/beignet
>

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2] Use CreateCall instead of CreateCall2.

2015-11-18 Thread Yan Wang
Signed-off-by: Yan Wang <yan.w...@linux.intel.com>
---
 backend/src/llvm/llvm_profiling.cpp | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backend/src/llvm/llvm_profiling.cpp 
b/backend/src/llvm/llvm_profiling.cpp
index 8c9157c..3fbd00d 100644
--- a/backend/src/llvm/llvm_profiling.cpp
+++ b/backend/src/llvm/llvm_profiling.cpp
@@ -177,12 +177,12 @@ namespace gbe
   builder->SetInsertPoint(instI);
   /* Add the timestamp store function call. */
   // __gen_ocl_store_timestamp(int nth, int type);
-  builder->CreateCall2(cast(module->getOrInsertFunction(
+  builder->CreateCall(cast(module->getOrInsertFunction(
   "__gen_ocl_calc_timestamp", 
Type::getVoidTy(module->getContext()),
   IntegerType::getInt32Ty(module->getContext()),
   IntegerType::getInt32Ty(module->getContext()),
   NULL)),
-  /* the args */ ConstantInt::get(intTy, pointNum++), 
ConstantInt::get(intTy, profilingType));
+  /* the args */ {ConstantInt::get(intTy, pointNum++), 
ConstantInt::get(intTy, profilingType)});
 }
 /* We insert one store_profiling at the end of the last block to hold the 
place. */
 llvm::Function::iterator BE = F.end();
@@ -190,12 +190,12 @@ namespace gbe
 BasicBlock::iterator retInst = BE->end();
 retInst--;
 builder->SetInsertPoint(retInst);
-builder->CreateCall2(cast(module->getOrInsertFunction(
+builder->CreateCall(cast(module->getOrInsertFunction(
 "__gen_ocl_store_profiling", Type::getVoidTy(module->getContext()),
 ptrTy,
 IntegerType::getInt32Ty(module->getContext()),
 NULL)),
-/* the args */profilingBuf, ConstantInt::get(intTy, profilingType));
+/* the args */{profilingBuf, ConstantInt::get(intTy, profilingType)});
 
 delete builder;
 return changed;
-- 
2.5.0

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Fix loop condition of PrintfSet constructor.

2015-01-08 Thread Yan Wang
Signed-off-by: Yan Wang yan.w...@linux.intel.com
---
 backend/src/ir/printf.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
index 3b176f6..cc1f8dc 100644
--- a/backend/src/ir/printf.hpp
+++ b/backend/src/ir/printf.hpp
@@ -176,7 +176,7 @@ namespace gbe
   fmts.push_back(f);
 }
 
-for (size_t i = 0; i  other.fmts.size(); ++i) {
+for (size_t i = 0; i  other.slots.size(); ++i) {
   PrintfSlot* s = other.slots[i];
   slots.push_back(s);
 }
-- 
1.9.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH v2] Fix PrintfState copying.

2014-12-16 Thread Yan Wang
PrintfState include std::string object and shouldn't be copied by
malloc/memcpy.

Signed-off-by: Yan Wang yan.w...@linux.intel.com
---
 backend/src/ir/printf.hpp | 33 +
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
index b9f7619..9984675 100644
--- a/backend/src/ir/printf.hpp
+++ b/backend/src/ir/printf.hpp
@@ -75,6 +75,33 @@ namespace gbe
   char conversion_specifier;
   int out_buf_sizeof_offset;  // Should *global_total_size to get the full 
offset.
   std::string str;//if %s, the string store here.
+
+  PrintfState(void) {
+left_justified = 0;
+sign_symbol = 0;
+alter_form = 0;
+zero_padding = 0;
+vector_n = 0;
+min_width = 0;
+precision = 0;
+length_modifier = 0;
+conversion_specifier = 0;
+out_buf_sizeof_offset = 0;
+  }
+
+  PrintfState(const PrintfState  other) {
+left_justified = other.left_justified;
+sign_symbol = other.sign_symbol;
+alter_form = other.alter_form;
+zero_padding = other.zero_padding;
+vector_n = other.vector_n;
+min_width = other.min_width;
+precision = other.precision;
+length_modifier = other.length_modifier;
+conversion_specifier = other.conversion_specifier;
+out_buf_sizeof_offset = other.out_buf_sizeof_offset;
+str = other.str;
+  }
 };
 
 enum {
@@ -106,8 +133,7 @@ namespace gbe
 
   PrintfSlot(PrintfState * st) {
 type = PRINTF_SLOT_TYPE_STATE;
-state = (PrintfState *)malloc(sizeof(PrintfState));
-memcpy(state, st, sizeof(PrintfState));
+state = new PrintfState(*st);
   }
 
   PrintfSlot(const PrintfSlot  other) {
@@ -119,8 +145,7 @@ namespace gbe
   type = PRINTF_SLOT_TYPE_STRING;
 } else if (other.type == PRINTF_SLOT_TYPE_STATE) {
   type = PRINTF_SLOT_TYPE_STATE;
-  state = (PrintfState *)malloc(sizeof(PrintfState));
-  memcpy(state, other.state, sizeof(PrintfState));
+  state = new PrintfState(*other.state);
 } else {
   type = PRINTF_SLOT_TYPE_NONE;
   ptr = NULL;
-- 
1.9.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Fix PrintfState copying.

2014-12-08 Thread Yan Wang
PrintfState includes std::string object and shouldn't be copied by
malloc/memcpy.

Signed-off-by: Yan Wang yan.w...@linux.intel.com
---
 backend/src/ir/printf.hpp | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
index b9f7619..8ea5976 100644
--- a/backend/src/ir/printf.hpp
+++ b/backend/src/ir/printf.hpp
@@ -75,6 +75,23 @@ namespace gbe
   char conversion_specifier;
   int out_buf_sizeof_offset;  // Should *global_total_size to get the full 
offset.
   std::string str;//if %s, the string store here.
+
+  PrintfState(void) {
+  }
+
+  PrintfState(const PrintfState  other) {
+left_justified = other.left_justified;
+sign_symbol = other.sign_symbol;
+alter_form = other.alter_form;
+zero_padding = other.zero_padding;
+vector_n = other.vector_n;
+min_width = other.min_width;
+precision = other.precision;
+length_modifier = other.length_modifier;
+conversion_specifier = other.conversion_specifier;
+out_buf_sizeof_offset = other.out_buf_sizeof_offset;
+str = other.str;
+  }
 };
 
 enum {
@@ -106,8 +123,7 @@ namespace gbe
 
   PrintfSlot(PrintfState * st) {
 type = PRINTF_SLOT_TYPE_STATE;
-state = (PrintfState *)malloc(sizeof(PrintfState));
-memcpy(state, st, sizeof(PrintfState));
+state = new PrintfState(*st);
   }
 
   PrintfSlot(const PrintfSlot  other) {
@@ -119,8 +135,7 @@ namespace gbe
   type = PRINTF_SLOT_TYPE_STRING;
 } else if (other.type == PRINTF_SLOT_TYPE_STATE) {
   type = PRINTF_SLOT_TYPE_STATE;
-  state = (PrintfState *)malloc(sizeof(PrintfState));
-  memcpy(state, other.state, sizeof(PrintfState));
+  state = new PrintfState(*other.state);
 } else {
   type = PRINTF_SLOT_TYPE_NONE;
   ptr = NULL;
-- 
1.9.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Fix delete operator using.

2014-12-08 Thread Yan Wang
The 2 delete operators work on array pointer.

Signed-off-by: Yan Wang yan.w...@linux.intel.com
---
 utests/compiler_fill_gl_image.cpp | 2 +-
 utests/utest_helper.cpp   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/utests/compiler_fill_gl_image.cpp 
b/utests/compiler_fill_gl_image.cpp
index 87d2fcd..f1eb8e7 100644
--- a/utests/compiler_fill_gl_image.cpp
+++ b/utests/compiler_fill_gl_image.cpp
@@ -70,7 +70,7 @@ static void compiler_fill_gl_image(void)
 for (uint32_t i = 0; i  w; i++)
   OCL_ASSERT(resultColor[j * w + i] == color);
   OCL_UNMAP_BUFFER(0);
-  delete resultColor;
+  delete[] resultColor;
 }
 
 MAKE_UTEST_FROM_FUNCTION(compiler_fill_gl_image);
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index 606c1bf..591054e 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -401,7 +401,7 @@ cl_ocl_init(void)
 
 error:
   if (props)
-delete props;
+delete[] props;
   return status;
 }
 
-- 
1.9.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Fix based on piglit OpenCL failed case (cl-program-tester).

2014-12-02 Thread Yan Wang
Fix tests/cl/program/build/optimization-options-cl10.cl
After calling check_cl_version_option, -cl-std=CLX.X should be
removed. This options couldn't be accepted by the subsequent
process.

Signed-off-by: Yan Wang yan.w...@linux.intel.com
---
 src/cl_program.c | 50 ++
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/src/cl_program.c b/src/cl_program.c
index c30f85e..07de15b 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -438,7 +438,7 @@ error:
 
 /* Before we do the real work, we need to check whether our platform
cl version can meet -cl-std= */
-static int check_cl_version_option(cl_program p, const char* options) {
+static int check_cl_version_option(cl_program p, const char* options, char** 
fopt) {
   const char* s = NULL;
   int ver1 = 0;
   int ver2 = 0;
@@ -467,16 +467,25 @@ static int check_cl_version_option(cl_program p, const 
char* options) {
 if (ver2  ver1)
   return 0;
 
+TRY_ALLOC_NO_ERR (*fopt, cl_calloc(strlen(options) + 1, sizeof(char)));
+memcpy(*fopt, options, s - options);
+if (s + strlen(-cl-std=CLX.X)  options + strlen(options))
+  memcpy((*fopt) + (s - options), s + strlen(-cl-std=CLX.X),
+options + strlen(options) - s - strlen(-cl-std=CLX.X));
+
 return 1;
   }
 
   return 1;
+error:
+  return 0;
 }
 
 LOCAL cl_int
 cl_program_build(cl_program p, const char *options)
 {
   cl_int err = CL_SUCCESS;
+  char* filter_options = NULL;
   int i = 0;
   int copyed = 0;
 
@@ -485,7 +494,7 @@ cl_program_build(cl_program p, const char *options)
 goto error;
   }
 
-  if (!check_cl_version_option(p, options)) {
+  if (!check_cl_version_option(p, options, filter_options)) {
 err = CL_BUILD_PROGRAM_FAILURE;
 goto error;
   }
@@ -495,8 +504,12 @@ cl_program_build(cl_program p, const char *options)
 cl_free(p-build_opts);
 p-build_opts = NULL;
   }
-  TRY_ALLOC (p-build_opts, cl_calloc(strlen(options) + 1, sizeof(char)));
-  memcpy(p-build_opts, options, strlen(options));
+  if (filter_options) {
+p-build_opts = filter_options; 
+  } else {
+TRY_ALLOC (p-build_opts, cl_calloc(strlen(options) + 1, 
sizeof(char)));
+memcpy(p-build_opts, options, strlen(options));
+  }
 
   p-source_type = p-source ? FROM_SOURCE : p-binary ? FROM_BINARY : 
FROM_LLVM;
 }
@@ -515,7 +528,8 @@ cl_program_build(cl_program p, const char *options)
   goto error;
 }
 
-p-opaque = compiler_program_new_from_source(p-ctx-device-vendor_id, 
p-source, p-build_log_max_sz, options, p-build_log, p-build_log_sz);
+p-opaque = compiler_program_new_from_source(p-ctx-device-vendor_id, 
p-source, p-build_log_max_sz,
+  filter_options ? filter_options : options, p-build_log, 
p-build_log_sz);
 if (UNLIKELY(p-opaque == NULL)) {
   if (p-build_log_sz  0  strstr(p-build_log, error: error reading 
'options'))
 err = CL_INVALID_BUILD_OPTIONS;
@@ -532,7 +546,8 @@ cl_program_build(cl_program p, const char *options)
   goto error;
 }
 
-compiler_program_build_from_llvm(p-opaque, p-build_log_max_sz, 
p-build_log, p-build_log_sz, options);
+compiler_program_build_from_llvm(p-opaque, p-build_log_max_sz, 
p-build_log, p-build_log_sz,
+  filter_options ? filter_options : options);
 if (UNLIKELY(p-opaque == NULL)) {
   if (p-build_log_sz  0  strstr(p-build_log, error: error reading 
'options'))
 err = CL_INVALID_BUILD_OPTIONS;
@@ -587,9 +602,10 @@ cl_program_link(cl_contextcontext,
   cl_int err = CL_SUCCESS;
   cl_int i = 0;
   int copyed = 0;
+  char* filter_options = NULL;
   p = cl_program_new(context);
 
-  if (!check_cl_version_option(p, options)) {
+  if (!check_cl_version_option(p, options, filter_options)) {
 err = CL_BUILD_PROGRAM_FAILURE;
 goto error;
   }
@@ -614,7 +630,8 @@ cl_program_link(cl_contextcontext,
 p-binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
   }
 
-  compiler_program_build_from_llvm(p-opaque, p-build_log_max_sz, 
p-build_log, p-build_log_sz, options);
+  compiler_program_build_from_llvm(p-opaque, p-build_log_max_sz, 
p-build_log, p-build_log_sz,
+filter_options ? filter_options : options);
 
   /* Create all the kernels */
   TRY (cl_program_load_gen_program, p);
@@ -633,6 +650,8 @@ cl_program_link(cl_contextcontext,
 copyed += sz;
   }
 done:
+  if (filter_options)
+cl_free(filter_options);
   p-is_built = 1;
   p-build_status = CL_BUILD_SUCCESS;
   if (errcode_ret)
@@ -640,6 +659,8 @@ done:
   return p;
 
 error:
+  if (filter_options)
+cl_free(filter_options);
   p-build_status = CL_BUILD_ERROR;
   if (errcode_ret)
 *errcode_ret = err;
@@ -655,13 +676,14 @@ cl_program_compile(cl_programp,
 {
   cl_int err = CL_SUCCESS;
   int i = 0;
+  char* filter_options = NULL;
 
   if (p-ref_n  1) {
 err = CL_INVALID_OPERATION;
 goto error;
   }
 
-  if (!check_cl_version_option(p

Re: [Beignet] [PATCH] Implement cl_khr_image2d_from_buffer extension.

2014-12-02 Thread yan . wang
Thanks for your review.
Agree your points.

Yan Wang

 Thanks for the patch. But This implementation may hurt
 performance. The major reason is that the image has some
 special layout requirement and if we want to create an
 image from eaxct the pitch * height buffer object, we
 have to introduce an extra copy here for all cases.
 This makes things even worse if the buffer is created
 by CL_MEM_USE_HOST_PTR.

 Before we find a way to eliminate the above overhead for this
 extension, I prefer to not include it in beignet.
 What do you think?

 On Tue, Nov 25, 2014 at 07:07:13PM +0800, Yan Wang wrote:
 Implement cl_khr_image2d_from_buffer extension.

 ---
  CMakeLists.txt |   2 +
  kernels/image_2D_buffer.cl |  15 +
  src/CMakeLists.txt |   5 ++
  src/cl_api.c   |   9 +++
  src/cl_device_id.c |   4 ++
  src/cl_device_id.h |   4 ++
  src/cl_extensions.c|   2 +-
  src/cl_gt_device.h |   4 ++
  src/cl_mem.c   | 156
 +++--
  utests/CMakeLists.txt  |   6 ++
  utests/image_2D_buffer.cpp |  89 ++
  11 files changed, 290 insertions(+), 6 deletions(-)
  create mode 100644 kernels/image_2D_buffer.cl
  create mode 100644 utests/image_2D_buffer.cpp

 diff --git a/CMakeLists.txt b/CMakeLists.txt
 index 49c8929..5ca7d90 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -189,6 +189,8 @@ ELSE(OCLIcd_FOUND)
MESSAGE(STATUS Looking for OCL ICD header file - not found)
  ENDIF(OCLIcd_FOUND)

 +SET (OCL_IMAGE2D_BUFFER true)
 +
  Find_Package(PythonInterp)

  ADD_SUBDIRECTORY(include)
 diff --git a/kernels/image_2D_buffer.cl b/kernels/image_2D_buffer.cl
 new file mode 100644
 index 000..6b9060c
 --- /dev/null
 +++ b/kernels/image_2D_buffer.cl
 @@ -0,0 +1,15 @@
 +__kernel void image_2D_buffer(image2d_t image1, image2d_t image2,
 sampler_t sampler, __global int *results)
 +{
 +   int x = get_global_id(0);
 +   int y = get_global_id(1);
 +   int w = get_image_width(image1);
 +   int offset = mad24(y, w, x);
 +
 +   int4 pix = read_imagei(image1, (int2)(x, y));
 +   int4 test = (pix != read_imagei(image2, sampler, (int2)(x, y)));
 +
 +   if (test.x || test.y || test.z || test.w)
 +  results[offset] = 0;
 +   else
 +  results[offset] = 1;
 +}
 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
 index 7182bad..3ca5f1f 100644
 --- a/src/CMakeLists.txt
 +++ b/src/CMakeLists.txt
 @@ -114,6 +114,11 @@ SET(CMAKE_CXX_FLAGS -DHAS_USERPTR
 ${CMAKE_CXX_FLAGS})
  SET(CMAKE_C_FLAGS -DHAS_USERPTR ${CMAKE_C_FLAGS})
  endif (DRM_INTEL_USERPTR)

 +if (OCL_IMAGE2D_BUFFER)
 +SET(CMAKE_CXX_FLAGS -DHAS_OCLImage2dBuffer ${CMAKE_CXX_FLAGS})
 +SET(CMAKE_C_FLAGS -DHAS_OCLImage2dBuffer ${CMAKE_C_FLAGS})
 +endif (OCL_IMAGE2D_BUFFER)
 +
  set(GIT_SHA1 git_sha1.h)
  add_custom_target(${GIT_SHA1} ALL
COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh
 diff --git a/src/cl_api.c b/src/cl_api.c
 index 972c687..04095a2 100644
 --- a/src/cl_api.c
 +++ b/src/cl_api.c
 @@ -548,6 +548,14 @@ clCreateImage(cl_context context,
  err = CL_INVALID_IMAGE_DESCRIPTOR;
  goto error;
}
 +#ifdef HAS_OCLImage2dBuffer
 +  if ((image_desc-image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER 
 +  image_desc-image_type != CL_MEM_OBJECT_IMAGE2D) 
 + image_desc-buffer) {
 +err = CL_INVALID_IMAGE_DESCRIPTOR;
 +goto error;
 +  }
 +#else
/* buffer refers to a valid buffer memory object if image_type is
   CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */
if (image_desc-image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER 
 @@ -555,6 +563,7 @@ clCreateImage(cl_context context,
  err = CL_INVALID_IMAGE_DESCRIPTOR;
  goto error;
}
 +#endif
if (image_desc-num_mip_levels || image_desc-num_samples) {
  err = CL_INVALID_IMAGE_DESCRIPTOR;
  goto error;
 diff --git a/src/cl_device_id.c b/src/cl_device_id.c
 index 5ef0bde..c47d48c 100644
 --- a/src/cl_device_id.c
 +++ b/src/cl_device_id.c
 @@ -571,6 +571,10 @@ cl_get_device_info(cl_device_id device,
  DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain)
  DECL_FIELD(PARTITION_TYPE, partition_type)
  DECL_FIELD(REFERENCE_COUNT, device_reference_count)
 +#ifdef HAS_OCLImage2dBuffer
 +DECL_FIELD(IMAGE_PITCH_ALIGNMENT, image_pitch_alignment)
 +DECL_FIELD(IMAGE_BASE_ADDRESS_ALIGNMENT,
 image_base_address_alignment)
 +#endif

  case CL_DRIVER_VERSION:
if (param_value_size_ret) {
 diff --git a/src/cl_device_id.h b/src/cl_device_id.h
 index ee6a8e6..8d8adac 100644
 --- a/src/cl_device_id.h
 +++ b/src/cl_device_id.h
 @@ -113,6 +113,10 @@ struct _cl_device_id {
cl_device_affinity_domainaffinity_domain;
cl_device_partition_property partition_type[3];
cl_uint  device_reference_count;
 +#ifdef HAS_OCLImage2dBuffer
 +  cl_uint  image_pitch_alignment;
 +  cl_uint  image_base_address_alignment;
 +#endif
  };

  /* Get a device from the given platform */
 diff --git

[Beignet] [PATCH] Fix based on piglit OpenCL falied case (cl-api-compile-program).

2014-12-01 Thread Yan Wang
1. Return the expected error code.
2. Don't destroy cl_program object after comile error because it
may be used still in the future.

Signed-off-by: Yan Wang yan.w...@linux.intel.com
---
 src/cl_program.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/cl_program.c b/src/cl_program.c
index fa67ef2..c30f85e 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -738,9 +738,9 @@ cl_program_compile(cl_programp,
 
 if (UNLIKELY(p-opaque == NULL)) {
   if (p-build_log_sz  0  strstr(p-build_log, error: error reading 
'options'))
-err = CL_INVALID_BUILD_OPTIONS;
+err = CL_INVALID_COMPILER_OPTIONS;
   else
-err = CL_BUILD_PROGRAM_FAILURE;
+err = CL_COMPILE_PROGRAM_FAILURE;
   goto error;
 }
 
@@ -758,8 +758,6 @@ cl_program_compile(cl_programp,
 
 error:
   p-build_status = CL_BUILD_ERROR;
-  cl_program_delete(p);
-  p = NULL;
   return err;
 }
 
-- 
1.9.3

___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Implement cl_khr_image2d_from_buffer extension.

2014-11-25 Thread Yan Wang
---
 CMakeLists.txt |   2 +
 kernels/image_2D_buffer.cl |  15 +
 src/CMakeLists.txt |   5 ++
 src/cl_api.c   |   9 +++
 src/cl_device_id.c |   4 ++
 src/cl_device_id.h |   4 ++
 src/cl_extensions.c|   2 +-
 src/cl_gt_device.h |   4 ++
 src/cl_mem.c   | 156 +++--
 utests/CMakeLists.txt  |   6 ++
 utests/image_2D_buffer.cpp |  89 ++
 11 files changed, 290 insertions(+), 6 deletions(-)
 create mode 100644 kernels/image_2D_buffer.cl
 create mode 100644 utests/image_2D_buffer.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49c8929..5ca7d90 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -189,6 +189,8 @@ ELSE(OCLIcd_FOUND)
   MESSAGE(STATUS Looking for OCL ICD header file - not found)
 ENDIF(OCLIcd_FOUND)
 
+SET (OCL_IMAGE2D_BUFFER true)
+
 Find_Package(PythonInterp)
 
 ADD_SUBDIRECTORY(include)
diff --git a/kernels/image_2D_buffer.cl b/kernels/image_2D_buffer.cl
new file mode 100644
index 000..6b9060c
--- /dev/null
+++ b/kernels/image_2D_buffer.cl
@@ -0,0 +1,15 @@
+__kernel void image_2D_buffer(image2d_t image1, image2d_t image2, sampler_t 
sampler, __global int *results)
+{
+   int x = get_global_id(0);
+   int y = get_global_id(1);
+   int w = get_image_width(image1);
+   int offset = mad24(y, w, x);
+
+   int4 pix = read_imagei(image1, (int2)(x, y));
+   int4 test = (pix != read_imagei(image2, sampler, (int2)(x, y)));
+
+   if (test.x || test.y || test.z || test.w)
+  results[offset] = 0;
+   else
+  results[offset] = 1;
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7182bad..3ca5f1f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -114,6 +114,11 @@ SET(CMAKE_CXX_FLAGS -DHAS_USERPTR ${CMAKE_CXX_FLAGS})
 SET(CMAKE_C_FLAGS -DHAS_USERPTR ${CMAKE_C_FLAGS})
 endif (DRM_INTEL_USERPTR)
 
+if (OCL_IMAGE2D_BUFFER)
+SET(CMAKE_CXX_FLAGS -DHAS_OCLImage2dBuffer ${CMAKE_CXX_FLAGS})
+SET(CMAKE_C_FLAGS -DHAS_OCLImage2dBuffer ${CMAKE_C_FLAGS})
+endif (OCL_IMAGE2D_BUFFER)
+
 set(GIT_SHA1 git_sha1.h)
 add_custom_target(${GIT_SHA1} ALL
   COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh
diff --git a/src/cl_api.c b/src/cl_api.c
index 972c687..04095a2 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -548,6 +548,14 @@ clCreateImage(cl_context context,
 err = CL_INVALID_IMAGE_DESCRIPTOR;
 goto error;
   }
+#ifdef HAS_OCLImage2dBuffer
+  if ((image_desc-image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER 
+  image_desc-image_type != CL_MEM_OBJECT_IMAGE2D) 
+ image_desc-buffer) {
+err = CL_INVALID_IMAGE_DESCRIPTOR;
+goto error;
+  }
+#else
   /* buffer refers to a valid buffer memory object if image_type is
  CL_MEM_OBJECT_IMAGE1D_BUFFER. Otherwise it must be NULL. */
   if (image_desc-image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER 
@@ -555,6 +563,7 @@ clCreateImage(cl_context context,
 err = CL_INVALID_IMAGE_DESCRIPTOR;
 goto error;
   }
+#endif
   if (image_desc-num_mip_levels || image_desc-num_samples) {
 err = CL_INVALID_IMAGE_DESCRIPTOR;
 goto error;
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index 5ef0bde..c47d48c 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -571,6 +571,10 @@ cl_get_device_info(cl_device_id device,
 DECL_FIELD(PARTITION_AFFINITY_DOMAIN, affinity_domain)
 DECL_FIELD(PARTITION_TYPE, partition_type)
 DECL_FIELD(REFERENCE_COUNT, device_reference_count)
+#ifdef HAS_OCLImage2dBuffer
+DECL_FIELD(IMAGE_PITCH_ALIGNMENT, image_pitch_alignment)
+DECL_FIELD(IMAGE_BASE_ADDRESS_ALIGNMENT, image_base_address_alignment)
+#endif
 
 case CL_DRIVER_VERSION:
   if (param_value_size_ret) {
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index ee6a8e6..8d8adac 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -113,6 +113,10 @@ struct _cl_device_id {
   cl_device_affinity_domainaffinity_domain;
   cl_device_partition_property partition_type[3];
   cl_uint  device_reference_count;
+#ifdef HAS_OCLImage2dBuffer
+  cl_uint  image_pitch_alignment;
+  cl_uint  image_base_address_alignment;
+#endif
 };
 
 /* Get a device from the given platform */
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index d07a525..e31386f 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -34,7 +34,7 @@ void check_opt1_extension(cl_extensions_t *extensions)
 {
   int id;
   for(id = OPT1_EXT_START_ID; id = OPT1_EXT_END_ID; id++)
-if (id == EXT_ID(khr_icd))
+if (id == EXT_ID(khr_icd) || id == EXT_ID(khr_image2d_from_buffer))
   extensions-extensions[id].base.ext_enabled = 1;
 }
 
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 37abfd2..f9c5ad4 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -124,4 +124,8 @@ DECL_INFO_STRING(driver_version, 
LIBCL_DRIVER_VERSION_STRING)
 .affinity_domain = 0,
 .partition_type = {0},
 .device_reference_count = 1,
+#ifdef HAS_OCLImage2dBuffer

  1   2   >