[brlcad-devel] OpenCL Boolean Evaluation Status Report

Param Hanji Sat, 13 Aug 2016 11:30:39 -0700

Hi,

For GSoC this year I've worked to improve OpenCL support for BRL-CAD. I
started off with accelerating a few of the primitives.


This was basically writing kernel code for the shoot algorithms for various
primitives. I had accelerated EPA just before GSoC began. This was followed
up with ETO and then PART. PART in particular had a few changes since it
had goto statements.

https://sourceforge.net/p/brlcad/patches/435/
https://sourceforge.net/p/brlcad/patches/444/
https://sourceforge.net/p/brlcad/patches/446/

All of them were accepted (after a few changes).

Then I moved on to incorporate boolean evaluation on the GPU side so that
complex figures can be generated. We decided that it was a good idea to
first have an interface to call the serial boolweave() and boolfinal()
functions using GPU generated segments as inputs. This yielded mixed
results.

I'm pretty confident weaving happened correctly, but the final evaluation
failed. This was sort of confirmed by PGM files generated (showing number
of segments and partitions for each pixel).  PFA a patch of my attempt.

Over the past couple of weeks (and the next week) I've focused on
implementing boolweave() on the GPU. I've fixed all the compile errors,
leaving run time debugging for the upcoming week. Unfortunately, I've run
out of time and can work on the kernel bool_eval() only after GSoC.

You can find my daily development logs here:

https://catchchaos.wordpress.com/

Best,
Param

Index: include/rt/shoot.h
===================================================================
--- include/rt/shoot.h	(revision 68251)
+++ include/rt/shoot.h	(working copy)
@@ -178,10 +178,10 @@
 
 RT_EXPORT extern void
 clt_frame(void *pixels, uint8_t o[3], int cur_pixel, int last_pixel,
-	  int width, int ibackground[3], int inonbackground[3],
+	  int width, int height, int ibackground[3], int inonbackground[3],
 	  double airdensity, double haze[3], fastf_t gamma,
           mat_t view2model, fastf_t cell_width, fastf_t cell_height,
-          fastf_t aspect, int lightmodel);
+          fastf_t aspect, int lightmodel, struct application *ap);
 #endif
 
 
Index: src/librt/primitives/primitive_util.c
===================================================================
--- src/librt/primitives/primitive_util.c	(revision 68251)
+++ src/librt/primitives/primitive_util.c	(working copy)
@@ -24,6 +24,8 @@
  * librt_private.h.
  */
 
+#include <stdbool.h>
+
 #include "bu/malloc.h"
 #include "bu/opt.h"
 #include "optical.h"
@@ -814,10 +816,10 @@
 
 void
 clt_frame(void *pixels, uint8_t o[3], int cur_pixel, int last_pixel,
-	  int width, int ibackground[3], int inonbackground[3],
+	  int width, int height, int ibackground[3], int inonbackground[3],
 	  double airdensity, double haze[3], fastf_t gamma,
           mat_t view2model, fastf_t cell_width, fastf_t cell_height,
-          fastf_t aspect, int lightmodel)
+          fastf_t aspect, int lightmodel, struct application *ap)
 {
     const size_t npix = last_pixel-cur_pixel+1;
 
@@ -884,6 +886,8 @@
 	    size_t sz_segs;
 	    cl_mem psegs;
 	    size_t snpix = swxh[0]*swxh[1];
+	    struct cl_seg *segs;
+	    int count = 0;
 
 	    sz_counts = sizeof(cl_int)*npix;
 	    pcounts = clCreateBuffer(clt_context, CL_MEM_WRITE_ONLY|CL_MEM_HOST_READ_ONLY, sz_counts, NULL, &error);
@@ -916,20 +920,20 @@
 	    sz_h = sizeof(cl_uint)*(npix+1);
 	    h = (cl_uint*)bu_calloc(1, sz_h, "h");
 	    h[0] = 0;
+
 	    for (i=1; i<=npix; i++) {
 		BU_ASSERT((counts[i-1] % 2) == 0);
 		h[i] = h[i-1] + counts[i-1]/2;	/* number of segs is half the number of hits */
 	    }
-	    bu_free(counts, "counts");
 
 	    ph = clCreateBuffer(clt_context, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, sz_h, h, &error);
 	    if (error != CL_SUCCESS) bu_bomb("failed to create OpenCL offs buffer");
 
 	    sz_segs = sizeof(struct cl_seg)*h[npix];
-	    bu_free(h, "h");
+	    segs = (struct cl_seg*)bu_malloc(sz_segs, "segments on gpu");
 
 	    if (sz_segs != 0) {
-		psegs = clCreateBuffer(clt_context, CL_MEM_READ_WRITE|CL_MEM_HOST_NO_ACCESS, sz_segs, NULL, &error);
+		psegs = clCreateBuffer(clt_context, CL_MEM_READ_WRITE, sz_segs, NULL, &error);
 		if (error != CL_SUCCESS) bu_bomb("failed to create OpenCL segs buffer");
 
 		bu_semaphore_acquire(clt_semaphore);
@@ -950,11 +954,37 @@
 		if (error != CL_SUCCESS) bu_bomb("failed to set OpenCL kernel arguments");
 		error = clEnqueueNDRangeKernel(clt_queue, clt_store_segs_kernel, 2, NULL, wxh,
 			swxh, 0, NULL, NULL);
+		error = clEnqueueReadBuffer(clt_queue, psegs, CL_TRUE, 0, sz_segs, segs, 0, NULL, NULL);
+		if (error != CL_SUCCESS) bu_bomb("failed to read OpenCL segs buffer");
 		bu_semaphore_release(clt_semaphore);
+
+		clt_boolweave_and_boolfinal(segs, ap, h, width, height);
+
+		h[0] = 0;
+
+		for (i=1; i<=npix; i++) {
+		    if (counts[i-1] != 0)
+			h[i] = h[i-1] + 1;
+		    else
+			h[i] = h[i-1];
+		}
+
+		bu_semaphore_acquire(clt_semaphore);
+		error = clEnqueueWriteBuffer(clt_queue, ph, CL_TRUE, 0, sz_h, h, 0, NULL, NULL);
+		if (error != CL_SUCCESS) bu_bomb("failed to write into OpenCL h buffer");
+		error = clEnqueueWriteBuffer(clt_queue, psegs, CL_TRUE, 0, sz_segs, segs, 0, NULL, NULL);
+		if (error != CL_SUCCESS) bu_bomb("failed to write into OpenCL segs buffer");
+		bu_semaphore_release(clt_semaphore);
+
+		bu_free(counts, "counts");
+
             } else {
 		psegs = NULL;
             }
 
+	    bu_free(segs, "segs");
+	    bu_free(h, "h");
+
 	    bu_semaphore_acquire(clt_semaphore);
 	    error = clSetKernelArg(clt_shade_segs_kernel, 0, sizeof(cl_mem), &ppixels);
 	    error |= clSetKernelArg(clt_shade_segs_kernel, 1, sizeof(cl_uchar3), &p.o);
@@ -1026,6 +1056,239 @@
     clEnqueueReadBuffer(clt_queue, ppixels, CL_TRUE, 0, sz_pixels, pixels, 0, NULL, NULL);
     clReleaseMemObject(ppixels);
 }
+
+static void
+copy_to_CPU(struct seg *cpu_seg, struct cl_seg *gpu_seg, struct application *ap)
+{
+    struct hit seg = RT_HIT_INIT_ZERO;
+    struct soltab *stp;
+    unsigned int i = 0;
+
+    cpu_seg->seg_in = seg;
+
+    cpu_seg->seg_in.hit_normal[X] = gpu_seg->seg_in.hit_normal.x;
+    cpu_seg->seg_in.hit_normal[Y] = gpu_seg->seg_in.hit_normal.y;
+    cpu_seg->seg_in.hit_normal[Z] = gpu_seg->seg_in.hit_normal.z;
+
+    cpu_seg->seg_in.hit_point[X] = gpu_seg->seg_in.hit_point.x;
+    cpu_seg->seg_in.hit_point[Y] = gpu_seg->seg_in.hit_point.y;
+    cpu_seg->seg_in.hit_point[Z] = gpu_seg->seg_in.hit_point.z;
+
+    cpu_seg->seg_in.hit_vpriv[X] = gpu_seg->seg_in.hit_vpriv.x;
+    cpu_seg->seg_in.hit_vpriv[Y] = gpu_seg->seg_in.hit_vpriv.y;
+    cpu_seg->seg_in.hit_vpriv[Z] = gpu_seg->seg_in.hit_vpriv.z;
+
+    cpu_seg->seg_in.hit_dist = gpu_seg->seg_in.hit_dist;
+    cpu_seg->seg_in.hit_surfno = gpu_seg->seg_in.hit_surfno;
+    cpu_seg->seg_in.hit_rayp = &ap->a_ray;
+
+
+    cpu_seg->seg_out = seg;
+
+    cpu_seg->seg_out.hit_normal[X] = gpu_seg->seg_out.hit_normal.x;
+    cpu_seg->seg_out.hit_normal[Y] = gpu_seg->seg_out.hit_normal.y;
+    cpu_seg->seg_out.hit_normal[Z] = gpu_seg->seg_out.hit_normal.z;
+
+    cpu_seg->seg_out.hit_point[X] = gpu_seg->seg_out.hit_point.x;
+    cpu_seg->seg_out.hit_point[Y] = gpu_seg->seg_out.hit_point.y;
+    cpu_seg->seg_out.hit_point[Z] = gpu_seg->seg_out.hit_point.z;
+
+    cpu_seg->seg_out.hit_vpriv[X] = gpu_seg->seg_out.hit_vpriv.x;
+    cpu_seg->seg_out.hit_vpriv[Y] = gpu_seg->seg_out.hit_vpriv.y;
+    cpu_seg->seg_out.hit_vpriv[Z] = gpu_seg->seg_out.hit_vpriv.z;
+
+    cpu_seg->seg_out.hit_dist = gpu_seg->seg_out.hit_dist;
+    cpu_seg->seg_out.hit_surfno = gpu_seg->seg_out.hit_surfno;
+    cpu_seg->seg_out.hit_rayp = &ap->a_ray;
+
+    cpu_seg->seg_stp = NULL;
+    RT_VISIT_ALL_SOLTABS_START(stp, ap->a_rt_i) {
+	/* Ignore "dead" solids in the list.  (They failed prep) */
+	if (stp->st_aradius <= 0) continue;
+	/* Infinite solids make the BVH construction explode. */
+	if (stp->st_aradius >= INFINITY) continue;
+
+	if (i == gpu_seg->seg_sti) {
+	    cpu_seg->seg_stp = stp;
+	    break;
+	}
+	i++;
+    } RT_VISIT_ALL_SOLTABS_END
+}
+
+static void
+copy_to_GPU(struct cl_seg *gpu_seg, struct partition *cpu_part, struct application *ap)
+{
+    struct soltab *stp;
+    unsigned int i = 0;
+
+    gpu_seg->seg_in.hit_normal.x = cpu_part->pt_inhit->hit_normal[X];
+    gpu_seg->seg_in.hit_normal.y = cpu_part->pt_inhit->hit_normal[Y];
+    gpu_seg->seg_in.hit_normal.z = cpu_part->pt_inhit->hit_normal[Z];
+
+    gpu_seg->seg_in.hit_point.x = cpu_part->pt_inhit->hit_point[X];
+    gpu_seg->seg_in.hit_point.y = cpu_part->pt_inhit->hit_point[Y];
+    gpu_seg->seg_in.hit_point.z = cpu_part->pt_inhit->hit_point[Z];
+
+    gpu_seg->seg_in.hit_vpriv.x = cpu_part->pt_inhit->hit_vpriv[X];
+    gpu_seg->seg_in.hit_vpriv.y = cpu_part->pt_inhit->hit_vpriv[Y];
+    gpu_seg->seg_in.hit_vpriv.z = cpu_part->pt_inhit->hit_vpriv[Z];
+
+    gpu_seg->seg_in.hit_dist = cpu_part->pt_inhit->hit_dist;
+    gpu_seg->seg_in.hit_surfno = cpu_part->pt_inhit->hit_surfno;
+
+
+    gpu_seg->seg_out.hit_normal.x = cpu_part->pt_outhit->hit_normal[X];
+    gpu_seg->seg_out.hit_normal.y = cpu_part->pt_outhit->hit_normal[Y];
+    gpu_seg->seg_out.hit_normal.z = cpu_part->pt_outhit->hit_normal[Z];
+
+    gpu_seg->seg_out.hit_point.x = cpu_part->pt_outhit->hit_point[X];
+    gpu_seg->seg_out.hit_point.y = cpu_part->pt_outhit->hit_point[Y];
+    gpu_seg->seg_out.hit_point.z = cpu_part->pt_outhit->hit_point[Z];
+
+    gpu_seg->seg_out.hit_vpriv.x = cpu_part->pt_outhit->hit_vpriv[X];
+    gpu_seg->seg_out.hit_vpriv.y = cpu_part->pt_outhit->hit_vpriv[Y];
+    gpu_seg->seg_out.hit_vpriv.z = cpu_part->pt_outhit->hit_vpriv[Z];
+
+    gpu_seg->seg_out.hit_dist = cpu_part->pt_outhit->hit_dist;
+    gpu_seg->seg_out.hit_surfno = cpu_part->pt_outhit->hit_surfno;
+
+    RT_VISIT_ALL_SOLTABS_START(stp, ap->a_rt_i) {
+	/* Ignore "dead" solids in the list.  (They failed prep) */
+	if (stp->st_aradius <= 0) continue;
+	/* Infinite solids make the BVH construction explode. */
+	if (stp->st_aradius >= INFINITY) continue;
+
+	if (cpu_part->pt_inseg->seg_stp == stp)
+	    gpu_seg->seg_sti = i;
+/*	for (BU_PTBL_FOR(segpp, (struct seg **), &cpu_part->pt_seglist))
+	    if ((*segpp)->seg_stp == stp) {
+		gpu_seg->seg_sti = i;
+		return;	    }*/
+	i++;
+    } RT_VISIT_ALL_SOLTABS_END
+}
+
+void
+clt_boolweave_and_boolfinal(struct cl_seg *segs, struct application *ap,
+			    cl_uint *h, int width, int height)
+{
+    size_t i, j;
+    size_t x, out = 0;
+    FILE *file;
+
+    for (i = 0; i < (size_t)(width * height); i++) {
+	struct seg waiting_segs;
+	struct seg finished_segs;
+	struct seg *temp_seg;
+	struct partition initial_part;
+	struct partition final_part;
+	struct bu_ptbl *regionbits;
+	struct resource *resp;
+	struct bu_bitv *solidbits;
+	struct rt_i *rtip;
+	struct partition *pp;
+
+	RT_AP_CHECK(ap);
+	if (ap->a_magic) {
+	    RT_CK_AP(ap);
+	} else {
+	    ap->a_magic = RT_AP_MAGIC;
+	}
+	if (ap->a_ray.magic) {
+	    RT_CK_RAY(&(ap->a_ray));
+	} else {
+	    ap->a_ray.magic = RT_RAY_MAGIC;
+	}
+	if (ap->a_resource == RESOURCE_NULL) {
+	    ap->a_resource = &rt_uniresource;
+	    if (rt_uniresource.re_magic == 0)
+		rt_init_resource(&rt_uniresource, 0, ap->a_rt_i);
+	}
+
+	resp = ap->a_resource;
+	RT_CK_RESOURCE(resp);
+	rtip = ap->a_rt_i;
+	RT_CK_RTI(rtip);
+
+	if (rtip->needprep)
+	    rt_prep_parallel(rtip, 1);	/* Stay on our CPU */
+
+	if (!resp->re_pieces && rtip->rti_nsolids_with_pieces > 0) {
+	    /* Initialize this processors 'solid pieces' state */
+	    rt_res_pieces_init(resp, rtip);
+	}
+
+	solidbits = rt_get_solidbitv(ap->a_rt_i->nsolids, resp);
+
+	initial_part.pt_forw = initial_part.pt_back = &initial_part;
+	initial_part.pt_magic = PT_HD_MAGIC;
+	final_part.pt_forw = final_part.pt_back = &final_part;
+	final_part.pt_magic = PT_HD_MAGIC;
+	ap->a_Final_Part_hdp = &final_part;
+
+	BU_LIST_INIT(&waiting_segs.l);
+	BU_LIST_INIT(&finished_segs.l);
+
+	ap->a_finished_segs_hdp = &finished_segs;
+
+	if (BU_LIST_IS_EMPTY(&resp->re_region_ptbl)) {
+	    BU_ALLOC(regionbits, struct bu_ptbl);
+	    bu_ptbl_init(regionbits, 7, "rt_shootray() regionbits ptbl");
+	} else {
+	    regionbits = BU_LIST_FIRST(bu_ptbl, &resp->re_region_ptbl);
+	    BU_LIST_DEQUEUE(&regionbits->l);
+	    BU_CK_PTBL(regionbits);
+	}
+
+	x = 0;
+	for (j = h[i]; j < h[i+1]; j++) {
+	    x++;
+	    RT_GET_SEG(temp_seg, ap->a_resource);
+    	    copy_to_CPU(temp_seg, &segs[j], ap);
+	    BU_LIST_INSERT(&(waiting_segs.l), &(temp_seg->l));
+	}
+	file = fopen("gpu_seg.pgm", "a");
+	fprintf(file, "%lu\n", x);
+	fclose(file);
+
+	if (BU_LIST_NON_EMPTY(&(waiting_segs.l))) {
+	    rt_boolweave(&finished_segs, &waiting_segs, &initial_part, ap);
+	}
+
+	x = 0;
+	for (BU_LIST_FOR(pp, partition, (struct bu_list *)&initial_part))
+	    x++;
+    	file = fopen("gpu_init_part.pgm", "a");
+	fprintf(file, "%lu\n", x);
+	fclose(file);
+
+	if (BU_LIST_NON_EMPTY(&(finished_segs.l))) {
+	    (void)rt_boolfinal(&initial_part, &final_part, regionbits, ap, solidbits);
+	}
+
+	x = 0;
+	for (BU_LIST_FOR(pp, partition, (struct bu_list *)&final_part))
+	    x++;
+    	file = fopen("gpu_final_part.pgm", "a");
+	fprintf(file, "%lu\n", x);
+	fclose(file);
+
+	for (BU_LIST_FOR(pp, partition, (struct bu_list *)&final_part)) {
+	    copy_to_GPU(&segs[out], pp, ap);
+	    out++;
+	    break;
+	}
+
+	BU_CK_PTBL(regionbits);
+	BU_LIST_APPEND(&resp->re_region_ptbl, &regionbits->l);
+
+	RT_FREE_PT_LIST(&initial_part, resp);
+	RT_FREE_SEG_LIST(&finished_segs, resp);
+	RT_FREE_PT_LIST(&final_part, resp);
+    }
+    printf("%d %d\n", height, width);
+}
 #endif

------------------------------------------------------------------------------
What NetFlow Analyzer can do for you? Monitors network bandwidth and traffic
patterns at an interface-level. Reveals which users, apps, and protocols are 
consuming the most bandwidth. Provides multi-vendor support for NetFlow, 
J-Flow, sFlow and other flows. Make informed decisions using capacity 
planning reports. http://sdm.link/zohodev2dev

_______________________________________________
BRL-CAD Developer mailing list
brlcad-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/brlcad-devel

[brlcad-devel] OpenCL Boolean Evaluation Status Report

Reply via email to