On Mon, Jul 4, 2016 at 9:33 PM, Param Hanji <param.catchch...@gmail.com>
wrote:
> On Mon, Jul 4, 2016 at 9:52 PM Vasco Alexandre da Silva Costa <
> vasco.co...@gmail.com> wrote:
>
>> On Mon, Jul 4, 2016 at 12:30 PM, Param Hanji <param.catchch...@gmail.com>
>> wrote:
>>
>>> Hi Vasco,
>>>
>>> I tried everything and still no luck. I'm pretty sure just copying
>>> inhits and outhits is not enough. Going by Sean's explanation of segments
>>> and partitions, it looks like I have no option but to implement partitions
>>> on the GPU.
>>>
>>
>> IIRC the code doesn't use partitions before calling rt_bool_*. Only
>> segments. So I don't see how this is a problem.
>> Have you examined the output lists in all stages?
>>
>>
>>
> Well to check if the bool functions are working as expected, I inserted
> code to print number of finalPartitions (after weave and boolfinal) for
> every pixel. I did this in shoot.c and primitive_util.c as well. Then I ran
> rt in serial as well as OpenCL mode.
>
> The results were very different. When boolean operations like subtraction
> were used, number of final partitions was 0 for every pixel. In ANSI C,
> this was not the case and partitions were indeed generated. This is why I
> think I'm missing something or doing something wrong.
>
> Here's a patch i just generated.
>
Hmmm. There seem to be a couple of issues here:
The boolean weaving code expects seg_in.hit_rayp and seg_out.hit_rayp to be
pointing to something. These values are mostly used for debug printfs in
bool.c with the exception of bool.c:bool_max_raynum. So I changed this to
point to a dummy struct xray.
The other issue is that the number of partitions per pixel is not
necessarily the same as the number of segments. IIRC it may be larger. I
changed the patch to only copy the first partition per pixel. This may not
be sufficient when doing multi-hit rendering.
There may still be issues with the regionbits and solidbits initialization.
Or, like you said, with a mismatch between the implementation of the
partitions and the segs that are currently used in the OpenCL shading code.
Get a sample .g file which demonstrates the issue with the subtraction
boolean op. Then we can compare the results from the OpenCL code with the
results from the ANSI C code. One simple way to do this is to dump the per
pixel partition length results to a grayscale image. e.g. you can use the
PPM P2 graymap image format which is simple to write:
https://en.wikipedia.org/wiki/Netpbm_format
This should make the result comparison easier than looking at a text dump.
--
Vasco Alexandre da Silva Costa
PhD in Computer Engineering (Computer Graphics)
Instituto Superior Técnico/University of Lisbon, Portugal
Index: include/rt/shoot.h
===================================================================
--- include/rt/shoot.h (revision 68233)
+++ include/rt/shoot.h (working copy)
@@ -178,10 +178,10 @@
RT_EXPORT extern void
clt_frame(void *pixels, uint8_t o[3], int cur_pixel, int last_pixel,
- int width, int ibackground[3], int inonbackground[3],
+ int width, int height, int ibackground[3], int inonbackground[3],
double airdensity, double haze[3], fastf_t gamma,
mat_t view2model, fastf_t cell_width, fastf_t cell_height,
- fastf_t aspect, int lightmodel);
+ fastf_t aspect, int lightmodel, struct application *ap);
#endif
Index: src/librt/primitives/primitive_util.c
===================================================================
--- src/librt/primitives/primitive_util.c (revision 68233)
+++ src/librt/primitives/primitive_util.c (working copy)
@@ -24,6 +24,8 @@
* librt_private.h.
*/
+#include <stdbool.h>
+
#include "bu/malloc.h"
#include "bu/opt.h"
#include "optical.h"
@@ -812,12 +814,235 @@
clt_db_nprims = 0;
}
+static void
+copy_to_CPU(struct seg *cpu_seg, struct cl_seg *gpu_seg, struct application *ap)
+{
+ struct hit seg = RT_HIT_INIT_ZERO;
+ struct soltab *stp;
+ unsigned int i = 0;
+
+ cpu_seg->seg_in = seg;
+
+ cpu_seg->seg_in.hit_normal[X] = gpu_seg->seg_in.hit_normal.x;
+ cpu_seg->seg_in.hit_normal[Y] = gpu_seg->seg_in.hit_normal.y;
+ cpu_seg->seg_in.hit_normal[Z] = gpu_seg->seg_in.hit_normal.z;
+
+ cpu_seg->seg_in.hit_point[X] = gpu_seg->seg_in.hit_point.x;
+ cpu_seg->seg_in.hit_point[Y] = gpu_seg->seg_in.hit_point.y;
+ cpu_seg->seg_in.hit_point[Z] = gpu_seg->seg_in.hit_point.z;
+
+ cpu_seg->seg_in.hit_vpriv[X] = gpu_seg->seg_in.hit_vpriv.x;
+ cpu_seg->seg_in.hit_vpriv[Y] = gpu_seg->seg_in.hit_vpriv.y;
+ cpu_seg->seg_in.hit_vpriv[Z] = gpu_seg->seg_in.hit_vpriv.z;
+
+ cpu_seg->seg_in.hit_dist = gpu_seg->seg_in.hit_dist;
+ cpu_seg->seg_in.hit_surfno = gpu_seg->seg_in.hit_surfno;
+ cpu_seg->seg_in.hit_rayp = &ap->a_ray;
+
+
+ cpu_seg->seg_out = seg;
+
+ cpu_seg->seg_out.hit_normal[X] = gpu_seg->seg_out.hit_normal.x;
+ cpu_seg->seg_out.hit_normal[Y] = gpu_seg->seg_out.hit_normal.y;
+ cpu_seg->seg_out.hit_normal[Z] = gpu_seg->seg_out.hit_normal.z;
+
+ cpu_seg->seg_out.hit_point[X] = gpu_seg->seg_out.hit_point.x;
+ cpu_seg->seg_out.hit_point[Y] = gpu_seg->seg_out.hit_point.y;
+ cpu_seg->seg_out.hit_point[Z] = gpu_seg->seg_out.hit_point.z;
+
+ cpu_seg->seg_out.hit_vpriv[X] = gpu_seg->seg_out.hit_vpriv.x;
+ cpu_seg->seg_out.hit_vpriv[Y] = gpu_seg->seg_out.hit_vpriv.y;
+ cpu_seg->seg_out.hit_vpriv[Z] = gpu_seg->seg_out.hit_vpriv.z;
+
+ cpu_seg->seg_out.hit_dist = gpu_seg->seg_out.hit_dist;
+ cpu_seg->seg_out.hit_surfno = gpu_seg->seg_out.hit_surfno;
+ cpu_seg->seg_out.hit_rayp = &ap->a_ray;
+
+ cpu_seg->seg_stp = NULL;
+
+ RT_VISIT_ALL_SOLTABS_START(stp, ap->a_rt_i) {
+ /* Ignore "dead" solids in the list. (They failed prep) */
+ if (stp->st_aradius <= 0) continue;
+ /* Infinite solids make the BVH construction explode. */
+ if (stp->st_aradius >= INFINITY) continue;
+
+ if (i == gpu_seg->seg_sti) {
+ cpu_seg->seg_stp = stp;
+ return;
+ }
+ i++;
+ } RT_VISIT_ALL_SOLTABS_END
+ printf("error\n");
+}
+
+static void
+copy_to_GPU(struct cl_seg *gpu_seg, struct partition *cpu_part, struct application *ap)
+{
+ struct soltab *stp;
+ struct seg **segpp;
+ unsigned int i = 0;
+
+ gpu_seg->seg_in.hit_normal.x = cpu_part->pt_inhit->hit_normal[X];
+ gpu_seg->seg_in.hit_normal.y = cpu_part->pt_inhit->hit_normal[Y];
+ gpu_seg->seg_in.hit_normal.z = cpu_part->pt_inhit->hit_normal[Z];
+
+ gpu_seg->seg_in.hit_point.x = cpu_part->pt_inhit->hit_point[X];
+ gpu_seg->seg_in.hit_point.y = cpu_part->pt_inhit->hit_point[Y];
+ gpu_seg->seg_in.hit_point.z = cpu_part->pt_inhit->hit_point[Z];
+
+ gpu_seg->seg_in.hit_vpriv.x = cpu_part->pt_inhit->hit_vpriv[X];
+ gpu_seg->seg_in.hit_vpriv.y = cpu_part->pt_inhit->hit_vpriv[Y];
+ gpu_seg->seg_in.hit_vpriv.z = cpu_part->pt_inhit->hit_vpriv[Z];
+
+ gpu_seg->seg_in.hit_dist = cpu_part->pt_inhit->hit_dist;
+ gpu_seg->seg_in.hit_surfno = cpu_part->pt_inhit->hit_surfno;
+
+
+ gpu_seg->seg_out.hit_normal.x = cpu_part->pt_outhit->hit_normal[X];
+ gpu_seg->seg_out.hit_normal.y = cpu_part->pt_outhit->hit_normal[Y];
+ gpu_seg->seg_out.hit_normal.z = cpu_part->pt_outhit->hit_normal[Z];
+
+ gpu_seg->seg_out.hit_point.x = cpu_part->pt_outhit->hit_point[X];
+ gpu_seg->seg_out.hit_point.y = cpu_part->pt_outhit->hit_point[Y];
+ gpu_seg->seg_out.hit_point.z = cpu_part->pt_outhit->hit_point[Z];
+
+ gpu_seg->seg_out.hit_vpriv.x = cpu_part->pt_outhit->hit_vpriv[X];
+ gpu_seg->seg_out.hit_vpriv.y = cpu_part->pt_outhit->hit_vpriv[Y];
+ gpu_seg->seg_out.hit_vpriv.z = cpu_part->pt_outhit->hit_vpriv[Z];
+
+ gpu_seg->seg_out.hit_dist = cpu_part->pt_outhit->hit_dist;
+ gpu_seg->seg_out.hit_surfno = cpu_part->pt_outhit->hit_surfno;
+
+ RT_VISIT_ALL_SOLTABS_START(stp, ap->a_rt_i) {
+ /* Ignore "dead" solids in the list. (They failed prep) */
+ if (stp->st_aradius <= 0) continue;
+ /* Infinite solids make the BVH construction explode. */
+ if (stp->st_aradius >= INFINITY) continue;
+
+ for (BU_PTBL_FOR(segpp, (struct seg **), &cpu_part->pt_seglist))
+ if ((*segpp)->seg_stp == stp) {
+ gpu_seg->seg_sti = i;
+ return;
+ }
+ i++;
+ } RT_VISIT_ALL_SOLTABS_END
+ printf("error\n");
+}
+
+static void
+clt_boolweave_and_boolfinal(struct cl_seg *segs, struct application *ap,
+ cl_uint *h, int width, int height)
+{
+ size_t i, j;
+ size_t x, out = 0;
+ FILE *file;
+
+ for (i = 0; i < (size_t)(width * height); i++) {
+ struct seg waiting_segs;
+ struct seg finished_segs;
+ struct seg *s2;
+ struct partition InitialPart;
+ struct partition FinalPart;
+ struct bu_ptbl *regionbits;
+ struct resource *resp;
+ struct bu_bitv *solidbits;
+ struct rt_i *rtip;
+ struct partition *pp;
+
+ RT_AP_CHECK(ap);
+ if (ap->a_magic) {
+ RT_CK_AP(ap);
+ } else {
+ ap->a_magic = RT_AP_MAGIC;
+ }
+ if (ap->a_ray.magic) {
+ RT_CK_RAY(&(ap->a_ray));
+ } else {
+ ap->a_ray.magic = RT_RAY_MAGIC;
+ }
+ if (ap->a_resource == RESOURCE_NULL) {
+ ap->a_resource = &rt_uniresource;
+ if (rt_uniresource.re_magic == 0)
+ rt_init_resource(&rt_uniresource, 0, ap->a_rt_i);
+ }
+
+ rtip = ap->a_rt_i;
+ RT_CK_RTI(rtip);
+ resp = ap->a_resource;
+ RT_CK_RESOURCE(resp);
+
+ if (rtip->needprep)
+ rt_prep_parallel(rtip, 1); /* Stay on our CPU */
+
+ solidbits = rt_get_solidbitv(ap->a_rt_i->nsolids, resp);
+
+ InitialPart.pt_forw = InitialPart.pt_back = &InitialPart;
+ InitialPart.pt_magic = PT_HD_MAGIC;
+ FinalPart.pt_forw = FinalPart.pt_back = &FinalPart;
+ FinalPart.pt_magic = PT_HD_MAGIC;
+ ap->a_Final_Part_hdp = &FinalPart;
+
+ BU_LIST_INIT(&waiting_segs.l);
+ BU_LIST_INIT(&finished_segs.l);
+
+ ap->a_finished_segs_hdp = &finished_segs;
+
+ if (BU_LIST_IS_EMPTY(&resp->re_region_ptbl)) {
+ BU_ALLOC(regionbits, struct bu_ptbl);
+ bu_ptbl_init(regionbits, 7, "rt_shootray() regionbits ptbl");
+ } else {
+ regionbits = BU_LIST_FIRST(bu_ptbl, &resp->re_region_ptbl);
+ BU_LIST_DEQUEUE(®ionbits->l);
+ BU_CK_PTBL(regionbits);
+ }
+
+ if (!resp->re_pieces && rtip->rti_nsolids_with_pieces > 0) {
+ /* Initialize this processors 'solid pieces' state */
+ rt_res_pieces_init(resp, rtip);
+ }
+
+ for (j = h[i]; j < h[i+1]; j++) {
+ RT_GET_SEG(s2, ap->a_resource);
+ copy_to_CPU(s2, &segs[j], ap);
+ BU_LIST_INSERT(&(waiting_segs.l), &(s2->l));
+ }
+
+ if (BU_LIST_NON_EMPTY(&(waiting_segs.l))) {
+ rt_boolweave(&finished_segs, &waiting_segs, &InitialPart, ap);
+ }
+
+ if (BU_LIST_NON_EMPTY(&(finished_segs.l))) {
+ (void)rt_boolfinal(&InitialPart, &FinalPart, BACKING_DIST,
+ INFINITY, regionbits, ap, solidbits);
+ }
+
+ x = 0;
+ for (BU_LIST_FOR(pp, partition, (struct bu_list *)&FinalPart))
+ x++;
+ file = fopen("gpu.txt", "a");
+ fprintf(file, "%lu\n", x);
+ fclose(file);
+
+ for (BU_LIST_FOR(pp, partition, (struct bu_list *)&FinalPart)) {
+ copy_to_GPU(&segs[out], pp, ap);
+ out++;
+ break;
+ }
+
+ BU_CK_PTBL(regionbits);
+ BU_LIST_APPEND(&resp->re_region_ptbl, ®ionbits->l);
+
+ RT_FREE_PT_LIST(&InitialPart, resp);
+ RT_FREE_SEG_LIST(&finished_segs, resp);
+ RT_FREE_PT_LIST(&FinalPart, resp);
+ }
+}
void
clt_frame(void *pixels, uint8_t o[3], int cur_pixel, int last_pixel,
- int width, int ibackground[3], int inonbackground[3],
+ int width, int height, int ibackground[3], int inonbackground[3],
double airdensity, double haze[3], fastf_t gamma,
mat_t view2model, fastf_t cell_width, fastf_t cell_height,
- fastf_t aspect, int lightmodel)
+ fastf_t aspect, int lightmodel, struct application *ap)
{
const size_t npix = last_pixel-cur_pixel+1;
@@ -858,6 +1083,9 @@
p.lightmodel = lightmodel;
sz_pixels = sizeof(cl_uchar)*o[2]*npix;
+
+ printf("w=%d, h=%d\n", width, height);
+
ppixels = clCreateBuffer(clt_context, CL_MEM_WRITE_ONLY|CL_MEM_HOST_READ_ONLY, sz_pixels, NULL, &error);
if (error != CL_SUCCESS) bu_bomb("failed to create OpenCL pixels buffer");
@@ -884,6 +1112,7 @@
size_t sz_segs;
cl_mem psegs;
size_t snpix = swxh[0]*swxh[1];
+ struct cl_seg *segs;
sz_counts = sizeof(cl_int)*npix;
pcounts = clCreateBuffer(clt_context, CL_MEM_WRITE_ONLY|CL_MEM_HOST_READ_ONLY, sz_counts, NULL, &error);
@@ -916,20 +1145,20 @@
sz_h = sizeof(cl_uint)*(npix+1);
h = (cl_uint*)bu_calloc(1, sz_h, "h");
h[0] = 0;
+
for (i=1; i<=npix; i++) {
BU_ASSERT((counts[i-1] % 2) == 0);
h[i] = h[i-1] + counts[i-1]/2; /* number of segs is half the number of hits */
}
- bu_free(counts, "counts");
ph = clCreateBuffer(clt_context, CL_MEM_READ_ONLY|CL_MEM_HOST_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, sz_h, h, &error);
if (error != CL_SUCCESS) bu_bomb("failed to create OpenCL offs buffer");
sz_segs = sizeof(struct cl_seg)*h[npix];
- bu_free(h, "h");
+ segs = (struct cl_seg*)bu_malloc(sz_segs, "segments on gpu");
if (sz_segs != 0) {
- psegs = clCreateBuffer(clt_context, CL_MEM_READ_WRITE|CL_MEM_HOST_NO_ACCESS, sz_segs, NULL, &error);
+ psegs = clCreateBuffer(clt_context, CL_MEM_READ_WRITE, sz_segs, NULL, &error);
if (error != CL_SUCCESS) bu_bomb("failed to create OpenCL segs buffer");
bu_semaphore_acquire(clt_semaphore);
@@ -950,10 +1179,34 @@
if (error != CL_SUCCESS) bu_bomb("failed to set OpenCL kernel arguments");
error = clEnqueueNDRangeKernel(clt_queue, clt_store_segs_kernel, 2, NULL, wxh,
swxh, 0, NULL, NULL);
+
+ error = clEnqueueReadBuffer(clt_queue, psegs, CL_TRUE, 0, sz_segs, segs, 0, NULL, NULL);
+ if (error != CL_SUCCESS) bu_bomb("failed to read OpenCL segs buffer");
bu_semaphore_release(clt_semaphore);
+
+ clt_boolweave_and_boolfinal(segs, ap, h, width, height);
+
+ h[0] = 0;
+
+ for (i=1; i<=npix; i++) {
+ if (counts[i-1] != 0)
+ h[i] = h[i-1] + 1;
+ else
+ h[i] = h[i-1];
+ }
+
+ bu_semaphore_acquire(clt_semaphore);
+ error = clEnqueueWriteBuffer(clt_queue, ph, CL_TRUE, 0, sz_h, h, 0, NULL, NULL);
+ if (error != CL_SUCCESS) bu_bomb("failed to write into OpenCL h buffer");
+ error = clEnqueueWriteBuffer(clt_queue, psegs, CL_TRUE, 0, sz_segs, segs, 0, NULL, NULL);
+ if (error != CL_SUCCESS) bu_bomb("failed to write into OpenCL segs buffer");
+ bu_semaphore_release(clt_semaphore);
+
+ bu_free(counts, "counts");
} else {
psegs = NULL;
}
+ bu_free(h, "h");
bu_semaphore_acquire(clt_semaphore);
error = clSetKernelArg(clt_shade_segs_kernel, 0, sizeof(cl_mem), &ppixels);
Index: src/librt/shoot.c
===================================================================
--- src/librt/shoot.c (revision 68233)
+++ src/librt/shoot.c (working copy)
@@ -858,6 +858,9 @@
struct rt_i *rtip;
const int debug_shoot = RT_G_DEBUG & DEBUG_SHOOT;
fastf_t pending_hit = 0; /* dist of closest odd hit pending */
+ size_t x;
+ struct partition *pp;
+ FILE *file;
RT_AP_CHECK(ap);
if (ap->a_magic) {
@@ -1456,6 +1459,13 @@
INFINITY,
regionbits, ap, solidbits);
+ x = 0;
+ for (BU_LIST_FOR(pp, partition, (struct bu_list *)&FinalPart))
+ x++;
+ file = fopen("cpu.txt", "a");
+ fprintf(file, "%lu\n", x);
+ fclose(file);
+
if (FinalPart.pt_forw == &FinalPart) {
if (ap->a_miss)
ap->a_return = ap->a_miss(ap);
Index: src/rt/do.c
===================================================================
--- src/rt/do.c (revision 68233)
+++ src/rt/do.c (working copy)
@@ -593,10 +593,10 @@
pixels = (uint8_t*)bu_calloc(size, sizeof(uint8_t), "image buffer");
- clt_frame(pixels, clt_o, cur_pixel, last_pixel, width,
+ clt_frame(pixels, clt_o, cur_pixel, last_pixel, width, height,
ibackground, inonbackground,
airdensity, haze, gamma_corr, view2model, cell_width,
- cell_height, aspect, lightmodel);
+ cell_height, aspect, lightmodel, &a);
pixelp = pixels + cur_pixel*clt_o[2];
------------------------------------------------------------------------------
Attend Shape: An AT&T Tech Expo July 15-16. Meet us at AT&T Park in San
Francisco, CA to explore cutting-edge tech and listen to tech luminaries
present their vision of the future. This family event has something for
everyone, including kids. Get more information and register today.
http://sdm.link/attshape
_______________________________________________
BRL-CAD Developer mailing list
brlcad-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/brlcad-devel