Changeset: c81334022593 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c81334022593
Modified Files:
        monetdb5/extras/bwd/cl_program_utilities.c
        monetdb5/extras/bwd/operations.c
Branch: bwd
Log Message:

* [Merge] tpch q6 now gives (consistent) incorrect results, q1 causes memory 
corruption


Unterschiede (257 Zeilen):

diff --git a/monetdb5/extras/bwd/cl_program_utilities.c 
b/monetdb5/extras/bwd/cl_program_utilities.c
--- a/monetdb5/extras/bwd/cl_program_utilities.c
+++ b/monetdb5/extras/bwd/cl_program_utilities.c
@@ -82,27 +82,30 @@ cl_program getProjectionLeftjoinProgram(
                        "  __constant static const size_t targetTypeBits = 
(sizeof(accessType)*8);\n"
                        "  __constant static const unsigned int 
approximationMask = ((1<<approximationBits)-1);\n"
                        "__kernel void project (\n"
-                       "__global struct{int count; int base; unsigned int 
values[];}* outputTail,\n"
-                       "__global struct{int count; int padding; unsigned int 
positions[];}* inputTail,\n"
-                       "__global const struct{int count; int base; unsigned 
int values[];}* approximationTail\n) {\n"
+                       /* "__global struct{int count; int base; unsigned int 
values[];}* outputTail,\n" */
+                       /* "__global struct{int count; int padding; unsigned 
int positions[];}* inputTail,\n" */
+                       /* "__global const struct{int count; int base; unsigned 
int values[];}* approximationTail\n) {\n" */
+                       "__global unsigned int* outputTail,\n"
+                       "__global unsigned int* inputTail,\n"
+                       "__global const unsigned int* approximationTail\n) {\n"
                        "    const size_t index = get_global_id(0);\n"
-                       " if(index < inputTail->count){\n"
-                       "  const size_t inputIndex = 
inputTail->positions[get_global_id(0)];"
+                       " if(index < inputTail[0]){\n"
+                       "  const size_t inputIndex = 
inputTail[2+get_global_id(0)];"
                        "  size_t slot = 
(inputIndex*approximationBits)/targetTypeBits;\n"
                        "  size_t offset = 
(inputIndex*approximationBits)%targetTypeBits;\n"
-                       "    __global const unsigned int* vals = 
approximationTail->values;\n"
+                       "    __global const unsigned int* vals = 
&(approximationTail[2]);\n"
 
-               "    const unsigned int delta = (("
-               "      
(((offset+approximationBits)>targetTypeBits)?(((vals[slot]<<(approximationBits-targetTypeBits+offset))
 + 
(vals[slot+1]>>(targetTypeBits-(approximationBits-targetTypeBits+offset))))&approximationMask):0)\n"
-               "    + 
(((offset+approximationBits)<=targetTypeBits)*(vals[slot]>>(targetTypeBits-offset-approximationBits)))"
-               "      )&approximationMask);\n"
+                       "    const unsigned int delta = (("
+                       "      
(((offset+approximationBits)>targetTypeBits)?(((vals[slot]<<(approximationBits-targetTypeBits+offset))
 + 
(vals[slot+1]>>(targetTypeBits-(approximationBits-targetTypeBits+offset))))&approximationMask):0)\n"
+                       "    + 
(((offset+approximationBits)<=targetTypeBits)*(vals[slot]>>(targetTypeBits-offset-approximationBits)))"
+                       "      )&approximationMask);\n"
                        "    size_t outslot = 
(index*approximationBits)/targetTypeBits;\n"
                        "    size_t outoffset = 
(index*approximationBits)%targetTypeBits;\n"
                        "    if(outoffset+approximationBits > targetTypeBits){"
-                       "      atomic_add(&(outputTail->values[outslot]), 
(delta >> (outoffset+approximationBits-targetTypeBits)));"
-                       "      atomic_add(&(outputTail->values[outslot+1]), 
(delta << (targetTypeBits-(outoffset+approximationBits-targetTypeBits))));"
+                       "      atomic_add(&(outputTail[2+outslot]), (delta >> 
(outoffset+approximationBits-targetTypeBits)));"
+                       "      atomic_add(&(outputTail[2+outslot+1]), (delta << 
(targetTypeBits-(outoffset+approximationBits-targetTypeBits))));"
                        "    }else{"
-                       "      atomic_add(&(outputTail->values[outslot]), 
(delta << (targetTypeBits-outoffset-approximationBits)));"
+                       "      atomic_add(&(outputTail[2+outslot]), (delta << 
(targetTypeBits-outoffset-approximationBits)));"
                        "    }"
                        " }\n"
                        "}";
@@ -114,95 +117,121 @@ cl_program getProjectionLeftjoinProgram(
 }
 
 cl_program getUSelectProgram(int type, char* predicateOperation, char* 
predicateOperation2, unsigned int approximationBits, unsigned int offsetBits, 
char inputIsVoidHeaded){
-
+       
        const char* sourceCodeTemplates[] = {
-               [0] = "  __constant static const size_t targetTypeBits = 
(sizeof(targetType)*8);\n"
+               [0] = "#define accessType unsigned int\n"
+               "  __constant static const size_t targetTypeBits = 
(sizeof(targetType)*8);\n"
                "  __constant static const unsigned int approximationMask = 
((1<<approximationBits)-1);\n"
                "__kernel void uselect (\n" // non-void-headed case
-               "__global struct{int count; int padding; int positions[];}* 
outputHead,\n"
-               "__global struct{int count; int base; unsigned char values[];}* 
outputTail,\n"
-               "__global const struct{int count; int base; unsigned char 
values[];}* approximationTail,\n"
-               "__global const struct{int count; int padding; int 
positions[];}* approximationHead\n"
-               ",\n"
-               "const %1$s operand,\n"
-               "const %1$s operand2\n"
+               /* " struct HeadColumn{int count; int padding; int 
positions[];};\n" */
+               /* " struct TailColumn{int count; int base; unsigned char 
values[];} ;\n" */
+               /* "__global struct HeadColumn* outputHead,\n" */
+               /* "__global struct TailColumn* outputTail,\n" */
+               /* "__global const struct TailColumn* approximationTail,\n" */
+               /* "__global const struct HeadColumn* approximationHead\n" */
+               "__global unsigned int* outputHead,\n"
+               "__global unsigned int* outputTail,\n"
+               "__global const unsigned int* approximationTail,\n"
+               "__global const unsigned int* approximationHead,\n"
+
+               
+               "const targetType operand,\n"
+               "const targetType operand2\n"
                ") {\n"
-               " if(get_global_id(0) < approximationTail->count){\n"
-               "    __global const unsigned int* vals = 
approximationTail->values;\n"
-               " __global const unsigned char* approximation = 
approximationTail->values;"
-               "  %1$s value  = approximationTail->base;\n"
-               "  const size_t inputOffset = get_global_id(0)*%4$d;\n"
+               " if(get_global_id(0) < approximationTail[0]){\n"
+               "    __global const unsigned int* vals = 
&(approximationTail[2]);\n"
+               /* " __global const unsigned char* approximation = 
approximationTail->values;" */
+               "  targetType value  = ((__global targetType 
*)approximationTail)[1];\n"
+
                "  size_t slot = 
(get_global_id(0)*approximationBits)/targetTypeBits;\n"
-               "  size_t offset = 
(get_global_id(0)*approximationBits)%%targetTypeBits;\n"
-               
+               "  size_t offset = 
(get_global_id(0)*approximationBits)%targetTypeBits;\n"
                "    const unsigned int delta = (("
                "      
(((offset+approximationBits)>targetTypeBits)?(((vals[slot]<<(approximationBits-targetTypeBits+offset))
 + 
(vals[slot+1]>>(targetTypeBits-(approximationBits-targetTypeBits+offset))))&approximationMask):0)\n"
                "    + 
(((offset+approximationBits)<=targetTypeBits)*(vals[slot]>>(targetTypeBits-offset-approximationBits)))"
                "      )&approximationMask);\n"
                "    value += (delta<<residualBits);\n"
-
+               "  if((value firstOperator operand)"
+               "     && (secondOperatorUnset || value secondOperator operand2)"
+               "    ){\n"
+               "    const int index = atomic_inc(outputHead);\n"
+               "    atomic_inc(outputTail);\n" // TODO: this could probably be 
done more efficiently
+               "    outputHead[2+index] = 
approximationHead[2+get_global_id(0)];\n"
+               "    size_t outslot = 
(index*approximationBits)/targetTypeBits;\n"
+               "    size_t outoffset = 
(index*approximationBits)%targetTypeBits;\n"
+               "    if(outoffset+approximationBits > 8*sizeof(accessType)){"
+               /* "      outputTail[2+outslot] |= (delta << 
(approximationBits-(8*sizeof(accessType)-outoffset)));" */
+               "      atomic_add(&(outputTail[2+outslot]), (delta >> 
(8*sizeof(accessType)+approximationBits-targetTypeBits)));"
+               "      atomic_add(&(outputTail[2+outslot+1]), (delta << 
(8*sizeof(accessType)-(outoffset+approximationBits-targetTypeBits))));"         
+               "    }else{"
+               "      atomic_add(&(outputTail[2+outslot]), (delta << 
(8*sizeof(accessType)-outoffset-approximationBits)));"
+               /* "      outputTail[2+outslot] |= (delta << 
(8*sizeof(accessType)-outoffset-approximationBits));" */
+               "    }"
+               "   }\n"
+               "  } \n"
+               " }\n",
                
-               "  if((value %2$s operand)"
-               "     && (%5$d || value %3$s operand2)"
-               "    )"
-               "{\n"
-               "    const int index = atomic_inc(&(outputHead->count));\n"
-               "    atomic_inc(&(outputTail->count));\n" // TODO: this could 
probably be done more efficiently
-               "    const int offset = index * %4$d;\n"
-               "    outputHead->positions[index] = 
approximationHead->positions[get_global_id(0)];\n"
-               "    for(int i = 0; i < %4$d; i++){\n"
-               "      outputTail->values[offset+i] = approximation[inputOffset 
+ i];\n"
-               "    }\n"
-               "  } \n"
-               " }\n"
-               "}",
-
                [1] = "#define accessType unsigned int\n"
                "  __constant static const size_t targetTypeBits = 
(sizeof(targetType)*8);\n"
                "  __constant static const unsigned int approximationMask = 
((1<<approximationBits)-1);\n"
+               /* " struct HeadColumn{int count; int padding; int 
positions[];};\n" */
+               /* " struct TailColumn{int count; int base; unsigned int 
values[];};\n" */
+               
                        "__kernel void uselect (\n" // void-headed case
-               "__global struct{int count; int padding; int positions[];}* 
outputHead,\n"
-               "__global struct{int count; int base; accessType values[];}* 
outputTail,\n"
-               "__global const struct{int count; int base; unsigned char 
values[];}* approximationTail,\n"
-               "const %1$s operand,\n"
-               "const %1$s operand2\n"
+
+               /* "__global struct HeadColumn* outputHead,\n" */
+               /* "__global struct TailColumn* outputTail,\n" */
+               /* "__global const struct TailColumn* approximationTail,\n" */
+               "__global unsigned int* outputHead,\n"
+               "__global unsigned int* outputTail,\n"
+               "__global const unsigned int* approximationTail,\n"
+
+               "const targetType operand,\n"
+               "const targetType operand2\n"
                ") {\n"
-               " if(get_global_id(0) < approximationTail->count){\n"
-               "  targetType value  = approximationTail->base;\n"
+               /* " if(get_global_id(0) < approximationTail->count){\n" */
+               /* "  targetType value  = approximationTail->base;\n" */
+               " if(get_global_id(0) < approximationTail[0]){\n" // count
+               "  targetType value  = ((__global 
targetType*)approximationTail)[1];\n"//base
                "  size_t slot = 
(get_global_id(0)*approximationBits)/targetTypeBits;\n"
-               "  size_t offset = 
(get_global_id(0)*approximationBits)%%targetTypeBits;\n"
-               "    __global const unsigned int* vals = 
approximationTail->values;\n"
+               "  size_t offset = 
(get_global_id(0)*approximationBits)%targetTypeBits;\n"
+               /* "    __global const unsigned int* vals = 
approximationTail->values;\n" */
+               "  __global const unsigned int* vals = 
&(approximationTail[2]);\n"
                "    const unsigned int delta = (("
                "      
(((offset+approximationBits)>targetTypeBits)?(((vals[slot]<<(approximationBits-targetTypeBits+offset))
 + 
(vals[slot+1]>>(targetTypeBits-(approximationBits-targetTypeBits+offset))))&approximationMask):0)\n"
                "    + 
(((offset+approximationBits)<=targetTypeBits)*(vals[slot]>>(targetTypeBits-offset-approximationBits)))"
                "      )&approximationMask);\n"
                "    value += (delta<<residualBits);\n"
-               "  if((value %2$s operand)"
-               "     && (%5$d || value %3$s operand2)"
+               "  if((value firstOperator operand)"
+               "     && (secondOperatorUnset || value secondOperator operand2)"
                "    ){\n"
-               "    const int index = atomic_inc(&(outputHead->count));\n"
-               "    atomic_inc(&(outputTail->count));\n" // TODO: this could 
probably be done more efficiently
+               "    const int index = atomic_inc(outputHead);\n"
+               "    atomic_inc(outputTail);\n" // TODO: this could probably be 
done more efficiently
                "    size_t outslot = 
(index*approximationBits)/targetTypeBits;\n"
-               "    size_t outoffset = 
(index*approximationBits)%%targetTypeBits;\n"
-               "    outputHead->positions[index] = get_global_id(0);\n"
-               "    if(outoffset+approximationBits > 8*sizeof(accessType))"
-               "      outputTail->values[outslot] |= (delta << 
(approximationBits-(8*sizeof(accessType)-outoffset)));"
-               "    else"
-               "      outputTail->values[outslot] |= (delta << 
(8*sizeof(accessType)-outoffset-approximationBits));"
+               "    size_t outoffset = 
(index*approximationBits)%targetTypeBits;\n"
+               "    outputHead[2+index] = get_global_id(0);\n"
+               "    if(outoffset+approximationBits > 8*sizeof(accessType)){"
+               "      atomic_add(&(outputTail[2+outslot]), (delta >> 
(8*sizeof(accessType)+approximationBits-targetTypeBits)));"
+               "      atomic_add(&(outputTail[2+outslot+1]), (delta << 
(8*sizeof(accessType)-(outoffset+approximationBits-targetTypeBits))));"         
+
+               /* "      outputTail[2+outslot] |= (delta << 
(approximationBits-(8*sizeof(accessType)-outoffset)));" */
+               "    }else"
+                       "      atomic_add(&(outputTail[2+outslot]), (delta << 
(8*sizeof(accessType)-outoffset-approximationBits)));"
+
+               /* "      outputTail[2+outslot] |= (delta << 
(8*sizeof(accessType)-outoffset-approximationBits));" */
                "  }\n"
                " }\n"
                "}"
 
        };
-       char* sourceCode = GDKmalloc(16384);
-       snprintf(sourceCode, 16384, sourceCodeTemplates[!!inputIsVoidHeaded], 
typeNames[type], (approximationBits == 
8*typeSizes[type])?predicateOperation:(approximateOperation(predicateOperation)),
 predicateOperation2?((approximationBits == 
8*typeSizes[type])?predicateOperation2:approximateOperation(predicateOperation2)):"==",
 approximationBits/8-offsetBits/8, predicateOperation2 == NULL?1:0, 
offsetBits/8);
+       /* char* sourceCode = malloc(16384); */
+       /* snprintf(sourceCode, 16384, 
sourceCodeTemplates[!!inputIsVoidHeaded], typeNames[type], (approximationBits 
== 
8*typeSizes[type])?predicateOperation:(approximateOperation(predicateOperation)),
 predicateOperation2?((approximationBits == 
8*typeSizes[type])?predicateOperation2:approximateOperation(predicateOperation2)):"==",
 approximationBits/8-offsetBits/8, predicateOperation2 == NULL?1:0, 
offsetBits/8); */
        {
                char options[256];
                cl_program program;
-               snprintf(options, 256, "-D targetType=int -D 
approximationBits=%d -D residualBits=%lu", approximationBits, 
typeSizes[type]*8-offsetBits-approximationBits);
+               snprintf(options, 256, "-DtargetType=%s -DapproximationBits=%d 
-DresidualBits=%lu -DfirstOperator=%s -DsecondOperator=%s 
-DsecondOperatorUnset=%d", typeNames[type], approximationBits, 
typeSizes[type]*8-offsetBits-approximationBits, (approximationBits == 
8*typeSizes[type])?predicateOperation:(approximateOperation(predicateOperation)),
 predicateOperation2?((approximationBits == 
8*typeSizes[type])?predicateOperation2:approximateOperation(predicateOperation2)):"==",
 predicateOperation2 == NULL?1:0);
                /* printf ("%s\n", sourceCode); */
-               program = compileProgram(sourceCode, options);
-               free(sourceCode);
+               program = 
compileProgram(sourceCodeTemplates[!!inputIsVoidHeaded], options);
+               /* free(sourceCode); */
                return program;
        }
 }
diff --git a/monetdb5/extras/bwd/operations.c b/monetdb5/extras/bwd/operations.c
--- a/monetdb5/extras/bwd/operations.c
+++ b/monetdb5/extras/bwd/operations.c
@@ -26,7 +26,9 @@ static const int eagerBufferLoading = 1;
 #ifdef __APPLE__
 static const int WORK_GROUP_SIZE=1;
 #else
-static const int WORK_GROUP_SIZE=16;
+static const int WORK_GROUP_SIZE=128;
+const int VALUES_PER_WORK_ITEM = 1;
+extern const int VALUES_PER_WORK_ITEM;
 #endif
 #define MAX_INTERMEDIATE_RESULT_SIZE 16777216
 
@@ -206,7 +208,7 @@ str BWDLeftJoinApproximate(bat * res, ba
                        if(err) printf("#%s, clEnqueueWriteBuffer: %s;\n", 
__func__, clError(err));
                        err = clEnqueueFillBuffer(getCommandQueue(), 
slot->tailApproximation, zeroIntPattern, sizeof(int), sizeof(clTail), 
calculatedBufferSize(headCount, slot->approximationBits), 0, NULL, NULL);
                        if(err) printf("#%s, clEnqueueFillBuffer: %s;\n", 
__func__, clError(err));
-                       free(zeroIntPattern)
+                       GDKfree(zeroIntPattern);
                }
                for (i = 0; i < 3; ++i) {
                        if((err = clSetKernelArg(projectKernel, i, 
sizeof(cl_mem), &((cl_mem[]){
@@ -435,7 +437,7 @@ static inline str uselect(bat *res, bat 
                                                if((err = 
clSetKernelArg(selectKernel, bufferI+i, sizeof(int), &(parameters[i]))))  // 
type specific
                                                        printf("#%s, 
clSetKernelArg(%d): %s;\n", __func__, bufferI+i, clError(err));
 
-                                       err = 
clEnqueueNDRangeKernel(getCommandQueue(), selectKernel, 1, (const size_t[]){0}, 
(const size_t[]){ceil(dataCount/((float)WORK_GROUP_SIZE))*WORK_GROUP_SIZE}, 
(const size_t[]){WORK_GROUP_SIZE}, 0, NULL, NULL);
+                                       err = 
clEnqueueNDRangeKernel(getCommandQueue(), selectKernel, 1, (const size_t[]){0}, 
(const size_t[]){ceil(dataCount/((float)WORK_GROUP_SIZE))*(WORK_GROUP_SIZE)}, 
(const size_t[]){WORK_GROUP_SIZE}, 0, NULL, NULL);
                                        if(err)
                                                printf("#%s, 
clEnqueueNDRangeKernel: %s;\n", __func__, clError(err));
                                        if (synchronousGPU) 
clFinish(getCommandQueue());
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to