Changeset: c81334022593 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c81334022593
Modified Files:
monetdb5/extras/bwd/cl_program_utilities.c
monetdb5/extras/bwd/operations.c
Branch: bwd
Log Message:
* [Merge] tpch q6 now gives (consistent) incorrect results, q1 causes memory
corruption
Unterschiede (257 Zeilen):
diff --git a/monetdb5/extras/bwd/cl_program_utilities.c
b/monetdb5/extras/bwd/cl_program_utilities.c
--- a/monetdb5/extras/bwd/cl_program_utilities.c
+++ b/monetdb5/extras/bwd/cl_program_utilities.c
@@ -82,27 +82,30 @@ cl_program getProjectionLeftjoinProgram(
" __constant static const size_t targetTypeBits =
(sizeof(accessType)*8);\n"
" __constant static const unsigned int
approximationMask = ((1<<approximationBits)-1);\n"
"__kernel void project (\n"
- "__global struct{int count; int base; unsigned int
values[];}* outputTail,\n"
- "__global struct{int count; int padding; unsigned int
positions[];}* inputTail,\n"
- "__global const struct{int count; int base; unsigned
int values[];}* approximationTail\n) {\n"
+ /* "__global struct{int count; int base; unsigned int
values[];}* outputTail,\n" */
+ /* "__global struct{int count; int padding; unsigned
int positions[];}* inputTail,\n" */
+ /* "__global const struct{int count; int base; unsigned
int values[];}* approximationTail\n) {\n" */
+ "__global unsigned int* outputTail,\n"
+ "__global unsigned int* inputTail,\n"
+ "__global const unsigned int* approximationTail\n) {\n"
" const size_t index = get_global_id(0);\n"
- " if(index < inputTail->count){\n"
- " const size_t inputIndex =
inputTail->positions[get_global_id(0)];"
+ " if(index < inputTail[0]){\n"
+ " const size_t inputIndex =
inputTail[2+get_global_id(0)];"
" size_t slot =
(inputIndex*approximationBits)/targetTypeBits;\n"
" size_t offset =
(inputIndex*approximationBits)%targetTypeBits;\n"
- " __global const unsigned int* vals =
approximationTail->values;\n"
+ " __global const unsigned int* vals =
&(approximationTail[2]);\n"
- " const unsigned int delta = (("
- "
(((offset+approximationBits)>targetTypeBits)?(((vals[slot]<<(approximationBits-targetTypeBits+offset))
+
(vals[slot+1]>>(targetTypeBits-(approximationBits-targetTypeBits+offset))))&approximationMask):0)\n"
- " +
(((offset+approximationBits)<=targetTypeBits)*(vals[slot]>>(targetTypeBits-offset-approximationBits)))"
- " )&approximationMask);\n"
+ " const unsigned int delta = (("
+ "
(((offset+approximationBits)>targetTypeBits)?(((vals[slot]<<(approximationBits-targetTypeBits+offset))
+
(vals[slot+1]>>(targetTypeBits-(approximationBits-targetTypeBits+offset))))&approximationMask):0)\n"
+ " +
(((offset+approximationBits)<=targetTypeBits)*(vals[slot]>>(targetTypeBits-offset-approximationBits)))"
+ " )&approximationMask);\n"
" size_t outslot =
(index*approximationBits)/targetTypeBits;\n"
" size_t outoffset =
(index*approximationBits)%targetTypeBits;\n"
" if(outoffset+approximationBits > targetTypeBits){"
- " atomic_add(&(outputTail->values[outslot]),
(delta >> (outoffset+approximationBits-targetTypeBits)));"
- " atomic_add(&(outputTail->values[outslot+1]),
(delta << (targetTypeBits-(outoffset+approximationBits-targetTypeBits))));"
+ " atomic_add(&(outputTail[2+outslot]), (delta >>
(outoffset+approximationBits-targetTypeBits)));"
+ " atomic_add(&(outputTail[2+outslot+1]), (delta <<
(targetTypeBits-(outoffset+approximationBits-targetTypeBits))));"
" }else{"
- " atomic_add(&(outputTail->values[outslot]),
(delta << (targetTypeBits-outoffset-approximationBits)));"
+ " atomic_add(&(outputTail[2+outslot]), (delta <<
(targetTypeBits-outoffset-approximationBits)));"
" }"
" }\n"
"}";
@@ -114,95 +117,121 @@ cl_program getProjectionLeftjoinProgram(
}
cl_program getUSelectProgram(int type, char* predicateOperation, char*
predicateOperation2, unsigned int approximationBits, unsigned int offsetBits,
char inputIsVoidHeaded){
-
+
const char* sourceCodeTemplates[] = {
- [0] = " __constant static const size_t targetTypeBits =
(sizeof(targetType)*8);\n"
+ [0] = "#define accessType unsigned int\n"
+ " __constant static const size_t targetTypeBits =
(sizeof(targetType)*8);\n"
" __constant static const unsigned int approximationMask =
((1<<approximationBits)-1);\n"
"__kernel void uselect (\n" // non-void-headed case
- "__global struct{int count; int padding; int positions[];}*
outputHead,\n"
- "__global struct{int count; int base; unsigned char values[];}*
outputTail,\n"
- "__global const struct{int count; int base; unsigned char
values[];}* approximationTail,\n"
- "__global const struct{int count; int padding; int
positions[];}* approximationHead\n"
- ",\n"
- "const %1$s operand,\n"
- "const %1$s operand2\n"
+ /* " struct HeadColumn{int count; int padding; int
positions[];};\n" */
+ /* " struct TailColumn{int count; int base; unsigned char
values[];} ;\n" */
+ /* "__global struct HeadColumn* outputHead,\n" */
+ /* "__global struct TailColumn* outputTail,\n" */
+ /* "__global const struct TailColumn* approximationTail,\n" */
+ /* "__global const struct HeadColumn* approximationHead\n" */
+ "__global unsigned int* outputHead,\n"
+ "__global unsigned int* outputTail,\n"
+ "__global const unsigned int* approximationTail,\n"
+ "__global const unsigned int* approximationHead,\n"
+
+
+ "const targetType operand,\n"
+ "const targetType operand2\n"
") {\n"
- " if(get_global_id(0) < approximationTail->count){\n"
- " __global const unsigned int* vals =
approximationTail->values;\n"
- " __global const unsigned char* approximation =
approximationTail->values;"
- " %1$s value = approximationTail->base;\n"
- " const size_t inputOffset = get_global_id(0)*%4$d;\n"
+ " if(get_global_id(0) < approximationTail[0]){\n"
+ " __global const unsigned int* vals =
&(approximationTail[2]);\n"
+ /* " __global const unsigned char* approximation =
approximationTail->values;" */
+ " targetType value = ((__global targetType
*)approximationTail)[1];\n"
+
" size_t slot =
(get_global_id(0)*approximationBits)/targetTypeBits;\n"
- " size_t offset =
(get_global_id(0)*approximationBits)%%targetTypeBits;\n"
-
+ " size_t offset =
(get_global_id(0)*approximationBits)%targetTypeBits;\n"
" const unsigned int delta = (("
"
(((offset+approximationBits)>targetTypeBits)?(((vals[slot]<<(approximationBits-targetTypeBits+offset))
+
(vals[slot+1]>>(targetTypeBits-(approximationBits-targetTypeBits+offset))))&approximationMask):0)\n"
" +
(((offset+approximationBits)<=targetTypeBits)*(vals[slot]>>(targetTypeBits-offset-approximationBits)))"
" )&approximationMask);\n"
" value += (delta<<residualBits);\n"
-
+ " if((value firstOperator operand)"
+ " && (secondOperatorUnset || value secondOperator operand2)"
+ " ){\n"
+ " const int index = atomic_inc(outputHead);\n"
+ " atomic_inc(outputTail);\n" // TODO: this could probably be
done more efficiently
+ " outputHead[2+index] =
approximationHead[2+get_global_id(0)];\n"
+ " size_t outslot =
(index*approximationBits)/targetTypeBits;\n"
+ " size_t outoffset =
(index*approximationBits)%targetTypeBits;\n"
+ " if(outoffset+approximationBits > 8*sizeof(accessType)){"
+ /* " outputTail[2+outslot] |= (delta <<
(approximationBits-(8*sizeof(accessType)-outoffset)));" */
+ " atomic_add(&(outputTail[2+outslot]), (delta >>
(8*sizeof(accessType)+approximationBits-targetTypeBits)));"
+ " atomic_add(&(outputTail[2+outslot+1]), (delta <<
(8*sizeof(accessType)-(outoffset+approximationBits-targetTypeBits))));"
+ " }else{"
+ " atomic_add(&(outputTail[2+outslot]), (delta <<
(8*sizeof(accessType)-outoffset-approximationBits)));"
+ /* " outputTail[2+outslot] |= (delta <<
(8*sizeof(accessType)-outoffset-approximationBits));" */
+ " }"
+ " }\n"
+ " } \n"
+ " }\n",
- " if((value %2$s operand)"
- " && (%5$d || value %3$s operand2)"
- " )"
- "{\n"
- " const int index = atomic_inc(&(outputHead->count));\n"
- " atomic_inc(&(outputTail->count));\n" // TODO: this could
probably be done more efficiently
- " const int offset = index * %4$d;\n"
- " outputHead->positions[index] =
approximationHead->positions[get_global_id(0)];\n"
- " for(int i = 0; i < %4$d; i++){\n"
- " outputTail->values[offset+i] = approximation[inputOffset
+ i];\n"
- " }\n"
- " } \n"
- " }\n"
- "}",
-
[1] = "#define accessType unsigned int\n"
" __constant static const size_t targetTypeBits =
(sizeof(targetType)*8);\n"
" __constant static const unsigned int approximationMask =
((1<<approximationBits)-1);\n"
+ /* " struct HeadColumn{int count; int padding; int
positions[];};\n" */
+ /* " struct TailColumn{int count; int base; unsigned int
values[];};\n" */
+
"__kernel void uselect (\n" // void-headed case
- "__global struct{int count; int padding; int positions[];}*
outputHead,\n"
- "__global struct{int count; int base; accessType values[];}*
outputTail,\n"
- "__global const struct{int count; int base; unsigned char
values[];}* approximationTail,\n"
- "const %1$s operand,\n"
- "const %1$s operand2\n"
+
+ /* "__global struct HeadColumn* outputHead,\n" */
+ /* "__global struct TailColumn* outputTail,\n" */
+ /* "__global const struct TailColumn* approximationTail,\n" */
+ "__global unsigned int* outputHead,\n"
+ "__global unsigned int* outputTail,\n"
+ "__global const unsigned int* approximationTail,\n"
+
+ "const targetType operand,\n"
+ "const targetType operand2\n"
") {\n"
- " if(get_global_id(0) < approximationTail->count){\n"
- " targetType value = approximationTail->base;\n"
+ /* " if(get_global_id(0) < approximationTail->count){\n" */
+ /* " targetType value = approximationTail->base;\n" */
+ " if(get_global_id(0) < approximationTail[0]){\n" // count
+ " targetType value = ((__global
targetType*)approximationTail)[1];\n"//base
" size_t slot =
(get_global_id(0)*approximationBits)/targetTypeBits;\n"
- " size_t offset =
(get_global_id(0)*approximationBits)%%targetTypeBits;\n"
- " __global const unsigned int* vals =
approximationTail->values;\n"
+ " size_t offset =
(get_global_id(0)*approximationBits)%targetTypeBits;\n"
+ /* " __global const unsigned int* vals =
approximationTail->values;\n" */
+ " __global const unsigned int* vals =
&(approximationTail[2]);\n"
" const unsigned int delta = (("
"
(((offset+approximationBits)>targetTypeBits)?(((vals[slot]<<(approximationBits-targetTypeBits+offset))
+
(vals[slot+1]>>(targetTypeBits-(approximationBits-targetTypeBits+offset))))&approximationMask):0)\n"
" +
(((offset+approximationBits)<=targetTypeBits)*(vals[slot]>>(targetTypeBits-offset-approximationBits)))"
" )&approximationMask);\n"
" value += (delta<<residualBits);\n"
- " if((value %2$s operand)"
- " && (%5$d || value %3$s operand2)"
+ " if((value firstOperator operand)"
+ " && (secondOperatorUnset || value secondOperator operand2)"
" ){\n"
- " const int index = atomic_inc(&(outputHead->count));\n"
- " atomic_inc(&(outputTail->count));\n" // TODO: this could
probably be done more efficiently
+ " const int index = atomic_inc(outputHead);\n"
+ " atomic_inc(outputTail);\n" // TODO: this could probably be
done more efficiently
" size_t outslot =
(index*approximationBits)/targetTypeBits;\n"
- " size_t outoffset =
(index*approximationBits)%%targetTypeBits;\n"
- " outputHead->positions[index] = get_global_id(0);\n"
- " if(outoffset+approximationBits > 8*sizeof(accessType))"
- " outputTail->values[outslot] |= (delta <<
(approximationBits-(8*sizeof(accessType)-outoffset)));"
- " else"
- " outputTail->values[outslot] |= (delta <<
(8*sizeof(accessType)-outoffset-approximationBits));"
+ " size_t outoffset =
(index*approximationBits)%targetTypeBits;\n"
+ " outputHead[2+index] = get_global_id(0);\n"
+ " if(outoffset+approximationBits > 8*sizeof(accessType)){"
+ " atomic_add(&(outputTail[2+outslot]), (delta >>
(8*sizeof(accessType)+approximationBits-targetTypeBits)));"
+ " atomic_add(&(outputTail[2+outslot+1]), (delta <<
(8*sizeof(accessType)-(outoffset+approximationBits-targetTypeBits))));"
+
+ /* " outputTail[2+outslot] |= (delta <<
(approximationBits-(8*sizeof(accessType)-outoffset)));" */
+ " }else"
+ " atomic_add(&(outputTail[2+outslot]), (delta <<
(8*sizeof(accessType)-outoffset-approximationBits)));"
+
+ /* " outputTail[2+outslot] |= (delta <<
(8*sizeof(accessType)-outoffset-approximationBits));" */
" }\n"
" }\n"
"}"
};
- char* sourceCode = GDKmalloc(16384);
- snprintf(sourceCode, 16384, sourceCodeTemplates[!!inputIsVoidHeaded],
typeNames[type], (approximationBits ==
8*typeSizes[type])?predicateOperation:(approximateOperation(predicateOperation)),
predicateOperation2?((approximationBits ==
8*typeSizes[type])?predicateOperation2:approximateOperation(predicateOperation2)):"==",
approximationBits/8-offsetBits/8, predicateOperation2 == NULL?1:0,
offsetBits/8);
+ /* char* sourceCode = malloc(16384); */
+ /* snprintf(sourceCode, 16384,
sourceCodeTemplates[!!inputIsVoidHeaded], typeNames[type], (approximationBits
==
8*typeSizes[type])?predicateOperation:(approximateOperation(predicateOperation)),
predicateOperation2?((approximationBits ==
8*typeSizes[type])?predicateOperation2:approximateOperation(predicateOperation2)):"==",
approximationBits/8-offsetBits/8, predicateOperation2 == NULL?1:0,
offsetBits/8); */
{
char options[256];
cl_program program;
- snprintf(options, 256, "-D targetType=int -D
approximationBits=%d -D residualBits=%lu", approximationBits,
typeSizes[type]*8-offsetBits-approximationBits);
+ snprintf(options, 256, "-DtargetType=%s -DapproximationBits=%d
-DresidualBits=%lu -DfirstOperator=%s -DsecondOperator=%s
-DsecondOperatorUnset=%d", typeNames[type], approximationBits,
typeSizes[type]*8-offsetBits-approximationBits, (approximationBits ==
8*typeSizes[type])?predicateOperation:(approximateOperation(predicateOperation)),
predicateOperation2?((approximationBits ==
8*typeSizes[type])?predicateOperation2:approximateOperation(predicateOperation2)):"==",
predicateOperation2 == NULL?1:0);
/* printf ("%s\n", sourceCode); */
- program = compileProgram(sourceCode, options);
- free(sourceCode);
+ program =
compileProgram(sourceCodeTemplates[!!inputIsVoidHeaded], options);
+ /* free(sourceCode); */
return program;
}
}
diff --git a/monetdb5/extras/bwd/operations.c b/monetdb5/extras/bwd/operations.c
--- a/monetdb5/extras/bwd/operations.c
+++ b/monetdb5/extras/bwd/operations.c
@@ -26,7 +26,9 @@ static const int eagerBufferLoading = 1;
#ifdef __APPLE__
static const int WORK_GROUP_SIZE=1;
#else
-static const int WORK_GROUP_SIZE=16;
+static const int WORK_GROUP_SIZE=128;
+const int VALUES_PER_WORK_ITEM = 1;
+extern const int VALUES_PER_WORK_ITEM;
#endif
#define MAX_INTERMEDIATE_RESULT_SIZE 16777216
@@ -206,7 +208,7 @@ str BWDLeftJoinApproximate(bat * res, ba
if(err) printf("#%s, clEnqueueWriteBuffer: %s;\n",
__func__, clError(err));
err = clEnqueueFillBuffer(getCommandQueue(),
slot->tailApproximation, zeroIntPattern, sizeof(int), sizeof(clTail),
calculatedBufferSize(headCount, slot->approximationBits), 0, NULL, NULL);
if(err) printf("#%s, clEnqueueFillBuffer: %s;\n",
__func__, clError(err));
- free(zeroIntPattern)
+ GDKfree(zeroIntPattern);
}
for (i = 0; i < 3; ++i) {
if((err = clSetKernelArg(projectKernel, i,
sizeof(cl_mem), &((cl_mem[]){
@@ -435,7 +437,7 @@ static inline str uselect(bat *res, bat
if((err =
clSetKernelArg(selectKernel, bufferI+i, sizeof(int), &(parameters[i])))) //
type specific
printf("#%s,
clSetKernelArg(%d): %s;\n", __func__, bufferI+i, clError(err));
- err =
clEnqueueNDRangeKernel(getCommandQueue(), selectKernel, 1, (const size_t[]){0},
(const size_t[]){ceil(dataCount/((float)WORK_GROUP_SIZE))*WORK_GROUP_SIZE},
(const size_t[]){WORK_GROUP_SIZE}, 0, NULL, NULL);
+ err =
clEnqueueNDRangeKernel(getCommandQueue(), selectKernel, 1, (const size_t[]){0},
(const size_t[]){ceil(dataCount/((float)WORK_GROUP_SIZE))*(WORK_GROUP_SIZE)},
(const size_t[]){WORK_GROUP_SIZE}, 0, NULL, NULL);
if(err)
printf("#%s,
clEnqueueNDRangeKernel: %s;\n", __func__, clError(err));
if (synchronousGPU)
clFinish(getCommandQueue());
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list