On 11/19/20 3:39 PM, Andreas Beckmann wrote:
> POCL built against LLVM 10 (sid) or LLVM 11 (experimental) causes a
> autopkgtest regression on armhf in libgpuarray while it succeeded with
> LLVM 9.
I finally managed to create a plain c reproducer (based on some pocl
test) which dies with this backtrace on abel.d.o:
#0 getEmissionKind () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/include/llvm/IR/DebugInfoMetadata.h:1244
#1 initialize () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LexicalScopes.cpp:53
#2 0xb13a82f0 in computeIntervals () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:979
#3 runOnMachineFunction () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:996
#4 runOnMachineFunction () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/LiveDebugVariables.cpp:1023
#5 0xb141d6c8 in runOnFunction () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/CodeGen/MachineFunctionPass.cpp:73
#6 0xb1297494 in runOnFunction () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1481
#7 0xb1297750 in runOnModule () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1517
#8 0xb1297ba8 in runOnModule () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1582
#9 run () at
/build/llvm-toolchain-10-hVI0Qp/llvm-toolchain-10-10.0.1/llvm/lib/IR/LegacyPassManager.cpp:1694
#10 0xb6dfac82 in pocl_llvm_codegen (Device=Device@entry=0x1a3dfc8,
Modp=0x20102d8, Output=Output@entry=0xbe9be8bc,
OutputSize=OutputSize@entry=0xbe9be8d0) at ./lib/CL/pocl_llvm_wg.cc:624
#11 0xb6dbf1de in llvm_codegen (output=output@entry=0x1be75e0
"/home/anbe/.cache/pocl/kcache/AP/PNFEAPBKBFEAKGGNMALGHGJEEKGMJFBFBMDHA/Sdot_kernel/0-0-0/Sdot_kernel.so",
device_i=device_i@entry=0, kernel=kernel@entry=0xbe9c0290, device=0x1a3dfc8,
command=command@entry=0xbe9c02c8, specialize=specialize@entry=0) at
./lib/CL/devices/common.c:158
#12 0xb6dc0e44 in pocl_check_kernel_disk_cache
(command=command@entry=0xbe9c02c8, specialized=specialized@entry=0) at
./lib/CL/devices/common.c:958
#13 0xb6dc1262 in pocl_check_kernel_dlhandle_cache (command=0xbe9c02c8,
initial_refcount=0, specialize=0) at ./lib/CL/devices/common.c:1081
#14 0xb6d993d4 in program_compile_dynamic_wg_binaries
(program=program@entry=0x1a18350) at ./lib/CL/pocl_build.c:179
#15 0xb6da9f20 in get_binary_sizes (sizes=0xbe9c03d4, program=0x1a18350) at
./lib/CL/clGetProgramInfo.c:36
#16 POclGetProgramInfo (program=0x1a18350, param_name=4453,
param_value_size=128, param_value=0xbe9c03d4, param_value_size_ret=0xbe9c03d0)
at ./lib/CL/clGetProgramInfo.c:115
#17 0x0045a070 in main () at 975931.c:238
I expect pocl built against llvm 11 (experimental) to fail similarily.
pocl built against llvm 9 (testing) passes.
Sylvestre, could you check whether this is an error on the LLVM side
or is POCL using LLVM incorrectly?
Andreas
#define CL_TARGET_OPENCL_VERSION 220
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
const char source[] =
"#ifdef DOUBLE_PRECISION\n"
" #ifdef cl_khr_fp64\n"
" #pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
" #else\n"
" #pragma OPENCL EXTENSION cl_amd_fp64 : enable\n"
" #endif\n"
"#endif\n"
"\n"
"__kernel void Sdot_kernel( __global float *_X, __global float *_Y, __global float *scratchBuff,\n"
" uint N, uint offx, int incx, uint offy, int incy, int doConj )\n"
"{\n"
" __global float *X = _X + offx;\n"
" __global float *Y = _Y + offy;\n"
" float dotP = (float) 0.0;\n"
"\n"
" if ( incx < 0 ) {\n"
" X = X + (N - 1) * abs(incx);\n"
" }\n"
" if ( incy < 0 ) {\n"
" Y = Y + (N - 1) * abs(incy);\n"
" }\n"
"\n"
" int gOffset;\n"
" for( gOffset=(get_global_id(0) * 4); (gOffset + 4 - 1)<N; gOffset+=( get_global_size(0) * 4 ) )\n"
" {\n"
" float4 vReg1, vReg2, res;\n"
"\n"
" #ifdef INCX_NONUNITY\n"
" vReg1 = (float4)( (X + (gOffset*incx))[0 + ( incx * 0)], (X + (gOffset*incx))[0 + ( incx * 1)], (X + (gOffset*incx))[0 + ( incx * 2)], (X + (gOffset*incx))[0 + ( incx * 3)]);\n"
" #else\n"
" vReg1 = vload4( 0, (__global float *) (X + gOffset) );\n"
" #endif\n"
"\n"
" #ifdef INCY_NONUNITY\n"
" vReg2 = (float4)( (Y + (gOffset*incy))[0 + ( incy * 0)], (Y + (gOffset*incy))[0 + ( incy * 1)], (Y + (gOffset*incy))[0 + ( incy * 2)], (Y + (gOffset*incy))[0 + ( incy * 3)]);\n"
" #else\n"
" vReg2 = vload4( 0, (__global float *) (Y + gOffset) );\n"
" #endif\n"
"\n"
" ;\n"
" res = vReg1 * vReg2 ;\n"
" dotP += res .S0 + res .S1 + res .S2 + res .S3;\n"
"; // Add-up elements in the vector to give a scalar\n"
" }\n"
"\n"
" // Loop for the last thread to handle the tail part of the vector\n"
" // Using the same gOffset used above\n"
" for( ; gOffset<N; gOffset++ )\n"
" {\n"
" float sReg1, sReg2, res;\n"
" sReg1 = X[gOffset * incx];\n"
" sReg2 = Y[gOffset * incy];\n"
"\n"
" ;\n"
" res = sReg1 * sReg2 ;\n"
" dotP = dotP + res ;\n"
" }\n"
"\n"
" // Note: this has to be called outside any if-conditions- because REDUCTION uses barrier\n"
" // dotP of work-item 0 will have the final reduced item of the work-group\n"
" __local float p1753 [ 64 ];\n"
" uint QKiD0 = get_local_id(0);\n"
" p1753 [ QKiD0 ] = dotP ;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" if( QKiD0 < 32 ) {\n"
" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 32 ];\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" if( QKiD0 < 16 ) {\n"
" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 16 ];\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" if( QKiD0 < 8 ) {\n"
" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 8 ];\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" if( QKiD0 < 4 ) {\n"
" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 4 ];\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" if( QKiD0 < 2 ) {\n"
" p1753 [ QKiD0 ] = p1753 [ QKiD0 ] + p1753 [ QKiD0 + 2 ];\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" if( QKiD0 == 0 ) {\n"
" dotP = p1753 [0] + p1753 [1];\n"
" }\n"
"\n"
" if( (get_local_id(0)) == 0 ) {\n"
" scratchBuff[ get_group_id(0) ] = dotP;\n"
" }\n"
"}\n"
"\n"
;
#define MAX_PLATFORMS 32
#define MAX_DEVICES 32
#define MAX_BINARIES 32
#define OPENCL_ERROR_CASE(ERR) \
case ERR: \
{ fprintf (stderr, "" #ERR " in %s on line %i\n", func_name, line); \
return 1; }
int
check_cl_error (cl_int cl_err, int line, const char* func_name) {
switch (cl_err)
{
case CL_SUCCESS: return 0;
OPENCL_ERROR_CASE (CL_DEVICE_NOT_FOUND)
OPENCL_ERROR_CASE (CL_DEVICE_NOT_AVAILABLE)
OPENCL_ERROR_CASE (CL_COMPILER_NOT_AVAILABLE)
OPENCL_ERROR_CASE (CL_MEM_OBJECT_ALLOCATION_FAILURE)
OPENCL_ERROR_CASE (CL_OUT_OF_RESOURCES)
OPENCL_ERROR_CASE (CL_OUT_OF_HOST_MEMORY)
OPENCL_ERROR_CASE (CL_PROFILING_INFO_NOT_AVAILABLE)
OPENCL_ERROR_CASE (CL_MEM_COPY_OVERLAP)
OPENCL_ERROR_CASE (CL_IMAGE_FORMAT_MISMATCH)
OPENCL_ERROR_CASE (CL_IMAGE_FORMAT_NOT_SUPPORTED)
OPENCL_ERROR_CASE (CL_BUILD_PROGRAM_FAILURE)
OPENCL_ERROR_CASE (CL_MAP_FAILURE)
OPENCL_ERROR_CASE (CL_MISALIGNED_SUB_BUFFER_OFFSET)
OPENCL_ERROR_CASE (CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
OPENCL_ERROR_CASE (CL_COMPILE_PROGRAM_FAILURE)
OPENCL_ERROR_CASE (CL_LINKER_NOT_AVAILABLE)
OPENCL_ERROR_CASE (CL_LINK_PROGRAM_FAILURE)
OPENCL_ERROR_CASE (CL_DEVICE_PARTITION_FAILED)
OPENCL_ERROR_CASE (CL_KERNEL_ARG_INFO_NOT_AVAILABLE)
OPENCL_ERROR_CASE (CL_INVALID_VALUE)
OPENCL_ERROR_CASE (CL_INVALID_DEVICE_TYPE)
OPENCL_ERROR_CASE (CL_INVALID_PLATFORM)
OPENCL_ERROR_CASE (CL_INVALID_DEVICE)
OPENCL_ERROR_CASE (CL_INVALID_CONTEXT)
OPENCL_ERROR_CASE (CL_INVALID_QUEUE_PROPERTIES)
OPENCL_ERROR_CASE (CL_INVALID_COMMAND_QUEUE)
OPENCL_ERROR_CASE (CL_INVALID_HOST_PTR)
OPENCL_ERROR_CASE (CL_INVALID_MEM_OBJECT)
OPENCL_ERROR_CASE (CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
OPENCL_ERROR_CASE (CL_INVALID_IMAGE_SIZE)
OPENCL_ERROR_CASE (CL_INVALID_SAMPLER)
OPENCL_ERROR_CASE (CL_INVALID_BINARY)
OPENCL_ERROR_CASE (CL_INVALID_BUILD_OPTIONS)
OPENCL_ERROR_CASE (CL_INVALID_PROGRAM)
OPENCL_ERROR_CASE (CL_INVALID_PROGRAM_EXECUTABLE)
OPENCL_ERROR_CASE (CL_INVALID_KERNEL_NAME)
OPENCL_ERROR_CASE (CL_INVALID_KERNEL_DEFINITION)
OPENCL_ERROR_CASE (CL_INVALID_KERNEL)
OPENCL_ERROR_CASE (CL_INVALID_ARG_INDEX)
OPENCL_ERROR_CASE (CL_INVALID_ARG_VALUE)
OPENCL_ERROR_CASE (CL_INVALID_ARG_SIZE)
OPENCL_ERROR_CASE (CL_INVALID_KERNEL_ARGS)
OPENCL_ERROR_CASE (CL_INVALID_WORK_DIMENSION)
OPENCL_ERROR_CASE (CL_INVALID_WORK_GROUP_SIZE)
OPENCL_ERROR_CASE (CL_INVALID_WORK_ITEM_SIZE)
OPENCL_ERROR_CASE (CL_INVALID_GLOBAL_OFFSET)
OPENCL_ERROR_CASE (CL_INVALID_EVENT_WAIT_LIST)
OPENCL_ERROR_CASE (CL_INVALID_EVENT)
OPENCL_ERROR_CASE (CL_INVALID_OPERATION)
OPENCL_ERROR_CASE (CL_INVALID_GL_OBJECT)
OPENCL_ERROR_CASE (CL_INVALID_BUFFER_SIZE)
OPENCL_ERROR_CASE (CL_INVALID_MIP_LEVEL)
OPENCL_ERROR_CASE (CL_INVALID_GLOBAL_WORK_SIZE)
OPENCL_ERROR_CASE (CL_INVALID_PROPERTY)
OPENCL_ERROR_CASE (CL_INVALID_IMAGE_DESCRIPTOR)
OPENCL_ERROR_CASE (CL_INVALID_COMPILER_OPTIONS)
OPENCL_ERROR_CASE (CL_INVALID_LINKER_OPTIONS)
OPENCL_ERROR_CASE (CL_INVALID_DEVICE_PARTITION_COUNT)
default:
printf ("Unknown OpenCL error %i in %s on line %i\n", cl_err, func_name,
line);
return 1;
}
}
#define _POCLU_CHECK_CL_ERROR_INNER(cond, func, line) \
do \
{ \
if (check_cl_error (cond, line, func)) \
return (EXIT_FAILURE); \
} \
while (0)
#define CHECK_CL_ERROR(cond) _POCLU_CHECK_CL_ERROR_INNER(cond, __PRETTY_FUNCTION__, __LINE__)
#define CHECK_OPENCL_ERROR_IN(message) _POCLU_CHECK_CL_ERROR_INNER(err, message, __LINE__)
#define TEST_ASSERT(EXP) \
do { \
if (!(EXP)) { \
fprintf(stderr, "Assertion: \n" #EXP "\nfailed on %s:%i\n", \
__FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
} while (0)
int main(){
cl_int err;
cl_platform_id platforms[MAX_PLATFORMS];
cl_uint nplatforms;
cl_device_id devices[MAX_DEVICES];
cl_uint ndevices;
cl_program program = NULL;
size_t binsizes[MAX_BINARIES];
size_t nbinaries;
CHECK_CL_ERROR(clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms));
TEST_ASSERT(nplatforms > 0);
CHECK_CL_ERROR(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &ndevices));
TEST_ASSERT(ndevices > 0);
cl_context context = clCreateContext(NULL, 1, devices, NULL, NULL, &err);
CHECK_OPENCL_ERROR_IN("clCreateContext");
const char * src[] = {source};
program = clCreateProgramWithSource(context, 1, src, NULL, &err);
CHECK_OPENCL_ERROR_IN("clCreateProgramWithSource");
CHECK_CL_ERROR(clBuildProgram(program, 1, devices, "-g -DINCX_NONUNITY -DINCY_NONUNITY", NULL, NULL));
CHECK_CL_ERROR(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(binsizes), binsizes, &nbinaries));
printf("binary size: %zd\n", binsizes[0]);
CHECK_CL_ERROR(clReleaseProgram(program));
CHECK_CL_ERROR (clReleaseContext (context));
printf ("OK\n");
return EXIT_SUCCESS;
}