This is an automated email from the ASF dual-hosted git repository.
ruihangl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 7f02606af2 [OPENCL] Always use convert_T for type conversion (#14972)
7f02606af2 is described below
commit 7f02606af2bc034d8ba2b09f938a58f8d9982855
Author: Tianqi Chen <[email protected]>
AuthorDate: Thu Jun 1 15:52:26 2023 -0400
[OPENCL] Always use convert_T for type conversion (#14972)
This PR changes the Cast in OpenCL to always relying on convert_T to get
closer to the spec and more reliable.
---
src/target/source/codegen_opencl.cc | 10 +--
.../python/unittest/test_target_codegen_opencl.py | 80 ++++++++++------------
.../unittest/test_target_texture_codegen_opencl.py | 38 +++++-----
3 files changed, 61 insertions(+), 67 deletions(-)
diff --git a/src/target/source/codegen_opencl.cc
b/src/target/source/codegen_opencl.cc
index 61a8ee8a57..fa4ca7d34b 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -370,17 +370,19 @@ std::string CodeGenOpenCL::CastFromTo(std::string value,
DataType from, DataType
std::string CodeGenOpenCL::CastTo(std::string value, DataType target) {
std::ostringstream os;
- if (target.lanes() == 1) {
- os << "((";
+ if (target == DataType::Bool()) {
+ os << "(";
+ os << "(";
this->PrintType(target, os);
os << ")" << value << ")";
- } else { // convert vector type
+ return os.str();
+ } else {
os << "(";
os << "convert_";
this->PrintType(target, os);
os << "(" << value << "))";
+ return os.str();
}
- return os.str();
}
void CodeGenOpenCL::VisitStmt_(const AllocateNode* op) {
diff --git a/tests/python/unittest/test_target_codegen_opencl.py
b/tests/python/unittest/test_target_codegen_opencl.py
index 67dc37363e..4a426c952b 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -32,10 +32,12 @@ def test_opencl_ternary_expression():
max_lhs = tvm.tir.const(2, dtype=dtype)
max_rhs = tvm.tir.if_then_else(A[0] > 0, true_value, false_value)
C = te.compute((n,), lambda i: tvm.te.max(max_lhs, max_rhs), name="C")
- s = te.create_schedule(C.op)
- s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
- fun = tvm.build(s, [A, C], target)
+ func = te.create_prim_func([A, C])
+ sch = tvm.tir.Schedule(func)
+ (x,) = sch.get_loops(sch.get_block("C"))
+ sch.bind(x, "threadIdx.x")
+ fun = tvm.build(sch.mod, target=target)
a = tvm.nd.empty((n,), A.dtype, dev)
c = tvm.nd.empty((n,), A.dtype, dev)
# Only need to test compiling here
@@ -48,9 +50,11 @@ def test_opencl_ternary_expression():
max_lhs = tvm.tir.const(2, dtype=dtype)
max_rhs = tvm.tir.Select(A[0] > 0, true_value, false_value)
C = te.compute((n,), lambda i: tvm.te.max(max_lhs, max_rhs), name="C")
- s = te.create_schedule(C.op)
- s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
- fun = tvm.build(s, [A, C], target)
+ func = te.create_prim_func([A, C])
+ sch = tvm.tir.Schedule(func)
+ (x,) = sch.get_loops(sch.get_block("C"))
+ sch.bind(x, "threadIdx.x")
+ fun = tvm.build(sch.mod, target=target)
a = tvm.nd.empty((n,), A.dtype, dev)
c = tvm.nd.empty((n,), A.dtype, dev)
@@ -76,9 +80,11 @@ def test_opencl_inf_nan():
A = te.placeholder((n,), name="A", dtype=dtype)
inf_value = tvm.tir.const(value, dtype=dtype)
C = te.compute((n,), lambda i: inf_value, name="C")
- s = te.create_schedule(C.op)
- s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
- fun = tvm.build(s, [A, C], target)
+ func = te.create_prim_func([A, C])
+ sch = tvm.tir.Schedule(func)
+ (x,) = sch.get_loops(sch.get_block("C"))
+ sch.bind(x, "threadIdx.x")
+ fun = tvm.build(sch.mod, target=target)
a = tvm.nd.empty((n,), A.dtype, dev)
c = tvm.nd.empty((n,), A.dtype, dev)
# Only need to test compiling here
@@ -102,9 +108,11 @@ def test_opencl_max():
max_lhs = A[0] + tvm.tir.const(1, dtype=dtype)
max_rhs = tvm.tir.const(0, dtype=dtype)
C = te.compute((n,), lambda i: tvm.te.max(max_lhs, max_rhs), name="C")
- s = te.create_schedule(C.op)
- s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x"))
- fun = tvm.build(s, [A, C], target)
+ func = te.create_prim_func([A, C])
+ sch = tvm.tir.Schedule(func)
+ (x,) = sch.get_loops(sch.get_block("C"))
+ sch.bind(x, "threadIdx.x")
+ fun = tvm.build(sch.mod, target=target)
a = tvm.nd.empty((n,), A.dtype, dev)
c = tvm.nd.empty((n,), A.dtype, dev)
@@ -150,7 +158,7 @@ def test_opencl_type_casting():
tvm.tir.all(
*[
i // block_size == tvm.tir.const(3, "int32"),
- i % block_size == tvm.tir.const(3, "int32"),
+ i % 3 == tvm.tir.const(1, "int32"),
]
),
tvm.tir.const(1, dtype),
@@ -158,42 +166,26 @@ def test_opencl_type_casting():
),
name="C",
)
- s = te.create_schedule(C.op)
- (tx, vx) = s[C].split(s[C].op.axis[0], factor=block_size)
- s[C].vectorize(vx)
- thrx = te.thread_axis("threadIdx.x")
-
- s[C].bind(tx, thrx)
- fun = tvm.build(s, [C], target)
-
+ # NOTE: test simple convert pattern
+ func = te.create_prim_func([C])
+ sch = tvm.tir.Schedule(func)
+ (x,) = sch.get_loops(sch.get_block("C"))
+ tx, vx = sch.split(x, factors=[None, block_size])
+ sch.bind(tx, "threadIdx.x")
+ sch.vectorize(vx)
+
+ fun = tvm.build(sch.mod, target=target)
c = tvm.nd.empty((n,), dtype, ctx)
assembly = fun.imported_modules[0].get_source()
-
- if dtype == "float32":
- false_branch = "((float4)(0.000000e+00f, 0.000000e+00f,
0.000000e+00f, 0.000000e+00f))"
- true_branch = "((float4)(1.000000e+00f, 1.000000e+00f,
1.000000e+00f, 1.000000e+00f))"
- lcond =
"convert_int4(((convert_uint4(((uint4)((((int)get_local_id(0)) == 3),
(((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3),
(((int)get_local_id(0)) == 3)))))"
- rcond = "(convert_uint4((((int4)((0)+(1*0), (0)+(1*1), (0)+(1*2),
(0)+(1*3))) == ((int4)(3, 3, 3, 3)))))"
- cond = "({} && {})".format(lcond, rcond)
- select = "select({}, {}, {})".format(false_branch, true_branch,
cond)
- count = assembly.count(select)
- assert count == 1
- fun(c)
-
- elif dtype == "float16":
- false_branch = "((half4)((half)0.000000e+00f, (half)0.000000e+00f,
(half)0.000000e+00f, (half)0.000000e+00f))"
- true_branch = "((half4)((half)1.000000e+00f, (half)1.000000e+00f,
(half)1.000000e+00f, (half)1.000000e+00f))"
- lcond =
"convert_short4(((convert_uint4(((uint4)((((int)get_local_id(0)) == 3),
(((int)get_local_id(0)) == 3), (((int)get_local_id(0)) == 3),
(((int)get_local_id(0)) == 3)))))"
- rcond = "(convert_uint4((((int4)((0)+(1*0), (0)+(1*1), (0)+(1*2),
(0)+(1*3))) == ((int4)(3, 3, 3, 3)))))))"
- cond = "({} && {})".format(lcond, rcond)
- select = "select({}, {}, {})".format(false_branch, true_branch,
cond)
- count = assembly.count(select)
- assert count == 1
- fun(c)
+ lcond =
"convert_int4(((convert_uint4(((uint4)(((convert_int(get_local_id(0))) == 3),
((convert_int(get_local_id(0))) == 3), ((convert_int(get_local_id(0))) == 3),
((convert_int(get_local_id(0))) == 3)))))"
+ rcond =
"(convert_uint4(((((int4)(((convert_int(get_local_id(0))))+(1*0),
((convert_int(get_local_id(0))))+(1*1), ((convert_int(get_local_id(0))))+(1*2),
((convert_int(get_local_id(0))))+(1*3))) % ((int4)(3, 3, 3, 3))) == ((int4)(1,
1, 1, 1))))))))"
+ pattern_cond = "({} && {})".format(lcond, rcond)
+ assert assembly.count(pattern_cond) != 0
+ fun(c)
dev = tvm.device(target, 0)
- check_type_casting(dev, 16, "float32")
+ check_type_casting(dev, 32, "float32")
# fp16 is not yet supported in ci
# check_type_casting(dev, 16, "float16")
diff --git a/tests/python/unittest/test_target_texture_codegen_opencl.py
b/tests/python/unittest/test_target_texture_codegen_opencl.py
index 639159c495..5681dcf9e6 100644
--- a/tests/python/unittest/test_target_texture_codegen_opencl.py
+++ b/tests/python/unittest/test_target_texture_codegen_opencl.py
@@ -1466,8 +1466,8 @@ class TestSimpleTextureToScalarFP16:
["global.texture", (1, 1, 40, 40, 4)],
["", (1, 4, 40, 40)],
[
- "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)((((int)get_local_id(0)) % 40), (((((int)get_group_id(0)) & 1) * 20) +
(((int)get_local_id(0)) / 40)))));",
- "out[((((int)get_group_id(0)) * 800) +
((int)get_local_id(0)))] = ((half)((float*)&v_)[(((int)get_group_id(0)) >>
1)]);",
+ "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)(((convert_int(get_local_id(0))) % 40),
((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0)))
/ 40)))));",
+ "out[(((convert_int(get_group_id(0))) * 800) +
(convert_int(get_local_id(0))))] =
(convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]));",
],
),
# 2. Buffer (NCHW4c) -> Cast(FP16) -> Buffer (NCHW)
@@ -1475,7 +1475,7 @@ class TestSimpleTextureToScalarFP16:
["", (1, 1, 40, 40, 4)],
["", (1, 4, 40, 40)],
[
- "out[((((int)get_group_id(0)) * 800) +
((int)get_local_id(0)))] = ((half)p0_comp[((((((int)get_group_id(0)) & 1) *
3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))]);"
+ "out[(((convert_int(get_group_id(0))) * 800) +
(convert_int(get_local_id(0))))] =
(convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) +
((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >>
1))]));"
],
),
# 3. Texture (NCHW4c) -> Cast(FP16) -> Texture (NCHW4c)
@@ -1483,8 +1483,8 @@ class TestSimpleTextureToScalarFP16:
["global.texture", (1, 1, 40, 40, 4)],
["global.texture", (1, 1, 40, 40, 4)],
[
- "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)((((((int)get_group_id(0)) * 24) + ((int)get_local_id(0))) % 40),
(((((int)get_group_id(0)) * 8) + (((int)get_local_id(0)) >> 3)) / 5))));",
- "write_imageh(out, (int2)((((((int)get_group_id(0)) * 24) +
((int)get_local_id(0))) % 40), (((((int)get_group_id(0)) * 8) +
(((int)get_local_id(0)) >> 3)) / 5)), (convert_half4(v_)));",
+ "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)(((((convert_int(get_group_id(0))) * 24) +
(convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) +
((convert_int(get_local_id(0))) >> 3)) / 5))));",
+ "write_imageh(out, (int2)(((((convert_int(get_group_id(0))) *
24) + (convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0)))
* 8) + ((convert_int(get_local_id(0))) >> 3)) / 5)), (convert_half4(v_)));",
],
),
)
@@ -1507,8 +1507,8 @@ class TestSimpleTextureToScalarFP32:
["global.texture", (1, 1, 40, 40, 4)],
["", (1, 4, 40, 40)],
[
- "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)((((int)get_local_id(0)) % 40), (((((int)get_group_id(0)) & 1) * 20) +
(((int)get_local_id(0)) / 40)))));",
- "out[((((int)get_group_id(0)) * 800) +
((int)get_local_id(0)))] = ((float*)&v_)[(((int)get_group_id(0)) >> 1)];",
+ "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)(((convert_int(get_local_id(0))) % 40),
((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0)))
/ 40)))));",
+ "out[(((convert_int(get_group_id(0))) * 800) +
(convert_int(get_local_id(0))))] =
((float*)&v_)[((convert_int(get_group_id(0))) >> 1)];",
],
),
# 2. Buffer (NCHW4c) -> Buffer (NCHW)
@@ -1516,7 +1516,7 @@ class TestSimpleTextureToScalarFP32:
["", (1, 1, 40, 40, 4)],
["", (1, 4, 40, 40)],
[
- "out[((((int)get_group_id(0)) * 800) +
((int)get_local_id(0)))] = p0_comp[((((((int)get_group_id(0)) & 1) * 3200) +
(((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))];"
+ "out[(((convert_int(get_group_id(0))) * 800) +
(convert_int(get_local_id(0))))] = p0_comp[(((((convert_int(get_group_id(0))) &
1) * 3200) + ((convert_int(get_local_id(0))) * 4)) +
((convert_int(get_group_id(0))) >> 1))];"
],
),
)
@@ -1619,8 +1619,8 @@ class TestTextureToScalarReuseSSAFP16:
["global.texture", (1, 1, 40, 40, 4)],
["", (1, 4, 40, 40)],
[
- "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)((((int)get_local_id(0)) % 40), (((((int)get_group_id(0)) & 1) * 20) +
(((int)get_local_id(0)) / 40)))));",
- "out_sum[((((int)get_group_id(0)) * 800) +
((int)get_local_id(0)))] = (((half)((float*)&v_)[(((int)get_group_id(0)) >>
1)]) + (((half)((float*)&v_)[(((int)get_group_id(0)) >> 1)]) +
((half)((float*)&v_)[(((int)get_group_id(0)) >> 1)])));",
+ "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)(((convert_int(get_local_id(0))) % 40),
((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0)))
/ 40)))));",
+ "out_sum[(((convert_int(get_group_id(0))) * 800) +
(convert_int(get_local_id(0))))] =
((convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)])) +
((convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)])) +
(convert_half(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]))));",
],
),
# 2. Buffer (NCHW4c) -> Cast(FP16) -> Buffer (NCHW)
@@ -1628,7 +1628,7 @@ class TestTextureToScalarReuseSSAFP16:
["", (1, 1, 40, 40, 4)],
["", (1, 4, 40, 40)],
[
- "out_sum[((((int)get_group_id(0)) * 800) +
((int)get_local_id(0)))] = (((half)p0_comp[((((((int)get_group_id(0)) & 1) *
3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))]) +
(((half)p0_comp[((((((int)get_group_id(0)) & 1) * 3200) +
(((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))]) +
((half)p0_comp[((((((int)get_group_id(0)) & 1) * 3200) +
(((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))])));"
+ " out_sum[(((convert_int(get_group_id(0))) * 800) +
(convert_int(get_local_id(0))))] =
((convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) +
((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >>
1))])) + ((convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200)
+ ((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >>
1))])) + (convert_half(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200)
+ ((c [...]
],
),
# 3. Texture (NCHW4c) -> Cast(FP16) -> Texture (NCHW4c)
@@ -1636,8 +1636,8 @@ class TestTextureToScalarReuseSSAFP16:
["global.texture", (1, 1, 40, 40, 4)],
["global.texture", (1, 1, 40, 40, 4)],
[
- "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)((((((int)get_group_id(0)) * 24) + ((int)get_local_id(0))) % 40),
(((((int)get_group_id(0)) * 8) + (((int)get_local_id(0)) >> 3)) / 5))));",
- "write_imageh(out_sum, (int2)((((((int)get_group_id(0)) * 24)
+ ((int)get_local_id(0))) % 40), (((((int)get_group_id(0)) * 8) +
(((int)get_local_id(0)) >> 3)) / 5)), ((convert_half4(v_)) +
((convert_half4(v_)) + (convert_half4(v_)))));",
+ "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)(((((convert_int(get_group_id(0))) * 24) +
(convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) +
((convert_int(get_local_id(0))) >> 3)) / 5))));",
+ "write_imageh(out_sum,
(int2)(((((convert_int(get_group_id(0))) * 24) +
(convert_int(get_local_id(0)))) % 40), ((((convert_int(get_group_id(0))) * 8) +
((convert_int(get_local_id(0))) >> 3)) / 5)), ((convert_half4(v_)) +
((convert_half4(v_)) + (convert_half4(v_)))));",
],
),
)
@@ -1660,8 +1660,8 @@ class TestTextureToScalarReuseSSAFP32:
["global.texture", (1, 1, 40, 40, 4)],
["", (1, 4, 40, 40)],
[
- "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)((((int)get_local_id(0)) % 40), (((((int)get_group_id(0)) & 1) * 20) +
(((int)get_local_id(0)) / 40)))));",
- "out_sum[((((int)get_group_id(0)) * 800) +
((int)get_local_id(0)))] = (((float*)&v_)[(((int)get_group_id(0)) >> 1)] +
(((float*)&v_)[(((int)get_group_id(0)) >> 1)] +
((float*)&v_)[(((int)get_group_id(0)) >> 1)]));",
+ "float4 v_ = READ_IMAGEF(p0_comp, image_sampler,
((int2)(((convert_int(get_local_id(0))) % 40),
((((convert_int(get_group_id(0))) & 1) * 20) + ((convert_int(get_local_id(0)))
/ 40)))));",
+ "out_sum[(((convert_int(get_group_id(0))) * 800) +
(convert_int(get_local_id(0))))] =
(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)] +
(((float*)&v_)[((convert_int(get_group_id(0))) >> 1)] +
((float*)&v_)[((convert_int(get_group_id(0))) >> 1)]));",
],
),
# 2. Buffer (NCHW4c) -> Buffer (NCHW)
@@ -1669,7 +1669,7 @@ class TestTextureToScalarReuseSSAFP32:
["", (1, 1, 40, 40, 4)],
["", (1, 4, 40, 40)],
[
- "out_sum[((((int)get_group_id(0)) * 800) +
((int)get_local_id(0)))] = (p0_comp[((((((int)get_group_id(0)) & 1) * 3200) +
(((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))] +
(p0_comp[((((((int)get_group_id(0)) & 1) * 3200) + (((int)get_local_id(0)) *
4)) + (((int)get_group_id(0)) >> 1))] + p0_comp[((((((int)get_group_id(0)) & 1)
* 3200) + (((int)get_local_id(0)) * 4)) + (((int)get_group_id(0)) >> 1))]));"
+ "out_sum[(((convert_int(get_group_id(0))) * 800) +
(convert_int(get_local_id(0))))] = (p0_comp[(((((convert_int(get_group_id(0)))
& 1) * 3200) + ((convert_int(get_local_id(0))) * 4)) +
((convert_int(get_group_id(0))) >> 1))] +
(p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) +
((convert_int(get_local_id(0))) * 4)) + ((convert_int(get_group_id(0))) >> 1))]
+ p0_comp[(((((convert_int(get_group_id(0))) & 1) * 3200) +
((convert_int(get_local_id(0))) * 4)) + ((convert_ [...]
],
),
)
@@ -1693,10 +1693,10 @@ class TestLocalArrayToTexture:
(1, 2, 38, 38, 4),
[
"float out_local[4];",
- "float4 v_ = READ_IMAGEF(p1_comp, image_sampler,
((int2)((((((int)get_group_id(0)) * 14) + ((int)get_local_id(0))) % 38),
((((((int)get_group_id(0)) * 64) + (((int)get_local_id(0)) >> 1)) % 722) /
19))));",
- "float4 v__1 = READ_IMAGEF(p2_comp, image_sampler, ((int2)(rw,
((((((((int)get_group_id(0)) * 32) + (((int)get_local_id(0)) >> 2)) / 361) *
12) + (rcb * 3)) + rh))));",
+ "float4 v_ = READ_IMAGEF(p1_comp, image_sampler,
((int2)(((((convert_int(get_group_id(0))) * 14) +
(convert_int(get_local_id(0)))) % 38), (((((convert_int(get_group_id(0))) * 64)
+ ((convert_int(get_local_id(0))) >> 1)) % 722) / 19))));",
+ "float4 v__1 = READ_IMAGEF(p2_comp, image_sampler, ((int2)(rw,
(((((((convert_int(get_group_id(0))) * 32) + ((convert_int(get_local_id(0))) >>
2)) / 361) * 12) + (rcb * 3)) + rh))));",
"out_local[cb_c] = (out_local[cb_c] + (((float*)&v_)[rcb] *
((float*)&v__1)[cb_c]));",
- "write_imagef(out, (int2)((((((int)get_group_id(0)) * 14) +
((int)get_local_id(0))) % 38), (((((int)get_group_id(0)) * 64) +
(((int)get_local_id(0)) >> 1)) / 19)), vload4(0, out_local + 0));",
+ "write_imagef(out, (int2)(((((convert_int(get_group_id(0))) *
14) + (convert_int(get_local_id(0)))) % 38), ((((convert_int(get_group_id(0)))
* 64) + ((convert_int(get_local_id(0))) >> 1)) / 19)), vload4(0, out_local +
0));",
],
),
)