================
@@ -4551,6 +4551,56 @@ static bool interp__builtin_ia32_vpdp(InterpState &S,
CodePtr OpPC,
return true;
}
+// Bit Matrix Multiply and Accumulate (AVX512BMM). Each 256-bit lane holds a
+// 16x16 bit matrix as 16 x i16 elements; element i is row i and bit j of that
+// element is entry [i][j]. The accumulator (third argument, src1 in the AMD
+// ISA) provides the initial value of each result bit, into which the
bit-matrix
+// product of the first two arguments (src2 * src3) is reduced with OR
(vbmacor)
+// or XOR (vbmacxor):
+// for i in 0..15, j in 0..15:
+// bit = C[16*i+j]
+// for k in 0..15: bit OP= A[16*i+k] & B[16*k+j]
+// dest[16*i+j] = bit
+static bool interp__builtin_ia32_bmac(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call, bool IsXor) {
+ assert(Call->getNumArgs() == 3);
+ const Pointer &C = S.Stk.pop<Pointer>();
+ const Pointer &B = S.Stk.pop<Pointer>();
+ const Pointer &A = S.Stk.pop<Pointer>();
+ const Pointer &Dst = S.Stk.peek<Pointer>();
+
+ unsigned NumElems = A.getNumElems();
+ QualType ElemQT = getElemType(A);
+ OptPrimType ElemT = S.getContext().classify(ElemQT);
+ bool DstUnsigned = ElemQT->isUnsignedIntegerOrEnumerationType();
+
+ INT_TYPE_SWITCH_NO_BOOL(*ElemT, {
+ for (unsigned Lane = 0; Lane < NumElems; Lane += 16) {
+ for (unsigned I = 0; I < 16; ++I) {
+ uint16_t AVal = (uint16_t)A.elem<T>(Lane +
I).toAPSInt().getZExtValue();
+ uint16_t DVal = (uint16_t)C.elem<T>(Lane +
I).toAPSInt().getZExtValue();
+ for (unsigned J = 0; J < 16; ++J) {
----------------
tbaederr wrote:
```suggestion
for (unsigned J = 0; J != 16; ++J) {
```
https://github.com/llvm/llvm-project/pull/182556
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits