Jordi Vaquero has uploaded this change for review. ( https://gem5-review.googlesource.com/c/public/gem5/+/27183 )

Change subject: arch-arm: ARMv8.3 CompNum, SIMD complex number support
......................................................................

arch-arm: ARMv8.3 CompNum, SIMD complex number support

This patch implements the CompNum SIMD instruction for armv8.3.
This instructions are Fcadd, Fcmla(vector and element) and
Vcadd, Vcmla ( vector and element).

+ isa/decoder/thumb.isa: Decoding changes for SIMD instructions in T32
+ isa/formats/fp.isa: Decoding changes for SIMD instructions in A32
+ isa/formats/uncond.isa: Decoding changes for SIMD instructions in A32
+ isa/formats/aarch64.isa: Decoding changes for SIMD instructions in A64
+ isa/formats/neon64.isa: Decoding changes for SIMD instructions in A64
+ isa/insts/neon.isa: Vcadd, Vcmla instruction implementation
+ isa/insts/neon64.isa: Fcadd, Fcmla instruction implementation
+ isa/templates/neon.isa: Modify templates for adding byElement support

Change-Id: I7f11ce88137dad077d2cad698dcaa9a79a3f317b
---
M src/arch/arm/ArmISA.py
M src/arch/arm/isa/decoder/thumb.isa
M src/arch/arm/isa/formats/aarch64.isa
M src/arch/arm/isa/formats/fp.isa
M src/arch/arm/isa/formats/neon64.isa
M src/arch/arm/isa/formats/uncond.isa
M src/arch/arm/isa/insts/neon.isa
M src/arch/arm/isa/insts/neon64.isa
M src/arch/arm/isa/templates/neon.isa
9 files changed, 496 insertions(+), 26 deletions(-)



diff --git a/src/arch/arm/ArmISA.py b/src/arch/arm/ArmISA.py
index 0d91705..2641ec3 100644
--- a/src/arch/arm/ArmISA.py
+++ b/src/arch/arm/ArmISA.py
@@ -79,7 +79,7 @@
id_isar2 = Param.UInt32(0x21232141, "Instruction Set Attribute Register 2") id_isar3 = Param.UInt32(0x01112131, "Instruction Set Attribute Register 3") id_isar4 = Param.UInt32(0x10010142, "Instruction Set Attribute Register 4") - id_isar5 = Param.UInt32(0x00000000, "Instruction Set Attribute Register 5") + id_isar5 = Param.UInt32(0x10000000, "Instruction Set Attribute Register 5")

     fpsid = Param.UInt32(0x410430a0, "Floating-point System ID Register")

@@ -101,8 +101,8 @@
     id_aa64isar0_el1 = Param.UInt64(0x0000000000000000,
         "AArch64 Instruction Set Attribute Register 0")

-    # GPI = 0x0 | GPA = 0x1| API=0x0 | APA=0x1
-    id_aa64isar1_el1 = Param.UInt64(0x0000000001000010,
+    # GPI = 0x0 | GPA = 0x1| API=0x0 | APA=0x1 | FCMA
+    id_aa64isar1_el1 = Param.UInt64(0x0000000001010010,
         "AArch64 Instruction Set Attribute Register 1")

     # 4K | 64K | !16K | !BigEndEL0 | !SNSMem | !BigEnd | 8b ASID | 40b PA
diff --git a/src/arch/arm/isa/decoder/thumb.isa b/src/arch/arm/isa/decoder/thumb.isa
index 7f04ef3..c319ec3 100644
--- a/src/arch/arm/isa/decoder/thumb.isa
+++ b/src/arch/arm/isa/decoder/thumb.isa
@@ -138,9 +138,11 @@
0x3: Thumb32LongMulMulAccAndDiv::thumb32LongMulMulAccAndDiv();
                 default: Thumb32DataProcReg::thumb32DataProcReg();
             }
+            0x2: Thumb32NeonSIMD::thumb32NeonSIMD();
             default: decode HTOPCODE_9_8 {
                 0x2: decode LTOPCODE_4 {
                     0x0: decode LTCOPROC {
+                        0x8: Thumb32NeonSIMD::thumb32NeonSIMD();
                         0xa, 0xb: VfpData::vfpData();
                         default: WarnUnimpl::cdp(); // cdp2
                     }
diff --git a/src/arch/arm/isa/formats/aarch64.isa b/src/arch/arm/isa/formats/aarch64.isa
index 7aaca04..4ff5456 100644
--- a/src/arch/arm/isa/formats/aarch64.isa
+++ b/src/arch/arm/isa/formats/aarch64.isa
@@ -2313,10 +2313,8 @@
             } else {
                 return new Unknown64(machInst);
             }
-        } else if (bits(machInst, 24) ||
-                   bits(machInst, 21) ||
-                   bits(machInst, 15)) {
-            return new Unknown64(machInst);
+        } else if (bits(machInst, 15) == 1) {
+            return decodeNeon3SameExtra<DecoderFeatures>(machInst);
         } else if (bits(machInst, 10) == 1) {
             if (bits(machInst, 23, 22))
                 return new Unknown64(machInst);
diff --git a/src/arch/arm/isa/formats/fp.isa b/src/arch/arm/isa/formats/fp.isa
index de0fdd2..f1b387e 100644
--- a/src/arch/arm/isa/formats/fp.isa
+++ b/src/arch/arm/isa/formats/fp.isa
@@ -96,6 +96,9 @@

     StaticInstPtr
     decodeNeonData(ExtMachInst machInst);
+
+    StaticInstPtr
+    decodeAdvancedSIMD(ExtMachInst machInst);
     '''

     decoder_output = '''
@@ -333,6 +336,84 @@
         return new Unknown(machInst);
     }
     '''
+    decoder_output += '''
+    StaticInstPtr
+    decodeAdvancedSIMD(ExtMachInst machInst)
+    {
+        uint8_t op_code = (bits(machInst, 25) << 1)
+                          | bits(machInst, 21);
+
+        IntRegIndex vd = (IntRegIndex)(2 * (bits(machInst, 15, 12) |
+                               (bits(machInst, 22) << 4)));
+        IntRegIndex vn = (IntRegIndex)(2 * (bits(machInst, 19, 16) |
+                               (bits(machInst, 7) << 4)));
+        IntRegIndex vm = (IntRegIndex)(2 * (bits(machInst, 3, 0) |
+                               (bits(machInst, 5) << 4)));
+        bool q = bits (machInst, 6);
+        switch (op_code) {
+          case 0x0:
+          {
+            // VCADD
+            bool s = bits (machInst, 20);
+            if (s) {
+               if (q)
+                   return new VcaddQ<uint32_t>(machInst, vd, vn, vm);
+               else
+                   return new VcaddD<uint32_t>(machInst, vd, vn, vm);
+            } else {
+               if (q)
+                   return new VcaddQ<uint16_t>(machInst, vd, vn, vm);
+               else
+                   return new VcaddD<uint16_t>(machInst, vd, vn, vm);
+            }
+          }
+          case 0x1:
+          {
+            // VCMLA
+            bool s = bits (machInst, 20);
+            if (s) {
+               if (q)
+                   return new VcmlaQ<uint32_t>(machInst, vd, vn, vm);
+               else
+                   return new VcmlaD<uint32_t>(machInst, vd, vn, vm);
+            } else {
+               if (q)
+                   return new VcmlaQ<uint16_t>(machInst, vd, vn, vm);
+               else
+                   return new VcmlaD<uint16_t>(machInst, vd, vn, vm);
+            }
+          }
+          case 0x2:
+          case 0x3:
+          {
+            // VCMLA by element
+            bool s = bits (machInst, 23);
+            if (s) {
+               uint8_t index_fp = 0;
+               if (q)
+                   return new VcmlaElemQ<uint32_t>(machInst, vd, vn, vm,
+                                                   index_fp);
+               else
+                   return new VcmlaElemD<uint32_t>(machInst, vd, vn, vm,
+                                                   index_fp);
+            } else {
+               vm = (IntRegIndex)(uint8_t)(2* bits(machInst, 3, 0));
+               uint8_t index_fp = bits(machInst, 5);
+               if (q)
+                   return new VcmlaElemQ<uint16_t>(machInst, vd, vn, vm,
+                                                   index_fp);
+               else
+                   return new VcmlaElemD<uint16_t>(machInst, vd, vn, vm,
+                                                   index_fp);
+            }
+          }
+          default:
+            return new Unknown64(machInst);
+        }
+
+    }
+    '''
+

     decoder_output += '''
     static StaticInstPtr
@@ -1869,6 +1950,12 @@
     '''
 }};

+def format Thumb32NeonSIMD() {{
+    decode_block = '''
+    return decodeAdvancedSIMD(machInst);
+    '''
+}};
+
 let {{
     header_output = '''
     bool
diff --git a/src/arch/arm/isa/formats/neon64.isa b/src/arch/arm/isa/formats/neon64.isa
index 1bdc97c..6c2b2e0 100644
--- a/src/arch/arm/isa/formats/neon64.isa
+++ b/src/arch/arm/isa/formats/neon64.isa
@@ -39,6 +39,9 @@
     // AdvSIMD three same
     template <typename DecoderFeatures>
     StaticInstPtr decodeNeon3Same(ExtMachInst machInst);
+    // AdvSIMD three same Extra
+    template <typename DecoderFeatures>
+    StaticInstPtr decodeNeon3SameExtra(ExtMachInst machInst);
     // AdvSIMD three different
     inline StaticInstPtr decodeNeon3Diff(ExtMachInst machInst);
     // AdvSIMD two-reg misc
@@ -500,6 +503,48 @@
         }
     }

+    template <typename DecoderFeatures>
+    StaticInstPtr
+    decodeNeon3SameExtra(ExtMachInst machInst)
+    {
+        uint8_t q      = bits(machInst, 30);
+        uint8_t size   = bits(machInst, 23, 22);
+        uint8_t opcode = bits(machInst, 15, 11);
+
+        IntRegIndex vd = (IntRegIndex) (uint8_t) bits(machInst, 4, 0);
+        IntRegIndex vn = (IntRegIndex) (uint8_t) bits(machInst, 9, 5);
+        IntRegIndex vm = (IntRegIndex) (uint8_t) bits(machInst, 20, 16);
+
+        switch (opcode) {
+          case 0x18:
+          case 0x19:
+          case 0x1a:
+          case 0x1b:
+            if (size == 0x1) {
+               if (q)
+                   return new FcmlaQX<uint16_t>(machInst, vd, vn, vm);
+               else
+                   return new FcmlaDX<uint16_t>(machInst, vd, vn, vm);
+            } else
+            return decodeNeonUThreeFpReg<FcmlaDX, FcmlaQX>(
+                                q, size & 0x1, machInst, vd, vn, vm);
+
+          case 0x1c:
+          case 0x1e:
+            if (size == 0x1) {
+               if (q)
+                   return new FcaddQX<uint16_t>(machInst, vd, vn, vm);
+               else
+                   return new FcaddDX<uint16_t>(machInst, vd, vn, vm);
+            } else
+                return decodeNeonUThreeFpReg<FcaddDX, FcaddQX>(
+                                    q, size & 0x1, machInst, vd, vn, vm);
+
+          default:
+            return new Unknown64(machInst);
+        }
+    }
+
     StaticInstPtr
     decodeNeon3Diff(ExtMachInst machInst)
     {
@@ -1324,7 +1369,27 @@
             if (!u && size >= 2 && sz_q != 0x2 && sz_L != 0x3)
                 return decodeNeonUThreeImmFpReg<FmlaElemDX, FmlaElemQX>(
                     q, sz, machInst, vd, vn, vm_fp, index_fp);
-            else
+            else if (u && (size == 1 || size == 2)){
+                // FCMLA by element
+                if (size == 0x2) {
+                    index_fp = H;
+                    if (q)
+                        return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                    else
+                        return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                } else {
+                    index_fp = (H << 1) | L;
+                    if (q)
+                        return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                    else
+                        return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                }
+
+            } else
                 return new Unknown64(machInst);
           case 0x2:
             if (size == 0x0 || size == 0x3)
@@ -1336,7 +1401,26 @@
return decodeNeonSThreeImmHAndWReg<SmlalElemX, SmlalElem2X>(
                     q, size, machInst, vd, vn, vm, index);
           case 0x3:
-            if (u || (size == 0x0 || size == 0x3))
+            if (u && (size == 1 || size == 2)){
+                // FCMLA by element
+                if (size == 0x2) {
+                    index_fp = H;
+                    if (q)
+                        return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                    else
+                        return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                } else {
+                    index_fp = (H << 1) | L;
+                    if (q)
+                        return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                    else
+                        return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                }
+            } else if (u || (size == 0x0 || size == 0x3))
                 return new Unknown64(machInst);
             else
                 return decodeNeonSThreeImmHAndWReg<SqdmlalElemX,
@@ -1352,7 +1436,26 @@
             if (!u && size >= 0x2 && sz_L != 0x3 && sz_q != 0x2)
                 return decodeNeonUThreeImmFpReg<FmlsElemDX, FmlsElemQX>(
                     q, sz, machInst, vd, vn, vm_fp, index_fp);
-            else
+            else if (u && (size == 1 || size == 2)){
+                // FCMLA by element
+                if (size == 0x2) {
+                    index_fp = H;
+                    if (q)
+                        return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                    else
+                        return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                } else {
+                    index_fp = (H << 1) | L;
+                    if (q)
+                        return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                    else
+                        return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                }
+            } else
                 return new Unknown64(machInst);
           case 0x6:
             if (size == 0x0 || size == 0x3)
@@ -1364,7 +1467,26 @@
return decodeNeonSThreeImmHAndWReg<SmlslElemX, SmlslElem2X>(
                     q, size, machInst, vd, vn, vm, index);
           case 0x7:
-            if (u || (size == 0x0 || size == 0x3))
+             if (u && (size == 1 || size == 2)){
+                // FCMLA by element
+                if (size == 0x2) {
+                    index_fp = H;
+                    if (q)
+                        return new FcmlaElemQX<uint32_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                    else
+                        return new FcmlaElemDX<uint32_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                } else {
+                    index_fp = (H << 1) | L;
+                    if (q)
+                        return new FcmlaElemQX<uint16_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                    else
+                        return new FcmlaElemDX<uint16_t>(machInst, vd, vn,
+                                                        vm_fp, index_fp);
+                }
+             } else if (u || (size == 0x0 || size == 0x3))
                 return new Unknown64(machInst);
             else
                 return decodeNeonSThreeImmHAndWReg<SqdmlslElemX,
diff --git a/src/arch/arm/isa/formats/uncond.isa b/src/arch/arm/isa/formats/uncond.isa
index ed5ed04..dfa9214 100644
--- a/src/arch/arm/isa/formats/uncond.isa
+++ b/src/arch/arm/isa/formats/uncond.isa
@@ -237,7 +237,9 @@
                     return new BlxImm(machInst, imm, COND_UC);
                 }
               case 0x2:
-                if (bits(op1, 4, 0) != 0) {
+                if (bits(machInst, 31, 25) == 0x7e){
+                    return decodeAdvancedSIMD(machInst);
+                } else if (bits(op1, 4, 0) != 0) {
                     if (CPNUM == 0xa || CPNUM == 0xb) {
                         return decodeExtensionRegLoadStore(machInst);
                     }
@@ -262,7 +264,9 @@
                 }
                 break;
               case 0x3:
-                if (bits(op1, 4) == 0) {
+                if (bits(machInst, 31, 24) == 0xfe) {
+                    return decodeAdvancedSIMD(machInst);
+                } else if (bits(op1, 4) == 0) {
                     if (CPNUM == 0xa || CPNUM == 0xb) {
                         return decodeShortFpTransfer(machInst);
                     } else if (CPNUM == 0xe) {
diff --git a/src/arch/arm/isa/insts/neon.isa b/src/arch/arm/isa/insts/neon.isa
index facdd16..aa67353 100644
--- a/src/arch/arm/isa/insts/neon.isa
+++ b/src/arch/arm/isa/insts/neon.isa
@@ -1146,12 +1146,21 @@
     allTypes = unsignedTypes + signedTypes

     def threeEqualRegInst(name, Name, opClass, types, rCount, op,
-                          readDest=False, pairwise=False,
-                          standardFpcsr=False):
+                          readDest=False, pairwise=False, byElem=False,
+                          standardFpcsr=False, complex=False):
         global header_output, exec_output
         eWalkCode = simdEnabledCheckCode + '''
-        RegVect srcReg1, srcReg2, destReg;
-        '''
+                    RegVect srcReg1, destReg;
+                    '''
+        if byElem:
+            # 2nd register operand has to be read fully
+            eWalkCode += '''
+                FullRegVect srcReg2;
+                '''
+        else:
+            eWalkCode += '''
+            RegVect srcReg2;
+            '''
         for reg in range(rCount):
             eWalkCode += '''
                 srcReg1.regs[%(reg)d] = htole(FpOp1P%(reg)d_uw);
@@ -1161,6 +1170,13 @@
                 eWalkCode += '''
                     destReg.regs[%(reg)d] = htole(FpDestP%(reg)d_uw);
                 ''' % { "reg" : reg }
+        if byElem:
+            # 2nd operand has to be read fully
+            for reg in range(rCount, 4):
+                eWalkCode += '''
+        srcReg2.regs[%(reg)d] = htole(FpOp2P%(reg)d_uw);
+        ''' % { "reg" : reg }
+
         readDestCode = ''
         if standardFpcsr:
             eWalkCode += '''
@@ -1168,7 +1184,10 @@
             '''
         if readDest:
             readDestCode = 'destElem = letoh(destReg.elements[i]);'
-        if pairwise:
+
+        if complex:
+            eWalkCode += op
+        elif pairwise:
             eWalkCode += '''
             for (unsigned i = 0; i < eCount; i++) {
                 Element srcElem1 = letoh(2 * i < eCount ?
@@ -1203,12 +1222,15 @@
             FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
             ''' % { "reg" : reg }
         iop = InstObjParams(name, Name,
-                            "RegRegRegOp",
-                            { "code": eWalkCode,
-                              "r_count": rCount,
-                              "predicate_test": predicateTest,
-                              "op_class": opClass }, [])
-        header_output += NeonRegRegRegOpDeclare.subst(iop)
+                            "RegRegRegImmOp" if byElem else "RegRegRegOp",
+                             { "code": eWalkCode,
+                               "r_count": rCount,
+                               "predicate_test": predicateTest,
+                               "op_class": opClass }, [])
+        if byElem:
+            header_output += NeonRegRegRegImmOpDeclare.subst(iop)
+        else:
+            header_output += NeonRegRegRegOpDeclare.subst(iop)
         exec_output += NeonEqualRegExecute.subst(iop)
         for type in types:
             substDict = { "targs" : type,
@@ -2186,6 +2208,119 @@
     '''
threeRegNarrowInst("vrsubhn", "Vrsubhn", "SimdAddOp", smallTypes, vrsubhnCode)

+    vcaddCode = '''
+        bool rot = bits(machInst, 24);
+        Element el1;
+        Element el3;
+
+        for (int i = 0; i < eCount/2; ++i) {
+            Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
+            Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
+            Element srcElem2_1 = letoh(srcReg2.elements[2*i]);
+            Element srcElem2_2 = letoh(srcReg2.elements[2*i+1]);
+            Element destElem_1;
+            Element destElem_2;
+            if (rot) {
+                el1 = srcElem2_2;
+                el3 = fplibNeg<Element>(srcElem2_1);
+            } else {
+                el1 = fplibNeg<Element>(srcElem2_2);
+                el3 = srcElem2_1;
+            }
+
+            destElem_1 = fplibAdd<Element>(srcElem1_1, el1, fpscr);
+            destElem_2 = fplibAdd<Element>(srcElem1_2, el3, fpscr);
+            destReg.elements[2*i] = htole(destElem_1);
+            destReg.elements[2*i+1] = htole(destElem_2);
+         }
+         '''
+
+    # VCADD
+    threeEqualRegInst("vcadd", "VcaddD", "SimdFloatAddOp",
+                            ("uint16_t", "uint32_t"), 2, vcaddCode,
+                            standardFpcsr=True, complex=True)
+    threeEqualRegInst("vcadd", "VcaddQ", "SimdFloatAddOp",
+                            ("uint16_t", "uint32_t"), 4,
+                           vcaddCode, standardFpcsr=True, complex=True)
+
+    vcmlaCode = '''
+        uint8_t rot = bits(machInst, %(rot)s);
+        Element el1;
+        Element el2;
+        Element el3;
+        Element el4;
+        for (int i = 0; i < eCount/2; ++i) {
+
+            Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
+            Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
+            Element srcElem2_1 = letoh(srcReg2.elements[2*%(index)s]);
+            Element srcElem2_2 = letoh(srcReg2.elements[2*%(index)s+1]);
+            Element destElem_1 = letoh(destReg.elements[2*i]);
+            Element destElem_2 = letoh(destReg.elements[2*i+1]);
+
+            switch (rot) {
+              case 0x0:
+                {
+                  el1 = srcElem2_1;
+                  el2 = srcElem1_1;
+                  el3 = srcElem2_2;
+                  el4 = srcElem1_1;
+                  break;
+                }
+              case 0x1:
+                {
+                  el1 = fplibNeg<Element>(srcElem2_2);
+                  el2 = srcElem1_2;
+                  el3 = srcElem2_1;
+                  el4 = srcElem1_2;
+                  break;
+                }
+              case 0x2:
+                {
+                  el1 = fplibNeg<Element>(srcElem2_1);
+                  el2 = srcElem1_1;
+                  el3 = fplibNeg<Element>(srcElem2_2);
+                  el4 = srcElem1_1;
+                  break;
+                }
+              case 0x3:
+                {
+                  el1 = srcElem2_2;
+                  el2 = srcElem1_2;
+                  el3 = fplibNeg<Element>(srcElem2_1);
+                  el4 = srcElem1_2;
+                  break;
+                }
+            }
+
+            destElem_1 = fplibMulAdd<Element>(destElem_1, el2, el1, fpscr);
+            destElem_2 = fplibMulAdd<Element>(destElem_2, el4, el3, fpscr);
+
+            destReg.elements[2*i] = htole(destElem_1);
+            destReg.elements[2*i+1] = htole(destElem_2);
+         }
+         '''
+
+    # VCMLA (by element)
+    vcmla_imm = vcmlaCode % {'rot': '21, 20', 'index': 'imm'}
+    threeEqualRegInst("vcmla", "VcmlaElemD", "SimdFloatMultAccOp",
+                           ("uint16_t", "uint32_t"), 2, vcmla_imm,
+                           readDest=True, byElem=True, standardFpcsr=True,
+                           complex=True)
+    threeEqualRegInst("vcmla", "VcmlaElemQ", "SimdFloatMultAccOp",
+                           ("uint16_t", "uint32_t"), 4, vcmla_imm,
+                           readDest=True, byElem=True, standardFpcsr=True,
+                           complex=True)
+
+    # FCMLA (vector)
+    vcmla_vec = vcmlaCode % {'rot': '24, 23', 'index': 'i'}
+    threeEqualRegInst("vcmla", "VcmlaD", "SimdFloatMultAccOp",
+                            ("uint16_t", "uint32_t"), 2, vcmla_vec,
+ readDest=True, standardFpcsr=True, complex=True)
+    threeEqualRegInst("vcmla", "VcmlaQ", "SimdFloatMultAccOp",
+                            ("uint16_t", "uint32_t"), 4, vcmla_vec,
+ readDest=True, standardFpcsr=True, complex=True)
+
     vqaddSCode = '''
         destElem = srcElem1 + srcElem2;
         FPSCR fpscr = (FPSCR) FpscrQc;
diff --git a/src/arch/arm/isa/insts/neon64.isa b/src/arch/arm/isa/insts/neon64.isa
index 6db9e38..5186de3 100644
--- a/src/arch/arm/isa/insts/neon64.isa
+++ b/src/arch/arm/isa/insts/neon64.isa
@@ -52,7 +52,7 @@

     def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
                            readDest=False, pairwise=False, scalar=False,
-                           byElem=False, decoder='Generic'):
+                           byElem=False, decoder='Generic', complex=False):
         assert (not pairwise) or ((not byElem) and (not scalar))
         global header_output, exec_output, decoders
         eWalkCode = simd64EnabledCheckCode + '''
@@ -85,7 +85,10 @@
         readDestCode = ''
         if readDest:
             readDestCode = 'destElem = letoh(destReg.elements[i]);'
-        if pairwise:
+
+        if complex:
+            eWalkCode += op
+        elif pairwise:
             eWalkCode += '''
         for (unsigned i = 0; i < eCount; i++) {
             Element srcElem1 = letoh(2 * i < eCount ?
@@ -975,6 +978,119 @@
                        True)
threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
                        True)
+
+    # FCADD
+    fcaddCode = '''
+        bool rot = bits(machInst, 12);
+        Element el1;
+        Element el3;
+        for (int i = 0; i < eCount/2; ++i) {
+            FPSCR fpscr = (FPSCR) FpscrExc;
+
+            Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
+            Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
+            Element srcElem2_1 = letoh(srcReg2.elements[2*i]);
+            Element srcElem2_2 = letoh(srcReg2.elements[2*i+1]);
+            Element destElem_1;
+            Element destElem_2;
+            if (rot) {
+                el1 = srcElem2_2;
+                el3 = fplibNeg<Element>(srcElem2_1);
+            } else {
+                el1 = fplibNeg<Element>(srcElem2_2);
+                el3 = srcElem2_1;
+            }
+
+            destElem_1 = fplibAdd<Element>(srcElem1_1, el1, fpscr);
+            destElem_2 = fplibAdd<Element>(srcElem1_2, el3, fpscr);
+
+            FpscrExc = fpscr;
+
+            destReg.elements[2*i] = htole(destElem_1);
+            destReg.elements[2*i+1] = htole(destElem_2);
+         }
+         '''
+
+    threeEqualRegInstX("fcadd", "FcaddDX", "SimdFloatAddOp",
+                            ("uint16_t", "uint32_t"), 2,
+                            fcaddCode, complex=True)
+    threeEqualRegInstX("fcadd", "FcaddQX", "SimdFloatAddOp", floatTypes, 4,
+                       fcaddCode, complex=True)
+
+    fcmlaCode = '''
+        uint8_t rot = bits(machInst, %(rot)s);
+        Element el1;
+        Element el2;
+        Element el3;
+        Element el4;
+        for (int i = 0; i < eCount/2; ++i) {
+            FPSCR fpscr = (FPSCR) FpscrExc;
+
+            Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
+            Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
+            Element srcElem2_1 = letoh(srcReg2.elements[2* %(index)s]);
+            Element srcElem2_2 = letoh(srcReg2.elements[2* %(index)s +1]);
+            Element destElem_1 = letoh(destReg.elements[2*i]);
+            Element destElem_2 = letoh(destReg.elements[2*i+1]);
+
+            switch (rot) {
+              case 0x0:
+                {
+                  el1 = srcElem2_1;
+                  el2 = srcElem1_1;
+                  el3 = srcElem2_2;
+                  el4 = srcElem1_1;
+                  break;
+                }
+              case 0x1:
+                {
+                  el1 = fplibNeg<Element>(srcElem2_2);
+                  el2 = srcElem1_2;
+                  el3 = srcElem2_1;
+                  el4 = srcElem1_2;
+                  break;
+                }
+              case 0x2:
+                {
+                  el1 = fplibNeg<Element>(srcElem2_1);
+                  el2 = srcElem1_1;
+                  el3 = fplibNeg<Element>(srcElem2_2);
+                  el4 = srcElem1_1;
+                  break;
+                }
+              case 0x3:
+                {
+                  el1 = srcElem2_2;
+                  el2 = srcElem1_2;
+                  el3 = fplibNeg<Element>(srcElem2_1);
+                  el4 = srcElem1_2;
+                  break;
+                }
+            }
+            destElem_1 = fplibMulAdd<Element>(destElem_1, el2, el1, fpscr);
+            destElem_2 = fplibMulAdd<Element>(destElem_2, el4, el3, fpscr);
+
+            FpscrExc = fpscr;
+
+            destReg.elements[2*i] = htole(destElem_1);
+            destReg.elements[2*i+1] = htole(destElem_2);
+         }
+         '''
+    # FCMLA (by element)
+    fcmla_imm = fcmlaCode % {'rot': '14, 13', 'index': 'imm'}
+    threeEqualRegInstX("fcmla", "FcmlaElemDX", "SimdFloatMultAccOp",
+                        ("uint16_t", "uint32_t"), 2, fcmla_imm, True,
+                        byElem=True, complex=True)
+    threeEqualRegInstX("fcmla", "FcmlaElemQX", "SimdFloatMultAccOp",
+                       floatTypes, 4, fcmla_imm, True, byElem=True,
+                       complex=True)
+    # FCMLA (vector)
+    fcmla_vec = fcmlaCode % {'rot': '12, 11', 'index': 'i'}
+    threeEqualRegInstX("fcmla", "FcmlaDX", "SimdFloatMultAccOp",
+                       ("uint16_t", "uint32_t"), 2, fcmla_vec, True,
+                       complex=True)
+    threeEqualRegInstX("fcmla", "FcmlaQX", "SimdFloatMultAccOp",
+                       floatTypes, 4, fcmla_vec, True, complex=True)
     # CLS
     clsCode = '''
             unsigned count = 0;
diff --git a/src/arch/arm/isa/templates/neon.isa b/src/arch/arm/isa/templates/neon.isa
index ee38c2b..39e6d22 100644
--- a/src/arch/arm/isa/templates/neon.isa
+++ b/src/arch/arm/isa/templates/neon.isa
@@ -215,12 +215,18 @@

         const unsigned rCount = %(r_count)d;
const unsigned eCount = rCount * sizeof(uint32_t) / sizeof(Element);
+        const unsigned eCountFull = 4 * sizeof(uint32_t) / sizeof(Element);

         union RegVect {
             uint32_t regs[rCount];
             Element elements[eCount];
         };

+        union FullRegVect {
+            uint32_t regs[4];
+            Element elements[eCountFull];
+        };
+
         if (%(predicate_test)s)
         {
             %(code)s;

--
To view, visit https://gem5-review.googlesource.com/c/public/gem5/+/27183
To unsubscribe, or for help writing mail filters, visit https://gem5-review.googlesource.com/settings

Gerrit-Project: public/gem5
Gerrit-Branch: develop
Gerrit-Change-Id: I7f11ce88137dad077d2cad698dcaa9a79a3f317b
Gerrit-Change-Number: 27183
Gerrit-PatchSet: 1
Gerrit-Owner: Jordi Vaquero <jordi.vaqu...@metempsy.com>
Gerrit-MessageType: newchange
_______________________________________________
gem5-dev mailing list
gem5-dev@gem5.org
http://m5sim.org/mailman/listinfo/gem5-dev

Reply via email to