[v8-dev] ARM: Make DoStoreKeyedFixedDoubleArray faster; don't allow conditional Vmov (issue 12316096)

hans Mon, 25 Feb 2013 04:10:20 -0800

Reviewers: Rodolph Perfetta, ulan,

Message:
This makes for faster writes to unboxed double arrays.


On the matmul benchmark that I used in

https://chromiumcodereview.appspot.com/11428137, I get 17% speed-up onNexus 10.

On kraken-imaging-desaturate-orig, I get about 5% speed-up.

Description:
ARM: Make DoStoreKeyedFixedDoubleArray faster; don't allow conditional Vmov

This patch makes us generate faster code for DoStoreKeyedFixedDoubleArray,
by using a branch rather than a conditional Vmov instruction.

Conditional VFP instructions are not a great idea in general, and it was
especially bad in this case because Vmov expands to a bunch of instructions.
For this reason, the patch also removes the 'cond' parameter from Vmov.

Thanks to Rodolph for pointing me to this!

BUG=none


Please review this at https://codereview.chromium.org/12316096/

SVN Base: https://v8.googlecode.com/svn/branches/bleeding_edge

Affected files:
  M src/arm/assembler-arm.h
  M src/arm/assembler-arm.cc
  M src/arm/lithium-codegen-arm.cc
  M src/arm/macro-assembler-arm.h
  M src/arm/macro-assembler-arm.cc


Index: src/arm/assembler-arm.cc
diff --git a/src/arm/assembler-arm.cc b/src/arm/assembler-arm.cc

index0c9a6022fcf900eaade8f1cf79b46f436934b54e..7cd0a1753eff2b9f663a44b35b05bfb9f4b1e2c7100644

--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc

@@ -2067,8 +2067,7 @@ static bool FitsVMOVDoubleImmediate(double d,uint32_t *encoding) {


 void Assembler::vmov(const DwVfpRegister dst,
                      double imm,
-                     const Register scratch,
-                     const Condition cond) {
+                     const Register scratch) {
   ASSERT(CpuFeatures::IsEnabled(VFP2));

   uint32_t enc;
@@ -2081,7 +2080,7 @@ void Assembler::vmov(const DwVfpRegister dst,
     // Vd(15-12) | 101(11-9) | sz=1(8) | imm4L(3-0)
     int vd, d;
     dst.split_code(&vd, &d);
-    emit(cond | 0x1D*B23 | d*B22 | 0x3*B20 | vd*B12 | 0x5*B9 | B8 | enc);
+    emit(al | 0x1D*B23 | d*B22 | 0x3*B20 | vd*B12 | 0x5*B9 | B8 | enc);
   } else if (FLAG_enable_vldr_imm) {
     // TODO(jfb) Temporarily turned off until we have constant blinding or

// some equivalent mitigation: an attacker can otherwisecontrol

@@ -2099,7 +2098,7 @@ void Assembler::vmov(const DwVfpRegister dst,

// that's tricky because vldr has a limited reach.Furthermore

     //           it breaks load locality.
     RecordRelocInfo(imm);
-    vldr(dst, MemOperand(pc, 0), cond);
+    vldr(dst, MemOperand(pc, 0));
   } else {
     // Synthesise the double from ARM immediates.
     uint32_t lo, hi;
@@ -2110,27 +2109,27 @@ void Assembler::vmov(const DwVfpRegister dst,

// Move the low part of the double into the lower of thecorresponsing S

         // registers of D register dst.
         mov(ip, Operand(lo));
-        vmov(dst.low(), ip, cond);
+        vmov(dst.low(), ip);

         // Move the high part of the double into the higher of the
         // corresponsing S registers of D register dst.
         mov(ip, Operand(hi));
-        vmov(dst.high(), ip, cond);
+        vmov(dst.high(), ip);
       } else {

// D16-D31 does not have S registers, so move the low and highparts

         // directly to the D register using vmov.32.
         // Note: This may be slower, so we only do this when we have to.
         mov(ip, Operand(lo));
-        vmov(dst, VmovIndexLo, ip, cond);
+        vmov(dst, VmovIndexLo, ip);
         mov(ip, Operand(hi));
-        vmov(dst, VmovIndexHi, ip, cond);
+        vmov(dst, VmovIndexHi, ip);
       }
     } else {
       // Move the low and high parts of the double to a D register in one
       // instruction.
       mov(ip, Operand(lo));
       mov(scratch, Operand(hi));
-      vmov(dst, ip, scratch, cond);
+      vmov(dst, ip, scratch);
     }
   }
 }
Index: src/arm/assembler-arm.h
diff --git a/src/arm/assembler-arm.h b/src/arm/assembler-arm.h

indexacf4beb87b2b69c6ff13a99f0f33f06ecf829223..b32c0f3c5ad9b291d9e058d345fadba02d497232100644

--- a/src/arm/assembler-arm.h
+++ b/src/arm/assembler-arm.h
@@ -1066,8 +1066,7 @@ class Assembler : public AssemblerBase {

   void vmov(const DwVfpRegister dst,
             double imm,
-            const Register scratch = no_reg,
-            const Condition cond = al);
+            const Register scratch = no_reg);
   void vmov(const SwVfpRegister dst,
             const SwVfpRegister src,
             const Condition cond = al);
Index: src/arm/lithium-codegen-arm.cc
diff --git a/src/arm/lithium-codegen-arm.cc b/src/arm/lithium-codegen-arm.cc

indexfaeb841d30c0951c18491830d9175ef4804e470b..da09a5356ca62f9bd557c56cc37e4975189053de100644

--- a/src/arm/lithium-codegen-arm.cc
+++ b/src/arm/lithium-codegen-arm.cc

@@ -4472,10 +4472,14 @@ voidLCodeGen::DoStoreKeyedFixedDoubleArray(LStoreKeyed* instr) {

   if (instr->NeedsCanonicalization()) {
     // Check for NaN. All NaNs must be canonicalized.
     __ VFPCompareAndSetFlags(value, value);
+    Label after_canonicalization;
+
     // Only load canonical NaN if the comparison above set the overflow.
+    __ b(vc, &after_canonicalization);
     __ Vmov(value,
-            FixedDoubleArray::canonical_not_the_hole_nan_as_double(),
-            no_reg, vs);
+            FixedDoubleArray::canonical_not_the_hole_nan_as_double());
+
+    __ bind(&after_canonicalization);
   }

   __ vstr(value, scratch, instr->additional_index() << element_size_shift);
Index: src/arm/macro-assembler-arm.cc
diff --git a/src/arm/macro-assembler-arm.cc b/src/arm/macro-assembler-arm.cc

index6796f7388abaca9aeff73b3b86b2ed89e4ff7437..2db68325f3ab8d46e5c3f92b568d16bfe7b2602e100644

--- a/src/arm/macro-assembler-arm.cc
+++ b/src/arm/macro-assembler-arm.cc

@@ -811,19 +811,18 @@ void MacroAssembler::VFPCompareAndLoadFlags(constDwVfpRegister src1,


 void MacroAssembler::Vmov(const DwVfpRegister dst,
                           const double imm,
-                          const Register scratch,
-                          const Condition cond) {
+                          const Register scratch) {
   ASSERT(CpuFeatures::IsEnabled(VFP2));
   static const DoubleRepresentation minus_zero(-0.0);
   static const DoubleRepresentation zero(0.0);
   DoubleRepresentation value(imm);
   // Handle special values first.
   if (value.bits == zero.bits) {
-    vmov(dst, kDoubleRegZero, cond);
+    vmov(dst, kDoubleRegZero);
   } else if (value.bits == minus_zero.bits) {
-    vneg(dst, kDoubleRegZero, cond);
+    vneg(dst, kDoubleRegZero);
   } else {
-    vmov(dst, imm, scratch, cond);
+    vmov(dst, imm, scratch);
   }
 }

Index: src/arm/macro-assembler-arm.h
diff --git a/src/arm/macro-assembler-arm.h b/src/arm/macro-assembler-arm.h

index93760bd2035c9fd6746f051e1f3244d100d05815..fb2833b1687eb79ed8bbf753cb4014f5a1ff8b47100644

--- a/src/arm/macro-assembler-arm.h
+++ b/src/arm/macro-assembler-arm.h
@@ -480,8 +480,7 @@ class MacroAssembler: public Assembler {

   void Vmov(const DwVfpRegister dst,
             const double imm,
-            const Register scratch = no_reg,
-            const Condition cond = al);
+            const Register scratch = no_reg);

   // Enter exit frame.
   // stack_space - extra stack space, used for alignment before call to C.


--
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

---You received this message because you are subscribed to the Google Groups "v8-dev" group.

To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/groups/opt_out.

[v8-dev] ARM: Make DoStoreKeyedFixedDoubleArray faster; don't allow conditional Vmov (issue 12316096)

Reply via email to