Revision: 4934
Author: [email protected]
Date: Thu Jun 24 02:03:49 2010
Log: X64: Remove more fpu code. Unroll more local initialization loops.

Review URL: http://codereview.chromium.org/2815028
http://code.google.com/p/v8/source/detail?r=4934

Modified:
 /branches/bleeding_edge/src/x64/assembler-x64.cc
 /branches/bleeding_edge/src/x64/assembler-x64.h
 /branches/bleeding_edge/src/x64/codegen-x64.cc
 /branches/bleeding_edge/src/x64/ic-x64.cc
 /branches/bleeding_edge/src/x64/virtual-frame-x64.cc
 /branches/bleeding_edge/src/x64/virtual-frame-x64.h

=======================================
--- /branches/bleeding_edge/src/x64/assembler-x64.cc Wed Jun 23 07:05:18 2010 +++ /branches/bleeding_edge/src/x64/assembler-x64.cc Thu Jun 24 02:03:49 2010
@@ -2736,6 +2736,28 @@
   emit(0x5A);
   emit_sse_operand(dst, src);
 }
+
+
+void Assembler::cvtsd2si(Register dst, XMMRegister src) {
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  emit(0xF2);
+  emit_optional_rex_32(dst, src);
+  emit(0x0F);
+  emit(0x2D);
+  emit_sse_operand(dst, src);
+}
+
+
+void Assembler::cvtsd2siq(Register dst, XMMRegister src) {
+  EnsureSpace ensure_space(this);
+  last_pc_ = pc_;
+  emit(0xF2);
+  emit_rex_64(dst, src);
+  emit(0x0F);
+  emit(0x2D);
+  emit_sse_operand(dst, src);
+}


 void Assembler::addsd(XMMRegister dst, XMMRegister src) {
=======================================
--- /branches/bleeding_edge/src/x64/assembler-x64.h     Wed Jun 23 07:05:18 2010
+++ /branches/bleeding_edge/src/x64/assembler-x64.h     Thu Jun 24 02:03:49 2010
@@ -1128,6 +1128,9 @@
   void cvtss2sd(XMMRegister dst, const Operand& src);
   void cvtsd2ss(XMMRegister dst, XMMRegister src);

+  void cvtsd2si(Register dst, XMMRegister src);
+  void cvtsd2siq(Register dst, XMMRegister src);
+
   void addsd(XMMRegister dst, XMMRegister src);
   void subsd(XMMRegister dst, XMMRegister src);
   void mulsd(XMMRegister dst, XMMRegister src);
=======================================
--- /branches/bleeding_edge/src/x64/codegen-x64.cc      Wed Jun 23 07:05:18 2010
+++ /branches/bleeding_edge/src/x64/codegen-x64.cc      Thu Jun 24 02:03:49 2010
@@ -2641,7 +2641,7 @@

   // Generate code to set the elements in the array that are not
   // literals.
-  for (int i = 0; i < node->values()->length(); i++) {
+  for (int i = 0; i < length; i++) {
     Expression* value = node->values()->at(i);

     // If value is a literal the property value is already set in the
=======================================
--- /branches/bleeding_edge/src/x64/ic-x64.cc   Wed Jun 23 07:05:18 2010
+++ /branches/bleeding_edge/src/x64/ic-x64.cc   Thu Jun 24 02:03:49 2010
@@ -791,7 +791,6 @@

     // Allocate a HeapNumber for the int and perform int-to-double
     // conversion.
-    ASSERT(array_type == kExternalUnsignedIntArray);
     // The value is zero-extended since we loaded the value from memory
     // with movl.
     __ cvtqsi2sd(xmm0, rcx);
@@ -1121,55 +1120,41 @@
   // The WebGL specification leaves the behavior of storing NaN and
   // +/-Infinity into integer arrays basically undefined. For more
   // reproducible behavior, convert these to zero.
-  __ fld_d(FieldOperand(rax, HeapNumber::kValueOffset));
+  __ movsd(xmm0, FieldOperand(rax, HeapNumber::kValueOffset));
   __ movq(rbx, FieldOperand(rbx, ExternalArray::kExternalPointerOffset));
   // rdi: untagged index
   // rbx: base pointer of external storage
   // top of FPU stack: value
   if (array_type == kExternalFloatArray) {
-    __ fstp_s(Operand(rbx, rdi, times_4, 0));
+    __ cvtsd2ss(xmm0, xmm0);
+    __ movss(Operand(rbx, rdi, times_4, 0), xmm0);
     __ ret(0);
   } else {
     // Need to perform float-to-int conversion.
-    // Test the top of the FP stack for NaN.
-    Label is_nan;
-    __ fucomi(0);
-    __ j(parity_even, &is_nan);
-
- __ push(rdx); // Make room on the stack. Receiver is no longer needed. - // TODO(lrn): If the rounding of this conversion is not deliberate, maybe
-    // switch to xmm registers.
-    __ fistp_d(Operand(rsp, 0));
-    __ pop(rdx);
+    // Test the value for NaN.
+
+    // Convert to int32 and store the low byte/word.
+    // If the value is NaN or +/-infinity, the result is 0x80000000,
+    // which is automatically zero when taken mod 2^n, n < 32.
     // rdx: value (converted to an untagged integer)
     // rdi: untagged index
     // rbx: base pointer of external storage
     switch (array_type) {
       case kExternalByteArray:
       case kExternalUnsignedByteArray:
+        __ cvtsd2si(rdx, xmm0);
         __ movb(Operand(rbx, rdi, times_1, 0), rdx);
         break;
       case kExternalShortArray:
       case kExternalUnsignedShortArray:
+        __ cvtsd2si(rdx, xmm0);
         __ movw(Operand(rbx, rdi, times_2, 0), rdx);
         break;
       case kExternalIntArray:
       case kExternalUnsignedIntArray: {
-        // We also need to explicitly check for +/-Infinity. These are
-        // converted to MIN_INT, but we need to be careful not to
-        // confuse with legal uses of MIN_INT.  Since MIN_INT truncated
-        // to 8 or 16 bits is zero, we only perform this test when storing
-        // 32-bit ints.
-        Label not_infinity;
-        // This test would apparently detect both NaN and Infinity,
-        // but we've already checked for NaN using the FPU hardware
-        // above.
-        __ movzxwq(rcx, FieldOperand(rax, HeapNumber::kValueOffset + 6));
-        __ and_(rcx, Immediate(0x7FF0));
-        __ cmpw(rcx, Immediate(0x7FF0));
-        __ j(not_equal, &not_infinity);
-        __ movq(rdx, Immediate(0));
-        __ bind(&not_infinity);
+        // Convert to int64, so that NaN and infinities become
+        // 0x8000000000000000, which is zero mod 2^32.
+        __ cvtsd2siq(rdx, xmm0);
         __ movl(Operand(rbx, rdi, times_4, 0), rdx);
         break;
       }
@@ -1177,31 +1162,6 @@
         UNREACHABLE();
         break;
     }
-    __ ret(0);
-
-    __ bind(&is_nan);
-    // rdi: untagged index
-    // rbx: base pointer of external storage
-    __ ffree();
-    __ fincstp();
-    __ Set(rdx, 0);
-    switch (array_type) {
-      case kExternalByteArray:
-      case kExternalUnsignedByteArray:
-        __ movb(Operand(rbx, rdi, times_1, 0), rdx);
-        break;
-      case kExternalShortArray:
-      case kExternalUnsignedShortArray:
-        __ movw(Operand(rbx, rdi, times_2, 0), rdx);
-        break;
-      case kExternalIntArray:
-      case kExternalUnsignedIntArray:
-        __ movl(Operand(rbx, rdi, times_4, 0), rdx);
-        break;
-      default:
-        UNREACHABLE();
-        break;
-    }
     __ ret(0);
   }

=======================================
--- /branches/bleeding_edge/src/x64/virtual-frame-x64.cc Tue Jun 22 03:07:57 2010 +++ /branches/bleeding_edge/src/x64/virtual-frame-x64.cc Thu Jun 24 02:03:49 2010
@@ -115,25 +115,45 @@
     Handle<Object> undefined = Factory::undefined_value();
     FrameElement initial_value =
         FrameElement::ConstantElement(undefined, FrameElement::SYNCED);
-    if (count == 1) {
-      __ Push(undefined);
-    } else if (count < kLocalVarBound) {
-      // For less locals the unrolled loop is more compact.
-      __ movq(kScratchRegister, undefined, RelocInfo::EMBEDDED_OBJECT);
+    if (count < kLocalVarBound) {
+      // For fewer locals the unrolled loop is more compact.
+
+ // Hope for one of the first eight registers, where the push operation
+      // takes only one byte (kScratchRegister needs the REX.W bit).
+      Result tmp = cgen()->allocator()->Allocate();
+      ASSERT(tmp.is_valid());
+      __ movq(tmp.reg(), undefined, RelocInfo::EMBEDDED_OBJECT);
       for (int i = 0; i < count; i++) {
-        __ push(kScratchRegister);
+        __ push(tmp.reg());
       }
     } else {
       // For more locals a loop in generated code is more compact.
       Label alloc_locals_loop;
       Result cnt = cgen()->allocator()->Allocate();
       ASSERT(cnt.is_valid());
-      __ movq(cnt.reg(), Immediate(count));
       __ movq(kScratchRegister, undefined, RelocInfo::EMBEDDED_OBJECT);
+#ifdef DEBUG
+      Label loop_size;
+      __ bind(&loop_size);
+#endif
+      if (is_uint8(count)) {
+        // Loading imm8 is shorter than loading imm32.
+        // Loading only partial byte register, and using decb below.
+        __ movb(cnt.reg(), Immediate(count));
+      } else {
+        __ movl(cnt.reg(), Immediate(count));
+      }
       __ bind(&alloc_locals_loop);
       __ push(kScratchRegister);
-      __ decl(cnt.reg());
+      if (is_uint8(count)) {
+        __ decb(cnt.reg());
+      } else {
+        __ decl(cnt.reg());
+      }
       __ j(not_zero, &alloc_locals_loop);
+#ifdef DEBUG
+      CHECK(masm()->SizeOfCodeGeneratedSince(&loop_size) < kLocalVarBound);
+#endif
     }
     for (int i = 0; i < count; i++) {
       elements_.Add(initial_value);
=======================================
--- /branches/bleeding_edge/src/x64/virtual-frame-x64.h Wed Jun 16 03:03:47 2010 +++ /branches/bleeding_edge/src/x64/virtual-frame-x64.h Thu Jun 24 02:03:49 2010
@@ -200,7 +200,7 @@
   inline void PrepareForReturn();

   // Number of local variables after when we use a loop for allocating.
-  static const int kLocalVarBound = 7;
+  static const int kLocalVarBound = 14;

   // Allocate and initialize the frame-allocated locals.
   void AllocateStackSlots();

--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev

Reply via email to