Revision: 4567
Author: [email protected]
Date: Mon May 3 03:43:49 2010
Log: X64: Port inline transcendental cache to X64.
Review URL: http://codereview.chromium.org/1860001
http://code.google.com/p/v8/source/detail?r=4567
Modified:
/branches/bleeding_edge/src/objects.h
/branches/bleeding_edge/src/x64/assembler-x64-inl.h
/branches/bleeding_edge/src/x64/assembler-x64.cc
/branches/bleeding_edge/src/x64/assembler-x64.h
/branches/bleeding_edge/src/x64/codegen-x64.cc
/branches/bleeding_edge/src/x64/codegen-x64.h
/branches/bleeding_edge/src/x64/disasm-x64.cc
=======================================
--- /branches/bleeding_edge/src/objects.h Wed Apr 28 05:05:40 2010
+++ /branches/bleeding_edge/src/objects.h Mon May 3 03:43:49 2010
@@ -1116,6 +1116,8 @@
static const uint32_t kSignMask = 0x80000000u;
static const uint32_t kExponentMask = 0x7ff00000u;
static const uint32_t kMantissaMask = 0xfffffu;
+ static const int kMantissaBits = 52;
+ static const int KExponentBits = 11;
static const int kExponentBias = 1023;
static const int kExponentShift = 20;
static const int kMantissaBitsInTopWord = 20;
=======================================
--- /branches/bleeding_edge/src/x64/assembler-x64-inl.h Wed Nov 11 01:50:06
2009
+++ /branches/bleeding_edge/src/x64/assembler-x64-inl.h Mon May 3 03:43:49
2010
@@ -87,6 +87,11 @@
void Assembler::emit_rex_64(XMMRegister reg, Register rm_reg) {
emit(0x48 | (reg.code() & 0x8) >> 1 | rm_reg.code() >> 3);
}
+
+
+void Assembler::emit_rex_64(Register reg, XMMRegister rm_reg) {
+ emit(0x48 | (reg.code() & 0x8) >> 1 | rm_reg.code() >> 3);
+}
void Assembler::emit_rex_64(Register reg, const Operand& op) {
@@ -158,6 +163,12 @@
byte rex_bits = (reg.code() & 0x8) >> 1 | (base.code() & 0x8) >> 3;
if (rex_bits != 0) emit(0x40 | rex_bits);
}
+
+
+void Assembler::emit_optional_rex_32(Register reg, XMMRegister base) {
+ byte rex_bits = (reg.code() & 0x8) >> 1 | (base.code() & 0x8) >> 3;
+ if (rex_bits != 0) emit(0x40 | rex_bits);
+}
void Assembler::emit_optional_rex_32(Register rm_reg) {
=======================================
--- /branches/bleeding_edge/src/x64/assembler-x64.cc Mon May 3 02:53:47
2010
+++ /branches/bleeding_edge/src/x64/assembler-x64.cc Mon May 3 03:43:49
2010
@@ -2040,6 +2040,14 @@
emit(0xD9);
emit(0xEE);
}
+
+
+void Assembler::fldpi() {
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ emit(0xD9);
+ emit(0xEB);
+}
void Assembler::fld_s(const Operand& adr) {
@@ -2398,6 +2406,53 @@
emit(0x6E);
emit_sse_operand(dst, src);
}
+
+
+void Assembler::movd(Register dst, XMMRegister src) {
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ emit(0x66);
+ emit_optional_rex_32(dst, src);
+ emit(0x0F);
+ emit(0x7E);
+ emit_sse_operand(dst, src);
+}
+
+
+void Assembler::movq(XMMRegister dst, Register src) {
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ emit(0x66);
+ emit_rex_64(dst, src);
+ emit(0x0F);
+ emit(0x6E);
+ emit_sse_operand(dst, src);
+}
+
+
+void Assembler::movq(Register dst, XMMRegister src) {
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ emit(0x66);
+ emit_rex_64(dst, src);
+ emit(0x0F);
+ emit(0x7E);
+ emit_sse_operand(dst, src);
+}
+
+
+void Assembler::extractps(Register dst, XMMRegister src, byte imm8) {
+ ASSERT(is_uint2(imm8));
+ EnsureSpace ensure_space(this);
+ last_pc_ = pc_;
+ emit(0x66);
+ emit_optional_rex_32(dst, src);
+ emit(0x0F);
+ emit(0x3A);
+ emit(0x17);
+ emit_sse_operand(dst, src);
+ emit(imm8);
+}
void Assembler::movsd(const Operand& dst, XMMRegister src) {
@@ -2600,6 +2655,10 @@
void Assembler::emit_sse_operand(XMMRegister dst, Register src) {
emit(0xC0 | (dst.low_bits() << 3) | src.low_bits());
}
+
+void Assembler::emit_sse_operand(Register dst, XMMRegister src) {
+ emit(0xC0 | (dst.low_bits() << 3) | src.low_bits());
+}
// Relocation information implementations.
=======================================
--- /branches/bleeding_edge/src/x64/assembler-x64.h Wed Apr 28 07:43:51 2010
+++ /branches/bleeding_edge/src/x64/assembler-x64.h Mon May 3 03:43:49 2010
@@ -1019,6 +1019,7 @@
void fld1();
void fldz();
+ void fldpi();
void fld_s(const Operand& adr);
void fld_d(const Operand& adr);
@@ -1080,6 +1081,10 @@
// SSE2 instructions
void movd(XMMRegister dst, Register src);
+ void movd(Register dst, XMMRegister src);
+ void movq(XMMRegister dst, Register src);
+ void movq(Register dst, XMMRegister src);
+ void extractps(Register dst, XMMRegister src, byte imm8);
void movsd(const Operand& dst, XMMRegister src);
void movsd(XMMRegister dst, XMMRegister src);
@@ -1110,6 +1115,7 @@
void emit_sse_operand(XMMRegister dst, XMMRegister src);
void emit_sse_operand(XMMRegister reg, const Operand& adr);
void emit_sse_operand(XMMRegister dst, Register src);
+ void emit_sse_operand(Register dst, XMMRegister src);
// Use either movsd or movlpd.
// void movdbl(XMMRegister dst, const Operand& src);
@@ -1176,8 +1182,9 @@
// the top bit of both register codes.
// High bit of reg goes to REX.R, high bit of rm_reg goes to REX.B.
// REX.W is set.
- inline void emit_rex_64(Register reg, Register rm_reg);
inline void emit_rex_64(XMMRegister reg, Register rm_reg);
+ inline void emit_rex_64(Register reg, XMMRegister rm_reg);
+ inline void emit_rex_64(Register reg, Register rm_reg);
// Emits a REX prefix that encodes a 64-bit operand size and
// the top bit of the destination, index, and base register codes.
@@ -1235,9 +1242,13 @@
inline void emit_optional_rex_32(XMMRegister reg, XMMRegister base);
// As for emit_optional_rex_32(Register, Register), except that
- // the registers are XMM registers.
+ // one of the registers is an XMM registers.
inline void emit_optional_rex_32(XMMRegister reg, Register base);
+ // As for emit_optional_rex_32(Register, Register), except that
+ // one of the registers is an XMM registers.
+ inline void emit_optional_rex_32(Register reg, XMMRegister base);
+
// As for emit_optional_rex_32(Register, const Operand&), except that
// the register is an XMM register.
inline void emit_optional_rex_32(XMMRegister reg, const Operand& op);
=======================================
--- /branches/bleeding_edge/src/x64/codegen-x64.cc Mon May 3 02:54:48 2010
+++ /branches/bleeding_edge/src/x64/codegen-x64.cc Mon May 3 03:43:49 2010
@@ -43,6 +43,8 @@
#define __ ACCESS_MASM(masm_)
+static int64_t kNaNValue = V8_INT64_C(0x7ff8000000000000);
+
//
-------------------------------------------------------------------------
// Platform-specific DeferredCode functions.
@@ -4516,19 +4518,19 @@
void CodeGenerator::GenerateMathSin(ZoneList<Expression*>* args) {
ASSERT_EQ(args->length(), 1);
- // Load the argument on the stack and jump to the runtime.
Load(args->at(0));
- Result answer = frame_->CallRuntime(Runtime::kMath_sin, 1);
- frame_->Push(&answer);
+ TranscendentalCacheStub stub(TranscendentalCache::SIN);
+ Result result = frame_->CallStub(&stub, 1);
+ frame_->Push(&result);
}
void CodeGenerator::GenerateMathCos(ZoneList<Expression*>* args) {
ASSERT_EQ(args->length(), 1);
- // Load the argument on the stack and jump to the runtime.
Load(args->at(0));
- Result answer = frame_->CallRuntime(Runtime::kMath_cos, 1);
- frame_->Push(&answer);
+ TranscendentalCacheStub stub(TranscendentalCache::COS);
+ Result result = frame_->CallStub(&stub, 1);
+ frame_->Push(&result);
}
@@ -7459,6 +7461,216 @@
// End of CodeGenerator implementation.
+void TranscendentalCacheStub::Generate(MacroAssembler* masm) {
+ // Input on stack:
+ // rsp[8]: argument (should be number).
+ // rsp[0]: return address.
+ Label runtime_call;
+ Label runtime_call_clear_stack;
+ Label input_not_smi;
+ Label loaded;
+ // Test that rax is a number.
+ __ movq(rax, Operand(rsp, kPointerSize));
+ __ JumpIfNotSmi(rax, &input_not_smi);
+ // Input is a smi. Untag and load it onto the FPU stack.
+ // Then load the bits of the double into rbx.
+ __ SmiToInteger32(rax, rax);
+ __ subq(rsp, Immediate(kPointerSize));
+ __ cvtlsi2sd(xmm1, rax);
+ __ movsd(Operand(rsp, 0), xmm1);
+ __ movq(rbx, xmm1);
+ __ movq(rdx, xmm1);
+ __ fld_d(Operand(rsp, 0));
+ __ addq(rsp, Immediate(kPointerSize));
+ __ jmp(&loaded);
+
+ __ bind(&input_not_smi);
+ // Check if input is a HeapNumber.
+ __ Move(rbx, Factory::heap_number_map());
+ __ cmpq(rbx, FieldOperand(rax, HeapObject::kMapOffset));
+ __ j(not_equal, &runtime_call);
+ // Input is a HeapNumber. Push it on the FPU stack and load its
+ // bits into rbx.
+ __ fld_d(FieldOperand(rax, HeapNumber::kValueOffset));
+ __ movq(rbx, FieldOperand(rax, HeapNumber::kValueOffset));
+ __ movq(rdx, rbx);
+ __ bind(&loaded);
+ // ST[0] == double value
+ // rbx = bits of double value.
+ // rdx = also bits of double value.
+ // Compute hash (h is 32 bits, bits are 64):
+ // h = h0 = bits ^ (bits >> 32);
+ // h ^= h >> 16;
+ // h ^= h >> 8;
+ // h = h & (cacheSize - 1);
+ // or h = (h0 ^ (h0 >> 8) ^ (h0 >> 16) ^ (h0 >> 24)) & (cacheSize - 1)
+ __ sar(rdx, Immediate(32));
+ __ xorl(rdx, rbx);
+ __ movl(rcx, rdx);
+ __ movl(rax, rdx);
+ __ movl(rdi, rdx);
+ __ sarl(rdx, Immediate(8));
+ __ sarl(rcx, Immediate(16));
+ __ sarl(rax, Immediate(24));
+ __ xorl(rcx, rdx);
+ __ xorl(rax, rdi);
+ __ xorl(rcx, rax);
+ ASSERT(IsPowerOf2(TranscendentalCache::kCacheSize));
+ __ andl(rcx, Immediate(TranscendentalCache::kCacheSize - 1));
+ // ST[0] == double value.
+ // rbx = bits of double value.
+ // rcx = TranscendentalCache::hash(double value).
+ __ movq(rax, ExternalReference::transcendental_cache_array_address());
+ // rax points to cache array.
+ __ movq(rax, Operand(rax, type_ *
sizeof(TranscendentalCache::caches_[0])));
+ // rax points to the cache for the type type_.
+ // If NULL, the cache hasn't been initialized yet, so go through runtime.
+ __ testq(rax, rax);
+ __ j(zero, &runtime_call_clear_stack);
+#ifdef DEBUG
+ // Check that the layout of cache elements match expectations.
+ { // NOLINT - doesn't like a single brace on a line.
+ TranscendentalCache::Element test_elem[2];
+ char* elem_start = reinterpret_cast<char*>(&test_elem[0]);
+ char* elem2_start = reinterpret_cast<char*>(&test_elem[1]);
+ char* elem_in0 = reinterpret_cast<char*>(&(test_elem[0].in[0]));
+ char* elem_in1 = reinterpret_cast<char*>(&(test_elem[0].in[1]));
+ char* elem_out = reinterpret_cast<char*>(&(test_elem[0].output));
+ // Two uint_32's and a pointer per element.
+ CHECK_EQ(16, static_cast<int>(elem2_start - elem_start));
+ CHECK_EQ(0, static_cast<int>(elem_in0 - elem_start));
+ CHECK_EQ(kIntSize, static_cast<int>(elem_in1 - elem_start));
+ CHECK_EQ(2 * kIntSize, static_cast<int>(elem_out - elem_start));
+ }
+#endif
+ // Find the address of the rcx'th entry in the cache, i.e., &rax[rcx*16].
+ __ addl(rcx, rcx);
+ __ lea(rcx, Operand(rax, rcx, times_8, 0));
+ // Check if cache matches: Double value is stored in uint32_t[2] array.
+ Label cache_miss;
+ __ cmpq(rbx, Operand(rcx, 0));
+ __ j(not_equal, &cache_miss);
+ // Cache hit!
+ __ movq(rax, Operand(rcx, 2 * kIntSize));
+ __ fstp(0); // Clear FPU stack.
+ __ ret(kPointerSize);
+
+ __ bind(&cache_miss);
+ // Update cache with new value.
+ __ AllocateHeapNumber(rax, rdi, &runtime_call_clear_stack);
+ GenerateOperation(masm);
+ __ movq(Operand(rcx, 0), rbx);
+ __ movq(Operand(rcx, 2 * kIntSize), rax);
+ __ fstp_d(FieldOperand(rax, HeapNumber::kValueOffset));
+ __ ret(kPointerSize);
+
+ __ bind(&runtime_call_clear_stack);
+ __ fstp(0);
+ __ bind(&runtime_call);
+ __ TailCallExternalReference(ExternalReference(RuntimeFunction()), 1, 1);
+}
+
+
+Runtime::FunctionId TranscendentalCacheStub::RuntimeFunction() {
+ switch (type_) {
+ // Add more cases when necessary.
+ case TranscendentalCache::SIN: return Runtime::kMath_sin;
+ case TranscendentalCache::COS: return Runtime::kMath_cos;
+ default:
+ UNIMPLEMENTED();
+ return Runtime::kAbort;
+ }
+}
+
+
+void TranscendentalCacheStub::GenerateOperation(MacroAssembler* masm) {
+ // Only free register is rdi.
+ Label done;
+ ASSERT(type_ == TranscendentalCache::SIN ||
+ type_ == TranscendentalCache::COS);
+ // More transcendental types can be added later.
+
+ // Both fsin and fcos require arguments in the range +/-2^63 and
+ // return NaN for infinities and NaN. They can share all code except
+ // the actual fsin/fcos operation.
+ Label in_range;
+ // If argument is outside the range -2^63..2^63, fsin/cos doesn't
+ // work. We must reduce it to the appropriate range.
+ __ movq(rdi, rbx);
+ // Move exponent and sign bits to low bits.
+ __ shr(rdi, Immediate(HeapNumber::kMantissaBits));
+ // Remove sign bit.
+ __ andl(rdi, Immediate((1 << HeapNumber::KExponentBits) - 1));
+ int supported_exponent_limit = (63 + HeapNumber::kExponentBias);
+ __ cmpl(rdi, Immediate(supported_exponent_limit));
+ __ j(below, &in_range);
+ // Check for infinity and NaN. Both return NaN for sin.
+ __ cmpl(rdi, Immediate(0x7ff));
+ Label non_nan_result;
+ __ j(not_equal, &non_nan_result);
+ // Input is +/-Infinity or NaN. Result is NaN.
+ __ fstp(0); // Clear fpu stack.
+ // NaN is represented by 0x7ff8000000000000.
+ __ movq(rdi, kNaNValue, RelocInfo::NONE);
+ __ push(rdi);
+ __ fld_d(Operand(rsp, 0));
+ __ addq(rsp, Immediate(kPointerSize));
+ __ jmp(&done);
+
+ __ bind(&non_nan_result);
+
+ // Use fpmod to restrict argument to the range +/-2*PI.
+ __ movq(rdi, rax); // Save rax before using fnstsw_ax.
+ __ fldpi();
+ __ fadd(0);
+ __ fld(1);
+ // FPU Stack: input, 2*pi, input.
+ {
+ Label no_exceptions;
+ __ fwait();
+ __ fnstsw_ax();
+ // Clear if Illegal Operand or Zero Division exceptions are set.
+ __ testl(rax, Immediate(5));
+ __ j(zero, &no_exceptions);
+ __ fnclex();
+ __ bind(&no_exceptions);
+ }
+
+ // Compute st(0) % st(1)
+ {
+ Label partial_remainder_loop;
+ __ bind(&partial_remainder_loop);
+ __ fprem1();
+ __ fwait();
+ __ fnstsw_ax();
+ __ testl(rax, Immediate(0x400)); // Check C2 bit of FPU status word.
+ // If C2 is set, computation only has partial result. Loop to
+ // continue computation.
+ __ j(not_zero, &partial_remainder_loop);
+ }
+ // FPU Stack: input, 2*pi, input % 2*pi
+ __ fstp(2);
+ // FPU Stack: input % 2*pi, 2*pi,
+ __ fstp(0);
+ // FPU Stack: input % 2*pi
+ __ movq(rax, rdi); // Restore rax (allocated HeapNumber pointer).
+
+ // FPU Stack: input % 2*pi
+ __ bind(&in_range);
+ switch (type_) {
+ case TranscendentalCache::SIN:
+ __ fsin();
+ break;
+ case TranscendentalCache::COS:
+ __ fcos();
+ break;
+ default:
+ UNREACHABLE();
+ }
+ __ bind(&done);
+}
+
+
// Get the integer part of a heap number. Surprisingly, all this bit
twiddling
// is faster than using the built-in instructions on floating point
registers.
// Trashes rdi and rbx. Dest is rcx. Source cannot be rcx or one of the
@@ -10995,7 +11207,6 @@
__ testb(rax, Immediate(5));
__ j(zero, &valid_result);
__ fstp(0); // Drop result in st(0).
- int64_t kNaNValue = V8_INT64_C(0x7ff8000000000000);
__ movq(rcx, kNaNValue, RelocInfo::NONE);
__ movq(Operand(rsp, kPointerSize), rcx);
__ movsd(xmm0, Operand(rsp, kPointerSize));
=======================================
--- /branches/bleeding_edge/src/x64/codegen-x64.h Thu Apr 29 08:14:39 2010
+++ /branches/bleeding_edge/src/x64/codegen-x64.h Mon May 3 03:43:49 2010
@@ -675,6 +675,22 @@
};
+// Compute a transcendental math function natively, or call the
+// TranscendentalCache runtime function.
+class TranscendentalCacheStub: public CodeStub {
+ public:
+ explicit TranscendentalCacheStub(TranscendentalCache::Type type)
+ : type_(type) {}
+ void Generate(MacroAssembler* masm);
+ private:
+ TranscendentalCache::Type type_;
+ Major MajorKey() { return TranscendentalCache; }
+ int MinorKey() { return type_; }
+ Runtime::FunctionId RuntimeFunction();
+ void GenerateOperation(MacroAssembler* masm);
+};
+
+
// Flag that indicates how to generate code for the stub
GenericBinaryOpStub.
enum GenericBinaryFlags {
NO_GENERIC_BINARY_FLAGS = 0,
=======================================
--- /branches/bleeding_edge/src/x64/disasm-x64.cc Wed Apr 28 07:43:51 2010
+++ /branches/bleeding_edge/src/x64/disasm-x64.cc Mon May 3 03:43:49 2010
@@ -996,23 +996,44 @@
if (operand_size_ == 0x66) {
// 0x66 0x0F prefix.
int mod, regop, rm;
- get_modrm(*current, &mod, ®op, &rm);
- if (opcode == 0x6E) {
- AppendToBuffer("movd %s,", NameOfXMMRegister(regop));
- current += PrintRightOperand(current);
- } else {
- const char* mnemonic = "?";
- if (opcode == 0x57) {
- mnemonic = "xorpd";
- } else if (opcode == 0x2E) {
- mnemonic = "comisd";
- } else if (opcode == 0x2F) {
- mnemonic = "ucomisd";
+ if (opcode == 0x3A) {
+ byte third_byte = *current;
+ current = data + 3;
+ if (third_byte == 0x17) {
+ get_modrm(*current, &mod, ®op, &rm);
+ AppendToBuffer("extractps "); // reg/m32, xmm, imm8
+ current += PrintRightOperand(current);
+ AppendToBuffer(", %s, %d", NameOfCPURegister(regop), (*current) &
3);
+ current += 1;
} else {
UnimplementedInstruction();
}
- AppendToBuffer("%s %s,", mnemonic, NameOfXMMRegister(regop));
- current += PrintRightXMMOperand(current);
+ } else {
+ get_modrm(*current, &mod, ®op, &rm);
+ if (opcode == 0x6E) {
+ AppendToBuffer("mov%c %s,",
+ rex_w() ? 'q' : 'd',
+ NameOfXMMRegister(regop));
+ current += PrintRightOperand(current);
+ } else if (opcode == 0x7E) {
+ AppendToBuffer("mov%c %s,",
+ rex_w() ? 'q' : 'd',
+ NameOfCPURegister(regop));
+ current += PrintRightXMMOperand(current);
+ } else {
+ const char* mnemonic = "?";
+ if (opcode == 0x57) {
+ mnemonic = "xorpd";
+ } else if (opcode == 0x2E) {
+ mnemonic = "comisd";
+ } else if (opcode == 0x2F) {
+ mnemonic = "ucomisd";
+ } else {
+ UnimplementedInstruction();
+ }
+ AppendToBuffer("%s %s,", mnemonic, NameOfXMMRegister(regop));
+ current += PrintRightXMMOperand(current);
+ }
}
} else if (group_1_prefix_ == 0xF2) {
// Beginning of instructions with prefix 0xF2.
--
v8-dev mailing list
[email protected]
http://groups.google.com/group/v8-dev