[llvm-commits] [llvm] r42433 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h test/CodeGen/X86/memcpy.ll

2007-09-28 Thread Rafael Espindola
Author: rafael
Date: Fri Sep 28 07:53:01 2007
New Revision: 42433

URL: http://llvm.org/viewvc/llvm-project?rev=42433view=rev
Log:
Refactor the memcpy lowering for the x86 target.

The only generated code difference is that now we call memcpy when
the size of the array is unknown. This matches GCC behavior and is
better since the run time value can be arbitrarily large.


Added:
llvm/trunk/test/CodeGen/X86/memcpy.ll
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.h

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: 
http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=42433r1=42432r2=42433view=diff

==
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Sep 28 07:53:01 2007
@@ -4188,35 +4188,61 @@
 }
 
 SDOperand X86TargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG DAG) {
-  SDOperand Chain = Op.getOperand(0);
-  unsigned Align =
-(unsigned)castConstantSDNode(Op.getOperand(4))-getValue();
+  SDOperand ChainOp = Op.getOperand(0);
+  SDOperand DestOp = Op.getOperand(1);
+  SDOperand SourceOp = Op.getOperand(2);
+  SDOperand CountOp = Op.getOperand(3);
+  SDOperand AlignOp = Op.getOperand(4);
+  unsigned Align = (unsigned)castConstantSDNode(AlignOp)-getValue();
   if (Align == 0) Align = 1;
 
-  ConstantSDNode *I = dyn_castConstantSDNode(Op.getOperand(3));
-  // If not DWORD aligned or size is more than the threshold, call memcpy.
-  // The libc version is likely to be faster for these cases. It can use the
-  // address value and run time information about the CPU.
+  // The libc version is likely to be faster for the following cases. It can
+  // use the address value and run time information about the CPU.
   // With glibc 2.6.1 on a core 2, coping an array of 100M longs was 30% faster
-  if ((Align  3) != 0 ||
-  (I  I-getValue()  Subtarget-getMinRepStrSizeThreshold())) {
-MVT::ValueType IntPtr = getPointerTy();
-TargetLowering::ArgListTy Args;
-TargetLowering::ArgListEntry Entry;
-Entry.Ty = getTargetData()-getIntPtrType();
-Entry.Node = Op.getOperand(1); Args.push_back(Entry);
-Entry.Node = Op.getOperand(2); Args.push_back(Entry);
-Entry.Node = Op.getOperand(3); Args.push_back(Entry);
-std::pairSDOperand,SDOperand CallResult =
+
+  // If not DWORD aligned, call memcpy.
+  if ((Align  3) != 0)
+return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
+
+  // If size is unknown, call memcpy.
+  ConstantSDNode *I = dyn_castConstantSDNode(CountOp);
+  if (!I)
+return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
+
+  // If size is more than the threshold, call memcpy.
+  unsigned Size = I-getValue();
+  if (Size  Subtarget-getMinRepStrSizeThreshold())
+return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
+
+  return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align, DAG);
+}
+
+SDOperand X86TargetLowering::LowerMEMCPYCall(SDOperand Chain,
+ SDOperand Dest,
+ SDOperand Source,
+ SDOperand Count,
+ SelectionDAG DAG) {
+  MVT::ValueType IntPtr = getPointerTy();
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = getTargetData()-getIntPtrType();
+  Entry.Node = Dest; Args.push_back(Entry);
+  Entry.Node = Source; Args.push_back(Entry);
+  Entry.Node = Count; Args.push_back(Entry);
+  std::pairSDOperand,SDOperand CallResult =
   LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C, false,
   DAG.getExternalSymbol(memcpy, IntPtr), Args, DAG);
-return CallResult.second;
-  }
+  return CallResult.second;
+}
 
+SDOperand X86TargetLowering::LowerMEMCPYInline(SDOperand Chain,
+   SDOperand Dest,
+   SDOperand Source,
+   unsigned Size,
+   unsigned Align,
+   SelectionDAG DAG) {
   MVT::ValueType AVT;
-  SDOperand Count;
   unsigned BytesLeft = 0;
-  bool TwoRepMovs = false;
   switch (Align  3) {
 case 2:   // WORD aligned
   AVT = MVT::i16;
@@ -4228,33 +4254,22 @@
   break;
 default:  // Byte aligned
   AVT = MVT::i8;
-  Count = Op.getOperand(3);
   break;
   }
 
-  if (AVT  MVT::i8) {
-if (I) {
-  unsigned UBytes = MVT::getSizeInBits(AVT) / 8;
-  Count = DAG.getConstant(I-getValue() / UBytes, getPointerTy());
-  BytesLeft = I-getValue() % UBytes;
-} else {
-  assert(AVT = MVT::i32 
- Do not use rep;movs if not at least DWORD aligned);
-  Count = 

Re: [llvm-commits] [llvm] r42433 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h test/CodeGen/X86/memcpy.ll

2007-09-28 Thread Evan Cheng
Hi Rafael,

Are you sure this is better? Did you do any measurement? Our goal  
isn't to match gcc output. :)

Perhaps you can add some unit tests to llvm-test?

Thanks,

Evan

On Sep 28, 2007, at 5:53 AM, Rafael Espindola [EMAIL PROTECTED] 
  wrote:

 Author: rafael
 Date: Fri Sep 28 07:53:01 2007
 New Revision: 42433

 URL: http://llvm.org/viewvc/llvm-project?rev=42433view=rev
 Log:
 Refactor the memcpy lowering for the x86 target.

 The only generated code difference is that now we call memcpy when
 the size of the array is unknown. This matches GCC behavior and is
 better since the run time value can be arbitrarily large.


 Added:
llvm/trunk/test/CodeGen/X86/memcpy.ll
 Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.h

 Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
 URL: 
 http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=42433r1=42432r2=42433view=diff

 === 
 === 
 === 
 =
 --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
 +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Sep 28  
 07:53:01 2007
 @@ -4188,35 +4188,61 @@
 }

 SDOperand X86TargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG  
 DAG) {
 -  SDOperand Chain = Op.getOperand(0);
 -  unsigned Align =
 -(unsigned)castConstantSDNode(Op.getOperand(4))-getValue();
 +  SDOperand ChainOp = Op.getOperand(0);
 +  SDOperand DestOp = Op.getOperand(1);
 +  SDOperand SourceOp = Op.getOperand(2);
 +  SDOperand CountOp = Op.getOperand(3);
 +  SDOperand AlignOp = Op.getOperand(4);
 +  unsigned Align = (unsigned)castConstantSDNode(AlignOp)-getValue 
 ();
   if (Align == 0) Align = 1;

 -  ConstantSDNode *I = dyn_castConstantSDNode(Op.getOperand(3));
 -  // If not DWORD aligned or size is more than the threshold, call  
 memcpy.
 -  // The libc version is likely to be faster for these cases. It  
 can use the
 -  // address value and run time information about the CPU.
 +  // The libc version is likely to be faster for the following  
 cases. It can
 +  // use the address value and run time information about the CPU.
   // With glibc 2.6.1 on a core 2, coping an array of 100M longs was  
 30% faster
 -  if ((Align  3) != 0 ||
 -  (I  I-getValue()  Subtarget-getMinRepStrSizeThreshold 
 ())) {
 -MVT::ValueType IntPtr = getPointerTy();
 -TargetLowering::ArgListTy Args;
 -TargetLowering::ArgListEntry Entry;
 -Entry.Ty = getTargetData()-getIntPtrType();
 -Entry.Node = Op.getOperand(1); Args.push_back(Entry);
 -Entry.Node = Op.getOperand(2); Args.push_back(Entry);
 -Entry.Node = Op.getOperand(3); Args.push_back(Entry);
 -std::pairSDOperand,SDOperand CallResult =
 +
 +  // If not DWORD aligned, call memcpy.
 +  if ((Align  3) != 0)
 +return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
 +
 +  // If size is unknown, call memcpy.
 +  ConstantSDNode *I = dyn_castConstantSDNode(CountOp);
 +  if (!I)
 +return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
 +
 +  // If size is more than the threshold, call memcpy.
 +  unsigned Size = I-getValue();
 +  if (Size  Subtarget-getMinRepStrSizeThreshold())
 +return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
 +
 +  return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align,  
 DAG);
 +}
 +
 +SDOperand X86TargetLowering::LowerMEMCPYCall(SDOperand Chain,
 + SDOperand Dest,
 + SDOperand Source,
 + SDOperand Count,
 + SelectionDAG DAG) {
 +  MVT::ValueType IntPtr = getPointerTy();
 +  TargetLowering::ArgListTy Args;
 +  TargetLowering::ArgListEntry Entry;
 +  Entry.Ty = getTargetData()-getIntPtrType();
 +  Entry.Node = Dest; Args.push_back(Entry);
 +  Entry.Node = Source; Args.push_back(Entry);
 +  Entry.Node = Count; Args.push_back(Entry);
 +  std::pairSDOperand,SDOperand CallResult =
   LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C,  
 false,
   DAG.getExternalSymbol(memcpy, IntPtr), Args, DAG);
 -return CallResult.second;
 -  }
 +  return CallResult.second;
 +}

 +SDOperand X86TargetLowering::LowerMEMCPYInline(SDOperand Chain,
 +   SDOperand Dest,
 +   SDOperand Source,
 +   unsigned Size,
 +   unsigned Align,
 +   SelectionDAG DAG) {
   MVT::ValueType AVT;
 -  SDOperand Count;
   unsigned BytesLeft = 0;
 -  bool TwoRepMovs = false;
   switch (Align  3) {
 case 2:   // WORD aligned
   AVT = MVT::i16;
 @@ -4228,33 +4254,22 @@
   break;
 default:  // Byte aligned
   AVT = MVT::i8;
 -  Count = 

Re: [llvm-commits] [llvm] r42433 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h test/CodeGen/X86/memcpy.ll

2007-09-28 Thread Chris Lattner
On Sep 28, 2007, at 8:36 AM, Evan Cheng wrote:
 Are you sure this is better? Did you do any measurement? Our goal
 isn't to match gcc output. :)

I think this behavior makes sense.  If the size is variable, it could  
be arbitrarily large.  We should assume that memcpy (the library  
implementation) is tuned as best as possible for handling the unknown  
size case.

-Chris

 Perhaps you can add some unit tests to llvm-test?

 Thanks,

 Evan

 On Sep 28, 2007, at 5:53 AM, Rafael Espindola  
 [EMAIL PROTECTED]
 wrote:

 Author: rafael
 Date: Fri Sep 28 07:53:01 2007
 New Revision: 42433

 URL: http://llvm.org/viewvc/llvm-project?rev=42433view=rev
 Log:
 Refactor the memcpy lowering for the x86 target.

 The only generated code difference is that now we call memcpy when
 the size of the array is unknown. This matches GCC behavior and is
 better since the run time value can be arbitrarily large.


 Added:
llvm/trunk/test/CodeGen/X86/memcpy.ll
 Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.h

 Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
 URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/ 
 X86ISelLowering.cpp?rev=42433r1=42432r2=42433view=diff

 ===
 ===
 ===
 =
 --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
 +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Sep 28
 07:53:01 2007
 @@ -4188,35 +4188,61 @@
 }

 SDOperand X86TargetLowering::LowerMEMCPY(SDOperand Op, SelectionDAG
 DAG) {
 -  SDOperand Chain = Op.getOperand(0);
 -  unsigned Align =
 -(unsigned)castConstantSDNode(Op.getOperand(4))-getValue();
 +  SDOperand ChainOp = Op.getOperand(0);
 +  SDOperand DestOp = Op.getOperand(1);
 +  SDOperand SourceOp = Op.getOperand(2);
 +  SDOperand CountOp = Op.getOperand(3);
 +  SDOperand AlignOp = Op.getOperand(4);
 +  unsigned Align = (unsigned)castConstantSDNode(AlignOp)-getValue
 ();
   if (Align == 0) Align = 1;

 -  ConstantSDNode *I = dyn_castConstantSDNode(Op.getOperand(3));
 -  // If not DWORD aligned or size is more than the threshold, call
 memcpy.
 -  // The libc version is likely to be faster for these cases. It
 can use the
 -  // address value and run time information about the CPU.
 +  // The libc version is likely to be faster for the following
 cases. It can
 +  // use the address value and run time information about the CPU.
   // With glibc 2.6.1 on a core 2, coping an array of 100M longs was
 30% faster
 -  if ((Align  3) != 0 ||
 -  (I  I-getValue()  Subtarget-getMinRepStrSizeThreshold
 ())) {
 -MVT::ValueType IntPtr = getPointerTy();
 -TargetLowering::ArgListTy Args;
 -TargetLowering::ArgListEntry Entry;
 -Entry.Ty = getTargetData()-getIntPtrType();
 -Entry.Node = Op.getOperand(1); Args.push_back(Entry);
 -Entry.Node = Op.getOperand(2); Args.push_back(Entry);
 -Entry.Node = Op.getOperand(3); Args.push_back(Entry);
 -std::pairSDOperand,SDOperand CallResult =
 +
 +  // If not DWORD aligned, call memcpy.
 +  if ((Align  3) != 0)
 +return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
 +
 +  // If size is unknown, call memcpy.
 +  ConstantSDNode *I = dyn_castConstantSDNode(CountOp);
 +  if (!I)
 +return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
 +
 +  // If size is more than the threshold, call memcpy.
 +  unsigned Size = I-getValue();
 +  if (Size  Subtarget-getMinRepStrSizeThreshold())
 +return LowerMEMCPYCall(ChainOp, DestOp, SourceOp, CountOp, DAG);
 +
 +  return LowerMEMCPYInline(ChainOp, DestOp, SourceOp, Size, Align,
 DAG);
 +}
 +
 +SDOperand X86TargetLowering::LowerMEMCPYCall(SDOperand Chain,
 + SDOperand Dest,
 + SDOperand Source,
 + SDOperand Count,
 + SelectionDAG DAG) {
 +  MVT::ValueType IntPtr = getPointerTy();
 +  TargetLowering::ArgListTy Args;
 +  TargetLowering::ArgListEntry Entry;
 +  Entry.Ty = getTargetData()-getIntPtrType();
 +  Entry.Node = Dest; Args.push_back(Entry);
 +  Entry.Node = Source; Args.push_back(Entry);
 +  Entry.Node = Count; Args.push_back(Entry);
 +  std::pairSDOperand,SDOperand CallResult =
   LowerCallTo(Chain, Type::VoidTy, false, false, CallingConv::C,
 false,
   DAG.getExternalSymbol(memcpy, IntPtr), Args,  
 DAG);
 -return CallResult.second;
 -  }
 +  return CallResult.second;
 +}

 +SDOperand X86TargetLowering::LowerMEMCPYInline(SDOperand Chain,
 +   SDOperand Dest,
 +   SDOperand Source,
 +   unsigned Size,
 +   unsigned Align,
 +   SelectionDAG DAG) {
   MVT::ValueType AVT;
 -  

Re: [llvm-commits] [llvm] r42433 - in /llvm/trunk: lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86ISelLowering.h test/CodeGen/X86/memcpy.ll

2007-09-28 Thread Rafael Espindola
 I think this behavior makes sense.  If the size is variable, it could
 be arbitrarily large.  We should assume that memcpy (the library
 implementation) is tuned as best as possible for handling the unknown
 size case.

Yes, we know that libc memcpy is better for big values and that inline
is better for small values. When the value is not know at compile
time, we can only guess.

If we guess that it will be small, we get it wrong for any size bigger
then an small arch specific limit (128 for x86-64 I think). If we
guess that it will be large, we add it small overhead for small
values, but a big saving for big ones.

This might be a good case for profiling based optimization .

 -Chris

Cheers,
-- 
Rafael Avila de Espindola

Google Ireland Ltd.
Gordon House
Barrow Street
Dublin 4
Ireland

Registered in Dublin, Ireland
Registration Number: 368047
___
llvm-commits mailing list
llvm-commits@cs.uiuc.edu
http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits