[PATCH] D29758: [OpenMP] Parallel reduction on the NVPTX device.

2017-02-16 Thread Phabricator via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rL295319: [OpenMP] Parallel reduction on the NVPTX device. 
(authored by arpith).

Changed prior to commit:
  https://reviews.llvm.org/D29758?vs=88149=88726#toc

Repository:
  rL LLVM

https://reviews.llvm.org/D29758

Files:
  cfe/trunk/lib/CodeGen/CGOpenMPRuntime.cpp
  cfe/trunk/lib/CodeGen/CGOpenMPRuntime.h
  cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp
  cfe/trunk/lib/CodeGen/CodeGenFunction.h
  cfe/trunk/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp

Index: cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
===
--- cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ cfe/trunk/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -67,12 +67,6 @@
   /// \brief Signal termination of Spmd mode execution.
   void emitSpmdEntryFooter(CodeGenFunction , EntryFunctionState );
 
-  /// \brief Returns specified OpenMP runtime function for the current OpenMP
-  /// implementation.  Specialized for the NVPTX device.
-  /// \param Function OpenMP runtime function.
-  /// \return Specified function.
-  llvm::Constant *createNVPTXRuntimeFunction(unsigned Function);
-
   //
   // Base class overrides.
   //
@@ -248,7 +242,32 @@
 ArrayRef CapturedVars,
 const Expr *IfCond) override;
 
-public:
+  /// Emit a code for reduction clause.
+  ///
+  /// \param Privates List of private copies for original reduction arguments.
+  /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+  /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+  /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+  /// or 'operator binop(LHS, RHS)'.
+  /// \param Options List of options for reduction codegen:
+  /// WithNowait true if parent directive has also nowait clause, false
+  /// otherwise.
+  /// SimpleReduction Emit reduction operation only. Used for omp simd
+  /// directive on the host.
+  /// ReductionKind The kind of reduction to perform.
+  virtual void emitReduction(CodeGenFunction , SourceLocation Loc,
+ ArrayRef Privates,
+ ArrayRef LHSExprs,
+ ArrayRef RHSExprs,
+ ArrayRef ReductionOps,
+ ReductionOptionsTy Options) override;
+
+  /// Returns specified OpenMP runtime function for the current OpenMP
+  /// implementation.  Specialized for the NVPTX device.
+  /// \param Function OpenMP runtime function.
+  /// \return Specified function.
+  llvm::Constant *createNVPTXRuntimeFunction(unsigned Function);
+
   /// Target codegen is specialized based on two programming models: the
   /// 'generic' fork-join model of OpenMP, and a more GPU efficient 'spmd'
   /// model for constructs like 'target parallel' that support it.
Index: cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp
===
--- cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp
+++ cfe/trunk/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1190,7 +1190,7 @@
 }
 
 void CodeGenFunction::EmitOMPReductionClauseFinal(
-const OMPExecutableDirective ) {
+const OMPExecutableDirective , const OpenMPDirectiveKind ReductionKind) {
   if (!HaveInsertPoint())
 return;
   llvm::SmallVector Privates;
@@ -1206,14 +1206,15 @@
 ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
   }
   if (HasAtLeastOneReduction) {
+bool WithNowait = D.getSingleClause() ||
+  isOpenMPParallelDirective(D.getDirectiveKind()) ||
+  D.getDirectiveKind() == OMPD_simd;
+bool SimpleReduction = D.getDirectiveKind() == OMPD_simd;
 // Emit nowait reduction if nowait clause is present or directive is a
 // parallel directive (it always has implicit barrier).
 CGM.getOpenMPRuntime().emitReduction(
 *this, D.getLocEnd(), Privates, LHSExprs, RHSExprs, ReductionOps,
-D.getSingleClause() ||
-isOpenMPParallelDirective(D.getDirectiveKind()) ||
-D.getDirectiveKind() == OMPD_simd,
-D.getDirectiveKind() == OMPD_simd);
+{WithNowait, SimpleReduction, ReductionKind});
   }
 }
 
@@ -1295,7 +1296,7 @@
 CGF.EmitOMPReductionClauseInit(S, PrivateScope);
 (void)PrivateScope.Privatize();
 CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt());
-CGF.EmitOMPReductionClauseFinal(S);
+CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_parallel);
   };
   emitCommonOMPParallelDirective(*this, S, OMPD_parallel, CodeGen);
   emitPostUpdateForReductionClause(
@@ -1708,7 +1709,7 @@
   // Emit final copy of the lastprivate variables at the end of loops.
   if (HasLastprivateClause)
 CGF.EmitOMPLastprivateClauseFinal(S, 

[PATCH] D29758: [OpenMP] Parallel reduction on the NVPTX device.

2017-02-13 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev accepted this revision.
ABataev added a comment.
This revision is now accepted and ready to land.

LG


https://reviews.llvm.org/D29758



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D29758: [OpenMP] Parallel reduction on the NVPTX device.

2017-02-12 Thread Arpith Jacob via Phabricator via cfe-commits
arpith-jacob updated this revision to Diff 88149.
arpith-jacob added a comment.

Minor fixup of comment style on emitInterWarpCopyFunction().


https://reviews.llvm.org/D29758

Files:
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.h
  test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp

Index: test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
===
--- /dev/null
+++ test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -0,0 +1,830 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
+// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
+
+// Check that the execution mode of all 3 target regions is set to Spmd Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
+
+template
+tx ftemplate(int n) {
+  int a;
+  short b;
+  tx c;
+  float d;
+  double e;
+
+  #pragma omp target parallel reduction(+: e) map(tofrom: e)
+  {
+e += 5;
+  }
+
+  #pragma omp target parallel reduction(^: c) reduction(*: d) map(tofrom: c,d)
+  {
+c ^= 2;
+d *= 33;
+  }
+
+  #pragma omp target parallel reduction(|: a) reduction(max: b) map(tofrom: a,b)
+  {
+a |= 1;
+b = 99 > b ? 99 : b;
+  }
+
+  return a+b+c+d+e;
+}
+
+int bar(int n){
+  int a = 0;
+
+  a += ftemplate(n);
+
+  return a;
+}
+
+  // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
+  //
+  // CHECK: call void @__kmpc_spmd_kernel_init(
+  // CHECK: br label {{%?}}[[EXECUTE:.+]]
+  //
+  // CHECK: [[EXECUTE]]
+  // CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  //
+  //
+  // define internal void [[PFN]](
+  // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
+  // CHECK: [[EV:%.+]] = load double, double* [[E]], align
+  // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
+  // CHECK: store double [[ADD]], double* [[E]], align
+  // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
+  // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
+  // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
+  // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+  // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
+  // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
+  // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
+
+  // CHECK: [[REDLABEL]]
+  // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
+  // CHECK: [[EV:%.+]] = load double, double* [[E]], align
+  // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
+  // CHECK: store double [[ADD]], double* [[E_IN]], align
+  // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+  // CHECK: br label %[[DEFAULTLABEL]]
+  //
+  // CHECK: [[DEFAULTLABEL]]
+  // CHECK: ret
+
+  //
+  // Reduction function
+  // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
+  // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+  // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+  // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
+  //
+  // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* 

[PATCH] D29758: [OpenMP] Parallel reduction on the NVPTX device.

2017-02-12 Thread Arpith Jacob via Phabricator via cfe-commits
arpith-jacob updated this revision to Diff 88144.
arpith-jacob added a comment.

Updated patch to address Alexey's comments.  Condensed parameters in 
emitReduction() to a struct Options.


https://reviews.llvm.org/D29758

Files:
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.h
  test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp

Index: test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
===
--- /dev/null
+++ test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -0,0 +1,830 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
+// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
+
+// Check that the execution mode of all 3 target regions is set to Spmd Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
+
+template
+tx ftemplate(int n) {
+  int a;
+  short b;
+  tx c;
+  float d;
+  double e;
+
+  #pragma omp target parallel reduction(+: e) map(tofrom: e)
+  {
+e += 5;
+  }
+
+  #pragma omp target parallel reduction(^: c) reduction(*: d) map(tofrom: c,d)
+  {
+c ^= 2;
+d *= 33;
+  }
+
+  #pragma omp target parallel reduction(|: a) reduction(max: b) map(tofrom: a,b)
+  {
+a |= 1;
+b = 99 > b ? 99 : b;
+  }
+
+  return a+b+c+d+e;
+}
+
+int bar(int n){
+  int a = 0;
+
+  a += ftemplate(n);
+
+  return a;
+}
+
+  // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
+  //
+  // CHECK: call void @__kmpc_spmd_kernel_init(
+  // CHECK: br label {{%?}}[[EXECUTE:.+]]
+  //
+  // CHECK: [[EXECUTE]]
+  // CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  //
+  //
+  // define internal void [[PFN]](
+  // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
+  // CHECK: [[EV:%.+]] = load double, double* [[E]], align
+  // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
+  // CHECK: store double [[ADD]], double* [[E]], align
+  // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0
+  // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8*
+  // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align
+  // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8*
+  // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait(i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]])
+  // CHECK: switch i32 [[RET]], label {{%?}}[[DEFAULTLABEL:.+]] [
+  // CHECK: i32 1, label {{%?}}[[REDLABEL:.+]]
+
+  // CHECK: [[REDLABEL]]
+  // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align
+  // CHECK: [[EV:%.+]] = load double, double* [[E]], align
+  // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]]
+  // CHECK: store double [[ADD]], double* [[E_IN]], align
+  // CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+  // CHECK: br label %[[DEFAULTLABEL]]
+  //
+  // CHECK: [[DEFAULTLABEL]]
+  // CHECK: ret
+
+  //
+  // Reduction function
+  // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*)
+  // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0
+  // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+  // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double*
+  //
+  // CHECK: [[VAR_LHS_REF:%.+]] 

Re: [PATCH] D29758: [OpenMP] Parallel reduction on the NVPTX device.

2017-02-10 Thread Alexey Bataev via cfe-commits
Arpith, see the comment in CGOpenMPRuntime.cpp

   // if SimpleReduction is true, only the next code is generated:
   //  ...
   //  [i] = RedOp(*[i], *[i]);
   //  ...

and is used for omp simd directive only.

-
Best regards,
Alexey Bataev

10.02.2017 18:49, Arpith Jacob via Phabricator пишет:
> arpith-jacob added inline comments.
>
>
> 
> Comment at: lib/CodeGen/CGOpenMPRuntime.h:956-962
> virtual void emitReduction(CodeGenFunction , SourceLocation Loc,
>ArrayRef Privates,
>ArrayRef LHSExprs,
>ArrayRef RHSExprs,
>ArrayRef ReductionOps,
> - bool WithNowait, bool SimpleReduction);
> + bool WithNowait, bool SimpleReduction,
> + OpenMPDirectiveKind ReductionKind);
> 
> ABataev wrote:
>> Number of parameters is getting too big, maybe it is better to aggregate 
>> them into a struct/class?
> Thanks Alexey for your comments.  I can place 'WithNoWait, SimpleReduction, 
> ReductionKind' in a struct.
>
> Can you explain what 'SimpleReduction' stands for?  It isn't create to me 
> when the reduction is simple...
>
> Thanks.
>
>
> https://reviews.llvm.org/D29758
>
>
>

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D29758: [OpenMP] Parallel reduction on the NVPTX device.

2017-02-10 Thread Arpith Jacob via Phabricator via cfe-commits
arpith-jacob added inline comments.



Comment at: lib/CodeGen/CGOpenMPRuntime.h:956-962
   virtual void emitReduction(CodeGenFunction , SourceLocation Loc,
  ArrayRef Privates,
  ArrayRef LHSExprs,
  ArrayRef RHSExprs,
  ArrayRef ReductionOps,
- bool WithNowait, bool SimpleReduction);
+ bool WithNowait, bool SimpleReduction,
+ OpenMPDirectiveKind ReductionKind);

ABataev wrote:
> Number of parameters is getting too big, maybe it is better to aggregate them 
> into a struct/class?
Thanks Alexey for your comments.  I can place 'WithNoWait, SimpleReduction, 
ReductionKind' in a struct.

Can you explain what 'SimpleReduction' stands for?  It isn't create to me when 
the reduction is simple...

Thanks.


https://reviews.llvm.org/D29758



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D29758: [OpenMP] Parallel reduction on the NVPTX device.

2017-02-10 Thread Alexey Bataev via Phabricator via cfe-commits
ABataev added inline comments.



Comment at: lib/CodeGen/CGOpenMPRuntime.h:956-962
   virtual void emitReduction(CodeGenFunction , SourceLocation Loc,
  ArrayRef Privates,
  ArrayRef LHSExprs,
  ArrayRef RHSExprs,
  ArrayRef ReductionOps,
- bool WithNowait, bool SimpleReduction);
+ bool WithNowait, bool SimpleReduction,
+ OpenMPDirectiveKind ReductionKind);

Number of parameters is getting too big, maybe it is better to aggregate them 
into a struct/class?



Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp:118-133
+// GPU Configuration:  This information can be derived from cuda registers,
+// however, providing compile time constants helps generate more efficient
+// code.  For all practical purposes this is fine because the configuration
+// is the same for all known NVPTX architectures.
+enum MachineConfiguration : unsigned {
+  WarpSize = 32,
+  // Number of bits required to represent a lane identifier, which is

It's better to use `///` style of comments here



Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp:653-675
+/// Build int32_t __kmpc_shuffle_int32(int32_t element,
+/// int16_t lane_offset, int16_t warp_size);
+llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
+llvm::FunctionType *FnTy =
+llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
+RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
+break;

Use `//` instead of `///`



Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp:963-965
+enum CopyAction : unsigned {
+  RemoteLaneToThread,
+  ThreadCopy,

Comments here?



Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp:969-974
+// Emit instructions to copy a Reduce list, which contains partially
+// aggregated values, in the specified direction.
+//
+// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
+//   the warp using shuffle instructions.
+// ThreadCopy: Make a copy of a Reduce list on the thread's stack.

Use `///`



Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp:1272
+
+// Emit a helper that reduces data across two OpenMP threads (lanes)
+// in the same warp.  It uses shuffle instructions to copy over data from

`///` style here



Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp:1488
+
+//
+// Design of OpenMP reductions on the GPU

`///` here



Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.h:245
 
-public:
+  /// \brief Emit a code for reduction clause.
+  ///

Bo \brief



Comment at: lib/CodeGen/CGOpenMPRuntimeNVPTX.h:263
+
+  /// \brief Returns specified OpenMP runtime function for the current OpenMP
+  /// implementation.  Specialized for the NVPTX device.

No \brief


https://reviews.llvm.org/D29758



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D29758: [OpenMP] Parallel reduction on the NVPTX device.

2017-02-09 Thread Arpith Jacob via Phabricator via cfe-commits
arpith-jacob created this revision.
Herald added a subscriber: jholewinski.

This patch implements codegen for the reduction clause on
any parallel construct for elementary data types.  An efficient
implementation requires hierarchical reduction within a
warp and a threadblock.  It is complicated by the fact that
variables declared in the stack of a CUDA thread cannot be
shared with other threads.

The patch creates a struct to hold reduction variables and
a number of helper functions.  The OpenMP runtime on the GPU
implements reduction algorithms that uses these helper
functions to perform reductions within a team.  Variables are
shared between CUDA threads using shuffle intrinsics.

An implementation of reductions on the NVPTX device is
substantially different to that of CPUs.  However, this patch
is written so that there are minimal changes to the rest of
OpenMP codegen.

The implemented design allows the compiler and runtime to be
decoupled, i.e., the runtime does not need to know of the
reduction operation(s), the type of the reduction variable(s),
or the number of reductions.  The design also allows reuse of
host codegen, with appropriate specialization for the NVPTX
device.

While the patch does introduce a number of abstractions, the
expected use case calls for inlining of the GPU OpenMP runtime.
After inlining and optimizations in LLVM, these abstractions
are unwound and performance of OpenMP reductions is comparable
to CUDA-canonical code.

Patch by Tian Jin in collaboration with Arpith Jacob


https://reviews.llvm.org/D29758

Files:
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.h
  test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp

Index: test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
===
--- /dev/null
+++ test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -0,0 +1,830 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
+// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64]
+
+// Check that the execution mode of all 3 target regions is set to Spmd Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
+
+template
+tx ftemplate(int n) {
+  int a;
+  short b;
+  tx c;
+  float d;
+  double e;
+
+  #pragma omp target parallel reduction(+: e) map(tofrom: e)
+  {
+e += 5;
+  }
+
+  #pragma omp target parallel reduction(^: c) reduction(*: d) map(tofrom: c,d)
+  {
+c ^= 2;
+d *= 33;
+  }
+
+  #pragma omp target parallel reduction(|: a) reduction(max: b) map(tofrom: a,b)
+  {
+a |= 1;
+b = 99 > b ? 99 : b;
+  }
+
+  return a+b+c+d+e;
+}
+
+int bar(int n){
+  int a = 0;
+
+  a += ftemplate(n);
+
+  return a;
+}
+
+  // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
+  //
+  // CHECK: call void @__kmpc_spmd_kernel_init(
+  // CHECK: br label {{%?}}[[EXECUTE:.+]]
+  //
+  // CHECK: [[EXECUTE]]
+  // CHECK: {{call|invoke}} void [[PFN:@.+]](i32*
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  //
+  //
+  // define internal void [[PFN]](
+  // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align
+  // CHECK: [[EV:%.+]] = load double, double* [[E]], align
+  // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5
+  // CHECK: store double [[ADD]], double* [[E]], align
+  // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}}