Update after review

http://reviews.llvm.org/D4002

Files:
  include/clang/AST/DeclOpenMP.h
  lib/AST/ASTContext.cpp
  lib/CodeGen/CGDecl.cpp
  lib/CodeGen/CGDeclCXX.cpp
  lib/CodeGen/CGExpr.cpp
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenModule.cpp
  lib/CodeGen/CodeGenModule.h
  lib/CodeGen/ModuleBuilder.cpp
  lib/Parse/Parser.cpp
  lib/Serialization/ASTReaderDecl.cpp
  test/OpenMP/threadprivate_codegen.cpp
Index: include/clang/AST/DeclOpenMP.h
===================================================================
--- include/clang/AST/DeclOpenMP.h
+++ include/clang/AST/DeclOpenMP.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/ArrayRef.h"
 
 namespace clang {
+class Expr;
 
 /// \brief This represents '#pragma omp threadprivate ...' directive.
 /// For example, in the following, both 'a' and 'A::b' are threadprivate:
Index: lib/AST/ASTContext.cpp
===================================================================
--- lib/AST/ASTContext.cpp
+++ lib/AST/ASTContext.cpp
@@ -7895,7 +7895,9 @@
     // We never need to emit an uninstantiated function template.
     if (FD->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate)
       return false;
-  } else
+  } else if (isa<OMPThreadPrivateDecl>(D))
+    return true;
+  else
     return false;
 
   // If this is a member of a class template, we do not need to emit it.
Index: lib/CodeGen/CGDecl.cpp
===================================================================
--- lib/CodeGen/CGDecl.cpp
+++ lib/CodeGen/CGDecl.cpp
@@ -19,6 +19,7 @@
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
@@ -86,11 +87,12 @@
   case Decl::StaticAssert: // static_assert(X, ""); [C++0x]
   case Decl::Label:        // __label__ x;
   case Decl::Import:
-  case Decl::OMPThreadPrivate:
   case Decl::Empty:
     // None of these decls require codegen support.
     return;
-
+  case Decl::OMPThreadPrivate:
+    CGM.EmitOMPThreadPrivateDecl(cast<OMPThreadPrivateDecl>(&D));
+    return;
   case Decl::NamespaceAlias:
     if (CGDebugInfo *DI = getDebugInfo())
         DI->EmitNamespaceAlias(cast<NamespaceAliasDecl>(D));
Index: lib/CodeGen/CGDeclCXX.cpp
===================================================================
--- lib/CodeGen/CGDeclCXX.cpp
+++ lib/CodeGen/CGDeclCXX.cpp
@@ -155,12 +155,6 @@
   EmitStoreOfScalar(RV.getScalarVal(), DeclPtr, false, Alignment, T);
 }
 
-static llvm::Function *
-CreateGlobalInitOrDestructFunction(CodeGenModule &CGM,
-                                   llvm::FunctionType *ty,
-                                   const Twine &name,
-                                   bool TLS = false);
-
 /// Create a stub function, suitable for being passed to atexit,
 /// which passes the given address to the given destructor function.
 static llvm::Constant *createAtExitStub(CodeGenModule &CGM, const VarDecl &VD,
@@ -174,7 +168,7 @@
     CGM.getCXXABI().getMangleContext().mangleDynamicAtExitDestructor(&VD, Out);
   }
   llvm::Function *fn =
-      CreateGlobalInitOrDestructFunction(CGM, ty, FnName.str());
+      CGM.CreateGlobalInitOrDestructFunction(ty, FnName.str());
 
   CodeGenFunction CGF(CGM);
 
@@ -226,31 +220,29 @@
   CGM.getCXXABI().EmitGuardedInit(*this, D, DeclPtr, PerformInit);
 }
 
-static llvm::Function *
-CreateGlobalInitOrDestructFunction(CodeGenModule &CGM,
-                                   llvm::FunctionType *FTy,
-                                   const Twine &Name, bool TLS) {
+llvm::Function *CodeGenModule::CreateGlobalInitOrDestructFunction(
+    llvm::FunctionType *FTy, const Twine &Name, bool TLS) {
   llvm::Function *Fn =
     llvm::Function::Create(FTy, llvm::GlobalValue::InternalLinkage,
-                           Name, &CGM.getModule());
-  if (!CGM.getLangOpts().AppleKext && !TLS) {
+                           Name, &getModule());
+  if (!getLangOpts().AppleKext && !TLS) {
     // Set the section if needed.
     if (const char *Section = 
-          CGM.getTarget().getStaticInitSectionSpecifier())
+          getTarget().getStaticInitSectionSpecifier())
       Fn->setSection(Section);
   }
 
-  Fn->setCallingConv(CGM.getRuntimeCC());
+  Fn->setCallingConv(getRuntimeCC());
 
-  if (!CGM.getLangOpts().Exceptions)
+  if (!getLangOpts().Exceptions)
     Fn->setDoesNotThrow();
 
-  if (!CGM.getSanitizerBlacklist().isIn(*Fn)) {
-    if (CGM.getLangOpts().Sanitize.Address)
+  if (!getSanitizerBlacklist().isIn(*Fn)) {
+    if (getLangOpts().Sanitize.Address)
       Fn->addFnAttr(llvm::Attribute::SanitizeAddress);
-    if (CGM.getLangOpts().Sanitize.Thread)
+    if (getLangOpts().Sanitize.Thread)
       Fn->addFnAttr(llvm::Attribute::SanitizeThread);
-    if (CGM.getLangOpts().Sanitize.Memory)
+    if (getLangOpts().Sanitize.Memory)
       Fn->addFnAttr(llvm::Attribute::SanitizeMemory);
   }
 
@@ -296,7 +288,7 @@
 
   // Create a variable initialization function.
   llvm::Function *Fn =
-      CreateGlobalInitOrDestructFunction(*this, FTy, FnName.str());
+      CreateGlobalInitOrDestructFunction(FTy, FnName.str());
 
   auto *ISA = D->getAttr<InitSegAttr>();
   CodeGenFunction(*this).GenerateCXXGlobalVarDeclInitFunc(Fn, D, Addr,
@@ -350,7 +342,7 @@
   if (!CXXThreadLocalInits.empty()) {
     // Generate a guarded initialization function.
     llvm::FunctionType *FTy = llvm::FunctionType::get(VoidTy, false);
-    InitFn = CreateGlobalInitOrDestructFunction(*this, FTy, "__tls_init",
+    InitFn = CreateGlobalInitOrDestructFunction(FTy, "__tls_init",
                                                 /*TLS*/ true);
     llvm::GlobalVariable *Guard = new llvm::GlobalVariable(
         getModule(), Int8Ty, false, llvm::GlobalVariable::InternalLinkage,
@@ -399,7 +391,7 @@
       // Priority is always <= 65535 (enforced by sema).
       PrioritySuffix = std::string(6-PrioritySuffix.size(), '0')+PrioritySuffix;
       llvm::Function *Fn = 
-        CreateGlobalInitOrDestructFunction(*this, FTy,
+        CreateGlobalInitOrDestructFunction(FTy,
                                            "_GLOBAL__I_" + PrioritySuffix);
       
       for (; I < PrioE; ++I)
@@ -423,7 +415,7 @@
       FileName[i] = '_';
   }
   llvm::Function *Fn = CreateGlobalInitOrDestructFunction(
-      *this, FTy, llvm::Twine("_GLOBAL__sub_I_", FileName));
+      FTy, llvm::Twine("_GLOBAL__sub_I_", FileName));
 
   CodeGenFunction(*this).GenerateCXXGlobalInitFunc(Fn, CXXGlobalInits);
   AddGlobalCtor(Fn);
@@ -440,7 +432,7 @@
 
   // Create our global destructor function.
   llvm::Function *Fn =
-    CreateGlobalInitOrDestructFunction(*this, FTy, "_GLOBAL__D_a");
+    CreateGlobalInitOrDestructFunction(FTy, "_GLOBAL__D_a");
 
   CodeGenFunction(*this).GenerateCXXGlobalDtorsFunc(Fn, CXXGlobalDtors);
   AddGlobalDtor(Fn);
@@ -562,7 +554,7 @@
       getContext().VoidTy, args, FunctionType::ExtInfo(), /*variadic=*/false);
   llvm::FunctionType *FTy = CGM.getTypes().GetFunctionType(FI);
   llvm::Function *fn = 
-    CreateGlobalInitOrDestructFunction(CGM, FTy, "__cxx_global_array_dtor");
+    CGM.CreateGlobalInitOrDestructFunction(FTy, "__cxx_global_array_dtor");
 
   StartFunction(VD, getContext().VoidTy, fn, FI, args);
 
Index: lib/CodeGen/CGExpr.cpp
===================================================================
--- lib/CodeGen/CGExpr.cpp
+++ lib/CodeGen/CGExpr.cpp
@@ -16,6 +16,7 @@
 #include "CGCall.h"
 #include "CGDebugInfo.h"
 #include "CGObjCRuntime.h"
+#include "CGOpenMPRuntime.h"
 #include "CGRecordLayout.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
@@ -1751,6 +1752,14 @@
   return CGF.Builder.CreateBitCast(V, IRType->getPointerTo(AS), Name);
 }
 
+static LValue EmitThreadPrivateVarDeclLValue(
+    CodeGenFunction &CGF, const VarDecl *VD, QualType T, llvm::Value *V,
+    llvm::Type *RealVarTy, CharUnits Alignment, SourceLocation Loc) {
+  V = CGF.CGM.getOpenMPRuntime().getOMPAddrOfThreadPrivate(CGF, V, VD, Loc);
+  V = EmitBitCastOfLValueToProperType(CGF, V, RealVarTy);
+  return CGF.MakeAddrLValue(V, T, Alignment);
+}
+
 static LValue EmitGlobalVarDeclLValue(CodeGenFunction &CGF,
                                       const Expr *E, const VarDecl *VD) {
   QualType T = E->getType();
@@ -1764,6 +1773,11 @@
   V = EmitBitCastOfLValueToProperType(CGF, V, RealVarTy);
   CharUnits Alignment = CGF.getContext().getDeclAlign(VD);
   LValue LV;
+  if (CGF.getLangOpts().OpenMP &&
+      CGF.CGM.getOpenMPRuntime().isOMPThreadPrivateDecl(VD)) {
+    return EmitThreadPrivateVarDeclLValue(CGF, VD, T, V, RealVarTy, Alignment,
+                                          E->getExprLoc());
+  }
   if (VD->getType()->isReferenceType()) {
     llvm::LoadInst *LI = CGF.Builder.CreateLoad(V);
     LI->setAlignment(Alignment.getQuantity());
@@ -1876,6 +1890,13 @@
     if (!V && VD->isStaticLocal())
       V = CGM.getStaticLocalDeclAddress(VD);
 
+    // Check if variable is threadprivate.
+    if (V && getLangOpts().OpenMP &&
+        CGM.getOpenMPRuntime().isOMPThreadPrivateDecl(VD))
+      return EmitThreadPrivateVarDeclLValue(
+          *this, VD, T, V, getTypes().ConvertTypeForMem(VD->getType()),
+          Alignment, E->getExprLoc());
+
     // Use special handling for lambdas.
     if (!V) {
       if (FieldDecl *FD = LambdaCaptureFields.lookup(VD)) {
Index: lib/CodeGen/CGOpenMPRuntime.cpp
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.cpp
+++ lib/CodeGen/CGOpenMPRuntime.cpp
@@ -123,10 +123,10 @@
                                                        SourceLocation Loc) {
   assert(CGF.CurFn && "No function in current CodeGenFunction.");
 
-  llvm::Value *GTid = nullptr;
-  OpenMPGtidMapTy::iterator I = OpenMPGtidMap.find(CGF.CurFn);
-  if (I != OpenMPGtidMap.end()) {
-    GTid = I->second;
+  llvm::Value *ThreadID = nullptr;
+  OpenMPThreadIDMapTy::iterator I = OpenMPThreadIDMap.find(CGF.CurFn);
+  if (I != OpenMPThreadIDMap.end()) {
+    ThreadID = I->second;
   } else {
     // Check if current function is a function which has first parameter
     // with type int32 and name ".global_tid.".
@@ -144,24 +144,24 @@
         CGF.CurFn->arg_begin()->getName() == ".global_tid.") {
       CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
       CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt);
-      GTid = CGF.Builder.CreateLoad(CGF.CurFn->arg_begin());
+      ThreadID = CGF.Builder.CreateLoad(CGF.CurFn->arg_begin());
     } else {
       // Generate "int32 .kmpc_global_thread_num.addr;"
       CGBuilderTy::InsertPointGuard IPG(CGF.Builder);
       CGF.Builder.SetInsertPoint(CGF.AllocaInsertPt);
       llvm::Value *Args[] = {EmitOpenMPUpdateLocation(CGF, Loc)};
-      GTid = CGF.EmitRuntimeCall(
+      ThreadID = CGF.EmitRuntimeCall(
           CreateRuntimeFunction(OMPRTL__kmpc_global_thread_num), Args);
     }
-    OpenMPGtidMap[CGF.CurFn] = GTid;
+    OpenMPThreadIDMap[CGF.CurFn] = ThreadID;
   }
-  return GTid;
+  return ThreadID;
 }
 
 void CGOpenMPRuntime::FunctionFinished(CodeGenFunction &CGF) {
   assert(CGF.CurFn && "No function in current CodeGenFunction.");
-  if (OpenMPGtidMap.count(CGF.CurFn))
-    OpenMPGtidMap.erase(CGF.CurFn);
+  if (OpenMPThreadIDMap.count(CGF.CurFn))
+    OpenMPThreadIDMap.erase(CGF.CurFn);
   if (OpenMPLocMap.count(CGF.CurFn))
     OpenMPLocMap.erase(CGF.CurFn);
 }
@@ -184,18 +184,214 @@
     llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
                                 getKmpc_MicroPointerTy()};
     llvm::FunctionType *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, TypeParams, true);
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ true);
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_fork_call");
     break;
   }
   case OMPRTL__kmpc_global_thread_num: {
     // Build kmp_int32 __kmpc_global_thread_num(ident_t *loc);
     llvm::Type *TypeParams[] = {getIdentTyPointerTy()};
     llvm::FunctionType *FnTy =
-        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, false);
+        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_global_thread_num");
     break;
   }
+  case OMPRTL__kmpc_threadprivate_cached: {
+    // Build void *__kmpc_threadprivate_cached(ident_t *loc,
+    // kmp_int32 global_tid, void *data, size_t size, void ***cache);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty,
+                                CGM.VoidPtrTy, CGM.SizeTy,
+                                CGM.VoidPtrTy->getPointerTo()->getPointerTo()};
+    llvm::FunctionType *FnTy =
+        llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_cached");
+    break;
+  }
+  case OMPRTL__kmpc_threadprivate_register: {
+    // Build void __kmpc_threadprivate_register(ident_t *, void *data,
+    // kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
+    // typedef void *(*kmpc_ctor)(void *);
+    auto KmpcCtorTy =
+        llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy,
+                                /*isVarArg*/ false)->getPointerTo();
+    // typedef void (*kmpc_dtor)(void *);
+    llvm::Type *KmpcCCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
+    auto KmpcCCtorTy =
+        llvm::FunctionType::get(CGM.VoidPtrTy, KmpcCCtorTyArgs,
+                                /*isVarArg*/ false)->getPointerTo();
+    // typedef void *(*kmpc_cctor)(void *, void *);
+    auto KmpcDtorTy =
+        llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy, /*isVarArg*/ false)
+            ->getPointerTo();
+    llvm::Type *FnTyArgs[] = {getIdentTyPointerTy(), CGM.VoidPtrTy, KmpcCtorTy,
+                              KmpcCCtorTy, KmpcDtorTy};
+    auto FnTy = llvm::FunctionType::get(CGM.VoidTy, FnTyArgs,
+                                        /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_threadprivate_register");
+    break;
+  }
   }
   return RTLFn;
 }
+
+void CGOpenMPRuntime::EmitOMPParallelCall(CodeGenFunction &CGF,
+                                          SourceLocation Loc,
+                                          llvm::Value *OutlinedFn,
+                                          llvm::Value *CapturedStruct) {
+  // Build call __kmpc_fork_call(loc, 1, microtask, captured_struct/*context*/)
+  llvm::Value *Args[] = {
+      EmitOpenMPUpdateLocation(CGF, Loc),
+      CGF.Builder.getInt32(1), // Number of arguments after 'microtask' argument
+      // (there is only one additional argument - 'context')
+      CGF.Builder.CreateBitCast(OutlinedFn, getKmpc_MicroPointerTy()),
+      CGF.EmitCastToVoidPtr(CapturedStruct)};
+  auto RTLFn = CreateRuntimeFunction(CGOpenMPRuntime::OMPRTL__kmpc_fork_call);
+  CGF.EmitRuntimeCall(RTLFn, Args);
+}
+
+void CGOpenMPRuntime::addOMPThreadPrivateDecl(const VarDecl *VD,
+                                              SourceLocation Loc) {
+  OpenMPThreadPrivateVars[VD] = Loc;
+}
+
+bool CGOpenMPRuntime::isOMPThreadPrivateDecl(const VarDecl *VD) const {
+  auto CurDecl = VD->getMostRecentDecl();
+  while (CurDecl) {
+    if (OpenMPThreadPrivateVars.count(CurDecl) > 0)
+      return true;
+    CurDecl = CurDecl->getPreviousDecl();
+  }
+  return false;
+}
+
+SourceLocation
+CGOpenMPRuntime::getOMPThreadPrivateDeclLoc(const VarDecl *VD) const {
+  auto CurDecl = VD->getMostRecentDecl();
+  while (CurDecl) {
+    if (OpenMPThreadPrivateVars.count(CurDecl) > 0)
+      return OpenMPThreadPrivateVars.lookup(CurDecl);
+    CurDecl = CurDecl->getPreviousDecl();
+  }
+  return SourceLocation();
+}
+
+llvm::Constant *
+CGOpenMPRuntime::getOrCreateThreadPrivateCache(const VarDecl *VD,
+                                               llvm::Value *Addr) {
+  StringRef MangledName = CGM.getMangledName(VD);
+  // Lookup the entry, lazily creating it if necessary.
+  auto &Entry = OpenMPThreadPrivateMap[MangledName];
+  if (Entry == nullptr) {
+    // Create cache memory for threadprivate variable void **Var.cache;
+    Entry = new llvm::GlobalVariable(
+        CGM.getModule(), CGM.Int8PtrPtrTy, /*IsConstant*/ false,
+        llvm::GlobalValue::CommonLinkage,
+        llvm::Constant::getNullValue(CGM.Int8PtrPtrTy), ".cache.",
+        dyn_cast<llvm::GlobalVariable>(Addr));
+  }
+
+  return Entry;
+}
+
+llvm::Value *CGOpenMPRuntime::getOMPAddrOfThreadPrivate(CodeGenFunction &CGF,
+                                                        llvm::Value *Addr,
+                                                        const VarDecl *VD,
+                                                        SourceLocation Loc) {
+  auto VarTy = Addr->getType()->getPointerElementType();
+  llvm::Value *Args[] = {EmitOpenMPUpdateLocation(CGF, Loc),
+                         GetOpenMPGlobalThreadNum(CGF, Loc),
+                         CGF.Builder.CreatePointerCast(Addr, CGM.Int8PtrTy),
+                         CGM.getSize(CGM.GetTargetTypeStoreSize(VarTy)),
+                         getOrCreateThreadPrivateCache(VD, Addr)};
+  return CGF.EmitRuntimeCall(
+      CreateRuntimeFunction(OMPRTL__kmpc_threadprivate_cached), Args);
+}
+
+llvm::Constant *CGOpenMPRuntime::EmitOMPCXXThreadPrivateInitFunction(
+    const VarDecl &VD, llvm::Constant *Addr, bool PerformInit,
+    bool PerformDestroy, SourceLocation Loc) {
+  QualType ASTTy = VD.getType();
+
+  llvm::Value *Ctor = nullptr, *CCtor = nullptr, *Dtor = nullptr;
+  if (PerformInit) {
+    CodeGenFunction CtorCGF(CGM);
+    FunctionArgList Args;
+    ImplicitParamDecl Dst(CGM.getContext(), /*DC*/ nullptr, SourceLocation(),
+                          /*Id*/ nullptr, CGM.getContext().VoidPtrTy);
+    Args.push_back(&Dst);
+
+    const CGFunctionInfo &FI = CGM.getTypes().arrangeFreeFunctionDeclaration(
+        CGM.getContext().VoidPtrTy, Args, FunctionType::ExtInfo(),
+        /*isVariadic*/ false);
+    auto FTy = CGM.getTypes().GetFunctionType(FI);
+    auto Fn =
+        CGM.CreateGlobalInitOrDestructFunction(FTy, ".__kmpc_global_ctor_.");
+    CtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidPtrTy, Fn, FI,
+                          Args, SourceLocation());
+    auto Arg = CtorCGF.EmitScalarConversion(
+        Fn->arg_begin(), CGM.getContext().VoidPtrTy,
+        CGM.getContext().getPointerType(ASTTy));
+    CtorCGF.EmitAnyExprToMem(VD.getAnyInitializer(), Arg,
+                             VD.getAnyInitializer()->getType().getQualifiers(),
+                             /*IsInitializer*/ true);
+    CtorCGF.Builder.CreateStore(Fn->arg_begin(), CtorCGF.ReturnValue);
+    CtorCGF.FinishFunction();
+    Ctor = Fn;
+  }
+  if (PerformDestroy) {
+    CodeGenFunction DtorCGF(CGM);
+    FunctionArgList Args;
+    ImplicitParamDecl Dst(CGM.getContext(), /*DC*/ nullptr, SourceLocation(),
+                          /*Id*/ nullptr, CGM.getContext().VoidPtrTy);
+    Args.push_back(&Dst);
+
+    const CGFunctionInfo &FI = CGM.getTypes().arrangeFreeFunctionDeclaration(
+        CGM.getContext().VoidTy, Args, FunctionType::ExtInfo(), false);
+    auto FTy = CGM.getTypes().GetFunctionType(FI);
+    auto Fn =
+        CGM.CreateGlobalInitOrDestructFunction(FTy, ".__kmpc_global_dtor_.");
+    DtorCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, Fn, FI, Args,
+                          SourceLocation());
+    DtorCGF.emitDestroy(Fn->arg_begin(), ASTTy,
+                        DtorCGF.getDestroyer(ASTTy.isDestructedType()),
+                        DtorCGF.needsEHCleanup(ASTTy.isDestructedType()));
+    DtorCGF.FinishFunction();
+    Dtor = Fn;
+  }
+  llvm::Type *CCtorTyArgs[] = {CGM.VoidPtrTy, CGM.VoidPtrTy};
+  auto CCtorTy = llvm::FunctionType::get(CGM.VoidPtrTy, CCtorTyArgs,
+                                         /*isVarArg*/ false)->getPointerTo();
+  CCtor = llvm::Constant::getNullValue(CCtorTy);
+  if (Ctor == nullptr) {
+    auto CtorTy = llvm::FunctionType::get(CGM.VoidPtrTy, CGM.VoidPtrTy,
+                                          /*isVarArg*/ false)->getPointerTo();
+    Ctor = llvm::Constant::getNullValue(CtorTy);
+  }
+  if (Dtor == nullptr) {
+    auto DtorTy = llvm::FunctionType::get(CGM.VoidTy, CGM.VoidPtrTy,
+                                          /*isVarArg*/ false)->getPointerTo();
+    Dtor = llvm::Constant::getNullValue(DtorTy);
+  }
+  auto InitFunctionTy = llvm::FunctionType::get(CGM.VoidTy, /*isVarArg*/ false);
+  auto InitFunction = CGM.CreateGlobalInitOrDestructFunction(
+      InitFunctionTy, ".__omp_threadprivate_init_.");
+  CodeGenFunction InitCGF(CGM);
+  FunctionArgList ArgList;
+  InitCGF.StartFunction(GlobalDecl(), CGM.getContext().VoidTy, InitFunction,
+                        CGM.getTypes().arrangeNullaryFunction(), ArgList, Loc);
+  // Call kmp_int32 __kmpc_global_thread_num(&loc) to init OpenMP runtime
+  // library.
+  InitCGF.EmitRuntimeCall(CreateRuntimeFunction(OMPRTL__kmpc_global_thread_num),
+                          EmitOpenMPUpdateLocation(InitCGF, VD.getLocation()));
+  // Call __kmpc_threadprivate_register(&loc, &var, ctor, cctor/*NULL*/, dtor)
+  // to register constructor/destructor for variable.
+  llvm::Value *Args[] = {EmitOpenMPUpdateLocation(InitCGF, VD.getLocation()),
+                         InitCGF.Builder.CreatePointerCast(Addr, CGM.VoidPtrTy),
+                         Ctor, CCtor, Dtor};
+  InitCGF.EmitRuntimeCall(
+      CreateRuntimeFunction(
+          CGOpenMPRuntime::OMPRTL__kmpc_threadprivate_register),
+      Args);
+  InitCGF.FinishFunction();
+  return InitFunction;
+}
Index: lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- lib/CodeGen/CGOpenMPRuntime.h
+++ lib/CodeGen/CGOpenMPRuntime.h
@@ -16,6 +16,7 @@
 
 #include "clang/AST/Type.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 
@@ -34,6 +35,7 @@
 } // namespace llvm
 
 namespace clang {
+class VarDecl;
 
 namespace CodeGen {
 
@@ -64,11 +66,17 @@
     OMP_IDENT_BARRIER_IMPL_SINGLE = 0x140
   };
   enum OpenMPRTLFunction {
-    // Call to void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro
-    // microtask, ...);
+    /// \brief Call to void __kmpc_fork_call(ident_t *loc, kmp_int32 argc,
+    /// kmpc_micro microtask, ...);
     OMPRTL__kmpc_fork_call,
-    // Call to kmp_int32 kmpc_global_thread_num(ident_t *loc);
-    OMPRTL__kmpc_global_thread_num
+    /// \brief Call to kmp_int32 __kmpc_global_thread_num(ident_t *loc);
+    OMPRTL__kmpc_global_thread_num,
+    /// \brief Call to void *__kmpc_threadprivate_cached(ident_t *loc,
+    /// kmp_int32 global_tid, void *data, size_t size, void ***cache);
+    OMPRTL__kmpc_threadprivate_cached,
+    /// \brief Call to void __kmpc_threadprivate_register( ident_t *,
+    /// void *data, kmpc_ctor ctor, kmpc_cctor cctor, kmpc_dtor dtor);
+    OMPRTL__kmpc_threadprivate_register
   };
 
 private:
@@ -132,17 +140,15 @@
   typedef llvm::DenseMap<llvm::Function *, llvm::Value *> OpenMPLocMapTy;
   OpenMPLocMapTy OpenMPLocMap;
   /// \brief Map of local gtid and functions.
-  typedef llvm::DenseMap<llvm::Function *, llvm::Value *> OpenMPGtidMapTy;
-  OpenMPGtidMapTy OpenMPGtidMap;
-
-public:
-  explicit CGOpenMPRuntime(CodeGenModule &CGM);
-  ~CGOpenMPRuntime() {}
-
-  /// \brief Cleans up references to the objects in finished function.
-  /// \param CGF Reference to finished CodeGenFunction.
-  ///
-  void FunctionFinished(CodeGenFunction &CGF);
+  typedef llvm::DenseMap<llvm::Function *, llvm::Value *> OpenMPThreadIDMapTy;
+  OpenMPThreadIDMapTy OpenMPThreadIDMap;
+  /// \brief Map of declarations marked as threadprivate along with locations.
+  typedef llvm::DenseMap<const VarDecl *, SourceLocation>
+      OpenMPThreadPrivateVarsTy;
+  OpenMPThreadPrivateVarsTy OpenMPThreadPrivateVars;
+  /// \brief Map of threadprivate vars and corresponding cache storages.
+  typedef llvm::StringMap<llvm::Constant *> OpenMPThreadPrivateMapTy;
+  OpenMPThreadPrivateMapTy OpenMPThreadPrivateMap;
 
   /// \brief Emits object of ident_t type with info for source location.
   /// \param CGF Reference to current CodeGenFunction.
@@ -170,6 +176,79 @@
   /// \param Function OpenMP runtime function.
   /// \return Specified function.
   llvm::Constant *CreateRuntimeFunction(OpenMPRTLFunction Function);
+
+  /// \brief If the specified mangled name is not in the module, create and
+  /// return threadprivate cache object.
+  /// \param D Threadprivate variable.
+  /// \param Addr Original global variable.
+  /// \return Cache variable for the specified threadprivate.
+  llvm::Constant *getOrCreateThreadPrivateCache(const VarDecl *VD,
+                                                llvm::Value *Addr);
+
+public:
+  explicit CGOpenMPRuntime(CodeGenModule &CGM);
+  virtual ~CGOpenMPRuntime() {}
+
+  /// \brief Cleans up references to the objects in finished function.
+  /// \param CGF Reference to finished CodeGenFunction.
+  ///
+  void FunctionFinished(CodeGenFunction &CGF);
+
+  /// \brief Emits code for parallel call of the \a OutlinedFn with variables
+  /// captured in a record which address is stored in \a CapturedStruct.
+  /// \param CGF Reference to current CodeGenFunction.
+  /// \param Loc Clang source location.
+  /// \param OutlinedFn Outlined function to be run in parallel threads.
+  /// \param CapturedStruct A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  ///
+  virtual void EmitOMPParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+                                   llvm::Value *OutlinedFn,
+                                   llvm::Value *CapturedStruct);
+
+  /// \brief Registers variable as threadprivate.
+  /// \param D Threadprivate variable.
+  /// \param Loc Location of threadprivate variable.
+  void addOMPThreadPrivateDecl(const VarDecl *VD, SourceLocation Loc);
+
+  /// \brief Checks if the specified declaration or any redeclarations are
+  /// marked as threadprivate.
+  /// \param D Variable declaration to be checked.
+  /// \return true if any redeclarations of the specified declaration are marked
+  /// as threadprivate.
+  bool isOMPThreadPrivateDecl(const VarDecl *VD) const;
+
+  /// \brief Gets the source location for the variable if it is marked as
+  /// threadprivate.
+  /// \param D Variable declaration to be checked.
+  /// \return Real location of threadprivate declaration if any redeclarations
+  /// of the specified declaration are marked as threadprivate, SourceLocation()
+  /// otherwise.
+  SourceLocation getOMPThreadPrivateDeclLoc(const VarDecl *VD) const;
+
+  /// \brief Returns address of the threadprivate variable for the current
+  /// thread.
+  /// \param CGF Reference to current CodeGenFunction.
+  /// \param Addr Address of the original variable.
+  /// \param D Threadprivate variable.
+  /// \param Loc Location of the reference to threadprivate var.
+  /// \return Address of the threadprivate variable for the current thread.
+  virtual llvm::Value *getOMPAddrOfThreadPrivate(CodeGenFunction &CGF,
+                                                 llvm::Value *Addr,
+                                                 const VarDecl *VD,
+                                                 SourceLocation Loc);
+
+  /// \brief Emits the function which registers constructor/destructor for
+  /// the specified threadprivate variable.
+  /// \param VD A threadprivate variable.
+  /// \param Addr The address of the original variable.
+  /// \param PerformInit Need to create initialization function.
+  /// \param PerformDestroy Need to create destroy function.
+  /// \param Loc Threadprivate declaration location.
+  virtual llvm::Constant *
+  EmitOMPCXXThreadPrivateInitFunction(const VarDecl &VD, llvm::Constant *Addr,
+                                      bool PerformInit, bool PerformDestroy,
+                                      SourceLocation Loc);
 };
 } // namespace CodeGen
 } // namespace clang
Index: lib/CodeGen/CGStmtOpenMP.cpp
===================================================================
--- lib/CodeGen/CGStmtOpenMP.cpp
+++ lib/CodeGen/CGStmtOpenMP.cpp
@@ -35,17 +35,8 @@
     OutlinedFn = CGF.GenerateCapturedStmtFunction(*CS);
   }
 
-  // Build call __kmpc_fork_call(loc, 1, microtask, captured_struct/*context*/)
-  llvm::Value *Args[] = {
-      CGM.getOpenMPRuntime().EmitOpenMPUpdateLocation(*this, S.getLocStart()),
-      Builder.getInt32(1), // Number of arguments after 'microtask' argument
-      // (there is only one additional argument - 'context')
-      Builder.CreateBitCast(OutlinedFn,
-                            CGM.getOpenMPRuntime().getKmpc_MicroPointerTy()),
-      EmitCastToVoidPtr(CapturedStruct)};
-  llvm::Constant *RTLFn = CGM.getOpenMPRuntime().CreateRuntimeFunction(
-      CGOpenMPRuntime::OMPRTL__kmpc_fork_call);
-  EmitRuntimeCall(RTLFn, Args);
+  CGM.getOpenMPRuntime().EmitOMPParallelCall(*this, S.getLocStart(), OutlinedFn,
+                                             CapturedStruct);
 }
 
 void CodeGenFunction::EmitOMPSimdDirective(const OMPSimdDirective &S) {
Index: lib/CodeGen/CodeGenModule.cpp
===================================================================
--- lib/CodeGen/CodeGenModule.cpp
+++ lib/CodeGen/CodeGenModule.cpp
@@ -1409,8 +1409,13 @@
     return EmitGlobalFunctionDefinition(GD, GV);
   }
 
-  if (const auto *VD = dyn_cast<VarDecl>(D))
-    return EmitGlobalVarDefinition(VD);
+  if (const auto *VD = dyn_cast<VarDecl>(D)) {
+    EmitGlobalVarDefinition(VD);
+    if (getLangOpts().OpenMP && getOpenMPRuntime().isOMPThreadPrivateDecl(VD))
+      EmitOMPThreadPrivateVarDecl(
+          VD, getOpenMPRuntime().getOMPThreadPrivateDeclLoc(VD));
+    return;
+  }
   
   llvm_unreachable("Invalid argument to EmitGlobalDefinition()");
 }
@@ -3213,6 +3218,10 @@
     break;
   }
 
+  case Decl::OMPThreadPrivate:
+    EmitOMPThreadPrivateDecl(cast<OMPThreadPrivateDecl>(D));
+    break;
+
   case Decl::ClassTemplateSpecialization: {
     const auto *Spec = cast<ClassTemplateSpecializationDecl>(D);
     if (DebugInfo &&
@@ -3383,6 +3392,33 @@
   return llvm::ConstantStruct::getAnon(Fields);
 }
 
+void CodeGenModule::EmitOMPThreadPrivateVarDecl(const VarDecl *VD,
+                                                SourceLocation Loc) {
+  if ((VD = VD->getDefinition(Context)) != nullptr) {
+    QualType ASTTy = VD->getType();
+    CXXRecordDecl *RD = ASTTy->getBaseElementTypeUnsafe()->getAsCXXRecordDecl();
+    bool PerformInit = RD && VD->getAnyInitializer() != nullptr;
+    bool PerformDestroy = RD && !RD->hasTrivialDestructor();
+    auto DeclPtr = VD->isStaticLocal() ? getStaticLocalDeclAddress(VD)
+                                       : GetAddrOfGlobalVar(VD);
+
+    if (DeclPtr && (PerformInit || PerformDestroy)) {
+      auto InitFunction =
+          getOpenMPRuntime().EmitOMPCXXThreadPrivateInitFunction(
+              *VD, DeclPtr, PerformInit, PerformDestroy, Loc);
+      CXXGlobalInits.push_back(InitFunction);
+    }
+  }
+}
+
+void CodeGenModule::EmitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D) {
+  for (auto RefExpr : D->varlists()) {
+    const VarDecl *VD = cast<VarDecl>(cast<DeclRefExpr>(RefExpr)->getDecl());
+    EmitOMPThreadPrivateVarDecl(VD, RefExpr->getExprLoc());
+    getOpenMPRuntime().addOMPThreadPrivateDecl(VD, RefExpr->getExprLoc());
+  }
+}
+
 llvm::Constant *CodeGenModule::GetAddrOfRTTIDescriptor(QualType Ty,
                                                        bool ForEH) {
   // Return a bogus pointer if RTTI is disabled, unless it's for EH.
@@ -3397,4 +3433,3 @@
 
   return getCXXABI().getAddrOfRTTIDescriptor(Ty);
 }
-
Index: lib/CodeGen/CodeGenModule.h
===================================================================
--- lib/CodeGen/CodeGenModule.h
+++ lib/CodeGen/CodeGenModule.h
@@ -660,6 +660,14 @@
       return GetAddrOfGlobalVar(cast<VarDecl>(GD.getDecl()));
   }
 
+  /// \brief Creates global initialization or destructor function.
+  /// \param FTy Type of the function.
+  /// \param Name Name of the function.
+  /// \param TLS Is this function for TLS initialization.
+  llvm::Function *CreateGlobalInitOrDestructFunction(llvm::FunctionType *FTy,
+                                                     const Twine &Name,
+                                                     bool TLS = false);
+
   /// Will return a global variable of the given type. If a variable with a
   /// different type already exists then a new  variable with the right type
   /// will be created and all uses of the old variable will be replaced with a
@@ -1032,7 +1040,16 @@
   /// are emitted lazily.
   void EmitGlobal(GlobalDecl D);
 
+  /// \brief Emit a code for threadprivate directive.
+  /// \param D Threadprivate declaration.
+  void EmitOMPThreadPrivateDecl(const OMPThreadPrivateDecl *D);
+
 private:
+  /// \brief Emit a code for threadprivate variable.
+  /// \param VD Threadprivate variable.
+  /// \param Loc Location of threadprivate declaration.
+  void EmitOMPThreadPrivateVarDecl(const VarDecl *VD, SourceLocation Loc);
+
   llvm::GlobalValue *GetGlobalValue(StringRef Ref);
 
   llvm::Constant *
Index: lib/CodeGen/ModuleBuilder.cpp
===================================================================
--- lib/CodeGen/ModuleBuilder.cpp
+++ lib/CodeGen/ModuleBuilder.cpp
@@ -16,6 +16,7 @@
 #include "CodeGenModule.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/DeclOpenMP.h"
 #include "clang/AST/Expr.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/TargetInfo.h"
@@ -128,6 +129,13 @@
         return;
 
       Builder->UpdateCompletedType(D);
+      // In C++, we may have member threadprivate decl that need to be emitted
+      // at this point.
+      if (Ctx->getLangOpts().CPlusPlus && !D->isDependentContext()) {
+        for (auto *M : D->decls())
+          if (isa<OMPThreadPrivateDecl>(M))
+            Builder->EmitTopLevelDecl(M);
+      }
 
       // For MSVC compatibility, treat declarations of static data members with
       // inline initializers as definitions.
Index: lib/Parse/Parser.cpp
===================================================================
--- lib/Parse/Parser.cpp
+++ lib/Parse/Parser.cpp
@@ -624,8 +624,7 @@
     HandlePragmaOpenCLExtension();
     return DeclGroupPtrTy();
   case tok::annot_pragma_openmp:
-    ParseOpenMPDeclarativeDirective();
-    return DeclGroupPtrTy();
+    return ParseOpenMPDeclarativeDirective();
   case tok::annot_pragma_ms_pointers_to_members:
     HandlePragmaMSPointersToMembers();
     return DeclGroupPtrTy();
Index: lib/Serialization/ASTReaderDecl.cpp
===================================================================
--- lib/Serialization/ASTReaderDecl.cpp
+++ lib/Serialization/ASTReaderDecl.cpp
@@ -2199,7 +2199,7 @@
   if (isa<FileScopeAsmDecl>(D) || 
       isa<ObjCProtocolDecl>(D) || 
       isa<ObjCImplDecl>(D) ||
-      isa<ImportDecl>(D))
+      isa<ImportDecl>(D) || isa<OMPThreadPrivateDecl>(D))
     return true;
   if (VarDecl *Var = dyn_cast<VarDecl>(D))
     return Var->isFileVarDecl() &&
Index: test/OpenMP/threadprivate_codegen.cpp
===================================================================
--- test/OpenMP/threadprivate_codegen.cpp
+++ test/OpenMP/threadprivate_codegen.cpp
@@ -0,0 +1,558 @@
+// RUN: %clang_cc1 -verify -DBODY -fopenmp=libiomp5 -triple x86_64-unknown-unknown -x c++ -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp=libiomp5 -x c++ -DBODY -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -g -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK-DEBUG %s
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+// CHECK-DAG: [[IDENT:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[S1:%.+]] = type { [[INT:i[0-9]+]] }
+// CHECK-DAG: [[S2:%.+]] = type { [[INT]], double }
+// CHECK-DAG: [[S3:%.+]] = type { [[INT]], float }
+// CHECK-DAG: [[S4:%.+]] = type { [[INT]], [[INT]] }
+// CHECK-DAG: [[S5:%.+]] = type { [[INT]], [[INT]], [[INT]] }
+// CHECK-DAG: [[SMAIN:%.+]] = type { [[INT]], double, double }
+// CHECK-DEBUG-DAG: [[IDENT:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DEBUG-DAG: [[S1:%.+]] = type { [[INT:i[0-9]+]] }
+// CHECK-DEBUG-DAG: [[S2:%.+]] = type { [[INT]], double }
+// CHECK-DEBUG-DAG: [[S3:%.+]] = type { [[INT]], float }
+// CHECK-DEBUG-DAG: [[S4:%.+]] = type { [[INT]], [[INT]] }
+// CHECK-DEBUG-DAG: [[S5:%.+]] = type { [[INT]], [[INT]], [[INT]] }
+// CHECK-DEBUG-DAG: [[SMAIN:%.+]] = type { [[INT]], double, double }
+
+struct S1 {
+  int a;
+  S1()
+      : a(0) {
+  }
+  S1(int a)
+      : a(a) {
+  }
+  S1(const S1 &s) {
+    a = 12 + s.a;
+  }
+  ~S1() {
+    a = 0;
+  }
+};
+
+struct S2 {
+  int a;
+  double b;
+  S2()
+      : a(0) {
+  }
+  S2(int a)
+      : a(a) {
+  }
+  S2(const S2 &s) {
+    a = 12 + s.a;
+  }
+  ~S2() {
+    a = 0;
+  }
+};
+
+struct S3 {
+  int a;
+  float b;
+  S3()
+      : a(0) {
+  }
+  S3(int a)
+      : a(a) {
+  }
+  S3(const S3 &s) {
+    a = 12 + s.a;
+  }
+  ~S3() {
+    a = 0;
+  }
+};
+
+struct S4 {
+  int a, b;
+  S4()
+      : a(0) {
+  }
+  S4(int a)
+      : a(a) {
+  }
+  S4(const S4 &s) {
+    a = 12 + s.a;
+  }
+  ~S4() {
+    a = 0;
+  }
+};
+
+struct S5 {
+  int a, b, c;
+  S5()
+      : a(0) {
+  }
+  S5(int a)
+      : a(a) {
+  }
+  S5(const S5 &s) {
+    a = 12 + s.a;
+  }
+  ~S5() {
+    a = 0;
+  }
+};
+
+// CHECK:      [[CACHE1:@[_.0-9a-zA-Z]+]] = common global i8** null
+// CHECK-NEXT: [[GS1:@.+]] = internal global [[S1]] zeroinitializer
+// CHECK:      [[DEFAULT_LOC:@.+]] = private unnamed_addr constant [[IDENT]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([{{[0-9]+}} x i8]* {{@.+}}, i32 0, i32 0) }
+// CHECK-NOT:  {{@.+}} = common global i8** null
+// CHECK:      [[GS2:@.+]] = internal global [[S2]] zeroinitializer
+// CHECK:      [[CACHE2:@[_.0-9a-zA-Z]+]] = common global i8** null
+// CHECK-NEXT: [[ARR_X:@.+]] = global [2 x [3 x [[S1]]]] zeroinitializer
+// CHECK:      [[CACHE3:@[_.0-9a-zA-Z]+]] = common global i8** null
+// CHECK-NEXT: [[SM:@.+]] = internal global [[SMAIN]] zeroinitializer
+// CHECK:      [[CACHE4:@[_.0-9a-zA-Z]+]] = common global i8** null
+// CHECK-NEXT: [[STATIC_S:@.+]] = external global [[S3]]
+// CHECK:      [[CACHE5:@[_.0-9a-zA-Z]+]] = common global i8** null
+// CHECK-NEXT: [[GS3:@.+]] = external global [[S5]]
+// CHECK:      [[CACHE6:@[_.0-9a-zA-Z]+]] = common global i8** null
+// CHECK-NEXT: [[ST_INT_ST:@.+]] = linkonce_odr global i32 23
+// CHECK:      [[CACHE7:@[_.0-9a-zA-Z]+]] = common global i8** null
+// CHECK-NEXT: [[ST_FLOAT_ST:@.+]] = linkonce_odr global float 2.300000e+01
+// CHECK:      [[CACHE8:@[_.0-9a-zA-Z]+]] = common global i8** null
+// CHECK-NEXT: [[ST_S4_ST:@.+]] = linkonce_odr global %struct.S4 zeroinitializer
+// CHECK-NOT: {{@.+}} = common global i8** null
+// There is no cache for gs2 - it is not threadprivate. Check that there is only
+// 8 caches created (for Static::s, gs1, gs3, arr_x, main::sm, ST<int>::st,
+// ST<float>::st, ST<S4>::st)
+// CHECK-DEBUG-DAG: [[GS1:@.+]] = internal global [[S1]] zeroinitializer
+// CHECK-DEBUG-DAG: [[GS2:@.+]] = internal global [[S2]] zeroinitializer
+// CHECK-DEBUG-DAG: [[ARR_X:@.+]] = global [2 x [3 x [[S1]]]] zeroinitializer
+// CHECK-DEBUG-DAG: [[SM:@.+]] = internal global [[SMAIN]] zeroinitializer
+// CHECK-DEBUG-DAG: [[STATIC_S:@.+]] = external global [[S3]]
+// CHECK-DEBUG-DAG: [[GS3:@.+]] = external global [[S5]]
+// CHECK-DEBUG-DAG: [[ST_INT_ST:@.+]] = linkonce_odr global i32 23
+// CHECK-DEBUG-DAG: [[ST_FLOAT_ST:@.+]] = linkonce_odr global float 2.300000e+01
+// CHECK-DEBUG-DAG: [[ST_S4_ST:@.+]] = linkonce_odr global %struct.S4 zeroinitializer
+// CHECK-DEBUG-DAG: [[LOC1:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;155;11;;\00"
+// CHECK-DEBUG-DAG: [[LOC2:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;202;4;;\00"
+// CHECK-DEBUG-DAG: [[LOC3:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;292;19;;\00"
+// CHECK-DEBUG-DAG: [[LOC4:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;292;16;;\00"
+// CHECK-DEBUG-DAG: [[LOC5:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;326;9;;\00"
+// CHECK-DEBUG-DAG: [[LOC6:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;343;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC7:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;360;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC8:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;386;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC9:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;407;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC10:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;422;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC11:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;439;27;;\00"
+// CHECK-DEBUG-DAG: [[LOC12:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;main;456;10;;\00"
+// CHECK-DEBUG-DAG: [[LOC13:@.*]] = private unnamed_addr constant [{{[0-9]+}} x i8] c";{{.*}}threadprivate_codegen.cpp;;264;10;;\00"
+
+struct Static {
+  static S3 s;
+#pragma omp threadprivate(s)
+};
+
+static S1 gs1(5);
+#pragma omp threadprivate(gs1)
+// CHECK:      define {{.*}} void [[S1_CTOR:@.*]]([[S1]]* %{{.*}},
+// CHECK:      define {{.*}} void [[S1_DTOR:@.*]]([[S1]]* %{{.*}})
+// CHECK:      define internal i8* [[GS1_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
+// CHECK:      [[RES:%.*]] = bitcast i8* %0 to [[S1]]*
+// CHECK-NEXT: call void [[S1_CTOR]]([[S1]]* [[RES]], {{.*}} 5)
+// CHECK-NEXT: ret i8* %0
+// CHECK-NEXT: }
+// CHECK:      define internal void [[GS1_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
+// CHECK:      [[RES:%.*]] = bitcast i8* %0 to [[S1]]*
+// CHECK-NEXT: call void [[S1_DTOR]]([[S1]]* [[RES]])
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK:      define internal void [[GS1_INIT:@\.__omp_threadprivate_init_\..*]]()
+// CHECK:      call void @__kmpc_threadprivate_register([[IDENT]]* [[DEFAULT_LOC]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i8* (i8*)* [[GS1_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[GS1_DTOR]])
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-DEBUG:      define {{.*}} void [[S1_CTOR:@.*]]([[S1]]* %{{.*}},
+// CHECK-DEBUG:      define {{.*}} void [[S1_DTOR:@.*]]([[S1]]* %{{.*}})
+// CHECK-DEBUG:      define internal i8* [[GS1_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
+// CHECK-DEBUG:      [[RES:%.*]] = bitcast i8* %0 to [[S1]]*
+// CHECK-DEBUG-NEXT: call void [[S1_CTOR]]([[S1]]* [[RES]], {{.*}} 5)
+// CHECK-DEBUG-NEXT: ret i8* %0
+// CHECK-DEBUG-NEXT: }
+// CHECK-DEBUG:      define internal void [[GS1_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
+// CHECK-DEBUG:      [[RES:%.*]] = bitcast i8* %0 to [[S1]]*
+// CHECK-DEBUG-NEXT: call void [[S1_DTOR]]([[S1]]* [[RES]])
+// CHECK-DEBUG-NEXT: ret void
+// CHECK-DEBUG-NEXT: }
+// CHECK-DEBUG:      define internal void [[GS1_INIT:@\.__omp_threadprivate_init_\..*]]()
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
+// CHECK-DEBUG:      @__kmpc_global_thread_num
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG:      store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC1]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      call void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i8* (i8*)* [[GS1_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[GS1_DTOR]])
+// CHECK-DEBUG-NEXT: ret void
+// CHECK-DEBUG-NEXT: }
+static S2 gs2(27);
+// CHECK:      define {{.*}} void [[S2_CTOR:@.*]]([[S2]]* %{{.*}},
+// CHECK:      define {{.*}} void [[S2_DTOR:@.*]]([[S2]]* %{{.*}})
+// No another call for S2 constructor because it is not threadprivate
+// CHECK-NOT:  call void [[S2_CTOR]]([[S2]]*
+// CHECK-DEBUG:      define {{.*}} void [[S2_CTOR:@.*]]([[S2]]* %{{.*}},
+// CHECK-DEBUG:      define {{.*}} void [[S2_DTOR:@.*]]([[S2]]* %{{.*}})
+// No another call for S2 constructor because it is not threadprivate
+// CHECK-DEBUG-NOT:  call void [[S2_CTOR]]([[S2]]*
+S1 arr_x[2][3] = { { 1, 2, 3 }, { 4, 5, 6 } };
+#pragma omp threadprivate(arr_x)
+// CHECK:      define {{.*}} i8* [[ARR_X_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
+// CHECK:      [[RES:%.*]] = bitcast i8* %0 to [2 x [3 x [[S1]]]]*
+// CHECK:      [[ARR1:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]]* [[RES]], i{{.*}} 0, i{{.*}} 0
+// CHECK:      [[ARR:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR1]], i{{.*}} 0, i{{.*}} 0
+// CHECK:      invoke void [[S1_CTOR]]([[S1]]* [[ARR]], [[INT]] 1)
+// CHECK:      [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]]* [[ARR]], i{{.*}} 1
+// CHECK:      invoke void [[S1_CTOR]]([[S1]]* [[ARR_ELEMENT]], [[INT]] 2)
+// CHECK:      [[ARR_ELEMENT2:%.*]] = getelementptr inbounds [[S1]]* [[ARR_ELEMENT]], i{{.*}} 1
+// CHECK:      invoke void [[S1_CTOR]]([[S1]]* [[ARR_ELEMENT2]], [[INT]] 3)
+// CHECK:      [[ARR_ELEMENT3:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR1]], i{{.*}} 1
+// CHECK:      [[ARR_:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR_ELEMENT3]], i{{.*}} 0, i{{.*}} 0
+// CHECK:      invoke void [[S1_CTOR]]([[S1]]* [[ARR_]], [[INT]] 4)
+// CHECK:      [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]]* [[ARR_]], i{{.*}} 1
+// CHECK:      invoke void [[S1_CTOR]]([[S1]]* [[ARR_ELEMENT]], [[INT]] 5)
+// CHECK:      [[ARR_ELEMENT2:%.*]] = getelementptr inbounds [[S1]]* [[ARR_ELEMENT]], i{{.*}} 1
+// CHECK:      invoke void [[S1_CTOR]]([[S1]]* [[ARR_ELEMENT2]], [[INT]] 6)
+// CHECK:      ret i8* %0
+// CHECK:      }
+// CHECK:      define {{.*}} void [[ARR_X_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
+// CHECK:      [[ARR_BEGIN:%.*]] = bitcast i8* %0 to [[S1]]*
+// CHECK-NEXT: [[ARR_CUR:%.*]] = getelementptr inbounds [[S1]]* [[ARR_BEGIN]], i{{.*}} 6
+// CHECK-NEXT: br label %[[ARR_LOOP:.*]]
+// CHECK:      {{.*}}[[ARR_LOOP]]{{.*}}
+// CHECK-NEXT: [[ARR_ELEMENTPAST:%.*]] = phi [[S1]]* [ [[ARR_CUR]], {{.*}} ], [ [[ARR_ELEMENT:%.*]], {{.*}} ]
+// CHECK-NEXT: [[ARR_ELEMENT:%.*]] = getelementptr inbounds [[S1]]* [[ARR_ELEMENTPAST]], i{{.*}} -1
+// CHECK-NEXT: invoke void [[S1_DTOR]]([[S1]]* [[ARR_ELEMENT]])
+// CHECK:      [[ARR_DONE:%.*]] = icmp eq [[S1]]* [[ARR_ELEMENT]], [[ARR_BEGIN]]
+// CHECK-NEXT: br i1 [[ARR_DONE]], label %[[ARR_EXIT:.*]], label %[[ARR_LOOP]]
+// CHECK:      {{.*}}[[ARR_EXIT]]{{.*}}
+// CHECK-NEXT: ret void
+// CHECK:      }
+// CHECK:      define {{.*}} void [[ARR_X_INIT:@\.__omp_threadprivate_init_\..*]]()
+// CHECK:      call void @__kmpc_threadprivate_register([[IDENT]]* [[DEFAULT_LOC]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i8* (i8*)* [[ARR_X_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[ARR_X_DTOR]])
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-DEBUG:      define {{.*}} i8* [[ARR_X_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
+// CHECK-DEBUG:      }
+// CHECK-DEBUG:      define {{.*}} void [[ARR_X_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
+// CHECK-DEBUG:      }
+// CHECK-DEBUG:      define {{.*}} void [[ARR_X_INIT:@\.__omp_threadprivate_init_\..*]]()
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
+// CHECK-DEBUG:      @__kmpc_global_thread_num
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG:      store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC2]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      call void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i8* (i8*)* [[ARR_X_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[ARR_X_DTOR]])
+// CHECK-DEBUG-NEXT: ret void
+// CHECK-DEBUG-NEXT: }
+extern S5 gs3;
+#pragma omp threadprivate(gs3)
+// No call for S5 constructor because gs3 has just declaration, not a definition.
+// CHECK-NOT:  call void {{.*}}([[S5]]*
+// CHECK-DEBUG-NOT:  call void {{.*}}([[S5]]*
+
+template <class T>
+struct ST {
+  static T st;
+#pragma omp threadprivate(st)
+};
+
+template <class T>
+T ST<T>::st(23);
+
+#endif
+
+#ifdef BODY
+
+// CHECK: define i32 @main()
+// CHECK-DEBUG: define i32 @main()
+int main() {
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
+  int Res;
+  struct Smain {
+    int a;
+    double b, c;
+    Smain()
+        : a(0) {
+    }
+    Smain(int a)
+        : a(a) {
+    }
+    Smain(const Smain &s) {
+      a = 12 + s.a;
+    }
+    ~Smain() {
+      a = 0;
+    }
+  };
+
+  static Smain sm(gs1.a);
+// CHECK:      [[THREAD_NUM:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT]]* [[DEFAULT_LOC]])
+// CHECK:      call i{{.*}} @__cxa_guard_acquire
+// CHECK:      [[GS1_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE1]])
+// CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
+// CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+// CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+// CHECK-NEXT: invoke void [[SMAIN_CTOR:.*]]([[SMAIN]]* [[SM]], [[INT]] [[GS1_A]])
+// CHECK:      call void @__cxa_guard_release
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG-NEXT: [[THREAD_NUM:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
+// CHECK-DEBUG:      call i{{.*}} @__cxa_guard_acquire
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      [[GS1_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+// CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
+// CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+// CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+// CHECK-DEBUG-NEXT: invoke void [[SMAIN_CTOR:.*]]([[SMAIN]]* [[SM]], [[INT]] [[GS1_A]])
+// CHECK-DEBUG:      call void @__cxa_guard_release
+#pragma omp threadprivate(sm)
+  // CHECK:      [[STATIC_S_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE4]])
+  // CHECK-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
+  // CHECK-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[STATIC_S_A:%.*]] = load [[INT]]* [[STATIC_S_A_ADDR]]
+  // CHECK-NEXT: store [[INT]] [[STATIC_S_A]], [[INT]]* [[RES_ADDR:[^,]+]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC5]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG-NEXT: [[STATIC_S_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S3]]* [[STATIC_S]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[STATIC_S_ADDR:%.*]] = bitcast i8* [[STATIC_S_TEMP_ADDR]] to [[S3]]*
+  // CHECK-DEBUG-NEXT: [[STATIC_S_A_ADDR:%.*]] = getelementptr inbounds [[S3]]* [[STATIC_S_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[STATIC_S_A:%.*]] = load [[INT]]* [[STATIC_S_A_ADDR]]
+  // CHECK-DEBUG-NEXT: store [[INT]] [[STATIC_S_A]], [[INT]]* [[RES_ADDR:[^,]+]]
+  Res = Static::s.a;
+  // CHECK:      [[SM_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE3]])
+  // CHECK-NEXT: [[SM_ADDR:%.*]] = bitcast i8* [[SM_TEMP_ADDR]] to [[SMAIN]]*
+  // CHECK-NEXT: [[SM_A_ADDR:%.*]] = getelementptr inbounds [[SMAIN]]* [[SM_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[SM_A:%.*]] = load [[INT]]* [[SM_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[SM_A]]
+  // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC6]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG-NEXT: [[SM_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[SM_ADDR:%.*]] = bitcast i8* [[SM_TEMP_ADDR]] to [[SMAIN]]*
+  // CHECK-DEBUG-NEXT: [[SM_A_ADDR:%.*]] = getelementptr inbounds [[SMAIN]]* [[SM_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[SM_A:%.*]] = load [[INT]]* [[SM_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[SM_A]]
+  // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  Res += sm.a;
+  // CHECK:      [[GS1_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE1]])
+  // CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
+  // CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS1_A]]
+  // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC7]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG-NEXT: [[GS1_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
+  // CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS1_A]]
+  // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  Res += gs1.a;
+  // CHECK:      [[GS2_A:%.*]] = load [[INT]]* getelementptr inbounds ([[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS2_A]]
+  // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  // CHECK-DEBUG:      [[GS2_A:%.*]] = load [[INT]]* getelementptr inbounds ([[S2]]* [[GS2]], i{{.*}} 0, i{{.*}} 0)
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS2_A]]
+  // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  Res += gs2.a;
+  // CHECK:      [[GS3_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE5]])
+  // CHECK-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
+  // CHECK-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[GS3_A:%.*]] = load [[INT]]* [[GS3_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS3_A]]
+  // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC8]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG-NEXT: [[GS3_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S5]]* [[GS3]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[GS3_ADDR:%.*]] = bitcast i8* [[GS3_TEMP_ADDR]] to [[S5]]*
+  // CHECK-DEBUG-NEXT: [[GS3_A_ADDR:%.*]] = getelementptr inbounds [[S5]]* [[GS3_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[GS3_A:%.*]] = load [[INT]]* [[GS3_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[GS3_A]]
+  // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  Res += gs3.a;
+  // CHECK:      [[ARR_X_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE2]])
+  // CHECK-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
+  // CHECK-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]]* [[ARR_X_1_1_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ARR_X_1_1_A]]
+  // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC9]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG-NEXT:      [[ARR_X_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([2 x [3 x [[S1]]]]* [[ARR_X]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ARR_X_ADDR:%.*]] = bitcast i8* [[ARR_X_TEMP_ADDR]] to [2 x [3 x [[S1]]]]*
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_ADDR:%.*]] = getelementptr inbounds [2 x [3 x [[S1]]]]* [[ARR_X_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_ADDR:%.*]] = getelementptr inbounds [3 x [[S1]]]* [[ARR_X_1_ADDR]], i{{.*}} 0, i{{.*}} 1
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[ARR_X_1_1_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[ARR_X_1_1_A:%.*]] = load [[INT]]* [[ARR_X_1_1_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ARR_X_1_1_A]]
+  // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  Res += arr_x[1][1].a;
+  // CHECK:      [[ST_INT_ST_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE6]])
+  // CHECK-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
+  // CHECK-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]]* [[ST_INT_ST_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_INT_ST_VAL]]
+  // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC10]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG-NEXT: [[ST_INT_ST_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[INT]]* [[ST_INT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ST_INT_ST_ADDR:%.*]] = bitcast i8* [[ST_INT_ST_TEMP_ADDR]] to [[INT]]*
+  // CHECK-DEBUG-NEXT: [[ST_INT_ST_VAL:%.*]] = load [[INT]]* [[ST_INT_ST_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_INT_ST_VAL]]
+  // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  Res += ST<int>::st;
+  // CHECK:      [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE7]])
+  // CHECK-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
+  // CHECK-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float* [[ST_FLOAT_ST_ADDR]]
+  // CHECK-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[FLOAT_TO_INT_CONV]]
+  // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC11]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast (float* [[ST_FLOAT_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_ADDR:%.*]] = bitcast i8* [[ST_FLOAT_ST_TEMP_ADDR]] to float*
+  // CHECK-DEBUG-NEXT: [[ST_FLOAT_ST_VAL:%.*]] = load float* [[ST_FLOAT_ST_ADDR]]
+  // CHECK-DEBUG-NEXT: [[FLOAT_TO_INT_CONV:%.*]] = fptosi float [[ST_FLOAT_ST_VAL]] to [[INT]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[FLOAT_TO_INT_CONV]]
+  // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  Res += static_cast<int>(ST<float>::st);
+  // CHECK:      [[ST_S4_ST_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE8]])
+  // CHECK-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
+  // CHECK-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]]* [[ST_S4_ST_A_ADDR]]
+  // CHECK-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_S4_ST_A]]
+  // CHECK-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  // CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+  // CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC12]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_ADDR:%.*]] = bitcast i8* [[ST_S4_ST_TEMP_ADDR]] to [[S4]]*
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A_ADDR:%.*]] = getelementptr inbounds [[S4]]* [[ST_S4_ST_ADDR]], i{{.*}} 0, i{{.*}} 0
+  // CHECK-DEBUG-NEXT: [[ST_S4_ST_A:%.*]] = load [[INT]]* [[ST_S4_ST_A_ADDR]]
+  // CHECK-DEBUG-NEXT: [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: [[ADD:%.*]] = add {{.*}} [[INT]] [[RES]], [[ST_S4_ST_A]]
+  // CHECK-DEBUG-NEXT: store [[INT]] [[ADD]], [[INT]]* [[RES:.+]]
+  Res += ST<S4>::st.a;
+  // CHECK:      [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-NEXT: ret [[INT]] [[RES]]
+  // CHECK-DEBUG:      [[RES:%.*]] = load [[INT]]* [[RES_ADDR]]
+  // CHECK-DEBUG-NEXT: ret [[INT]] [[RES]]
+  return Res;
+}
+// CHECK: }
+
+// CHECK:      define {{.*}} void [[SMAIN_CTOR]]([[SMAIN]]* %{{.*}},
+// CHECK:      define {{.*}} void [[SMAIN_DTOR:@.*]]([[SMAIN]]* %{{.*}})
+// CHECK:      define internal i8* [[SM_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
+// CHECK:      [[THREAD_NUM:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT]]* [[DEFAULT_LOC]])
+// CHECK:      [[RES:%.*]] = bitcast i8* %0 to [[SMAIN]]*
+// CHECK:      [[GS1_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[DEFAULT_LOC]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8*** [[CACHE1]])
+// CHECK-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
+// CHECK-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+// CHECK-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+// CHECK-NEXT: call void [[SMAIN_CTOR]]([[SMAIN]]* [[RES]], [[INT]] [[GS1_A]])
+// CHECK-NEXT: ret i8* %0
+// CHECK-NEXT: }
+// CHECK:      define internal void [[SM_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
+// CHECK:      [[RES:%.*]] = bitcast i8* %0 to [[SMAIN]]*
+// CHECK-NEXT: call void [[SMAIN_DTOR]]([[SMAIN]]* [[RES]])
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK:      define internal void [[SM_INIT:@\.__omp_threadprivate_init_\..*]]()
+// CHECK:      call void @__kmpc_threadprivate_register([[IDENT]]* [[DEFAULT_LOC]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i8* (i8*)* [[SM_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[SM_DTOR]])
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-DEBUG:      define {{.*}} void [[SMAIN_CTOR]]([[SMAIN]]* %{{.*}},
+// CHECK-DEBUG:      define {{.*}} void [[SMAIN_DTOR:@.*]]([[SMAIN]]* %{{.*}})
+// CHECK-DEBUG:      define internal i8* [[SM_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG-NEXT: [[THREAD_NUM:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT]]* [[KMPC_LOC_ADDR]])
+// CHECK-DEBUG:      [[RES:%.*]] = bitcast i8* %0 to [[SMAIN]]*
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC3]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      [[GS1_TEMP_ADDR:%.*]] = call i8* @__kmpc_threadprivate_cached([[IDENT]]* [[KMPC_LOC_ADDR]], i32 [[THREAD_NUM]], i8* bitcast ([[S1]]* [[GS1]] to i8*), i{{.*}} {{[0-9]+}}, i8***
+// CHECK-DEBUG-NEXT: [[GS1_ADDR:%.*]] = bitcast i8* [[GS1_TEMP_ADDR]] to [[S1]]*
+// CHECK-DEBUG-NEXT: [[GS1_A_ADDR:%.*]] = getelementptr inbounds [[S1]]* [[GS1_ADDR]], i{{.*}} 0, i{{.*}} 0
+// CHECK-DEBUG-NEXT: [[GS1_A:%.*]] = load [[INT]]* [[GS1_A_ADDR]]
+// CHECK-DEBUG-NEXT: call void [[SMAIN_CTOR]]([[SMAIN]]* [[RES]], [[INT]] [[GS1_A]])
+// CHECK-DEBUG-NEXT: ret i8* %0
+// CHECK-DEBUG-NEXT: }
+// CHECK-DEBUG:      define internal void [[SM_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
+// CHECK-DEBUG:      }
+// CHECK-DEBUG:      define internal void [[SM_INIT:@\.__omp_threadprivate_init_\..*]]()
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
+// CHECK-DEBUG:      @__kmpc_global_thread_num
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC4]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      call void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[SMAIN]]* [[SM]] to i8*), i8* (i8*)* [[SM_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[SM_DTOR]])
+// CHECK-DEBUG-NEXT: ret void
+// CHECK-DEBUG-NEXT: }
+
+// CHECK:      define {{.*}} void [[S4_CTOR:@.*]]([[S4]]* %{{.*}},
+// CHECK:      define {{.*}} void [[S4_DTOR:@.*]]([[S4]]* %{{.*}})
+// CHECK:      define internal i8* [[ST_S4_ST_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
+// CHECK:      [[RES:%.*]] = bitcast i8* %0 to [[S4]]*
+// CHECK-NEXT: call void [[S4_CTOR]]([[S4]]* [[RES]], {{.*}} 23)
+// CHECK-NEXT: ret i8* %0
+// CHECK-NEXT: }
+// CHECK:      define internal void [[ST_S4_ST_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
+// CHECK:      [[RES:%.*]] = bitcast i8* %0 to [[S4]]*
+// CHECK-NEXT: call void [[S4_DTOR]]([[S4]]* [[RES]])
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK:      define internal void [[ST_S4_ST_INIT:@\.__omp_threadprivate_init_\..*]]()
+// CHECK:      call void @__kmpc_threadprivate_register([[IDENT]]* [[DEFAULT_LOC]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i8* (i8*)* [[ST_S4_ST_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[ST_S4_ST_DTOR]])
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-DEBUG:      define {{.*}} void [[S4_CTOR:@.*]]([[S4]]* %{{.*}},
+// CHECK-DEBUG:      define {{.*}} void [[S4_DTOR:@.*]]([[S4]]* %{{.*}})
+// CHECK-DEBUG:      define internal i8* [[ST_S4_ST_CTOR:@\.__kmpc_global_ctor_\..*]](i8*)
+// CHECK-DEBUG:      }
+// CHECK-DEBUG:      define internal void [[ST_S4_ST_DTOR:@\.__kmpc_global_dtor_\..*]](i8*)
+// CHECK-DEBUG:      }
+// CHECK-DEBUG:      define internal void [[ST_S4_ST_INIT:@\.__omp_threadprivate_init_\..*]]()
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR:%.*]] = alloca [[IDENT]]
+// CHECK-DEBUG:      @__kmpc_global_thread_num
+// CHECK-DEBUG:      [[KMPC_LOC_ADDR_PSOURCE:%.*]] = getelementptr inbounds [[IDENT]]* [[KMPC_LOC_ADDR]], i{{.*}} 0, i{{.*}} 4
+// CHECK-DEBUG-NEXT: store i8* getelementptr inbounds ([{{.*}} x i8]* [[LOC13]], i{{.*}} 0, i{{.*}} 0), i8** [[KMPC_LOC_ADDR_PSOURCE]]
+// CHECK-DEBUG:      call void @__kmpc_threadprivate_register([[IDENT]]* [[KMPC_LOC_ADDR]], i8* bitcast ([[S4]]* [[ST_S4_ST]] to i8*), i8* (i8*)* [[ST_S4_ST_CTOR]], i8* (i8*, i8*)* null, void (i8*)* [[ST_S4_ST_DTOR]])
+// CHECK-DEBUG-NEXT: ret void
+// CHECK-DEBUG-NEXT: }
+
+// CHECK:      define internal void {{@.*}}()
+// CHECK-DAG:  call void [[GS1_INIT]]()
+// CHECK-DAG:  call void [[ARR_X_INIT]]()
+// CHECK-DAG:  call void [[SM_INIT]]()
+// CHECK-DAG:  call void [[ST_S4_ST_INIT]]()
+// CHECK:      ret void
+// CHECK-DEBUG:      define internal void {{@.*}}()
+// CHECK-DEBUG-DAG:  call void [[GS1_INIT]]()
+// CHECK-DEBUG-DAG:  call void [[ARR_X_INIT]]()
+// CHECK-DEBUG-DAG:  call void [[SM_INIT]]()
+// CHECK-DEBUG-DAG:  call void [[ST_S4_ST_INIT]]()
+// CHECK-DEBUG:      ret void
+#endif
+
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits

Reply via email to