russell.gallop created this revision.
russell.gallop added reviewers: ruiu, pcc, anton-afanasyev.
Herald added subscribers: cfe-commits, dexonsmith, MaskRay, hiraditya, 
arichardson, mehdi_amini, emaste.
Herald added a reviewer: espindola.
Herald added projects: clang, LLVM.

The clang -ftime-trace feature is useful for analysing build time in compiles. 
I've been experimenting with adding time-trace support to LLD for analysing 
link times.

Adding the option -time-trace produces a .json file alongside the output ELF 
file (just ELF so far). This works for LTO and ThinLTO, picking up the relevant 
markers from LLVM. Example ThinLTO build:
F10284608: ThinLTOTimeTrace.PNG <https://reviews.llvm.org/F10284608>

This is still work in progress so shouldn't be submitted in its present state. 
There are a few things that still need doing:

- Improve how events are collected. (The current method combines multiple 
threads (ThinLTO) in one stack with a mutex which won't scale well. Making this 
thread local would probably be better.)
- Adding test(s)
- Re-enable the assertion for monotonically increasing times which I don't have 
working yet.

Do people think that this will be useful/worth pursuing?


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D69043

Files:
  clang/tools/driver/cc1_main.cpp
  lld/ELF/Driver.cpp
  lld/ELF/ICF.cpp
  lld/ELF/MarkLive.cpp
  lld/ELF/Options.td
  lld/ELF/Writer.cpp
  llvm/include/llvm/Support/TimeProfiler.h
  llvm/lib/Support/TimeProfiler.cpp

Index: llvm/lib/Support/TimeProfiler.cpp
===================================================================
--- llvm/lib/Support/TimeProfiler.cpp
+++ llvm/lib/Support/TimeProfiler.cpp
@@ -13,8 +13,9 @@
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Threading.h"
 #include <cassert>
 #include <chrono>
 #include <string>
@@ -37,10 +38,11 @@
   TimePointType End;
   std::string Name;
   std::string Detail;
+  uint64_t Tid;
 
   Entry(TimePointType &&S, TimePointType &&E, std::string &&N, std::string &&Dt)
       : Start(std::move(S)), End(std::move(E)), Name(std::move(N)),
-        Detail(std::move(Dt)){};
+        Detail(std::move(Dt)), Tid(llvm::get_threadid()){};
 
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoid truncation issues causing inner
@@ -59,26 +61,47 @@
 };
 
 struct TimeTraceProfiler {
-  TimeTraceProfiler() {
+  TimeTraceProfiler(StringRef ProgName) {
     StartTime = steady_clock::now();
+    this->ProgName = ProgName;
   }
 
   void begin(std::string Name, llvm::function_ref<std::string()> Detail) {
+    std::lock_guard<std::mutex> Lock(Mu);
     Stack.emplace_back(steady_clock::now(), TimePointType(), std::move(Name),
                        Detail());
   }
 
   void end() {
+    std::lock_guard<std::mutex> Lock(Mu);
+    uint64_t Tid = llvm::get_threadid();
+
     assert(!Stack.empty() && "Must call begin() first");
-    auto &E = Stack.back();
+
+    // Find the latest entry from this thread on the stack.
+    auto LatestEvent =
+        std::find_if(Stack.rbegin(), Stack.rend(),
+                     [&](const Entry &Val) { return Val.Tid == Tid; });
+    // Should always be able to find a stack entry on this thread.
+    assert(LatestEvent != Stack.rend() &&
+           "end() without begin() found on thread");
+
+    // Get reference from iterator.
+    auto &E = *LatestEvent;
+
     E.End = steady_clock::now();
 
     // Check that end times monotonically increase.
-    assert((Entries.empty() ||
-            (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
-             Entries.back().getFlameGraphStartUs(StartTime) +
-                 Entries.back().getFlameGraphDurUs())) &&
-           "TimeProfiler scope ended earlier than previous scope");
+    //    auto LastEntryOnThreadIter = std::find_if(Entries.rbegin(),
+    //    Entries.rend(), [&](const Entry &Val) {
+    //          return Val.Tid == Tid;
+    //        });
+    //    auto &LastEntryOnThread = *LastEntryOnThreadIter;
+    //    assert((Entries.empty() ||
+    //            (E.getFlameGraphStartUs(StartTime) + E.getFlameGraphDurUs() >=
+    //             LastEntryOnThread.getFlameGraphStartUs(StartTime) +
+    //                 LastEntryOnThread.getFlameGraphDurUs())) &&
+    //           "TimeProfiler scope ended earlier than previous scope");
 
     // Calculate duration at full precision for overall counts.
     DurationType Duration = E.End - E.Start;
@@ -92,18 +115,21 @@
     // templates from within, we only want to add the topmost one. "topmost"
     // happens to be the ones that don't have any currently open entries above
     // itself.
-    if (std::find_if(++Stack.rbegin(), Stack.rend(), [&](const Entry &Val) {
-          return Val.Name == E.Name;
-        }) == Stack.rend()) {
+    if (std::find_if(std::next(LatestEvent), Stack.rend(),
+                     [&](const Entry &Val) {
+                       return (Val.Name == E.Name) && (Val.Tid == Tid);
+                     }) == Stack.rend()) {
       auto &CountAndTotal = CountAndTotalPerName[E.Name];
       CountAndTotal.first++;
       CountAndTotal.second += Duration;
     }
 
-    Stack.pop_back();
+    // Remove the last entry in the stack that matches this Tid.
+    Stack.erase(std::next(LatestEvent).base());
   }
 
   void Write(raw_pwrite_stream &OS) {
+    std::lock_guard<std::mutex> Lock(Mu);
     assert(Stack.empty() &&
            "All profiler sections should be ended when calling Write");
     json::OStream J(OS);
@@ -118,7 +144,7 @@
 
       J.object([&]{
         J.attribute("pid", 1);
-        J.attribute("tid", 0);
+        J.attribute("tid", int64_t(E.Tid));
         J.attribute("ph", "X");
         J.attribute("ts", StartUs);
         J.attribute("dur", DurUs);
@@ -129,7 +155,12 @@
 
     // Emit totals by section name as additional "thread" events, sorted from
     // longest one.
-    int Tid = 1;
+    // Find highest used thread id.
+    uint64_t MaxTid = 0;
+    for (const auto &E : Entries) {
+      MaxTid = std::max(MaxTid, E.Tid);
+    }
+    uint64_t Tid = MaxTid + 1;
     std::vector<NameAndCountAndDurationType> SortedTotals;
     SortedTotals.reserve(CountAndTotalPerName.size());
     for (const auto &E : CountAndTotalPerName)
@@ -146,7 +177,7 @@
 
       J.object([&]{
         J.attribute("pid", 1);
-        J.attribute("tid", Tid);
+        J.attribute("tid", int64_t(Tid));
         J.attribute("ph", "X");
         J.attribute("ts", 0);
         J.attribute("dur", DurUs);
@@ -168,7 +199,7 @@
       J.attribute("ts", 0);
       J.attribute("ph", "M");
       J.attribute("name", "process_name");
-      J.attributeObject("args", [&] { J.attribute("name", "clang"); });
+      J.attributeObject("args", [&] { J.attribute("name", ProgName); });
     });
 
     J.arrayEnd();
@@ -176,19 +207,23 @@
     J.objectEnd();
   }
 
-  SmallVector<Entry, 16> Stack;
-  SmallVector<Entry, 128> Entries;
-  StringMap<CountAndDurationType> CountAndTotalPerName;
+  std::mutex Mu;
+  SmallVector<Entry, 16> Stack /*GUARDED_BY(Mu)*/;
+  SmallVector<Entry, 128> Entries /*GUARDED_BY(Mu)*/;
+  StringMap<CountAndDurationType> CountAndTotalPerName /*GUARDED_BY(Mu)*/;
   TimePointType StartTime;
+  StringRef ProgName;
 
   // Minimum time granularity (in microseconds)
   unsigned TimeTraceGranularity;
 };
 
-void timeTraceProfilerInitialize(unsigned TimeTraceGranularity) {
+void timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
+                                 StringRef ProgName) {
   assert(TimeTraceProfilerInstance == nullptr &&
          "Profiler should not be initialized");
-  TimeTraceProfilerInstance = new TimeTraceProfiler();
+  TimeTraceProfilerInstance =
+      new TimeTraceProfiler(llvm::sys::path::filename(ProgName));
   TimeTraceProfilerInstance->TimeTraceGranularity = TimeTraceGranularity;
 }
 
Index: llvm/include/llvm/Support/TimeProfiler.h
===================================================================
--- llvm/include/llvm/Support/TimeProfiler.h
+++ llvm/include/llvm/Support/TimeProfiler.h
@@ -19,7 +19,8 @@
 /// Initialize the time trace profiler.
 /// This sets up the global \p TimeTraceProfilerInstance
 /// variable to be the profiler instance.
-void timeTraceProfilerInitialize(unsigned TimeTraceGranularity);
+void timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
+                                 StringRef ProgName);
 
 /// Cleanup the time trace profiler, if it was initialized.
 void timeTraceProfilerCleanup();
Index: lld/ELF/Writer.cpp
===================================================================
--- lld/ELF/Writer.cpp
+++ lld/ELF/Writer.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/RandomNumberGenerator.h"
 #include "llvm/Support/SHA1.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/xxhash.h"
 #include <climits>
 
@@ -139,7 +140,10 @@
          script->needsInterpSection();
 }
 
-template <class ELFT> void writeResult() { Writer<ELFT>().run(); }
+template <class ELFT> void writeResult() {
+  llvm::TimeTraceScope timeScope("Write output file", StringRef(""));
+  Writer<ELFT>().run();
+}
 
 static void removeEmptyPTLoad(std::vector<PhdrEntry *> &phdrs) {
   llvm::erase_if(phdrs, [&](const PhdrEntry *p) {
Index: lld/ELF/Options.td
===================================================================
--- lld/ELF/Options.td
+++ lld/ELF/Options.td
@@ -358,6 +358,11 @@
     "Run the linker multi-threaded (default)",
     "Do not run the linker multi-threaded">;
 
+def time_trace: F<"time-trace">, HelpText<"Record time trace">;
+
+defm time_trace_granularity: Eq<"time-trace-granularity",
+  "Minimum time granularity (in microseconds) traced by time profiler">;
+
 defm toc_optimize : B<"toc-optimize",
     "(PowerPC64) Enable TOC related optimizations (default)",
     "(PowerPC64) Disable TOC related optimizations">;
Index: lld/ELF/MarkLive.cpp
===================================================================
--- lld/ELF/MarkLive.cpp
+++ lld/ELF/MarkLive.cpp
@@ -31,6 +31,7 @@
 #include "lld/Common/Strings.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Object/ELF.h"
+#include "llvm/Support/TimeProfiler.h"
 #include <functional>
 #include <vector>
 
@@ -318,6 +319,7 @@
 // input sections. This function make some or all of them on
 // so that they are emitted to the output file.
 template <class ELFT> void markLive() {
+  llvm::TimeTraceScope timeScope("GC", StringRef(""));
   // If -gc-sections is not given, no sections are removed.
   if (!config->gcSections) {
     for (InputSectionBase *sec : inputSections)
Index: lld/ELF/ICF.cpp
===================================================================
--- lld/ELF/ICF.cpp
+++ lld/ELF/ICF.cpp
@@ -84,6 +84,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Object/ELF.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/xxhash.h"
 #include <algorithm>
 #include <atomic>
@@ -445,6 +446,7 @@
 
 // The main function of ICF.
 template <class ELFT> void ICF<ELFT>::run() {
+  llvm::TimeTraceScope timeScope("ICF", StringRef(""));
   // Collect sections to merge.
   for (InputSectionBase *sec : inputSections) {
     auto *s = cast<InputSection>(sec);
Index: lld/ELF/Driver.cpp
===================================================================
--- lld/ELF/Driver.cpp
+++ lld/ELF/Driver.cpp
@@ -56,6 +56,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TarWriter.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
 #include <utility>
@@ -483,6 +484,12 @@
   if (errorCount())
     return;
 
+  // Initialize time trace.
+  if (args.hasArg(OPT_time_trace))
+    llvm::timeTraceProfilerInitialize(
+        args::getInteger(args, OPT_time_trace_granularity, 500u),
+        config->progName);
+
   // The Target instance handles target-specific stuff, such as applying
   // relocations or writing a PLT section. It also contains target-dependent
   // values such as a default image base address.
@@ -491,19 +498,33 @@
   switch (config->ekind) {
   case ELF32LEKind:
     link<ELF32LE>(args);
-    return;
+    break;
   case ELF32BEKind:
     link<ELF32BE>(args);
-    return;
+    break;
   case ELF64LEKind:
     link<ELF64LE>(args);
-    return;
+    break;
   case ELF64BEKind:
     link<ELF64BE>(args);
-    return;
+    break;
   default:
     llvm_unreachable("unknown Config->EKind");
   }
+
+  if (llvm::timeTraceProfilerEnabled()) {
+    SmallString<128> path(config->outputFile);
+    llvm::sys::path::replace_extension(path, "json");
+    std::error_code errorCode;
+    raw_fd_ostream profilerOS(path, errorCode, sys::fs::F_Text);
+    if (errorCode) {
+      error("cannot open " + path + ": " + errorCode.message());
+      return;
+    }
+    llvm::timeTraceProfilerWrite(profilerOS);
+    llvm::timeTraceProfilerCleanup();
+  }
+  return;
 }
 
 static std::string getRpath(opt::InputArgList &args) {
@@ -1570,6 +1591,7 @@
 // Because all bitcode files that the program consists of are passed to
 // the compiler at once, it can do a whole-program optimization.
 template <class ELFT> void LinkerDriver::compileBitcodeFiles() {
+  llvm::TimeTraceScope timeScope("LTO", StringRef(""));
   // Compile bitcode files and replace bitcode symbols.
   lto.reset(new BitcodeCompiler);
   for (BitcodeFile *file : bitcodeFiles)
@@ -1696,6 +1718,8 @@
 // Do actual linking. Note that when this function is called,
 // all linker scripts have already been parsed.
 template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
+  llvm::TimeTraceScope timeScope("ExecuteLinker", StringRef(""));
+
   // If a -hash-style option was not given, set to a default value,
   // which varies depending on the target.
   if (!args.hasArg(OPT_hash_style)) {
@@ -1735,8 +1759,11 @@
   // symbols that we need to the symbol table. This process might
   // add files to the link, via autolinking, these files are always
   // appended to the Files vector.
-  for (size_t i = 0; i < files.size(); ++i)
-    parseFile(files[i]);
+  {
+    llvm::TimeTraceScope timeScope("Parse input files", StringRef(""));
+    for (size_t i = 0; i < files.size(); ++i)
+      parseFile(files[i]);
+  }
 
   // Now that we have every file, we can decide if we will need a
   // dynamic symbol table.
Index: clang/tools/driver/cc1_main.cpp
===================================================================
--- clang/tools/driver/cc1_main.cpp
+++ clang/tools/driver/cc1_main.cpp
@@ -218,7 +218,7 @@
 
   if (Clang->getFrontendOpts().TimeTrace) {
     llvm::timeTraceProfilerInitialize(
-        Clang->getFrontendOpts().TimeTraceGranularity);
+        Clang->getFrontendOpts().TimeTraceGranularity, Argv0);
   }
   // --print-supported-cpus takes priority over the actual compilation.
   if (Clang->getFrontendOpts().PrintSupportedCPUs)
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to