https://github.com/ilovepi updated 
https://github.com/llvm/llvm-project/pull/182621

>From 92a078b388681bbd0f785005519b4120ee2007f1 Mon Sep 17 00:00:00 2001
From: Paul Kirth <[email protected]>
Date: Thu, 12 Jun 2025 10:50:39 -0700
Subject: [PATCH 1/2] [clang-doc] Improve complexity of Index construction

The existing implementation ends up with an O(N^2) algorithm due to
repeated linear scans during index construction. Switching to a
StringMap allows us to reduce this to O(N), since we no longer need to
search the vector.

The `BM_Index_Insertion` benchmark measures the time taken to insert N
unique records into the index.

| Scale (N Items) | Baseline (ns) | Patched (ns) | Speedup | Change |
|----------------:|--------------:|-------------:|--------:|-------:|
| 10              | 9,977         | 11,004       | 0.91x   | +10.3% |
| 64              | 69,249        | 69,166       | 1.00x   | -0.1%  |
| 512             | 1,932,714     | 525,877      | 3.68x   | -72.8% |
| 4,096           | 92,411,535    | 4,589,030    | 20.1x   | -95.0% |
| 10,000          | 577,384,945   | 12,998,039   | 44.4x   | -97.7% |

The patch delivers significant improvements to scalability. At 10,000
items, index construction is **~44 times faster**, confirming the
complexity reduction from O(N^2) to O(N). The crossover point where the
new map-based approach beats the vector-based approach appears to be
around N=64.

Since the index is typically larger than 64 for files of non trivial
complexity, and users will typically be building documentation for an
entire project with many files, all normal usage should benefit from
this change.

Other benchmarks show minor regressions, though in a typical build of
LLVM documentation index construction takes up a larger amount of
runtime than any of these other components.
---
 clang-tools-extra/clang-doc/Generators.cpp    | 24 ++---
 clang-tools-extra/clang-doc/JSONGenerator.cpp | 16 ++--
 clang-tools-extra/clang-doc/MDGenerator.cpp   | 31 ++++---
 .../clang-doc/Representation.cpp              |  3 +-
 clang-tools-extra/clang-doc/Representation.h  |  3 +-
 clang-tools-extra/clang-doc/YAMLGenerator.cpp |  6 +-
 .../unittests/clang-doc/ClangDocTest.cpp      |  4 +-
 .../unittests/clang-doc/GeneratorTest.cpp     | 88 +++++++++++++++----
 8 files changed, 124 insertions(+), 51 deletions(-)

diff --git a/clang-tools-extra/clang-doc/Generators.cpp 
b/clang-tools-extra/clang-doc/Generators.cpp
index eca1f288d5ba1..f69d666757b18 100644
--- a/clang-tools-extra/clang-doc/Generators.cpp
+++ b/clang-tools-extra/clang-doc/Generators.cpp
@@ -212,33 +212,35 @@ void Generator::addInfoToIndex(Index &Idx, const 
doc::Info *Info) {
   for (const auto &R : llvm::reverse(Info->Namespace)) {
     // Look for the current namespace in the children of the index I is
     // pointing.
-    auto It = llvm::find(I->Children, R.USR);
+    auto It = I->Children.find(llvm::toStringRef(R.USR));
     if (It != I->Children.end()) {
       // If it is found, just change I to point the namespace reference found.
-      I = &*It;
+      I = &It->second;
     } else {
       // If it is not found a new reference is created
-      I->Children.emplace_back(R.USR, R.Name, R.RefType, R.Path);
+      auto [NewInfo, success] = I->Children.try_emplace(
+          llvm::toStringRef(R.USR), R.USR, R.Name, R.RefType, R.Path);
       // I is updated with the reference of the new namespace reference
-      I = &I->Children.back();
+      if (success)
+        I = &NewInfo->second;
     }
   }
   // Look for Info in the vector where it is supposed to be; it could already
   // exist if it is a parent namespace of an Info already passed to this
   // function.
-  auto It = llvm::find(I->Children, Info->USR);
+  auto It = I->Children.find(llvm::toStringRef(Info->USR));
   if (It == I->Children.end()) {
     // If it is not in the vector it is inserted
-    I->Children.emplace_back(Info->USR, Info->extractName(), Info->IT,
-                             Info->Path);
+    I->Children.try_emplace(llvm::toStringRef(Info->USR), Info->USR,
+                            Info->extractName(), Info->IT, Info->Path);
   } else {
     // If it not in the vector we only check if Path and Name are not empty
     // because if the Info was included by a namespace it may not have those
     // values.
-    if (It->Path.empty())
-      It->Path = Info->Path;
-    if (It->Name.empty())
-      It->Name = Info->extractName();
+    if (It->second.Path.empty())
+      It->second.Path = Info->Path;
+    if (It->second.Name.empty())
+      It->second.Name = Info->extractName();
   }
 }
 
diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp 
b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 9a8f51b808cab..00b862cc5f281 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -805,19 +805,25 @@ static Error serializeIndex(const ClangDocContext &CDCtx, 
StringRef RootDir) {
 
   if (IndexCopy.Children.empty()) {
     // If the index is empty, default to displaying the global namespace.
-    IndexCopy.Children.emplace_back(GlobalNamespaceID, "",
+    IndexCopy.Children.try_emplace(toStringRef(GlobalNamespaceID), 
GlobalNamespaceID, "",
                                     InfoType::IT_namespace, "GlobalNamespace");
   } else {
     IndexArrayRef.reserve(CDCtx.Idx.Children.size());
   }
 
-  for (auto &Idx : IndexCopy.Children) {
-    if (Idx.Children.empty())
+  llvm::SmallVector<const Index *> Children;
+  Children.reserve(IndexCopy.Children.size());
+  for (const auto &[_, Idx] : IndexCopy.Children)
+    Children.push_back(&Idx);
+  llvm::sort(Children, [](const Index *A, const Index *B) { return *A < *B; });
+
+  for (const auto *Idx : Children) {
+    if (Idx->Children.empty())
       continue;
-    std::string TypeStr = infoTypeToString(Idx.RefType);
+    std::string TypeStr = infoTypeToString(Idx->RefType);
     json::Value IdxVal = Object();
     auto &IdxObj = *IdxVal.getAsObject();
-    serializeReference(Idx, IdxObj);
+    serializeReference(*Idx, IdxObj);
     IndexArrayRef.push_back(IdxVal);
   }
   Obj["Index"] = IndexArray;
diff --git a/clang-tools-extra/clang-doc/MDGenerator.cpp 
b/clang-tools-extra/clang-doc/MDGenerator.cpp
index 851b4e084ef23..f01a3e4673bcf 100644
--- a/clang-tools-extra/clang-doc/MDGenerator.cpp
+++ b/clang-tools-extra/clang-doc/MDGenerator.cpp
@@ -314,7 +314,7 @@ static void genMarkdown(const ClangDocContext &CDCtx, const 
TypedefInfo &I,
   // TODO support typedefs in markdown.
 }
 
-static void serializeReference(llvm::raw_fd_ostream &OS, Index &I, int Level) {
+static void serializeReference(llvm::raw_fd_ostream &OS, const Index &I, int 
Level) {
   // Write out the heading level starting at ##
   OS << "##" << std::string(Level, '#') << " ";
   writeNameLink("", I, OS);
@@ -338,8 +338,14 @@ static llvm::Error serializeIndex(ClangDocContext &CDCtx) {
     OS << " for " << CDCtx.ProjectName;
   OS << "\n\n";
 
-  for (auto C : CDCtx.Idx.Children)
-    serializeReference(OS, C, 0);
+  std::vector<const Index *> Children;
+  Children.reserve(CDCtx.Idx.Children.size());
+  for (const auto &[_, C] : CDCtx.Idx.Children)
+    Children.push_back(&C);
+  llvm::sort(Children, [](const Index *A, const Index *B) { return *A < *B; });
+
+  for (const auto *C : Children)
+    serializeReference(OS, *C, 0);
 
   return llvm::Error::success();
 }
@@ -356,10 +362,15 @@ static llvm::Error genIndex(ClangDocContext &CDCtx) {
                                        FileErr.message());
   CDCtx.Idx.sort();
   OS << "# " << CDCtx.ProjectName << " C/C++ Reference\n\n";
-  for (auto C : CDCtx.Idx.Children) {
-    if (!C.Children.empty()) {
+  std::vector<const Index *> Children;
+  Children.reserve(CDCtx.Idx.Children.size());
+  for (const auto &[_, C] : CDCtx.Idx.Children)
+    Children.push_back(&C);
+  llvm::sort(Children, [](const Index *A, const Index *B) { return *A < *B; });
+  for (const auto *C : Children) {
+    if (!C->Children.empty()) {
       const char *Type;
-      switch (C.RefType) {
+      switch (C->RefType) {
       case InfoType::IT_namespace:
         Type = "Namespace";
         break;
@@ -387,10 +398,10 @@ static llvm::Error genIndex(ClangDocContext &CDCtx) {
       case InfoType::IT_default:
         Type = "Other";
       }
-      OS << "* " << Type << ": [" << C.Name << "](";
-      if (!C.Path.empty())
-        OS << C.Path << "/";
-      OS << C.Name << ")\n";
+      OS << "* " << Type << ": [" << C->Name << "](";
+      if (!C->Path.empty())
+        OS << C->Path << "/";
+      OS << C->Name << ")\n";
     }
   }
   return llvm::Error::success();
diff --git a/clang-tools-extra/clang-doc/Representation.cpp 
b/clang-tools-extra/clang-doc/Representation.cpp
index a69129041516f..e1b1c96342666 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -473,8 +473,7 @@ bool Index::operator<(const Index &Other) const {
 }
 
 void Index::sort() {
-  llvm::sort(Children);
-  for (auto &C : Children)
+  for (auto &[_,C] : Children)
     C.sort();
 }
 
diff --git a/clang-tools-extra/clang-doc/Representation.h 
b/clang-tools-extra/clang-doc/Representation.h
index 297e564ea7866..9da65105e48c0 100644
--- a/clang-tools-extra/clang-doc/Representation.h
+++ b/clang-tools-extra/clang-doc/Representation.h
@@ -18,6 +18,7 @@
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/Specifiers.h"
 #include "clang/Tooling/Execution.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include <array>
 #include <optional>
@@ -611,7 +612,7 @@ struct Index : public Reference {
   bool operator<(const Index &Other) const;
 
   std::optional<SmallString<16>> JumpToSection;
-  std::vector<Index> Children;
+  llvm::StringMap<Index> Children;
 
   void sort();
 };
diff --git a/clang-tools-extra/clang-doc/YAMLGenerator.cpp 
b/clang-tools-extra/clang-doc/YAMLGenerator.cpp
index ce4ef58e582e5..d81fcafddbc1f 100644
--- a/clang-tools-extra/clang-doc/YAMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/YAMLGenerator.cpp
@@ -109,15 +109,15 @@ template <unsigned U> struct ScalarTraits<SmallString<U>> 
{
   static QuotingType mustQuote(StringRef) { return QuotingType::Single; }
 };
 
-template <> struct ScalarTraits<std::array<unsigned char, 20>> {
+template <> struct ScalarTraits<SymbolID> {
 
-  static void output(const std::array<unsigned char, 20> &S, void *,
+  static void output(const SymbolID &S, void *,
                      llvm::raw_ostream &OS) {
     OS << toHex(toStringRef(S));
   }
 
   static StringRef input(StringRef Scalar, void *,
-                         std::array<unsigned char, 20> &Value) {
+                         SymbolID &Value) {
     if (Scalar.size() != 40)
       return "Error: Incorrect scalar size for USR.";
     Value = stringToSymbol(Scalar);
diff --git a/clang-tools-extra/unittests/clang-doc/ClangDocTest.cpp 
b/clang-tools-extra/unittests/clang-doc/ClangDocTest.cpp
index 73891abd53f99..169dd598f65ef 100644
--- a/clang-tools-extra/unittests/clang-doc/ClangDocTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/ClangDocTest.cpp
@@ -247,8 +247,8 @@ void CheckBaseRecordInfo(BaseRecordInfo *Expected, 
BaseRecordInfo *Actual) {
 void CheckIndex(Index &Expected, Index &Actual) {
   CheckReference(Expected, Actual);
   ASSERT_EQ(Expected.Children.size(), Actual.Children.size());
-  for (size_t Idx = 0; Idx < Actual.Children.size(); ++Idx)
-    CheckIndex(Expected.Children[Idx], Actual.Children[Idx]);
+  for (auto &[_, C] : Expected.Children)
+    CheckIndex(C, Actual.Children.find(llvm::toStringRef(C.USR))->second);
 }
 
 } // namespace doc
diff --git a/clang-tools-extra/unittests/clang-doc/GeneratorTest.cpp 
b/clang-tools-extra/unittests/clang-doc/GeneratorTest.cpp
index e7f68b405284f..b6c4793aef3d9 100644
--- a/clang-tools-extra/unittests/clang-doc/GeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/GeneratorTest.cpp
@@ -46,45 +46,99 @@ TEST(GeneratorTest, emitIndex) {
   Index ExpectedIdx;
   Index IndexA;
   IndexA.Name = "A";
-  ExpectedIdx.Children.emplace_back(std::move(IndexA));
+  IndexA.USR = serialize::hashUSR("1");
+  ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexA.USR),
+                                   std::move(IndexA));
   Index IndexB;
   IndexB.Name = "B";
+  IndexB.USR = serialize::hashUSR("2");
   Index IndexC;
   IndexC.Name = "C";
-  IndexB.Children.emplace_back(std::move(IndexC));
-  ExpectedIdx.Children.emplace_back(std::move(IndexB));
+  IndexC.USR = serialize::hashUSR("3");
+  IndexB.Children.try_emplace(llvm::toStringRef(IndexC.USR), 
std::move(IndexC));
+  ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexB.USR),
+                                   std::move(IndexB));
   Index IndexD;
   IndexD.Name = "D";
+  IndexD.USR = serialize::hashUSR("4");
   Index IndexE;
   IndexE.Name = "E";
+  IndexE.USR = serialize::hashUSR("5");
   Index IndexF;
   IndexF.Name = "F";
-  IndexE.Children.emplace_back(std::move(IndexF));
-  IndexD.Children.emplace_back(std::move(IndexE));
-  ExpectedIdx.Children.emplace_back(std::move(IndexD));
+  IndexF.USR = serialize::hashUSR("6");
+  IndexE.Children.try_emplace(llvm::toStringRef(IndexF.USR), 
std::move(IndexF));
+  IndexD.Children.try_emplace(llvm::toStringRef(IndexE.USR), 
std::move(IndexE));
+  ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexD.USR),
+                                   std::move(IndexD));
   Index IndexG;
   IndexG.Name = "GlobalNamespace";
   IndexG.RefType = InfoType::IT_namespace;
-  ExpectedIdx.Children.emplace_back(std::move(IndexG));
+  ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexG.USR),
+                                   std::move(IndexG));
 
   CheckIndex(ExpectedIdx, Idx);
 }
 
 TEST(GeneratorTest, sortIndex) {
   Index Idx;
-  Idx.Children.emplace_back("b");
-  Idx.Children.emplace_back("aA");
-  Idx.Children.emplace_back("aa");
-  Idx.Children.emplace_back("A");
-  Idx.Children.emplace_back("a");
+  Index IndexA;
+  IndexA.Name = "a";
+  IndexA.USR = serialize::hashUSR("1");
+  Idx.Children.try_emplace(llvm::toStringRef(IndexA.USR), std::move(IndexA));
+
+  Index IndexB;
+  IndexB.Name = "A";
+  IndexB.USR = serialize::hashUSR("2");
+  Idx.Children.try_emplace(llvm::toStringRef(IndexB.USR), std::move(IndexB));
+
+  Index IndexC;
+  IndexC.Name = "aa";
+  IndexC.USR = serialize::hashUSR("3");
+  Idx.Children.try_emplace(llvm::toStringRef(IndexC.USR), std::move(IndexC));
+
+  Index IndexD;
+  IndexD.Name = "aA";
+  IndexD.USR = serialize::hashUSR("4");
+  Idx.Children.try_emplace(llvm::toStringRef(IndexD.USR), std::move(IndexD));
+
+  Index IndexE;
+  IndexE.Name = "b";
+  IndexE.USR = serialize::hashUSR("5");
+  Idx.Children.try_emplace(llvm::toStringRef(IndexE.USR), std::move(IndexE));
+
   Idx.sort();
 
   Index ExpectedIdx;
-  ExpectedIdx.Children.emplace_back("a");
-  ExpectedIdx.Children.emplace_back("A");
-  ExpectedIdx.Children.emplace_back("aa");
-  ExpectedIdx.Children.emplace_back("aA");
-  ExpectedIdx.Children.emplace_back("b");
+  Index IndexAExp;
+  IndexAExp.Name = "a";
+  IndexAExp.USR = serialize::hashUSR("1");
+  ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexAExp.USR),
+                                   std::move(IndexAExp));
+
+  Index IndexBExp;
+  IndexBExp.Name = "A";
+  IndexBExp.USR = serialize::hashUSR("2");
+  ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexBExp.USR),
+                                   std::move(IndexBExp));
+
+  Index IndexCExp;
+  IndexCExp.Name = "aa";
+  IndexCExp.USR = serialize::hashUSR("3");
+  ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexCExp.USR),
+                                   std::move(IndexCExp));
+
+  Index IndexDExp;
+  IndexDExp.Name = "aA";
+  IndexDExp.USR = serialize::hashUSR("4");
+  ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexDExp.USR),
+                                   std::move(IndexDExp));
+
+  Index IndexEExp;
+  IndexEExp.Name = "b";
+  IndexEExp.USR = serialize::hashUSR("5");
+  ExpectedIdx.Children.try_emplace(llvm::toStringRef(IndexEExp.USR),
+                                   std::move(IndexEExp));
 
   CheckIndex(ExpectedIdx, Idx);
 }

>From 5936c9136091b1be169a567c1e774e12ffcb8fa6 Mon Sep 17 00:00:00 2001
From: Paul Kirth <[email protected]>
Date: Mon, 23 Feb 2026 15:55:06 -0800
Subject: [PATCH 2/2] Format

---
 clang-tools-extra/clang-doc/JSONGenerator.cpp  | 5 +++--
 clang-tools-extra/clang-doc/MDGenerator.cpp    | 3 ++-
 clang-tools-extra/clang-doc/Representation.cpp | 2 +-
 clang-tools-extra/clang-doc/YAMLGenerator.cpp  | 6 ++----
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp 
b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 00b862cc5f281..773dc3a205e1c 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -805,8 +805,9 @@ static Error serializeIndex(const ClangDocContext &CDCtx, 
StringRef RootDir) {
 
   if (IndexCopy.Children.empty()) {
     // If the index is empty, default to displaying the global namespace.
-    IndexCopy.Children.try_emplace(toStringRef(GlobalNamespaceID), 
GlobalNamespaceID, "",
-                                    InfoType::IT_namespace, "GlobalNamespace");
+    IndexCopy.Children.try_emplace(toStringRef(GlobalNamespaceID),
+                                   GlobalNamespaceID, "",
+                                   InfoType::IT_namespace, "GlobalNamespace");
   } else {
     IndexArrayRef.reserve(CDCtx.Idx.Children.size());
   }
diff --git a/clang-tools-extra/clang-doc/MDGenerator.cpp 
b/clang-tools-extra/clang-doc/MDGenerator.cpp
index f01a3e4673bcf..f94690632895c 100644
--- a/clang-tools-extra/clang-doc/MDGenerator.cpp
+++ b/clang-tools-extra/clang-doc/MDGenerator.cpp
@@ -314,7 +314,8 @@ static void genMarkdown(const ClangDocContext &CDCtx, const 
TypedefInfo &I,
   // TODO support typedefs in markdown.
 }
 
-static void serializeReference(llvm::raw_fd_ostream &OS, const Index &I, int 
Level) {
+static void serializeReference(llvm::raw_fd_ostream &OS, const Index &I,
+                               int Level) {
   // Write out the heading level starting at ##
   OS << "##" << std::string(Level, '#') << " ";
   writeNameLink("", I, OS);
diff --git a/clang-tools-extra/clang-doc/Representation.cpp 
b/clang-tools-extra/clang-doc/Representation.cpp
index e1b1c96342666..e3ccc43b9072b 100644
--- a/clang-tools-extra/clang-doc/Representation.cpp
+++ b/clang-tools-extra/clang-doc/Representation.cpp
@@ -473,7 +473,7 @@ bool Index::operator<(const Index &Other) const {
 }
 
 void Index::sort() {
-  for (auto &[_,C] : Children)
+  for (auto &[_, C] : Children)
     C.sort();
 }
 
diff --git a/clang-tools-extra/clang-doc/YAMLGenerator.cpp 
b/clang-tools-extra/clang-doc/YAMLGenerator.cpp
index d81fcafddbc1f..6034131f5db35 100644
--- a/clang-tools-extra/clang-doc/YAMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/YAMLGenerator.cpp
@@ -111,13 +111,11 @@ template <unsigned U> struct ScalarTraits<SmallString<U>> 
{
 
 template <> struct ScalarTraits<SymbolID> {
 
-  static void output(const SymbolID &S, void *,
-                     llvm::raw_ostream &OS) {
+  static void output(const SymbolID &S, void *, llvm::raw_ostream &OS) {
     OS << toHex(toStringRef(S));
   }
 
-  static StringRef input(StringRef Scalar, void *,
-                         SymbolID &Value) {
+  static StringRef input(StringRef Scalar, void *, SymbolID &Value) {
     if (Scalar.size() != 40)
       return "Error: Incorrect scalar size for USR.";
     Value = stringToSymbol(Scalar);

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to