llvmbot wrote:

<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang

@llvm/pr-subscribers-clangd

Author: Jan Svoboda (jansvoboda11)

<details>
<summary>Changes</summary>

Clang uses the `HeaderFileInfo` struct to track bits of information on header 
files, which gets used throughout the compiler. We also use this to compute the 
set of affecting module maps in `ASTWriter` and in the end serialize the 
information into the `HEADER_SEARCH_TABLE` record of a PCM file, allowing 
clients to learn about headers from the module. In doing so, Clang asks for 
existing `HeaderFileInfo` for all known `FileEntries`. Note that this asks the 
loaded PCM files for the information they have on each header file in question. 
This seems unnecessary: we only want to serialize information on header files 
that either belong to the current module or that got included textually. Loaded 
PCM files can't provide us with any useful information.

For explicit modules with lazy loading (using `-fmodule-map-file=&lt;path&gt;` 
with `-fmodule-file=&lt;name&gt;=&lt;path&gt;`) the compiler knows about header 
files listed in the module map files on the command-line. This can be a large 
number.

Asking for existing `HeaderFileInfo` can trigger deserialization of 
`HEADER_SEARCH_TABLE` from loaded PCM files. Keys of the on-disk hash table 
consist of the header file size and modification time. However, with explicit 
modules Clang zeroes out the modification time. Moreover, if you import lots of 
modules, some of their header files end up having identical sizes. This means 
lots of hash collisions that can only be resolved by running the serialized 
filename through `FileManager` and comparing equality of the `FileEntry`. This 
ends up being super expensive, essentially re-stating lots of the transitively 
loaded SDK header files.

This patch cleans up the API for getting `HeaderFileInfo` and makes sure 
`ASTWriter` uses the version that doesn't ask loaded PCM files for more 
information. This removes the excessive stat traffic coming from `ASTWriter` 
hopefully without changing observable behavior.

---
Full diff: https://github.com/llvm/llvm-project/pull/87848.diff


5 Files Affected:

- (modified) clang-tools-extra/clangd/index/SymbolCollector.cpp (+1-1) 
- (modified) clang/include/clang/Lex/HeaderSearch.h (+13-11) 
- (modified) clang/lib/Lex/HeaderSearch.cpp (+42-35) 
- (modified) clang/lib/Serialization/ASTWriter.cpp (+4-7) 
- (added) clang/test/Modules/foo.c (+48) 


``````````diff
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp 
b/clang-tools-extra/clangd/index/SymbolCollector.cpp
index 85b8fc549b016e..5c4e2150cf3123 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -409,7 +409,7 @@ class SymbolCollector::HeaderFileURICache {
     // Framework headers are spelled as <FrameworkName/Foo.h>, not
     // "path/FrameworkName.framework/Headers/Foo.h".
     auto &HS = PP->getHeaderSearchInfo();
-    if (const auto *HFI = HS.getExistingFileInfo(*FE, /*WantExternal*/ false))
+    if (const auto *HFI = HS.getExistingFileInfo(*FE))
       if (!HFI->Framework.empty())
         if (auto Spelling =
                 getFrameworkHeaderIncludeSpelling(*FE, HFI->Framework, HS))
diff --git a/clang/include/clang/Lex/HeaderSearch.h 
b/clang/include/clang/Lex/HeaderSearch.h
index 705dcfa8aacc3f..b8724e9aa3c92e 100644
--- a/clang/include/clang/Lex/HeaderSearch.h
+++ b/clang/include/clang/Lex/HeaderSearch.h
@@ -529,14 +529,15 @@ class HeaderSearch {
   /// Return whether the specified file is a normal header,
   /// a system header, or a C++ friendly system header.
   SrcMgr::CharacteristicKind getFileDirFlavor(FileEntryRef File) {
-    return (SrcMgr::CharacteristicKind)getFileInfo(File).DirInfo;
+    if (const HeaderFileInfo *HFI = getExistingFileInfo(File))
+      return (SrcMgr::CharacteristicKind)HFI->DirInfo;
+    return (SrcMgr::CharacteristicKind)HeaderFileInfo().DirInfo;
   }
 
   /// Mark the specified file as a "once only" file due to
   /// \#pragma once.
   void MarkFileIncludeOnce(FileEntryRef File) {
-    HeaderFileInfo &FI = getFileInfo(File);
-    FI.isPragmaOnce = true;
+    getFileInfo(File).isPragmaOnce = true;
   }
 
   /// Mark the specified file as a system header, e.g. due to
@@ -816,16 +817,17 @@ class HeaderSearch {
 
   unsigned header_file_size() const { return FileInfo.size(); }
 
-  /// Return the HeaderFileInfo structure for the specified FileEntry,
-  /// in preparation for updating it in some way.
+  /// Return the HeaderFileInfo structure for the specified FileEntry, in
+  /// preparation for updating it in some way.
   HeaderFileInfo &getFileInfo(FileEntryRef FE);
 
-  /// Return the HeaderFileInfo structure for the specified FileEntry,
-  /// if it has ever been filled in.
-  /// \param WantExternal Whether the caller wants purely-external header file
-  ///        info (where \p External is true).
-  const HeaderFileInfo *getExistingFileInfo(FileEntryRef FE,
-                                            bool WantExternal = true) const;
+  /// Return the HeaderFileInfo structure for the specified FileEntry, if it 
has
+  /// ever been filled in (either locally or externally).
+  const HeaderFileInfo *getExistingFileInfo(FileEntryRef FE) const;
+
+  /// Return the headerFileInfo structure for the specified FileEntry, if it 
has
+  /// ever been filled in locally.
+  const HeaderFileInfo *getExistingLocalFileInfo(FileEntryRef FE) const;
 
   SearchDirIterator search_dir_begin() { return {*this, 0}; }
   SearchDirIterator search_dir_end() { return {*this, SearchDirs.size()}; }
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index fcc2b56df166b8..9c3d8626cfabae 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -947,9 +947,13 @@ OptionalFileEntryRef HeaderSearch::LookupFile(
       // If we have no includer, that means we're processing a #include
       // from a module build. We should treat this as a system header if we're
       // building a [system] module.
-      bool IncluderIsSystemHeader =
-          Includer ? getFileInfo(*Includer).DirInfo != SrcMgr::C_User :
-          BuildSystemModule;
+      bool IncluderIsSystemHeader = [&]() {
+        if (!Includer)
+          return BuildSystemModule;
+        const HeaderFileInfo *HFI = getExistingFileInfo(*Includer);
+        assert(HFI && "includer without file info");
+        return HFI->DirInfo != SrcMgr::C_User;
+      }();
       if (OptionalFileEntryRef FE = getFileAndSuggestModule(
               TmpDir, IncludeLoc, IncluderAndDir.second, 
IncluderIsSystemHeader,
               RequestingModule, SuggestedModule)) {
@@ -964,10 +968,11 @@ OptionalFileEntryRef HeaderSearch::LookupFile(
         // Note that we only use one of FromHFI/ToHFI at once, due to potential
         // reallocation of the underlying vector potentially making the first
         // reference binding dangling.
-        HeaderFileInfo &FromHFI = getFileInfo(*Includer);
-        unsigned DirInfo = FromHFI.DirInfo;
-        bool IndexHeaderMapHeader = FromHFI.IndexHeaderMapHeader;
-        StringRef Framework = FromHFI.Framework;
+        const HeaderFileInfo *FromHFI = getExistingFileInfo(*Includer);
+        assert(FromHFI && "includer without file info");
+        unsigned DirInfo = FromHFI->DirInfo;
+        bool IndexHeaderMapHeader = FromHFI->IndexHeaderMapHeader;
+        StringRef Framework = FromHFI->Framework;
 
         HeaderFileInfo &ToHFI = getFileInfo(*FE);
         ToHFI.DirInfo = DirInfo;
@@ -1154,10 +1159,12 @@ OptionalFileEntryRef HeaderSearch::LookupFile(
   // "Foo" is the name of the framework in which the including header was 
found.
   if (!Includers.empty() && Includers.front().first && !isAngled &&
       !Filename.contains('/')) {
-    HeaderFileInfo &IncludingHFI = getFileInfo(*Includers.front().first);
-    if (IncludingHFI.IndexHeaderMapHeader) {
+    const HeaderFileInfo *IncludingHFI =
+        getExistingFileInfo(*Includers.front().first);
+    assert(IncludingHFI && "includer without file info");
+    if (IncludingHFI->IndexHeaderMapHeader) {
       SmallString<128> ScratchFilename;
-      ScratchFilename += IncludingHFI.Framework;
+      ScratchFilename += IncludingHFI->Framework;
       ScratchFilename += '/';
       ScratchFilename += Filename;
 
@@ -1287,11 +1294,11 @@ OptionalFileEntryRef 
HeaderSearch::LookupSubframeworkHeader(
   }
 
   // This file is a system header or C++ unfriendly if the old file is.
-  //
-  // Note that the temporary 'DirInfo' is required here, as either call to
-  // getFileInfo could resize the vector and we don't want to rely on order
-  // of evaluation.
-  unsigned DirInfo = getFileInfo(ContextFileEnt).DirInfo;
+  const HeaderFileInfo *ContextHFI = getExistingFileInfo(ContextFileEnt);
+  assert(ContextHFI && "context file without file info");
+  // Note that the temporary 'DirInfo' is required here, as the call to
+  // getFileInfo could resize the vector and might invalidate 'ContextHFI'.
+  unsigned DirInfo = ContextHFI->DirInfo;
   getFileInfo(*File).DirInfo = DirInfo;
 
   FrameworkName.pop_back(); // remove the trailing '/'
@@ -1331,8 +1338,6 @@ static void mergeHeaderFileInfo(HeaderFileInfo &HFI,
     HFI.Framework = OtherHFI.Framework;
 }
 
-/// getFileInfo - Return the HeaderFileInfo structure for the specified
-/// FileEntry.
 HeaderFileInfo &HeaderSearch::getFileInfo(FileEntryRef FE) {
   if (FE.getUID() >= FileInfo.size())
     FileInfo.resize(FE.getUID() + 1);
@@ -1349,27 +1354,20 @@ HeaderFileInfo &HeaderSearch::getFileInfo(FileEntryRef 
FE) {
   }
 
   HFI->IsValid = true;
-  // We have local information about this header file, so it's no longer
-  // strictly external.
+  // We assume the caller has local information about this header file, so it's
+  // no longer strictly external.
   HFI->External = false;
   return *HFI;
 }
 
-const HeaderFileInfo *
-HeaderSearch::getExistingFileInfo(FileEntryRef FE, bool WantExternal) const {
-  // If we have an external source, ensure we have the latest information.
-  // FIXME: Use a generation count to check whether this is really up to date.
+const HeaderFileInfo *HeaderSearch::getExistingFileInfo(FileEntryRef FE) const 
{
   HeaderFileInfo *HFI;
   if (ExternalSource) {
-    if (FE.getUID() >= FileInfo.size()) {
-      if (!WantExternal)
-        return nullptr;
+    if (FE.getUID() >= FileInfo.size())
       FileInfo.resize(FE.getUID() + 1);
-    }
 
     HFI = &FileInfo[FE.getUID()];
-    if (!WantExternal && (!HFI->IsValid || HFI->External))
-      return nullptr;
+    // FIXME: Use a generation count to check whether this is really up to 
date.
     if (!HFI->Resolved) {
       auto ExternalHFI = ExternalSource->GetHeaderFileInfo(FE);
       if (ExternalHFI.IsValid) {
@@ -1378,16 +1376,25 @@ HeaderSearch::getExistingFileInfo(FileEntryRef FE, bool 
WantExternal) const {
           mergeHeaderFileInfo(*HFI, ExternalHFI);
       }
     }
-  } else if (FE.getUID() >= FileInfo.size()) {
-    return nullptr;
-  } else {
+  } else if (FE.getUID() < FileInfo.size()) {
     HFI = &FileInfo[FE.getUID()];
+  } else {
+    HFI = nullptr;
   }
 
-  if (!HFI->IsValid || (HFI->External && !WantExternal))
-    return nullptr;
+  return (HFI && HFI->IsValid) ? HFI : nullptr;
+}
+
+const HeaderFileInfo *
+HeaderSearch::getExistingLocalFileInfo(FileEntryRef FE) const {
+  HeaderFileInfo *HFI;
+  if (FE.getUID() < FileInfo.size()) {
+    HFI = &FileInfo[FE.getUID()];
+  } else {
+    HFI = nullptr;
+  }
 
-  return HFI;
+  return (HFI && HFI->IsValid && !HFI->External) ? HFI : nullptr;
 }
 
 bool HeaderSearch::isFileMultipleIncludeGuarded(FileEntryRef File) const {
diff --git a/clang/lib/Serialization/ASTWriter.cpp 
b/clang/lib/Serialization/ASTWriter.cpp
index ba6a8a5e16e4e7..a05caca1b5a6fb 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -178,8 +178,7 @@ std::set<const FileEntry *> GetAffectingModuleMaps(const 
Preprocessor &PP,
     if (!File)
       continue;
 
-    const HeaderFileInfo *HFI =
-        HS.getExistingFileInfo(*File, /*WantExternal*/ false);
+    const HeaderFileInfo *HFI = HS.getExistingLocalFileInfo(*File);
     if (!HFI || (HFI->isModuleHeader && !HFI->isCompilingModuleHeader))
       continue;
 
@@ -2045,14 +2044,12 @@ void ASTWriter::WriteHeaderSearch(const HeaderSearch 
&HS) {
     if (!File)
       continue;
 
-    // Get the file info. This will load info from the external source if
-    // necessary. Skip emitting this file if we have no information on it
-    // as a header file (in which case HFI will be null) or if it hasn't
+    // Get the file info. Skip emitting this file if we have no information on
+    // it as a header file (in which case HFI will be null) or if it hasn't
     // changed since it was loaded. Also skip it if it's for a modular header
     // from a different module; in that case, we rely on the module(s)
     // containing the header to provide this information.
-    const HeaderFileInfo *HFI =
-        HS.getExistingFileInfo(*File, /*WantExternal*/!Chain);
+    const HeaderFileInfo *HFI = HS.getExistingLocalFileInfo(*File);
     if (!HFI || (HFI->isModuleHeader && !HFI->isCompilingModuleHeader))
       continue;
 
diff --git a/clang/test/Modules/foo.c b/clang/test/Modules/foo.c
new file mode 100644
index 00000000000000..8f6860be1bff27
--- /dev/null
+++ b/clang/test/Modules/foo.c
@@ -0,0 +1,48 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+// Not to be committed, just here as a demonstration.
+
+//--- frameworks/A.framework/Modules/module.modulemap
+framework module A {
+  header "A1.h"
+  header "A2.h"
+  header "A3.h"
+}
+//--- frameworks/A.framework/Headers/A1.h
+//--- frameworks/A.framework/Headers/A2.h
+//--- frameworks/A.framework/Headers/A3.h
+//--- EndA
+// RUN: %clang_cc1 -fmodules -F %t/frameworks -emit-module -fmodule-name=A 
%t/frameworks/A.framework/Modules/module.modulemap -o %t/A.pcm
+
+//--- frameworks/B.framework/Modules/module.modulemap
+framework module B {
+  header "B1.h"
+  header "B2.h"
+  header "B3.h"
+}
+//--- frameworks/B.framework/Headers/B1.h
+//--- frameworks/B.framework/Headers/B2.h
+//--- frameworks/B.framework/Headers/B3.h
+//--- EndB
+// RUN: %clang_cc1 -fmodules -F %t/frameworks -emit-module -fmodule-name=B 
%t/frameworks/B.framework/Modules/module.modulemap -o %t/B.pcm
+
+//--- frameworks/X.framework/Modules/module.modulemap
+framework module X { header "X.h" }
+//--- frameworks/X.framework/Headers/X.h
+#import <A/A1.h>
+#import <B/B1.h>
+// RUN: %clang_cc1 -fmodules -F %t/frameworks -emit-module -fmodule-name=X 
%t/frameworks/X.framework/Modules/module.modulemap -o %t/X.pcm \
+// RUN:   -fmodule-map-file=%t/frameworks/A.framework/Modules/module.modulemap 
-fmodule-file=A=%t/A.pcm \
+// RUN:   -fmodule-map-file=%t/frameworks/B.framework/Modules/module.modulemap 
-fmodule-file=B=%t/B.pcm
+
+// Without this patch, ASTWriter would go looking for:
+// * "X.h" in A.pcm and B.pcm and not comparing it to any of their files due 
to size difference
+// * "A2.h" and compare it to "A1.h", "A2.h"                                   
       in A.pcm
+// * "A3.h" and compare it to "A1.h", "A2.h", "A3.h"                           
       in A.pcm
+// * "B2.h" and compare it to "A1.h", "A2.h", "A3.h" in A.pcm; "B1.h", "B2.h"  
       in B.pcm
+// * "B3.h" and compare it to "A1.h", "A2.h", "A3.h" in A.pcm; "B1.h", "B2.h", 
"B3.h" in B.pcm
+
+// With this patch, ASTWriter doesn't go looking for anything of the above.
+// * Clang already knows that "X.h" belongs to the current module and needs to 
be serialized,
+//   while the other headers belong to A or B and do not need to be serialized.

``````````

</details>


https://github.com/llvm/llvm-project/pull/87848
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to