https://github.com/azhan92 updated https://github.com/llvm/llvm-project/pull/204668
>From 785f4a51a8e7696a273ac12a6ae1fbbf30f04c1c Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Thu, 18 Jun 2026 12:00:14 -0400 Subject: [PATCH 01/11] Changes from fexec-charset PR --- clang/include/clang/Lex/Preprocessor.h | 4 ++++ clang/include/clang/Lex/TextEncoding.h | 23 +++++++++++++++++++++++ clang/lib/Lex/CMakeLists.txt | 1 + clang/lib/Lex/TextEncoding.cpp | 18 ++++++++++++++++++ 4 files changed, 46 insertions(+) create mode 100644 clang/include/clang/Lex/TextEncoding.h create mode 100644 clang/lib/Lex/TextEncoding.cpp diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 8b684e85eb1c1..28a14eb76f8ef 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -30,6 +30,7 @@ #include "clang/Lex/ModuleMap.h" #include "clang/Lex/PPCallbacks.h" #include "clang/Lex/PPEmbedParameters.h" +#include "clang/Lex/TextEncoding.h" #include "clang/Lex/Token.h" #include "clang/Lex/TokenLexer.h" #include "clang/Support/Compiler.h" @@ -198,6 +199,7 @@ class Preprocessor { std::unique_ptr<ScratchBuffer> ScratchBuf; HeaderSearch &HeaderInfo; ModuleLoader &TheModuleLoader; + TextEncoding TE; /// External source of macros. ExternalPreprocessorSource *ExternalSource; @@ -1265,6 +1267,8 @@ class Preprocessor { Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; } llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; } + TextEncoding &getTextEncoding() { return TE; } + void setExternalSource(ExternalPreprocessorSource *Source) { ExternalSource = Source; } diff --git a/clang/include/clang/Lex/TextEncoding.h b/clang/include/clang/Lex/TextEncoding.h new file mode 100644 index 0000000000000..ef718aec4d6d6 --- /dev/null +++ b/clang/include/clang/Lex/TextEncoding.h @@ -0,0 +1,23 @@ +//===-- clang/Lex/TextEncoding.h - Text Conversion Config -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_TEXTENCODING_H +#define LLVM_CLANG_LEX_TEXTENCODING_H + +#include "clang/Basic/LangOptions.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/TextEncoding.h" + +enum ConversionAction { CA_NoConversion }; + +class TextEncoding { +public: + llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; +}; + +#endif diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt index f61737cd68021..7b0be7249cd99 100644 --- a/clang/lib/Lex/CMakeLists.txt +++ b/clang/lib/Lex/CMakeLists.txt @@ -29,6 +29,7 @@ add_clang_library(clangLex Preprocessor.cpp PreprocessorLexer.cpp ScratchBuffer.cpp + TextEncoding.cpp TokenConcatenation.cpp TokenLexer.cpp diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp new file mode 100644 index 0000000000000..33e5436367014 --- /dev/null +++ b/clang/lib/Lex/TextEncoding.cpp @@ -0,0 +1,18 @@ +//===--- TextEncoding.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/TextEncoding.h" +#include "clang/Basic/DiagnosticDriver.h" + +llvm::TextEncodingConverter * +TextEncoding::getConverter(ConversionAction Action) const { + switch (Action) { + default: + return nullptr; + } +} >From e8270826b0c143247bf62459e20e9643d7468c29 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Thu, 18 Jun 2026 12:05:21 -0400 Subject: [PATCH 02/11] Changes from finput-charset PR --- .../clang/Basic/DiagnosticCommonKinds.td | 3 + clang/include/clang/Basic/SourceManager.h | 10 +- .../include/clang/Frontend/CompilerInstance.h | 1 + clang/include/clang/Lex/TextEncoding.h | 4 +- clang/lib/Basic/SourceManager.cpp | 104 ++++++++++++++---- clang/lib/Frontend/CompilerInstance.cpp | 20 ++-- .../lib/Frontend/VerifyDiagnosticConsumer.cpp | 4 +- clang/lib/Lex/ModuleMap.cpp | 5 +- clang/lib/Lex/PPDirectives.cpp | 7 +- clang/lib/Lex/Preprocessor.cpp | 5 +- clang/lib/Lex/TextEncoding.cpp | 2 + clang/lib/Serialization/ASTReader.cpp | 7 +- 12 files changed, 132 insertions(+), 40 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index f2ed2f4698b8d..8ebac3908b465 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -417,6 +417,9 @@ def note_file_sloc_usage : Note< "%plural{0:|: plus %2B (%human2B) for macro expansions}2">; def note_file_misc_sloc_usage : Note< "%0 additional files entered using a total of %1B (%human1B) of space">; +def warn_charset_conversion_failed : Warning< + "conversion from source encoding failed for '%0': %1; interpreting as IBM-1047">, + InGroup<DiagGroup<"charset-conversion-failed">>; // Modules def err_module_format_unhandled : Error< diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index 4217b8683da1e..f7d91d612e4ab 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -50,6 +50,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TextEncoding.h" #include <cassert> #include <cstddef> #include <map> @@ -156,6 +157,11 @@ class alignas(8) ContentCache { /// FIXME: Remove this once OrigEntry is a FileEntryRef with a stable name. StringRef Filename; + /// Information on whether this is associated with a FileID for a file (as + /// opposed to a buffer) and, if so, what conversion (if any) was requested. + llvm::PointerIntPair<llvm::TextEncodingConverter *, 1u, bool> + FileIDConverterInfo; + /// A bump pointer allocated array of offsets for each source line. /// /// This is lazily computed. The lines are owned by the SourceManager @@ -918,6 +924,7 @@ class SourceManager : public RefCountedBase<SourceManager> { /// being \#included from the specified IncludePosition. FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, SrcMgr::CharacteristicKind FileCharacter, + llvm::TextEncodingConverter *Converter = nullptr, int LoadedID = 0, SourceLocation::UIntTy LoadedOffset = 0); @@ -942,7 +949,8 @@ class SourceManager : public RefCountedBase<SourceManager> { /// Get the FileID for \p SourceFile if it exists. Otherwise, create a /// new FileID for the \p SourceFile. FileID getOrCreateFileID(FileEntryRef SourceFile, - SrcMgr::CharacteristicKind FileCharacter); + SrcMgr::CharacteristicKind FileCharacter, + llvm::TextEncodingConverter *Converter = nullptr); /// Creates an expansion SLocEntry for the substitution of an argument into a /// function-like macro's body. Returns the start of the expansion. diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h index bb0eddb918623..89a0d066afd6f 100644 --- a/clang/include/clang/Frontend/CompilerInstance.h +++ b/clang/include/clang/Frontend/CompilerInstance.h @@ -864,6 +864,7 @@ class CompilerInstance : public ModuleLoader { /// /// \return True on success. static bool InitializeSourceManager(const FrontendInputFile &Input, + llvm::TextEncodingConverter *, DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr); diff --git a/clang/include/clang/Lex/TextEncoding.h b/clang/include/clang/Lex/TextEncoding.h index ef718aec4d6d6..3e7653580e994 100644 --- a/clang/include/clang/Lex/TextEncoding.h +++ b/clang/include/clang/Lex/TextEncoding.h @@ -13,9 +13,11 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/TextEncoding.h" -enum ConversionAction { CA_NoConversion }; +enum ConversionAction { CA_NoConversion, CA_FromInputEncoding }; class TextEncoding { +std::unique_ptr<llvm::TextEncodingConverter> FromInputEncodingConverter; + public: llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; }; diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index b6cc6ec9365f5..950186866c86d 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -16,6 +16,7 @@ #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManagerInternals.h" +#include "clang/Lex/TextEncoding.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" @@ -31,6 +32,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SmallVectorMemoryBuffer.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -136,7 +138,51 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, Buffer = std::move(*BufferOrError); - // Check that the file's size fits in an 'unsigned' (with room for a + // Unless this is a named pipe (in which case we can handle a mismatch), + // check that the file's size is the same as in the file entry (which may + // have come from a stat cache). + assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize()); + if (!ContentsEntry->isNamedPipe() && + Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { + Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); + + return std::nullopt; + } + + // Convert source from the input charset to UTF-8 if necessary. + llvm::TextEncodingConverter *Converter = FileIDConverterInfo.getPointer(); + if (Converter) { + StringRef OriginalBuf = Buffer->getBuffer(); + llvm::SmallString<0> UTF8Buf; + UTF8Buf.reserve(OriginalBuf.size() + 1); + + std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf); + if (EC) { + // If conversion fails, emit a warning and fall back to interpreting the + // file as UTF-8 without conversion. + // + // This allows the compiler to accept system or third-party headers that + // are encoded in UTF-8 even if conversion to the option-specified input + // charset failed. + // + // Diagnostics already exist when files are not well-formed UTF-8. + // + // TODO: Add input byte offset information. + // + // TODO: Consider adjusting the message to omit the "interpreting as + // UTF-8" recovery description if the warning has been upgraded to an + // error. + Diag.Report(Loc, diag::warn_charset_conversion_failed) + << ContentsEntry->getName() << EC.message(); + } else { + // TODO: Reclaim memory if the buffer size exceeds the content. + auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>( + std::move(UTF8Buf), Buffer->getBufferIdentifier()); + Buffer = std::move(NewBuf); + } + } + + // Check that the buffer's size fits in an 'unsigned' (with room for a // past-the-end value). This is deeply regrettable, but various parts of // Clang (including elsewhere in this file!) use 'unsigned' to represent file // offsets, line numbers, string literal lengths, and so on, and fail @@ -151,22 +197,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, return std::nullopt; } - // Unless this is a named pipe (in which case we can handle a mismatch), - // check that the file's size is the same as in the file entry (which may - // have come from a stat cache). - // The buffer will always be larger than the file size on z/OS in the presence - // of characters outside the base character set. - assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize()); - if (!ContentsEntry->isNamedPipe() && - Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { - Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); - - return std::nullopt; - } - - // If the buffer is valid, check to see if it has a UTF Byte Order Mark - // (BOM). We only support UTF-8 with and without a BOM right now. See - // http://en.wikipedia.org/wiki/Byte_order_mark for more information. + // If the buffer is valid, check to see if it has a UTF Byte Order Mark (BOM) + // Note that any conversion requested using `-finput-charset` (if successful) + // has already occurred, so we are expecting UTF-8 with or without a BOM. + // + // In theory, if we see a non-UTF-8 BOM, we can assume that an appropriate + // conversion was not supplied via `-finput-charset` and we could try to + // convert based on the BOM. + // + // See http://en.wikipedia.org/wiki/Byte_order_mark for more information. StringRef BufStr = Buffer->getBuffer(); const char *InvalidBOM = getInvalidBOM(BufStr); @@ -537,15 +576,30 @@ FileID SourceManager::getNextFileID(FileID FID) const { /// being \#included from the specified IncludePosition. FileID SourceManager::createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, - SrcMgr::CharacteristicKind FileCharacter, + SrcMgr::CharacteristicKind FileCharacter, + llvm::TextEncodingConverter *Converter, int LoadedID, SourceLocation::UIntTy LoadedOffset) { SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile, isSystem(FileCharacter)); + #ifndef NDEBUG + // Either the content cache has never been used for a FileID (and, if we are + // being asked to use a converter, there should be no valid buffer set up for + // it) or the conversion (or lack thereof) should be the same as that used + // previously. + auto [CacheConverter, CacheUsedByFileID] = IR.FileIDConverterInfo; + if (CacheUsedByFileID) + assert(CacheConverter == Converter); + else + assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded()); +#endif + IR.FileIDConverterInfo.setPointerAndInt(Converter, true); + // If this is a named pipe, immediately load the buffer to ensure subsequent // calls to ContentCache::getSize() are accurate. - if (IR.ContentsEntry->isNamedPipe()) + // Do the same if character-encoding conversion was requested. + if (IR.ContentsEntry->isNamedPipe() || Converter) (void)IR.getBufferOrNone(Diag, getFileManager(), SourceLocation()); return createFileIDImpl(IR, SourceFile.getName(), IncludePos, FileCharacter, @@ -583,10 +637,12 @@ FileID SourceManager::createFileID(const llvm::MemoryBufferRef &Buffer, /// new FileID for the \p SourceFile. FileID SourceManager::getOrCreateFileID(FileEntryRef SourceFile, - SrcMgr::CharacteristicKind FileCharacter) { + SrcMgr::CharacteristicKind FileCharacter, + llvm::TextEncodingConverter *Converter) { FileID ID = translateFile(SourceFile); - return ID.isValid() ? ID : createFileID(SourceFile, SourceLocation(), - FileCharacter); + return ID.isValid() ? ID + : createFileID(SourceFile, SourceLocation(), + FileCharacter, Converter); } /// createFileID - Create a new FileID for the specified ContentCache and @@ -2340,8 +2396,8 @@ SourceManagerForFile::SourceManagerForFile(StringRef FileName, std::make_unique<DiagnosticsEngine>(DiagnosticIDs::create(), *DiagOpts); SourceMgr = std::make_unique<SourceManager>(*Diagnostics, *FileMgr); FileEntryRef FE = llvm::cantFail(FileMgr->getFileRef(FileName)); - FileID ID = - SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User); + FileID ID = SourceMgr->createFileID( + FE, SourceLocation(), clang::SrcMgr::C_User, /*Converter=*/nullptr); assert(ID.isValid()); SourceMgr->setMainFileID(ID); } diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 8aee45b5dc644..008bdb5bdeb0d 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -34,6 +34,7 @@ #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/TextEncoding.h" #include "clang/Sema/CodeCompleteConsumer.h" #include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Sema.h" @@ -912,15 +913,20 @@ CompilerInstance::createOutputFileImpl(StringRef OutputPath, bool Binary, // Initialization Utilities bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input){ - return InitializeSourceManager(Input, getDiagnostics(), getFileManager(), - getSourceManager()); + // Retrieve the converter to the internal charset if it exists. + llvm::TextEncodingConverter *Converter = + hasPreprocessor() ? getPreprocessor().getTextEncoding().getConverter( + CA_FromInputEncoding) + : nullptr; + + return InitializeSourceManager(Input, Converter, getDiagnostics(), + getFileManager(), getSourceManager()); } // static -bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input, - DiagnosticsEngine &Diags, - FileManager &FileMgr, - SourceManager &SourceMgr) { +bool CompilerInstance::InitializeSourceManager( + const FrontendInputFile &Input, llvm::TextEncodingConverter *Converter, + DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr) { SrcMgr::CharacteristicKind Kind = Input.getKind().getFormat() == InputKind::ModuleMap ? Input.isSystem() ? SrcMgr::C_System_ModuleMap @@ -950,7 +956,7 @@ bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input, } SourceMgr.setMainFileID( - SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind)); + SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind, Converter)); assert(SourceMgr.getMainFileID().isValid() && "Couldn't establish MainFileID!"); diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp index 1bfe644b2525a..01e3b20e0c7cb 100644 --- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp +++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp @@ -610,8 +610,10 @@ static bool ParseDirective(StringRef S, ExpectedData *ED, SourceManager &SM, } FileID FID = SM.translateFile(*File); + // FIXME: Figure out character-encoding converter treatment. if (FID.isInvalid()) - FID = SM.createFileID(*File, Pos, SrcMgr::C_User); + FID = SM.createFileID(*File, Pos, SrcMgr::C_User, + /*Converter=*/nullptr); if (PH.Next(Line) && Line > 0) ExpectedLoc = SM.translateLineCol(FID, Line, 1); diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index 6c07386f89010..c7c1e04b76ea1 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -1473,7 +1473,10 @@ bool ModuleMap::parseModuleMapFile(FileEntryRef File, bool IsSystem, if (LocalFID.isInvalid()) { auto FileCharacter = IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap; - LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter); + // FIXME: Module map files are also textual "source files". For consistency, + // conversion should occur. + LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter, + /*Converter=*/nullptr); } ID = LocalFID; } diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index eb21a510dcf83..9f42ad12655e1 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -2796,7 +2796,12 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // position on the file where it will be included and after the expansions. if (IncludePos.isMacroID()) IncludePos = SourceMgr.getExpansionRange(IncludePos).getEnd(); - FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter); + // Retrieve the converter to the internal charset if it exists. + llvm::TextEncodingConverter *Converter = + getTextEncoding().getConverter(CA_FromInputEncoding); + + FileID FID = + SourceMgr.createFileID(*File, IncludePos, FileCharacter, Converter); if (!FID.isValid()) { TheModuleLoader.HadFatalFailure = true; return ImportAction::Failure; diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 1e21b4a94cea3..c5e32468bcc7e 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -649,8 +649,9 @@ void Preprocessor::EnterMainSourceFile() { << PPOpts.PCHThroughHeader; return; } - setPCHThroughHeaderFileID( - SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User)); + // FIXME: Figure out character-encoding converter treatment. + setPCHThroughHeaderFileID(SourceMgr.createFileID( + *File, SourceLocation(), SrcMgr::C_User, /*Converter=*/nullptr)); } // Skip tokens from the Predefines and if needed the main file. diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp index 33e5436367014..7a0fea3798277 100644 --- a/clang/lib/Lex/TextEncoding.cpp +++ b/clang/lib/Lex/TextEncoding.cpp @@ -12,6 +12,8 @@ llvm::TextEncodingConverter * TextEncoding::getConverter(ConversionAction Action) const { switch (Action) { + case CA_FromInputEncoding: + return FromInputEncodingConverter.get(); default: return nullptr; } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index f8a6a38bb9b5c..379622e92984b 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -2002,8 +2002,11 @@ bool ASTReader::ReadSLocEntry(int ID) { } SrcMgr::CharacteristicKind FileCharacter = (SrcMgr::CharacteristicKind)Record[2]; - FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, ID, - BaseOffset + Record[0]); + // Note: If conversion was originally necessary, OverriddenBuffer should be + // true and the associated handling will trigger. + FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, + /*Converter=*/nullptr, ID, + BaseOffset + Record[0]); SrcMgr::FileInfo &FileInfo = SourceMgr.getSLocEntry(FID).getFile(); FileInfo.NumCreatedFIDs = Record[5]; if (Record[3]) >From 569cd4065cdf76720c6636f9ff4af975f25f5d7d Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Thu, 18 Jun 2026 11:18:31 -0400 Subject: [PATCH 03/11] Add getEncodingNameFromFileTag function --- llvm/include/llvm/Support/AutoConvert.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index d68b0e8b515e0..337befec1b352 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -105,6 +105,25 @@ inline ErrorOr<bool> needConversion(const Twine &FileName, const int FD = -1) { return false; } +inline ErrorOr<std::string> +getEncodingNameFromFileTag(const Twine &FileName, const int FD = -1) { +#ifdef __MVS__ + ErrorOr<__ccsid_t> TagOrErr = getzOSFileTag(FileName, FD); + if (!TagOrErr) + return TagOrErr.getError(); + + __ccsid_t Tag = *TagOrErr; + if (Tag == 0) + return std::string(); // Return empty string for no tag + + char Buffer[16]; + snprintf(Buffer, sizeof(Buffer), "%03d", Tag); + return std::string(Buffer); +#else + return std::string(); // Return empty string for non-MVS platforms +#endif +} + } /* namespace llvm */ #endif /* __cplusplus */ >From 97ec4133c8977d91cad6ff26bda05f5672fdc1a7 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Thu, 18 Jun 2026 12:17:48 -0400 Subject: [PATCH 04/11] Add TextEncodingConverter cache --- llvm/include/llvm/Support/TextEncoding.h | 15 ++++++++ llvm/lib/Support/TextEncoding.cpp | 45 ++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h index 8a304910aa5dd..6ca37ce6e7a4e 100644 --- a/llvm/include/llvm/Support/TextEncoding.h +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -21,8 +21,10 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorOr.h" +#include <memory> #include <string> #include <system_error> +#include <utility> namespace llvm { @@ -137,6 +139,19 @@ class TextEncodingConverter { } }; +/// Cache for TextEncodingConverter instances. +class TextEncodingConverterCache { + public: + /// Get or create a cached TextEncodingConverter. + /// If the converter exists in the cache, returns it. Otherwise, creates a new + /// converter, caches it, and returns it. + /// \param[in] SourceEncoding the source character encoding name + /// \param[in] TargetEncoding the target character encoding name + /// \return pointer to the converter or an error code + LLVM_ABI static ErrorOr<TextEncodingConverter *> + getOrCreateConverter(StringRef SourceEncoding, StringRef TargetEncoding); +}; + } // namespace llvm #endif diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index d36f02c1300b9..7797624ab278b 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -16,8 +16,11 @@ #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringMap.h" #include "llvm/Support/ConvertEBCDIC.h" +#include "llvm/Support/ManagedStatic.h" #include <system_error> +#include <utility> #if HAVE_ICU #if HAVE_WINDOWS_ICU @@ -356,3 +359,45 @@ ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From, return std::make_error_code(std::errc::invalid_argument); #endif } + +namespace { +// Global cache for TextEncodingConverter instances +// Use StringMap which is designed for string keys +using ConverterCache = StringMap<std::unique_ptr<TextEncodingConverter>>; + +static ManagedStatic<ConverterCache> GlobalConverterCache; +} // namespace + +ErrorOr<TextEncodingConverter *> +TextEncodingConverterCache::getOrCreateConverter(StringRef SourceEncoding, + StringRef TargetEncoding) { + // Don't create a converter if source and target are the same + if (SourceEncoding == TargetEncoding) + return nullptr; + + // Create cache key by concatenating source and target with a separator + SmallString<64> Key; + Key = SourceEncoding; + Key += " -> "; + Key += TargetEncoding; + + // Check if converter already exists + auto Iter = GlobalConverterCache->find(Key); + if (Iter != GlobalConverterCache->end()) { + return Iter->second.get(); + } + + // Create a new converter + ErrorOr<TextEncodingConverter> ErrorOrConverter = + TextEncodingConverter::create(SourceEncoding, TargetEncoding); + if (!ErrorOrConverter) { + return ErrorOrConverter.getError(); + } + + // Insert into cache and return pointer + auto NewConverter = + std::make_unique<TextEncodingConverter>(std::move(*ErrorOrConverter)); + TextEncodingConverter *Result = NewConverter.get(); + GlobalConverterCache->try_emplace(Key, std::move(NewConverter)); + return Result; +} >From 2de55673f253d89bba98bd99fbff8fcc5a166105 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Thu, 18 Jun 2026 11:31:00 -0400 Subject: [PATCH 05/11] Get filetag and create converter --- clang/lib/Basic/SourceManager.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 950186866c86d..945bc3e4177e2 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -583,6 +583,20 @@ FileID SourceManager::createFileID(FileEntryRef SourceFile, SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile, isSystem(FileCharacter)); + llvm::ErrorOr<std::string> Ccsid = + llvm::getEncodingNameFromFileTag(SourceFile.getName()); + if (!Ccsid) { + Diag.Report(SourceLocation(), diag::err_cannot_open_file) + << SourceFile.getName() << Ccsid.getError().message(); + return FileID(); + } + if (!Ccsid->empty()) { + llvm::ErrorOr<llvm::TextEncodingConverter *> FileTagConverter = + llvm::TextEncodingConverterCache::getOrCreateConverter(*Ccsid, "UTF-8"); + if (FileTagConverter) + Converter = *FileTagConverter; + } + #ifndef NDEBUG // Either the content cache has never been used for a FileID (and, if we are // being asked to use a converter, there should be no valid buffer set up for >From f46342c689c8a87fd0b12d38320a1ff149af3051 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Thu, 18 Jun 2026 15:44:13 -0400 Subject: [PATCH 06/11] Disable autoconversion for tagged files --- clang/lib/Basic/SourceManager.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 945bc3e4177e2..7af5b48481a1a 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -122,7 +122,16 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // return paths. IsBufferInvalid = true; - auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile); + // If a converter is set, open the file in binary mode to get raw bytes + // and avoid platform-specific auto-conversion (e.g., EBCDIC->ASCII on z/OS, + // CRLF->LF on Windows). The explicit converter will handle all transformations. + bool NeedsExplicitConversion = FileIDConverterInfo.getPointer() != nullptr; + bool IsText = !NeedsExplicitConversion; + + auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile, + /*RequiresNullTerminator=*/true, + /*MaybeLimit=*/std::nullopt, + IsText); // If we were unable to open the file, then we are in an inconsistent // situation where the content cache referenced a file which no longer >From 067516a4f6fa9f27aa7ce9b3b3fdb79605b385ca Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Thu, 18 Jun 2026 16:23:33 -0400 Subject: [PATCH 07/11] Debug output from bob --- clang/lib/Basic/FileManager.cpp | 2 ++ clang/lib/Basic/SourceManager.cpp | 27 +++++++++++++++++++++++++- llvm/lib/Support/VirtualFileSystem.cpp | 6 ++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp index 8fb3ba0a27aad..50471b43ef96a 100644 --- a/clang/lib/Basic/FileManager.cpp +++ b/clang/lib/Basic/FileManager.cpp @@ -601,6 +601,8 @@ std::error_code FileManager::getStatValue(StringRef Path, // // Because of this, check to see if the file exists with 'open'. If the // open succeeds, use fstat to get the stat info. + llvm::errs() << "[DEBUG] FileManager: Opening file '" << Path << "' in " + << (IsText ? "TEXT" : "BINARY") << " mode\n"; auto OwnedFile = IsText ? FS->openFileForRead(Path) : FS->openFileForReadBinary(Path); diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 7af5b48481a1a..b91abc4c928eb 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -128,6 +128,11 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, bool NeedsExplicitConversion = FileIDConverterInfo.getPointer() != nullptr; bool IsText = !NeedsExplicitConversion; + llvm::errs() << "[DEBUG] ContentCache::getBufferOrNone: Opening file '" + << ContentsEntry->getName() << "' - " + << (IsText ? "TEXT mode (no converter)" : "BINARY mode (converter present)") + << "\n"; + auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile, /*RequiresNullTerminator=*/true, /*MaybeLimit=*/std::nullopt, @@ -161,11 +166,18 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // Convert source from the input charset to UTF-8 if necessary. llvm::TextEncodingConverter *Converter = FileIDConverterInfo.getPointer(); if (Converter) { + llvm::errs() << "[DEBUG] SourceManager: Using converter for file '" + << ContentsEntry->getName() << "'\n"; StringRef OriginalBuf = Buffer->getBuffer(); + llvm::errs() << "[DEBUG] SourceManager: Original buffer size: " + << OriginalBuf.size() << " bytes\n"; llvm::SmallString<0> UTF8Buf; UTF8Buf.reserve(OriginalBuf.size() + 1); std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf); + llvm::errs() << "[DEBUG] SourceManager: Conversion " + << (EC ? "FAILED" : "succeeded") + << ", UTF8 buffer size: " << UTF8Buf.size() << " bytes\n"; if (EC) { // If conversion fails, emit a warning and fall back to interpreting the // file as UTF-8 without conversion. @@ -600,10 +612,18 @@ FileID SourceManager::createFileID(FileEntryRef SourceFile, return FileID(); } if (!Ccsid->empty()) { + llvm::errs() << "[DEBUG] SourceManager::createFileID: File '" << SourceFile.getName() + << "' has encoding tag: '" << *Ccsid << "'\n"; llvm::ErrorOr<llvm::TextEncodingConverter *> FileTagConverter = llvm::TextEncodingConverterCache::getOrCreateConverter(*Ccsid, "UTF-8"); - if (FileTagConverter) + if (FileTagConverter) { Converter = *FileTagConverter; + llvm::errs() << "[DEBUG] SourceManager::createFileID: Converter obtained for '" + << *Ccsid << "' -> UTF-8\n"; + } else { + llvm::errs() << "[DEBUG] SourceManager::createFileID: Failed to get converter: " + << FileTagConverter.getError().message() << "\n"; + } } #ifndef NDEBUG @@ -617,6 +637,11 @@ FileID SourceManager::createFileID(FileEntryRef SourceFile, else assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded()); #endif + + if (Converter) { + llvm::errs() << "[DEBUG] SourceManager::createFileID: Setting converter for file '" + << SourceFile.getName() << "'\n"; + } IR.FileIDConverterInfo.setPointerAndInt(Converter, true); // If this is a named pipe, immediately load the buffer to ensure subsequent diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 42e8bb4f9958e..219fbd2097543 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -119,6 +119,8 @@ ErrorOr<std::unique_ptr<MemoryBuffer>> FileSystem::getBufferForFile(const llvm::Twine &Name, int64_t FileSize, bool RequiresNullTerminator, bool IsVolatile, bool IsText) { + llvm::errs() << "[DEBUG] FileSystem::getBufferForFile: Opening '" << Name + << "' in " << (IsText ? "TEXT" : "BINARY") << " mode\n"; auto F = IsText ? openFileForRead(Name) : openFileForReadBinary(Name); if (!F) return F.getError(); @@ -348,6 +350,8 @@ ErrorOr<Status> RealFileSystem::status(const Twine &Path) { ErrorOr<std::unique_ptr<File>> RealFileSystem::openFileForRead(const Twine &Name) { + llvm::errs() << "[DEBUG] RealFileSystem::openFileForRead: Opening '" + << Name << "' in TEXT mode\n"; auto BypassSandbox = sys::sandbox::scopedDisable(); return openFileForReadWithFlags(Name, sys::fs::OF_Text); @@ -355,6 +359,8 @@ RealFileSystem::openFileForRead(const Twine &Name) { ErrorOr<std::unique_ptr<File>> RealFileSystem::openFileForReadBinary(const Twine &Name) { + llvm::errs() << "[DEBUG] RealFileSystem::openFileForReadBinary: Opening '" + << Name << "' in BINARY mode\n"; auto BypassSandbox = sys::sandbox::scopedDisable(); return openFileForReadWithFlags(Name, sys::fs::OF_None); >From ed912a41a25ceab606c8ebbf7be8f93a0eb8213d Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Thu, 18 Jun 2026 16:45:59 -0400 Subject: [PATCH 08/11] Changes from Bob --- llvm/lib/Support/TextEncoding.cpp | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index 7797624ab278b..f1801a9c9b69e 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -365,7 +365,12 @@ namespace { // Use StringMap which is designed for string keys using ConverterCache = StringMap<std::unique_ptr<TextEncodingConverter>>; -static ManagedStatic<ConverterCache> GlobalConverterCache; +struct ConverterCacheData { + ConverterCache Cache; + std::shared_mutex Mutex; +}; + +static ManagedStatic<ConverterCacheData> GlobalConverterCache; } // namespace ErrorOr<TextEncodingConverter *> @@ -381,23 +386,32 @@ TextEncodingConverterCache::getOrCreateConverter(StringRef SourceEncoding, Key += " -> "; Key += TargetEncoding; - // Check if converter already exists - auto Iter = GlobalConverterCache->find(Key); - if (Iter != GlobalConverterCache->end()) { - return Iter->second.get(); + // First, try to find existing converter with shared lock (allows concurrent reads) + { + std::shared_lock<std::shared_mutex> ReadLock(GlobalConverterCache->Mutex); + auto Iter = GlobalConverterCache->Cache.find(Key); + if (Iter != GlobalConverterCache->Cache.end()) + return Iter->second.get(); } + // Not found, need to create - acquire unique lock for writing + std::unique_lock<std::shared_mutex> WriteLock(GlobalConverterCache->Mutex); + + // Double-check: another thread might have created it while we were waiting + auto Iter = GlobalConverterCache->Cache.find(Key); + if (Iter != GlobalConverterCache->Cache.end()) + return Iter->second.get(); + // Create a new converter ErrorOr<TextEncodingConverter> ErrorOrConverter = TextEncodingConverter::create(SourceEncoding, TargetEncoding); - if (!ErrorOrConverter) { + if (!ErrorOrConverter) return ErrorOrConverter.getError(); - } // Insert into cache and return pointer auto NewConverter = std::make_unique<TextEncodingConverter>(std::move(*ErrorOrConverter)); TextEncodingConverter *Result = NewConverter.get(); - GlobalConverterCache->try_emplace(Key, std::move(NewConverter)); + GlobalConverterCache->Cache.try_emplace(Key, std::move(NewConverter)); return Result; } >From c2a8d9e90746baf5ff2c038f8e8cb41c33fa55d7 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Thu, 18 Jun 2026 23:23:52 -0400 Subject: [PATCH 09/11] Changes from bob --- clang/lib/Basic/FileManager.cpp | 16 +++++++++++----- llvm/include/llvm/Support/VirtualFileSystem.h | 5 +++++ llvm/lib/Support/VirtualFileSystem.cpp | 10 +++++++--- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp index 50471b43ef96a..3e12a123ea73d 100644 --- a/clang/lib/Basic/FileManager.cpp +++ b/clang/lib/Basic/FileManager.cpp @@ -539,15 +539,21 @@ FileManager::getBufferForFile(FileEntryRef FE, bool isVolatile, FileSize = -1; StringRef Filename = FE.getName(); - // If the file is already open, use the open file descriptor. + // If the file is already open, check if the mode matches. if (Entry->File) { - auto Result = Entry->File->getBuffer(Filename, FileSize, - RequiresNullTerminator, isVolatile); + // Check if the cached file's mode matches the requested mode + if (Entry->File->isText() == IsText) { + // Mode matches, use the cached file descriptor + auto Result = Entry->File->getBuffer(Filename, FileSize, + RequiresNullTerminator, isVolatile); + Entry->closeFile(); + return Result; + } + // Mode mismatch - close the cached file and reopen with correct mode Entry->closeFile(); - return Result; } - // Otherwise, open the file. + // Open the file with the requested mode. return getBufferForFileImpl(Filename, FileSize, isVolatile, RequiresNullTerminator, IsText); } diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h index d22c534228331..0622f5916fdda 100644 --- a/llvm/include/llvm/Support/VirtualFileSystem.h +++ b/llvm/include/llvm/Support/VirtualFileSystem.h @@ -137,6 +137,11 @@ class LLVM_ABI File { /// Closes the file. virtual std::error_code close() = 0; + /// Returns true if this file was opened in text mode (with potential + /// encoding conversions), false if opened in binary mode. + /// Default implementation returns true for backward compatibility. + virtual bool isText() const { return true; } + // Get the same file with a different path. static ErrorOr<std::unique_ptr<File>> getWithPath(ErrorOr<std::unique_ptr<File>> Result, const Twine &P); diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 219fbd2097543..87ffa6be7408c 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -196,11 +196,13 @@ class RealFile : public File { file_t FD; Status S; std::string RealName; + bool IsTextMode; - RealFile(file_t RawFD, StringRef NewName, StringRef NewRealPathName) + RealFile(file_t RawFD, StringRef NewName, StringRef NewRealPathName, + bool IsText) : FD(RawFD), S(NewName, {}, {}, {}, {}, {}, llvm::sys::fs::file_type::status_error, {}), - RealName(NewRealPathName.str()) { + RealName(NewRealPathName.str()), IsTextMode(IsText) { assert(FD != kInvalidFile && "Invalid or inactive file descriptor"); } @@ -215,6 +217,7 @@ class RealFile : public File { bool IsVolatile) override; std::error_code close() override; void setPath(const Twine &Path) override; + bool isText() const override { return IsTextMode; } }; } // namespace @@ -322,8 +325,9 @@ class RealFileSystem : public FileSystem { adjustPath(Name, Storage), Flags, &RealName); if (!FDOrErr) return errorToErrorCode(FDOrErr.takeError()); + bool IsText = (Flags & sys::fs::OF_Text) != sys::fs::OF_None; return std::unique_ptr<File>( - new RealFile(*FDOrErr, Name.str(), RealName.str())); + new RealFile(*FDOrErr, Name.str(), RealName.str(), IsText)); } struct WorkingDirectory { >From cb3ab2d74b085cae34de1e0df65f030b68d093b4 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Fri, 19 Jun 2026 00:01:30 -0400 Subject: [PATCH 10/11] Fix bob code --- llvm/lib/Support/TextEncoding.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index f1801a9c9b69e..7ec3ed037dbc7 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/Support/ConvertEBCDIC.h" #include "llvm/Support/ManagedStatic.h" +#include "llvm/Support/RWMutex.h" #include <system_error> #include <utility> @@ -367,7 +368,7 @@ using ConverterCache = StringMap<std::unique_ptr<TextEncodingConverter>>; struct ConverterCacheData { ConverterCache Cache; - std::shared_mutex Mutex; + llvm::sys::RWMutex Mutex; }; static ManagedStatic<ConverterCacheData> GlobalConverterCache; @@ -388,14 +389,14 @@ TextEncodingConverterCache::getOrCreateConverter(StringRef SourceEncoding, // First, try to find existing converter with shared lock (allows concurrent reads) { - std::shared_lock<std::shared_mutex> ReadLock(GlobalConverterCache->Mutex); + llvm::sys::ScopedReader ReadLock(GlobalConverterCache->Mutex); auto Iter = GlobalConverterCache->Cache.find(Key); if (Iter != GlobalConverterCache->Cache.end()) return Iter->second.get(); } // Not found, need to create - acquire unique lock for writing - std::unique_lock<std::shared_mutex> WriteLock(GlobalConverterCache->Mutex); + llvm::sys::ScopedWriter WriteLock(GlobalConverterCache->Mutex); // Double-check: another thread might have created it while we were waiting auto Iter = GlobalConverterCache->Cache.find(Key); >From 812cf2b40cd3344f65e8ebdd9b95db55a93ade99 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Fri, 19 Jun 2026 15:22:00 -0400 Subject: [PATCH 11/11] update file mismatch checking from bob --- clang/lib/Basic/FileManager.cpp | 3 ++- llvm/include/llvm/Support/VirtualFileSystem.h | 7 +++++++ llvm/lib/Support/VirtualFileSystem.cpp | 3 +++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp index 3e12a123ea73d..ba5f56e1e7dca 100644 --- a/clang/lib/Basic/FileManager.cpp +++ b/clang/lib/Basic/FileManager.cpp @@ -542,7 +542,8 @@ FileManager::getBufferForFile(FileEntryRef FE, bool isVolatile, // If the file is already open, check if the mode matches. if (Entry->File) { // Check if the cached file's mode matches the requested mode - if (Entry->File->isText() == IsText) { + // Only perform mismatch recovery for real files + if (!Entry->File->realFileTextMismatch(IsText)) { // Mode matches, use the cached file descriptor auto Result = Entry->File->getBuffer(Filename, FileSize, RequiresNullTerminator, isVolatile); diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h index 0622f5916fdda..a3ef38fe552a7 100644 --- a/llvm/include/llvm/Support/VirtualFileSystem.h +++ b/llvm/include/llvm/Support/VirtualFileSystem.h @@ -142,6 +142,13 @@ class LLVM_ABI File { /// Default implementation returns true for backward compatibility. virtual bool isText() const { return true; } + /// Returns true if this is a real file and the requested text mode differs + /// from the current mode. Always returns false for non-real files. + /// Default implementation returns false for non-real files. + virtual bool realFileTextMismatch(bool RequestedIsText) const { + return false; + } + // Get the same file with a different path. static ErrorOr<std::unique_ptr<File>> getWithPath(ErrorOr<std::unique_ptr<File>> Result, const Twine &P); diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index 87ffa6be7408c..d56a552eafd7a 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -218,6 +218,9 @@ class RealFile : public File { std::error_code close() override; void setPath(const Twine &Path) override; bool isText() const override { return IsTextMode; } + bool realFileTextMismatch(bool RequestedIsText) const override { + return IsTextMode != RequestedIsText; + } }; } // namespace _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
