https://github.com/azhan92 created https://github.com/llvm/llvm-project/pull/204233
None >From 72b7bfae7524dbb6a2261de2703217d1c4e565c9 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 16 Jun 2026 15:01:43 -0400 Subject: [PATCH 1/4] Changes from fexec-charset PR --- clang/include/clang/Lex/Preprocessor.h | 4 ++++ clang/include/clang/Lex/TextEncodingConfig.h | 23 ++++++++++++++++++++ clang/lib/Frontend/CompilerInstance.cpp | 1 + clang/lib/Lex/CMakeLists.txt | 1 + clang/lib/Lex/TextEncodingConfig.cpp | 20 +++++++++++++++++ 5 files changed, 49 insertions(+) create mode 100644 clang/include/clang/Lex/TextEncodingConfig.h create mode 100644 clang/lib/Lex/TextEncodingConfig.cpp diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 8b684e85eb1c1..5f8d5caaafcac 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -30,6 +30,7 @@ #include "clang/Lex/ModuleMap.h" #include "clang/Lex/PPCallbacks.h" #include "clang/Lex/PPEmbedParameters.h" +#include "clang/Lex/TextEncodingConfig.h" #include "clang/Lex/Token.h" #include "clang/Lex/TokenLexer.h" #include "clang/Support/Compiler.h" @@ -198,6 +199,7 @@ class Preprocessor { std::unique_ptr<ScratchBuffer> ScratchBuf; HeaderSearch &HeaderInfo; ModuleLoader &TheModuleLoader; + TextEncodingConfig TEC; /// External source of macros. ExternalPreprocessorSource *ExternalSource; @@ -1265,6 +1267,8 @@ class Preprocessor { Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; } llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; } + TextEncodingConfig &getTextEncodingConfig() { return TEC; } + void setExternalSource(ExternalPreprocessorSource *Source) { ExternalSource = Source; } diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h new file mode 100644 index 0000000000000..a810b9ab61b1a --- /dev/null +++ b/clang/include/clang/Lex/TextEncodingConfig.h @@ -0,0 +1,23 @@ +//===-- clang/Lex/TextEncodingConfig.h - Text Conversion Config -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H +#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H + +#include "clang/Basic/LangOptions.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/TextEncoding.h" + +enum ConversionAction { CA_NoConversion }; + +class TextEncodingConfig { +public: + llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; +}; + +#endif diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 9e88abbece7f2..09607e6de8ce7 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -34,6 +34,7 @@ #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/TextEncodingConfig.h" #include "clang/Sema/CodeCompleteConsumer.h" #include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Sema.h" diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt index f61737cd68021..106a5d3b126be 100644 --- a/clang/lib/Lex/CMakeLists.txt +++ b/clang/lib/Lex/CMakeLists.txt @@ -29,6 +29,7 @@ add_clang_library(clangLex Preprocessor.cpp PreprocessorLexer.cpp ScratchBuffer.cpp + TextEncodingConfig.cpp TokenConcatenation.cpp TokenLexer.cpp diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp new file mode 100644 index 0000000000000..bb3f0d4b4abec --- /dev/null +++ b/clang/lib/Lex/TextEncodingConfig.cpp @@ -0,0 +1,20 @@ +//===--- TextEncodingConfig.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/TextEncodingConfig.h" +#include "clang/Basic/DiagnosticDriver.h" + +using namespace llvm; + +llvm::TextEncodingConverter * +TextEncodingConfig::getConverter(ConversionAction Action) const { + switch (Action) { + default: + return nullptr; + } +} >From 660b75df3e808eb772982ad7cd14dd81855ac840 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 16 Jun 2026 15:34:36 -0400 Subject: [PATCH 2/4] Changes from finput-charset PR --- .../clang/Basic/DiagnosticCommonKinds.td | 3 + clang/include/clang/Basic/SourceManager.h | 10 +- .../include/clang/Frontend/CompilerInstance.h | 1 + clang/include/clang/Lex/TextEncodingConfig.h | 4 +- clang/lib/Basic/SourceManager.cpp | 108 ++++++++++++++---- clang/lib/Frontend/CompilerInstance.cpp | 19 +-- .../lib/Frontend/VerifyDiagnosticConsumer.cpp | 4 +- clang/lib/Lex/ModuleMap.cpp | 5 +- clang/lib/Lex/PPDirectives.cpp | 7 +- clang/lib/Lex/Preprocessor.cpp | 5 +- clang/lib/Lex/TextEncodingConfig.cpp | 2 + clang/lib/Serialization/ASTReader.cpp | 7 +- 12 files changed, 136 insertions(+), 39 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index f2ed2f4698b8d..8ebac3908b465 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -417,6 +417,9 @@ def note_file_sloc_usage : Note< "%plural{0:|: plus %2B (%human2B) for macro expansions}2">; def note_file_misc_sloc_usage : Note< "%0 additional files entered using a total of %1B (%human1B) of space">; +def warn_charset_conversion_failed : Warning< + "conversion from source encoding failed for '%0': %1; interpreting as IBM-1047">, + InGroup<DiagGroup<"charset-conversion-failed">>; // Modules def err_module_format_unhandled : Error< diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index 4217b8683da1e..f7d91d612e4ab 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -50,6 +50,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TextEncoding.h" #include <cassert> #include <cstddef> #include <map> @@ -156,6 +157,11 @@ class alignas(8) ContentCache { /// FIXME: Remove this once OrigEntry is a FileEntryRef with a stable name. StringRef Filename; + /// Information on whether this is associated with a FileID for a file (as + /// opposed to a buffer) and, if so, what conversion (if any) was requested. + llvm::PointerIntPair<llvm::TextEncodingConverter *, 1u, bool> + FileIDConverterInfo; + /// A bump pointer allocated array of offsets for each source line. /// /// This is lazily computed. The lines are owned by the SourceManager @@ -918,6 +924,7 @@ class SourceManager : public RefCountedBase<SourceManager> { /// being \#included from the specified IncludePosition. FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, SrcMgr::CharacteristicKind FileCharacter, + llvm::TextEncodingConverter *Converter = nullptr, int LoadedID = 0, SourceLocation::UIntTy LoadedOffset = 0); @@ -942,7 +949,8 @@ class SourceManager : public RefCountedBase<SourceManager> { /// Get the FileID for \p SourceFile if it exists. Otherwise, create a /// new FileID for the \p SourceFile. FileID getOrCreateFileID(FileEntryRef SourceFile, - SrcMgr::CharacteristicKind FileCharacter); + SrcMgr::CharacteristicKind FileCharacter, + llvm::TextEncodingConverter *Converter = nullptr); /// Creates an expansion SLocEntry for the substitution of an argument into a /// function-like macro's body. Returns the start of the expansion. diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h index bb0eddb918623..89a0d066afd6f 100644 --- a/clang/include/clang/Frontend/CompilerInstance.h +++ b/clang/include/clang/Frontend/CompilerInstance.h @@ -864,6 +864,7 @@ class CompilerInstance : public ModuleLoader { /// /// \return True on success. static bool InitializeSourceManager(const FrontendInputFile &Input, + llvm::TextEncodingConverter *, DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr); diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h index a810b9ab61b1a..30e0fcf2ac919 100644 --- a/clang/include/clang/Lex/TextEncodingConfig.h +++ b/clang/include/clang/Lex/TextEncodingConfig.h @@ -13,9 +13,11 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/TextEncoding.h" -enum ConversionAction { CA_NoConversion }; +enum ConversionAction { CA_NoConversion, CA_FromInputEncoding }; class TextEncodingConfig { +std::unique_ptr<llvm::TextEncodingConverter> FromInputEncodingConverter; + public: llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; }; diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index b6cc6ec9365f5..c33ee69962864 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -16,6 +16,7 @@ #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManagerInternals.h" +#include "clang/Lex/TextEncodingConfig.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" @@ -31,6 +32,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SmallVectorMemoryBuffer.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -136,7 +138,57 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, Buffer = std::move(*BufferOrError); - // Check that the file's size fits in an 'unsigned' (with room for a + // Unless this is a named pipe (in which case we can handle a mismatch), + // check that the file's size is the same as in the file entry (which may + // have come from a stat cache). + // The buffer will always be larger than the file size on z/OS in the presence + // of characters outside the base character set. + assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize()); + if (!ContentsEntry->isNamedPipe() && + Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { + Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); + + return std::nullopt; + } + + // Convert source from the input charset to UTF-8 if necessary. + if (Converter) { + StringRef OriginalBuf = Buffer->getBuffer(); + + llvm::SmallString<0> UTF8Buf; + UTF8Buf.reserve(OriginalBuf.size() + 1); + + std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf); + if (EC) { + Diag.Report(Loc, diag::warn_charset_conversion_failed) + << ContentsEntry->getName() << EC.message(); +#ifdef __MVS__ + // On z/OS, if conversion fails, try converting from IBM-1047 to UTF-8 + std::unique_ptr<llvm::TextEncodingConverter> FallbackConverter = + TextEncodingConfig::createInputConverterFromFiletag(1047, Diag); + + if (FallbackConverter) { + // Try converting with IBM-1047 converter + UTF8Buf.clear(); + UTF8Buf.reserve(OriginalBuf.size() + 1); + EC = FallbackConverter->convert(OriginalBuf, UTF8Buf); + + if (!EC) { + auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>( + std::move(UTF8Buf), Buffer->getBufferIdentifier()); + Buffer = std::move(NewBuf); + } else { + // TODO: Reclaim memory if the buffer size exceeds the content. + auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>( + std::move(UTF8Buf), Buffer->getBufferIdentifier()); + Buffer = std::move(NewBuf); + } + } +#endif + } + } + + // Check that the buffer's size fits in an 'unsigned' (with room for a // past-the-end value). This is deeply regrettable, but various parts of // Clang (including elsewhere in this file!) use 'unsigned' to represent file // offsets, line numbers, string literal lengths, and so on, and fail @@ -151,22 +203,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, return std::nullopt; } - // Unless this is a named pipe (in which case we can handle a mismatch), - // check that the file's size is the same as in the file entry (which may - // have come from a stat cache). - // The buffer will always be larger than the file size on z/OS in the presence - // of characters outside the base character set. - assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize()); - if (!ContentsEntry->isNamedPipe() && - Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { - Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); - - return std::nullopt; - } - - // If the buffer is valid, check to see if it has a UTF Byte Order Mark - // (BOM). We only support UTF-8 with and without a BOM right now. See - // http://en.wikipedia.org/wiki/Byte_order_mark for more information. + // If the buffer is valid, check to see if it has a UTF Byte Order Mark (BOM) + // Note that any conversion requested using `-finput-charset` (if successful) + // has already occurred, so we are expecting UTF-8 with or without a BOM. + // + // In theory, if we see a non-UTF-8 BOM, we can assume that an appropriate + // conversion was not supplied via `-finput-charset` and we could try to + // convert based on the BOM. + // + // See http://en.wikipedia.org/wiki/Byte_order_mark for more information. StringRef BufStr = Buffer->getBuffer(); const char *InvalidBOM = getInvalidBOM(BufStr); @@ -537,15 +582,30 @@ FileID SourceManager::getNextFileID(FileID FID) const { /// being \#included from the specified IncludePosition. FileID SourceManager::createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, + llvm::TextEncodingConverter *Converter, SrcMgr::CharacteristicKind FileCharacter, int LoadedID, SourceLocation::UIntTy LoadedOffset) { SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile, isSystem(FileCharacter)); + #ifndef NDEBUG + // Either the content cache has never been used for a FileID (and, if we are + // being asked to use a converter, there should be no valid buffer set up for + // it) or the conversion (or lack thereof) should be the same as that used + // previously. + auto [CacheConverter, CacheUsedByFileID] = IR.FileIDConverterInfo; + if (CacheUsedByFileID) + assert(CacheConverter == Converter); + else + assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded()); +#endif + IR.FileIDConverterInfo.setPointerAndInt(Converter, true); + // If this is a named pipe, immediately load the buffer to ensure subsequent // calls to ContentCache::getSize() are accurate. - if (IR.ContentsEntry->isNamedPipe()) + // Do the same if character-encoding conversion was requested. + if (IR.ContentsEntry->isNamedPipe() || Converter) (void)IR.getBufferOrNone(Diag, getFileManager(), SourceLocation()); return createFileIDImpl(IR, SourceFile.getName(), IncludePos, FileCharacter, @@ -583,10 +643,12 @@ FileID SourceManager::createFileID(const llvm::MemoryBufferRef &Buffer, /// new FileID for the \p SourceFile. FileID SourceManager::getOrCreateFileID(FileEntryRef SourceFile, - SrcMgr::CharacteristicKind FileCharacter) { + SrcMgr::CharacteristicKind FileCharacter, + llvm::TextEncodingConverter *Converter) { FileID ID = translateFile(SourceFile); - return ID.isValid() ? ID : createFileID(SourceFile, SourceLocation(), - FileCharacter); + return ID.isValid() ? ID + : createFileID(SourceFile, SourceLocation(), + FileCharacter, Converter); } /// createFileID - Create a new FileID for the specified ContentCache and @@ -2340,8 +2402,8 @@ SourceManagerForFile::SourceManagerForFile(StringRef FileName, std::make_unique<DiagnosticsEngine>(DiagnosticIDs::create(), *DiagOpts); SourceMgr = std::make_unique<SourceManager>(*Diagnostics, *FileMgr); FileEntryRef FE = llvm::cantFail(FileMgr->getFileRef(FileName)); - FileID ID = - SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User); + FileID ID = SourceMgr->createFileID( + FE, SourceLocation(), clang::SrcMgr::C_User, /*Converter=*/nullptr); assert(ID.isValid()); SourceMgr->setMainFileID(ID); } diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 09607e6de8ce7..e94d05ba33fbb 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -907,15 +907,20 @@ CompilerInstance::createOutputFileImpl(StringRef OutputPath, bool Binary, // Initialization Utilities bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input){ - return InitializeSourceManager(Input, getDiagnostics(), getFileManager(), - getSourceManager()); + // Retrieve the converter to the internal charset if it exists. + llvm::TextEncodingConverter *Converter = + hasPreprocessor() ? getPreprocessor().getTextEncodingConfig().getConverter( + CA_FromInputEncoding) + : nullptr; + + return InitializeSourceManager(Input, Converter, getDiagnostics(), + getFileManager(), getSourceManager()); } // static -bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input, - DiagnosticsEngine &Diags, - FileManager &FileMgr, - SourceManager &SourceMgr) { +bool CompilerInstance::InitializeSourceManager( + const FrontendInputFile &Input, llvm::TextEncodingConverter *Converter, + DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr) { SrcMgr::CharacteristicKind Kind = Input.getKind().getFormat() == InputKind::ModuleMap ? Input.isSystem() ? SrcMgr::C_System_ModuleMap @@ -923,7 +928,7 @@ bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input, : Input.isSystem() ? SrcMgr::C_System : SrcMgr::C_User; if (Input.isBuffer()) { - SourceMgr.setMainFileID(SourceMgr.createFileID(Input.getBuffer(), Kind)); + SourceMgr.setMainFileID(SourceMgr.createFileID(Input.getBuffer(), Kind, Converter)); assert(SourceMgr.getMainFileID().isValid() && "Couldn't establish MainFileID!"); return true; diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp index 1bfe644b2525a..01e3b20e0c7cb 100644 --- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp +++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp @@ -610,8 +610,10 @@ static bool ParseDirective(StringRef S, ExpectedData *ED, SourceManager &SM, } FileID FID = SM.translateFile(*File); + // FIXME: Figure out character-encoding converter treatment. if (FID.isInvalid()) - FID = SM.createFileID(*File, Pos, SrcMgr::C_User); + FID = SM.createFileID(*File, Pos, SrcMgr::C_User, + /*Converter=*/nullptr); if (PH.Next(Line) && Line > 0) ExpectedLoc = SM.translateLineCol(FID, Line, 1); diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index 6c07386f89010..c7c1e04b76ea1 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -1473,7 +1473,10 @@ bool ModuleMap::parseModuleMapFile(FileEntryRef File, bool IsSystem, if (LocalFID.isInvalid()) { auto FileCharacter = IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap; - LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter); + // FIXME: Module map files are also textual "source files". For consistency, + // conversion should occur. + LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter, + /*Converter=*/nullptr); } ID = LocalFID; } diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index eb21a510dcf83..2e095fce02c0f 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -2796,7 +2796,12 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // position on the file where it will be included and after the expansions. if (IncludePos.isMacroID()) IncludePos = SourceMgr.getExpansionRange(IncludePos).getEnd(); - FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter); + // Retrieve the converter to the internal charset if it exists. + llvm::TextEncodingConverter *Converter = + getTextEncodingConfig().getConverter(CA_FromInputEncoding); + + FileID FID = + SourceMgr.createFileID(*File, IncludePos, FileCharacter, Converter); if (!FID.isValid()) { TheModuleLoader.HadFatalFailure = true; return ImportAction::Failure; diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 1e21b4a94cea3..c5e32468bcc7e 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -649,8 +649,9 @@ void Preprocessor::EnterMainSourceFile() { << PPOpts.PCHThroughHeader; return; } - setPCHThroughHeaderFileID( - SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User)); + // FIXME: Figure out character-encoding converter treatment. + setPCHThroughHeaderFileID(SourceMgr.createFileID( + *File, SourceLocation(), SrcMgr::C_User, /*Converter=*/nullptr)); } // Skip tokens from the Predefines and if needed the main file. diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp index bb3f0d4b4abec..ed426c304423b 100644 --- a/clang/lib/Lex/TextEncodingConfig.cpp +++ b/clang/lib/Lex/TextEncodingConfig.cpp @@ -14,6 +14,8 @@ using namespace llvm; llvm::TextEncodingConverter * TextEncodingConfig::getConverter(ConversionAction Action) const { switch (Action) { + case CA_FromInputEncoding: + return FromInputEncodingConverter.get(); default: return nullptr; } diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 74a7b51368c28..7bc28a2791067 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -2002,8 +2002,11 @@ bool ASTReader::ReadSLocEntry(int ID) { } SrcMgr::CharacteristicKind FileCharacter = (SrcMgr::CharacteristicKind)Record[2]; - FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, ID, - BaseOffset + Record[0]); + // Note: If conversion was originally necessary, OverriddenBuffer should be + // true and the associated handling will trigger. + FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, + /*Converter=*/nullptr, ID, + BaseOffset + Record[0]); SrcMgr::FileInfo &FileInfo = SourceMgr.getSLocEntry(FID).getFile(); FileInfo.NumCreatedFIDs = Record[5]; if (Record[3]) >From 51adf4f5b68b461d667507bb1e568d87cab8e23e Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 16 Jun 2026 15:34:51 -0400 Subject: [PATCH 3/4] Add wrapper function for getzOSFileTag --- llvm/include/llvm/Support/AutoConvert.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index d68b0e8b515e0..9586d41e10c2f 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -105,6 +105,23 @@ inline ErrorOr<bool> needConversion(const Twine &FileName, const int FD = -1) { return false; } +#ifdef __MVS__ +inline ErrorOr<__ccsid_t> getFileTag(const Twine &FileName, const int FD = -1) { + ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD); + if (!Ccsid) + return Ccsid; + // Assume untagged files to be IBM-1047 encoded + if (*Ccsid == 0) + return CCSID_IBM_1047; + return Ccsid; +} +#else +inline ErrorOr<int> getFileTag(const Twine &FileName, const int FD = -1) { + // On non-z/OS platforms, return 0 to indicate no file tag + return 0; +} +#endif + } /* namespace llvm */ #endif /* __cplusplus */ >From b8b8c2945182007ee9f02ef74358acb4b40ea0f3 Mon Sep 17 00:00:00 2001 From: alisonzhang <[email protected]> Date: Tue, 16 Jun 2026 15:52:52 -0400 Subject: [PATCH 4/4] Changes from Bob --- clang/include/clang/Lex/TextEncodingConfig.h | 8 +++++ clang/lib/Basic/SourceManager.cpp | 7 +++- clang/lib/Lex/TextEncodingConfig.cpp | 38 ++++++++++++++++++++ llvm/include/llvm/Support/TextEncoding.h | 22 ++++++++++++ llvm/lib/Support/TextEncoding.cpp | 36 +++++++++++++++++++ 5 files changed, 110 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h index 30e0fcf2ac919..e4f6595997888 100644 --- a/clang/include/clang/Lex/TextEncodingConfig.h +++ b/clang/include/clang/Lex/TextEncodingConfig.h @@ -17,9 +17,17 @@ enum ConversionAction { CA_NoConversion, CA_FromInputEncoding }; class TextEncodingConfig { std::unique_ptr<llvm::TextEncodingConverter> FromInputEncodingConverter; +llvm::StringMap<std::unique_ptr<llvm::TextEncodingConverter>> FromFiletagEncodingConverters; public: llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; + static llvm::TextEncodingConverter * + getFromFiletagEncodingConverter(TextEncodingConfig &TEC, + llvm::StringRef FiletagEncoding); + static llvm::TextEncodingConverter * + createAndInsertFromFiletagEncodingConverter(TextEncodingConfig &TEC, + llvm::StringRef FiletagEncoding, + clang::DiagnosticsEngine &Diag); }; #endif diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index c33ee69962864..25a7023b7db85 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -122,7 +122,10 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // return paths. IsBufferInvalid = true; - auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile); + auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile, + /*RequiresNullTerminator=*/true, + /*MaybeLimit=*/std::nullopt, + /*IsText=*/false); // If we were unable to open the file, then we are in an inconsistent // situation where the content cache referenced a file which no longer @@ -589,6 +592,8 @@ FileID SourceManager::createFileID(FileEntryRef SourceFile, SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile, isSystem(FileCharacter)); + llvm::ErrorOr<int> Ccsid = llvm::getFileTag(SourceFile.getName()); + if (Ccsid && *Ccsid > 0) { #ifndef NDEBUG // Either the content cache has never been used for a FileID (and, if we are // being asked to use a converter, there should be no valid buffer set up for diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp index ed426c304423b..01c9bfcf607db 100644 --- a/clang/lib/Lex/TextEncodingConfig.cpp +++ b/clang/lib/Lex/TextEncodingConfig.cpp @@ -20,3 +20,41 @@ TextEncodingConfig::getConverter(ConversionAction Action) const { return nullptr; } } + +llvm::TextEncodingConverter * +TextEncodingConfig::getFromFiletagEncodingConverter(TextEncodingConfig &TEC, + StringRef FiletagEncoding) { + auto Iter = TEC.FromFiletagEncodingConverters.find(FiletagEncoding); + if (Iter != TEC.FromFiletagEncodingConverters.end()) { + return Iter->second.get(); + } + return nullptr; +} + +llvm::TextEncodingConverter * +TextEncodingConfig::createAndInsertFromFiletagEncodingConverter(TextEncodingConfig &TEC, + StringRef FiletagEncoding, + clang::DiagnosticsEngine &Diag) { + llvm::TextEncodingConverter *Converter = getFromFiletagtEncodingConverter(TEC, FiletagEncoding); + if (Converter) + return Converter; + + const char *UTF8 = "UTF-8"; + // Create a converter between the input and internal encodings + if (FiletagEncoding != UTF8) { + ErrorOr<TextEncodingConverter> ErrorOrConverter = + llvm::TextEncodingConverter::create(FiletagEncoding, UTF8); + if (!ErrorOrConverter) { + Diag.Report(clang::diag::err_drv_invalid_value) + << "Filetag encoding" << FiletagEncoding; + return nullptr; + } else { + auto FromFiletagEncodingConverter = std::make_unique<llvm::TextEncodingConverter>( + std::move(*ErrorOrConverter)); + llvm::TextEncodingConverter *Result = FromFiletagEncodingConverter.get(); + TEC.FromFiletagEncodingConverters.insert_or_assign(FiletagEncoding, std::move(FromFiletagEncodingConverter)); + return Result; + } + } + return nullptr; +} diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h index 8a304910aa5dd..24a2e026ebc82 100644 --- a/llvm/include/llvm/Support/TextEncoding.h +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -16,11 +16,13 @@ #define LLVM_SUPPORT_TEXT_ENCODING_H #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Config/config.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorOr.h" +#include <memory> #include <string> #include <system_error> @@ -137,6 +139,26 @@ class TextEncodingConverter { } }; +/// Utility class to manage a cache of TextEncodingConverter instances. +/// This is useful when you need to convert from multiple source encodings +/// to a common target encoding (e.g., UTF-8). +class TextEncodingConverterCache { + StringMap<std::unique_ptr<TextEncodingConverter>> Converters; + +public: + /// Get a converter from the cache, or nullptr if not found. + /// \param[in] SourceEncoding the source character encoding name + /// \return pointer to the converter or nullptr + TextEncodingConverter *getConverter(StringRef SourceEncoding) const; + + /// Create and insert a converter into the cache. + /// \param[in] SourceEncoding the source character encoding name + /// \param[in] TargetEncoding the target character encoding name + /// \return pointer to the converter or nullptr on error + TextEncodingConverter *createAndInsertConverter(StringRef SourceEncoding, + StringRef TargetEncoding); +}; + } // namespace llvm #endif diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index d36f02c1300b9..2a1d19935699e 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -356,3 +356,39 @@ ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From, return std::make_error_code(std::errc::invalid_argument); #endif } + +TextEncodingConverter * +TextEncodingConverterCache::getConverter(StringRef SourceEncoding) const { + auto Iter = Converters.find(SourceEncoding); + if (Iter != Converters.end()) { + return Iter->second.get(); + } + return nullptr; +} + +TextEncodingConverter * +TextEncodingConverterCache::createAndInsertConverter(StringRef SourceEncoding, + StringRef TargetEncoding) { + // Check if converter already exists + TextEncodingConverter *Converter = getConverter(SourceEncoding); + if (Converter) + return Converter; + + // Don't create a converter if source and target are the same + if (SourceEncoding == TargetEncoding) + return nullptr; + + // Create a new converter + ErrorOr<TextEncodingConverter> ErrorOrConverter = + TextEncodingConverter::create(SourceEncoding, TargetEncoding); + if (!ErrorOrConverter) { + return nullptr; + } + + // Insert into cache and return pointer + auto NewConverter = + std::make_unique<TextEncodingConverter>(std::move(*ErrorOrConverter)); + TextEncodingConverter *Result = NewConverter.get(); + Converters.insert_or_assign(SourceEncoding, std::move(NewConverter)); + return Result; +} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
