https://github.com/azhan92 updated https://github.com/llvm/llvm-project/pull/201187
>From 06f309fbf5b9e3361c355fb9d5f801c6f61cb934 Mon Sep 17 00:00:00 2001 From: Alison Zhang <[email protected]> Date: Tue, 2 Jun 2026 15:08:29 -0400 Subject: [PATCH 1/3] Updates from -fexec-charset PR --- clang/include/clang/Lex/Preprocessor.h | 6 ++++- clang/include/clang/Lex/TextEncodingConfig.h | 23 ++++++++++++++++++++ clang/lib/Frontend/CompilerInstance.cpp | 1 + clang/lib/Lex/CMakeLists.txt | 1 + clang/lib/Lex/TextEncodingConfig.cpp | 20 +++++++++++++++++ 5 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 clang/include/clang/Lex/TextEncodingConfig.h create mode 100644 clang/lib/Lex/TextEncodingConfig.cpp diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 8b684e85eb1c1..d3d32130f6e3b 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -30,6 +30,7 @@ #include "clang/Lex/ModuleMap.h" #include "clang/Lex/PPCallbacks.h" #include "clang/Lex/PPEmbedParameters.h" +#include "clang/Lex/TextEncodingConfig.h" #include "clang/Lex/Token.h" #include "clang/Lex/TokenLexer.h" #include "clang/Support/Compiler.h" @@ -198,7 +199,8 @@ class Preprocessor { std::unique_ptr<ScratchBuffer> ScratchBuf; HeaderSearch &HeaderInfo; ModuleLoader &TheModuleLoader; - + TextEncodingConfig TEC; + /// External source of macros. ExternalPreprocessorSource *ExternalSource; @@ -1265,6 +1267,8 @@ class Preprocessor { Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; } llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; } + TextEncodingConfig &getTextEncodingConfig() { return TEC; } + void setExternalSource(ExternalPreprocessorSource *Source) { ExternalSource = Source; } diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h new file mode 100644 index 0000000000000..a810b9ab61b1a --- /dev/null +++ b/clang/include/clang/Lex/TextEncodingConfig.h @@ -0,0 +1,23 @@ +//===-- clang/Lex/TextEncodingConfig.h - Text Conversion Config -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H +#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H + +#include "clang/Basic/LangOptions.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/TextEncoding.h" + +enum ConversionAction { CA_NoConversion }; + +class TextEncodingConfig { +public: + llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; +}; + +#endif diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 9e88abbece7f2..09607e6de8ce7 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -34,6 +34,7 @@ #include "clang/Lex/HeaderSearch.h" #include "clang/Lex/Preprocessor.h" #include "clang/Lex/PreprocessorOptions.h" +#include "clang/Lex/TextEncodingConfig.h" #include "clang/Sema/CodeCompleteConsumer.h" #include "clang/Sema/ParsedAttr.h" #include "clang/Sema/Sema.h" diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt index f61737cd68021..106a5d3b126be 100644 --- a/clang/lib/Lex/CMakeLists.txt +++ b/clang/lib/Lex/CMakeLists.txt @@ -29,6 +29,7 @@ add_clang_library(clangLex Preprocessor.cpp PreprocessorLexer.cpp ScratchBuffer.cpp + TextEncodingConfig.cpp TokenConcatenation.cpp TokenLexer.cpp diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp new file mode 100644 index 0000000000000..bb3f0d4b4abec --- /dev/null +++ b/clang/lib/Lex/TextEncodingConfig.cpp @@ -0,0 +1,20 @@ +//===--- TextEncodingConfig.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Lex/TextEncodingConfig.h" +#include "clang/Basic/DiagnosticDriver.h" + +using namespace llvm; + +llvm::TextEncodingConverter * +TextEncodingConfig::getConverter(ConversionAction Action) const { + switch (Action) { + default: + return nullptr; + } +} >From a3b003af2fc9104896f3cd98d1e47982945669b5 Mon Sep 17 00:00:00 2001 From: Alison Zhang <[email protected]> Date: Tue, 2 Jun 2026 15:46:40 -0400 Subject: [PATCH 2/3] Filetag support --- .../clang/Basic/DiagnosticCommonKinds.td | 3 + clang/include/clang/Basic/SourceManager.h | 18 ++- .../include/clang/Frontend/CompilerInstance.h | 1 + clang/include/clang/Lex/TextEncodingConfig.h | 17 ++- clang/lib/Basic/SourceManager.cpp | 115 ++++++++++++++---- clang/lib/Frontend/CompilerInstance.cpp | 45 +++++-- .../lib/Frontend/VerifyDiagnosticConsumer.cpp | 3 +- clang/lib/Lex/ModuleMap.cpp | 5 +- clang/lib/Lex/PPDirectives.cpp | 8 +- clang/lib/Lex/Preprocessor.cpp | 5 +- clang/lib/Lex/TextEncodingConfig.cpp | 38 ++++++ clang/lib/Serialization/ASTReader.cpp | 7 +- llvm/include/llvm/Support/TextEncoding.h | 4 + llvm/lib/Support/TextEncoding.cpp | 3 +- 14 files changed, 230 insertions(+), 42 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index f2ed2f4698b8d..e4e8e079a31f3 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -417,6 +417,9 @@ def note_file_sloc_usage : Note< "%plural{0:|: plus %2B (%human2B) for macro expansions}2">; def note_file_misc_sloc_usage : Note< "%0 additional files entered using a total of %1B (%human1B) of space">; +def warn_charset_conversion_failed : Warning< + "conversion from source encoding failed for '%0': %1; interpreting as %2">, + InGroup<DiagGroup<"charset-conversion-failed">>; // Modules def err_module_format_unhandled : Error< diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h index 4217b8683da1e..33517b9df59f5 100644 --- a/clang/include/clang/Basic/SourceManager.h +++ b/clang/include/clang/Basic/SourceManager.h @@ -50,6 +50,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/TextEncoding.h" #include <cassert> #include <cstddef> #include <map> @@ -156,6 +157,11 @@ class alignas(8) ContentCache { /// FIXME: Remove this once OrigEntry is a FileEntryRef with a stable name. StringRef Filename; + /// Information on whether this is associated with a FileID for a file (as + /// opposed to a buffer) and, if so, what conversion (if any) was requested. + llvm::PointerIntPair<llvm::TextEncodingConverter *, 1u, bool> + FileIDConverterInfo; + /// A bump pointer allocated array of offsets for each source line. /// /// This is lazily computed. The lines are owned by the SourceManager @@ -919,8 +925,17 @@ class SourceManager : public RefCountedBase<SourceManager> { FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, SrcMgr::CharacteristicKind FileCharacter, int LoadedID = 0, + llvm::TextEncodingConverter *Converter, int LoadedID = 0, SourceLocation::UIntTy LoadedOffset = 0); + FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, + SrcMgr::CharacteristicKind FileCharacter, + int LoadedID = 0, + SourceLocation::UIntTy LoadedOffset = 0) { + return createFileID(SourceFile, IncludePos, FileCharacter, + /*Converter=*/nullptr, LoadedID, LoadedOffset); + } + /// Create a new FileID that represents the specified memory buffer. /// /// This does no caching of the buffer and takes ownership of the @@ -942,7 +957,8 @@ class SourceManager : public RefCountedBase<SourceManager> { /// Get the FileID for \p SourceFile if it exists. Otherwise, create a /// new FileID for the \p SourceFile. FileID getOrCreateFileID(FileEntryRef SourceFile, - SrcMgr::CharacteristicKind FileCharacter); + SrcMgr::CharacteristicKind FileCharacter, + llvm::TextEncodingConverter *Converter = nullptr); /// Creates an expansion SLocEntry for the substitution of an argument into a /// function-like macro's body. Returns the start of the expansion. diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h index bb0eddb918623..6010b4bd900e9 100644 --- a/clang/include/clang/Frontend/CompilerInstance.h +++ b/clang/include/clang/Frontend/CompilerInstance.h @@ -864,6 +864,7 @@ class CompilerInstance : public ModuleLoader { /// /// \return True on success. static bool InitializeSourceManager(const FrontendInputFile &Input, + llvm::TextEncodingConverter *, DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr); diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h index a810b9ab61b1a..5cf077d6134a8 100644 --- a/clang/include/clang/Lex/TextEncodingConfig.h +++ b/clang/include/clang/Lex/TextEncodingConfig.h @@ -9,15 +9,28 @@ #ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H #define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H -#include "clang/Basic/LangOptions.h" +#include "clang/Basic/Diagnostic.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/TextEncoding.h" -enum ConversionAction { CA_NoConversion }; +enum ConversionAction { CA_NoConversion, CA_FromInputEncoding }; class TextEncodingConfig { + llvm::StringRef InputEncoding; + std::string FileTagEncoding; + std::unique_ptr<llvm::TextEncodingConverter> FromInputEncodingConverter; + public: llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; + static std::unique_ptr<llvm::TextEncodingConverter> +#ifdef __MVS__ + createInputConverterFromFiletag(__ccsid_t Ccsid, + clang::DiagnosticsEngine &Diags); +#endif + static std::error_code + setFromInputConverter(TextEncodingConfig &TEC, + std::unique_ptr<llvm::TextEncodingConverter> Converter); + llvm::StringRef getInputEncoding() { return InputEncoding; } }; #endif diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index b6cc6ec9365f5..e36050847d40e 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -16,6 +16,7 @@ #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManagerInternals.h" +#include "clang/Lex/TextEncodingConfig.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" @@ -31,6 +32,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/SmallVectorMemoryBuffer.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -120,6 +122,13 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, // return paths. IsBufferInvalid = true; + // If we have a converter, open the file in binary mode to prevent autoconversion. + llvm::TextEncodingConverter *Converter = FileIDConverterInfo.getPointer(); + bool IsText = (Converter == nullptr); + auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile, + /*RequiresNullTerminator=*/true, + /*MaybeLimit=*/std::nullopt, IsText); + auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile); // If we were unable to open the file, then we are in an inconsistent @@ -136,7 +145,57 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, Buffer = std::move(*BufferOrError); - // Check that the file's size fits in an 'unsigned' (with room for a + // Unless this is a named pipe (in which case we can handle a mismatch), + // check that the file's size is the same as in the file entry (which may + // have come from a stat cache). + // The buffer will always be larger than the file size on z/OS in the presence + // of characters outside the base character set. + assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize()); + if (!ContentsEntry->isNamedPipe() && + Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { + Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); + + return std::nullopt; + } + + // Convert source from the input charset to UTF-8 if necessary. + if (Converter) { + StringRef OriginalBuf = Buffer->getBuffer(); + + llvm::SmallString<0> UTF8Buf; + UTF8Buf.reserve(OriginalBuf.size() + 1); + + std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf); + if (EC) { + Diag.Report(Loc, diag::warn_charset_conversion_failed) + << ContentsEntry->getName() << EC.message(); +#ifdef __MVS__ + // On z/OS, if conversion fails, try converting from IBM-1047 to UTF-8 + std::unique_ptr<llvm::TextEncodingConverter> FallbackConverter = + TextEncodingConfig::createInputConverterFromFiletag(1047, Diag); + + if (FallbackConverter) { + // Try converting with IBM-1047 converter + UTF8Buf.clear(); + UTF8Buf.reserve(OriginalBuf.size() + 1); + EC = FallbackConverter->convert(OriginalBuf, UTF8Buf); + + if (!EC) { + auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>( + std::move(UTF8Buf), Buffer->getBufferIdentifier()); + Buffer = std::move(NewBuf); + } else { + // TODO: Reclaim memory if the buffer size exceeds the content. + auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>( + std::move(UTF8Buf), Buffer->getBufferIdentifier()); + Buffer = std::move(NewBuf); + } + } +#endif + } + } + + // Check that the buffer's size fits in an 'unsigned' (with room for a // past-the-end value). This is deeply regrettable, but various parts of // Clang (including elsewhere in this file!) use 'unsigned' to represent file // offsets, line numbers, string literal lengths, and so on, and fail @@ -151,22 +210,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, FileManager &FM, return std::nullopt; } - // Unless this is a named pipe (in which case we can handle a mismatch), - // check that the file's size is the same as in the file entry (which may - // have come from a stat cache). - // The buffer will always be larger than the file size on z/OS in the presence - // of characters outside the base character set. - assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize()); - if (!ContentsEntry->isNamedPipe() && - Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) { - Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName(); - - return std::nullopt; - } - - // If the buffer is valid, check to see if it has a UTF Byte Order Mark - // (BOM). We only support UTF-8 with and without a BOM right now. See - // http://en.wikipedia.org/wiki/Byte_order_mark for more information. + // If the buffer is valid, check to see if it has a UTF Byte Order Mark (BOM) + // Note that any conversion requested using `-finput-charset` (if successful) + // has already occurred, so we are expecting UTF-8 with or without a BOM. + // + // In theory, if we see a non-UTF-8 BOM, we can assume that an appropriate + // conversion was not supplied via `-finput-charset` and we could try to + // convert based on the BOM. + // + // See http://en.wikipedia.org/wiki/Byte_order_mark for more information. StringRef BufStr = Buffer->getBuffer(); const char *InvalidBOM = getInvalidBOM(BufStr); @@ -537,15 +589,30 @@ FileID SourceManager::getNextFileID(FileID FID) const { /// being \#included from the specified IncludePosition. FileID SourceManager::createFileID(FileEntryRef SourceFile, SourceLocation IncludePos, + llvm::TextEncodingConverter *Converter, SrcMgr::CharacteristicKind FileCharacter, int LoadedID, SourceLocation::UIntTy LoadedOffset) { SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile, isSystem(FileCharacter)); + #ifndef NDEBUG + // Either the content cache has never been used for a FileID (and, if we are + // being asked to use a converter, there should be no valid buffer set up for + // it) or the conversion (or lack thereof) should be the same as that used + // previously. + auto [CacheConverter, CacheUsedByFileID] = IR.FileIDConverterInfo; + if (CacheUsedByFileID) + assert(CacheConverter == Converter); + else + assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded()); +#endif + IR.FileIDConverterInfo.setPointerAndInt(Converter, true); + // If this is a named pipe, immediately load the buffer to ensure subsequent // calls to ContentCache::getSize() are accurate. - if (IR.ContentsEntry->isNamedPipe()) + // Do the same if character-encoding conversion was requested. + if (IR.ContentsEntry->isNamedPipe() || Converter) (void)IR.getBufferOrNone(Diag, getFileManager(), SourceLocation()); return createFileIDImpl(IR, SourceFile.getName(), IncludePos, FileCharacter, @@ -583,10 +650,12 @@ FileID SourceManager::createFileID(const llvm::MemoryBufferRef &Buffer, /// new FileID for the \p SourceFile. FileID SourceManager::getOrCreateFileID(FileEntryRef SourceFile, - SrcMgr::CharacteristicKind FileCharacter) { + SrcMgr::CharacteristicKind FileCharacter, + llvm::TextEncodingConverter *Converter) { FileID ID = translateFile(SourceFile); - return ID.isValid() ? ID : createFileID(SourceFile, SourceLocation(), - FileCharacter); + return ID.isValid() ? ID + : createFileID(SourceFile, SourceLocation(), + FileCharacter, Converter); } /// createFileID - Create a new FileID for the specified ContentCache and @@ -2340,8 +2409,8 @@ SourceManagerForFile::SourceManagerForFile(StringRef FileName, std::make_unique<DiagnosticsEngine>(DiagnosticIDs::create(), *DiagOpts); SourceMgr = std::make_unique<SourceManager>(*Diagnostics, *FileMgr); FileEntryRef FE = llvm::cantFail(FileMgr->getFileRef(FileName)); - FileID ID = - SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User); + FileID ID = SourceMgr->createFileID( + FE, SourceLocation(), clang::SrcMgr::C_User, /*Converter=*/nullptr); assert(ID.isValid()); SourceMgr->setMainFileID(ID); } diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 09607e6de8ce7..fc04f32910ea1 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -51,6 +51,9 @@ #include "llvm/Config/llvm-config.h" #include "llvm/Plugins/PassPlugin.h" #include "llvm/Support/AdvisoryLock.h" +#ifdef __MVS__ +#include "llvm/Support/AutoConvert.h" +#endif #include "llvm/Support/BuryPointer.h" #include "llvm/Support/CrashRecoveryContext.h" #include "llvm/Support/Errc.h" @@ -907,15 +910,37 @@ CompilerInstance::createOutputFileImpl(StringRef OutputPath, bool Binary, // Initialization Utilities bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input){ - return InitializeSourceManager(Input, getDiagnostics(), getFileManager(), - getSourceManager()); + llvm::TextEncodingConverter *Converter = nullptr; + if (hasPreprocessor() && !Input.isBuffer()) { + Preprocessor &PP = getPreprocessor(); + StringRef InputFile = Input.getFile(); + +#ifdef __MVS__ + // Check for system filetag if we are on z/OS. + llvm::ErrorOr<__ccsid_t> Ccsid = llvm::getzOSFileTag(InputFile); + if (!Ccsid.getError() && *Ccsid > 0) { + // Create converter from filetag if it exists + std::unique_ptr<llvm::TextEncodingConverter> InputConverter = + TextEncodingConfig::createInputConverterFromFiletag(*Ccsid, getDiagnostics()); + + if (InputConverter) + TextEncodingConfig::setFromInputConverter( + PP.getTextEncodingConfig(), std::move(InputConverter)); + } +#endif + + // Retrieve the converter to the internal charset if it exists. + Converter = PP.getTextEncodingConfig().getConverter(CA_FromInputEncoding); + } + + return InitializeSourceManager(Input, Converter, getDiagnostics(), + getFileManager(), getSourceManager()); } // static -bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input, - DiagnosticsEngine &Diags, - FileManager &FileMgr, - SourceManager &SourceMgr) { +bool CompilerInstance::InitializeSourceManager( + const FrontendInputFile &Input, llvm::TextEncodingConverter *Converter, + DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr) { SrcMgr::CharacteristicKind Kind = Input.getKind().getFormat() == InputKind::ModuleMap ? Input.isSystem() ? SrcMgr::C_System_ModuleMap @@ -931,10 +956,14 @@ bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input, StringRef InputFile = Input.getFile(); + // If we have a converter, open the file in binary mode to avoid autoconversion. + bool IsText = (Converter == nullptr); + // Figure out where to get and map in the main file. auto FileOrErr = InputFile == "-" ? FileMgr.getSTDIN() - : FileMgr.getFileRef(InputFile, /*OpenFile=*/true); + : FileMgr.getFileRef(InputFile, /*OpenFile=*/true, + /*CacheFailure=*/true, IsText); if (!FileOrErr) { auto EC = llvm::errorToErrorCode(FileOrErr.takeError()); if (InputFile != "-") @@ -945,7 +974,7 @@ bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input, } SourceMgr.setMainFileID( - SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind)); + SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind, Converter)); assert(SourceMgr.getMainFileID().isValid() && "Couldn't establish MainFileID!"); diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp index 1bfe644b2525a..2afe990761267 100644 --- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp +++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp @@ -611,7 +611,8 @@ static bool ParseDirective(StringRef S, ExpectedData *ED, SourceManager &SM, FileID FID = SM.translateFile(*File); if (FID.isInvalid()) - FID = SM.createFileID(*File, Pos, SrcMgr::C_User); + FID = SM.createFileID(*File, Pos, SrcMgr::C_User, + /*Converter=*/nullptr); if (PH.Next(Line) && Line > 0) ExpectedLoc = SM.translateLineCol(FID, Line, 1); diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp index 6c07386f89010..c7c1e04b76ea1 100644 --- a/clang/lib/Lex/ModuleMap.cpp +++ b/clang/lib/Lex/ModuleMap.cpp @@ -1473,7 +1473,10 @@ bool ModuleMap::parseModuleMapFile(FileEntryRef File, bool IsSystem, if (LocalFID.isInvalid()) { auto FileCharacter = IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap; - LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter); + // FIXME: Module map files are also textual "source files". For consistency, + // conversion should occur. + LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter, + /*Converter=*/nullptr); } ID = LocalFID; } diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp index eb21a510dcf83..f9636f2e61ae5 100644 --- a/clang/lib/Lex/PPDirectives.cpp +++ b/clang/lib/Lex/PPDirectives.cpp @@ -2796,7 +2796,13 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport( // position on the file where it will be included and after the expansions. if (IncludePos.isMacroID()) IncludePos = SourceMgr.getExpansionRange(IncludePos).getEnd(); - FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter); + // Retrieve the converter to the internal charset if it exists. + llvm::TextEncodingConverter *Converter = + getTextEncodingConfig().getConverter(CA_FromInputEncoding); + + FileID FID = + SourceMgr.createFileID(*File, IncludePos, FileCharacter, Converter); + if (!FID.isValid()) { TheModuleLoader.HadFatalFailure = true; return ImportAction::Failure; diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index 1e21b4a94cea3..8173723fe9bfe 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -649,8 +649,9 @@ void Preprocessor::EnterMainSourceFile() { << PPOpts.PCHThroughHeader; return; } - setPCHThroughHeaderFileID( - SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User)); + // FIXME: Figure out character-encoding converter treatment. + setPCHThroughHeaderFileID(SourceMgr.createFileID( + *File, SourceLocation(), SrcMgr::C_User, /*Converter=*/nullptr)); } // Skip tokens from the Predefines and if needed the main file. diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp index bb3f0d4b4abec..a0cf1e62c7ec1 100644 --- a/clang/lib/Lex/TextEncodingConfig.cpp +++ b/clang/lib/Lex/TextEncodingConfig.cpp @@ -8,13 +8,51 @@ #include "clang/Lex/TextEncodingConfig.h" #include "clang/Basic/DiagnosticDriver.h" +#include "llvm/Support/AutoConvert.h" using namespace llvm; llvm::TextEncodingConverter * TextEncodingConfig::getConverter(ConversionAction Action) const { switch (Action) { + case CA_FromInputEncoding: + return FromInputEncodingConverter.get(); default: return nullptr; } } + +std::unique_ptr<llvm::TextEncodingConverter> +TextEncodingConfig::createInputConverterFromFiletag( + __ccsid_t Ccsid, clang::DiagnosticsEngine &Diags) { + using namespace llvm; + + std::string FileTagEncoding = std::to_string(Ccsid); + + llvm::StringRef InputEncoding = FileTagEncoding; + const char *UTF8 = "UTF-8"; + + // Create a converter between the input and internal encodings + if (llvm::TextEncodingConverter::getKnownEncoding(InputEncoding) != + llvm::TextEncodingConverter::getKnownEncoding(UTF8)) { + ErrorOr<TextEncodingConverter> ErrorOrConverter = + llvm::TextEncodingConverter::create(InputEncoding, UTF8); + if (!ErrorOrConverter) { + Diags.Report(clang::diag::err_drv_invalid_value) + << "filetag" << InputEncoding; + return nullptr; + } else { + return std::make_unique<llvm::TextEncodingConverter>( + std::move(*ErrorOrConverter)); + } + } + return nullptr; +} + +std::error_code +TextEncodingConfig::setFromInputConverter( + TextEncodingConfig &TEC, + std::unique_ptr<llvm::TextEncodingConverter> Converter) { + TEC.FromInputEncodingConverter = std::move(Converter); + return std::error_code(); +} diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 74a7b51368c28..7bc28a2791067 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -2002,8 +2002,11 @@ bool ASTReader::ReadSLocEntry(int ID) { } SrcMgr::CharacteristicKind FileCharacter = (SrcMgr::CharacteristicKind)Record[2]; - FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, ID, - BaseOffset + Record[0]); + // Note: If conversion was originally necessary, OverriddenBuffer should be + // true and the associated handling will trigger. + FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, + /*Converter=*/nullptr, ID, + BaseOffset + Record[0]); SrcMgr::FileInfo &FileInfo = SourceMgr.getSLocEntry(FID).getFile(); FileInfo.NumCreatedFIDs = Record[5]; if (Record[3]) diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h index 8a304910aa5dd..75d4e40bfc786 100644 --- a/llvm/include/llvm/Support/TextEncoding.h +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -137,6 +137,10 @@ class TextEncodingConverter { } }; + + // Maps the encoding name to enum constant if possible. + static std::optional<TextEncoding> getKnownEncoding(StringRef Name); + } // namespace llvm #endif diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index d36f02c1300b9..bca1d9c94b057 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -48,7 +48,8 @@ static void normalizeCharSetName(StringRef CSName, } // Maps the encoding name to enum constant if possible. -static std::optional<TextEncoding> getKnownEncoding(StringRef Name) { +std::optional<TextEncoding> +TextEncodingConverter::getKnownEncoding(StringRef Name) { SmallString<16> Normalized; normalizeCharSetName(Name, Normalized); if (Normalized.equals("utf8")) >From 91ad4393346c94ae0baca578ba5dae1a2695cf50 Mon Sep 17 00:00:00 2001 From: Alison Zhang <[email protected]> Date: Tue, 2 Jun 2026 15:50:51 -0400 Subject: [PATCH 3/3] Fix warning message --- clang/include/clang/Basic/DiagnosticCommonKinds.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td index e4e8e079a31f3..8ebac3908b465 100644 --- a/clang/include/clang/Basic/DiagnosticCommonKinds.td +++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td @@ -418,7 +418,7 @@ def note_file_sloc_usage : Note< def note_file_misc_sloc_usage : Note< "%0 additional files entered using a total of %1B (%human1B) of space">; def warn_charset_conversion_failed : Warning< - "conversion from source encoding failed for '%0': %1; interpreting as %2">, + "conversion from source encoding failed for '%0': %1; interpreting as IBM-1047">, InGroup<DiagGroup<"charset-conversion-failed">>; // Modules _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
