https://github.com/azhan92 created 
https://github.com/llvm/llvm-project/pull/204233

None

>From 72b7bfae7524dbb6a2261de2703217d1c4e565c9 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 16 Jun 2026 15:01:43 -0400
Subject: [PATCH 1/4] Changes from fexec-charset PR

---
 clang/include/clang/Lex/Preprocessor.h       |  4 ++++
 clang/include/clang/Lex/TextEncodingConfig.h | 23 ++++++++++++++++++++
 clang/lib/Frontend/CompilerInstance.cpp      |  1 +
 clang/lib/Lex/CMakeLists.txt                 |  1 +
 clang/lib/Lex/TextEncodingConfig.cpp         | 20 +++++++++++++++++
 5 files changed, 49 insertions(+)
 create mode 100644 clang/include/clang/Lex/TextEncodingConfig.h
 create mode 100644 clang/lib/Lex/TextEncodingConfig.cpp

diff --git a/clang/include/clang/Lex/Preprocessor.h 
b/clang/include/clang/Lex/Preprocessor.h
index 8b684e85eb1c1..5f8d5caaafcac 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -30,6 +30,7 @@
 #include "clang/Lex/ModuleMap.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/PPEmbedParameters.h"
+#include "clang/Lex/TextEncodingConfig.h"
 #include "clang/Lex/Token.h"
 #include "clang/Lex/TokenLexer.h"
 #include "clang/Support/Compiler.h"
@@ -198,6 +199,7 @@ class Preprocessor {
   std::unique_ptr<ScratchBuffer> ScratchBuf;
   HeaderSearch      &HeaderInfo;
   ModuleLoader      &TheModuleLoader;
+  TextEncodingConfig TEC;
 
   /// External source of macros.
   ExternalPreprocessorSource *ExternalSource;
@@ -1265,6 +1267,8 @@ class Preprocessor {
   Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
   llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
 
+  TextEncodingConfig &getTextEncodingConfig() { return TEC; }
+
   void setExternalSource(ExternalPreprocessorSource *Source) {
     ExternalSource = Source;
   }
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h 
b/clang/include/clang/Lex/TextEncodingConfig.h
new file mode 100644
index 0000000000000..a810b9ab61b1a
--- /dev/null
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -0,0 +1,23 @@
+//===-- clang/Lex/TextEncodingConfig.h - Text Conversion Config -*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+
+#include "clang/Basic/LangOptions.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TextEncoding.h"
+
+enum ConversionAction { CA_NoConversion };
+
+class TextEncodingConfig {
+public:
+  llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
+};
+
+#endif
diff --git a/clang/lib/Frontend/CompilerInstance.cpp 
b/clang/lib/Frontend/CompilerInstance.cpp
index 9e88abbece7f2..09607e6de8ce7 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -34,6 +34,7 @@
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/TextEncodingConfig.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
 #include "clang/Sema/ParsedAttr.h"
 #include "clang/Sema/Sema.h"
diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt
index f61737cd68021..106a5d3b126be 100644
--- a/clang/lib/Lex/CMakeLists.txt
+++ b/clang/lib/Lex/CMakeLists.txt
@@ -29,6 +29,7 @@ add_clang_library(clangLex
   Preprocessor.cpp
   PreprocessorLexer.cpp
   ScratchBuffer.cpp
+  TextEncodingConfig.cpp
   TokenConcatenation.cpp
   TokenLexer.cpp
 
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp 
b/clang/lib/Lex/TextEncodingConfig.cpp
new file mode 100644
index 0000000000000..bb3f0d4b4abec
--- /dev/null
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -0,0 +1,20 @@
+//===--- TextEncodingConfig.cpp 
-------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/TextEncodingConfig.h"
+#include "clang/Basic/DiagnosticDriver.h"
+
+using namespace llvm;
+
+llvm::TextEncodingConverter *
+TextEncodingConfig::getConverter(ConversionAction Action) const {
+  switch (Action) {
+  default:
+    return nullptr;
+  }
+}

>From 660b75df3e808eb772982ad7cd14dd81855ac840 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 16 Jun 2026 15:34:36 -0400
Subject: [PATCH 2/4] Changes from finput-charset PR

---
 .../clang/Basic/DiagnosticCommonKinds.td      |   3 +
 clang/include/clang/Basic/SourceManager.h     |  10 +-
 .../include/clang/Frontend/CompilerInstance.h |   1 +
 clang/include/clang/Lex/TextEncodingConfig.h  |   4 +-
 clang/lib/Basic/SourceManager.cpp             | 108 ++++++++++++++----
 clang/lib/Frontend/CompilerInstance.cpp       |  19 +--
 .../lib/Frontend/VerifyDiagnosticConsumer.cpp |   4 +-
 clang/lib/Lex/ModuleMap.cpp                   |   5 +-
 clang/lib/Lex/PPDirectives.cpp                |   7 +-
 clang/lib/Lex/Preprocessor.cpp                |   5 +-
 clang/lib/Lex/TextEncodingConfig.cpp          |   2 +
 clang/lib/Serialization/ASTReader.cpp         |   7 +-
 12 files changed, 136 insertions(+), 39 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td 
b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index f2ed2f4698b8d..8ebac3908b465 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -417,6 +417,9 @@ def note_file_sloc_usage : Note<
   "%plural{0:|: plus %2B (%human2B) for macro expansions}2">;
 def note_file_misc_sloc_usage : Note<
   "%0 additional files entered using a total of %1B (%human1B) of space">;
+def warn_charset_conversion_failed : Warning<
+  "conversion from source encoding failed for '%0': %1; interpreting as 
IBM-1047">,
+   InGroup<DiagGroup<"charset-conversion-failed">>;
 
 // Modules
 def err_module_format_unhandled : Error<
diff --git a/clang/include/clang/Basic/SourceManager.h 
b/clang/include/clang/Basic/SourceManager.h
index 4217b8683da1e..f7d91d612e4ab 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -50,6 +50,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TextEncoding.h"
 #include <cassert>
 #include <cstddef>
 #include <map>
@@ -156,6 +157,11 @@ class alignas(8) ContentCache {
   /// FIXME: Remove this once OrigEntry is a FileEntryRef with a stable name.
   StringRef Filename;
 
+  /// Information on whether this is associated with a FileID for a file (as
+  /// opposed to a buffer) and, if so, what conversion (if any) was requested.
+  llvm::PointerIntPair<llvm::TextEncodingConverter *, 1u, bool>
+      FileIDConverterInfo;
+
   /// A bump pointer allocated array of offsets for each source line.
   ///
   /// This is lazily computed.  The lines are owned by the SourceManager
@@ -918,6 +924,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// being \#included from the specified IncludePosition.
   FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos,
                       SrcMgr::CharacteristicKind FileCharacter,
+                     llvm::TextEncodingConverter *Converter = nullptr, 
                       int LoadedID = 0,
                       SourceLocation::UIntTy LoadedOffset = 0);
 
@@ -942,7 +949,8 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// Get the FileID for \p SourceFile if it exists. Otherwise, create a
   /// new FileID for the \p SourceFile.
   FileID getOrCreateFileID(FileEntryRef SourceFile,
-                           SrcMgr::CharacteristicKind FileCharacter);
+                           SrcMgr::CharacteristicKind FileCharacter,
+                          llvm::TextEncodingConverter *Converter = nullptr);
 
   /// Creates an expansion SLocEntry for the substitution of an argument into a
   /// function-like macro's body. Returns the start of the expansion.
diff --git a/clang/include/clang/Frontend/CompilerInstance.h 
b/clang/include/clang/Frontend/CompilerInstance.h
index bb0eddb918623..89a0d066afd6f 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -864,6 +864,7 @@ class CompilerInstance : public ModuleLoader {
   ///
   /// \return True on success.
   static bool InitializeSourceManager(const FrontendInputFile &Input,
+                                     llvm::TextEncodingConverter *,    
                                       DiagnosticsEngine &Diags,
                                       FileManager &FileMgr,
                                       SourceManager &SourceMgr);
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h 
b/clang/include/clang/Lex/TextEncodingConfig.h
index a810b9ab61b1a..30e0fcf2ac919 100644
--- a/clang/include/clang/Lex/TextEncodingConfig.h
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -13,9 +13,11 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/TextEncoding.h"
 
-enum ConversionAction { CA_NoConversion };
+enum ConversionAction { CA_NoConversion, CA_FromInputEncoding };
 
 class TextEncodingConfig {
+std::unique_ptr<llvm::TextEncodingConverter> FromInputEncodingConverter;
+
 public:
   llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
 };
diff --git a/clang/lib/Basic/SourceManager.cpp 
b/clang/lib/Basic/SourceManager.cpp
index b6cc6ec9365f5..c33ee69962864 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -16,6 +16,7 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManagerInternals.h"
+#include "clang/Lex/TextEncodingConfig.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
@@ -31,6 +32,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -136,7 +138,57 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
 
   Buffer = std::move(*BufferOrError);
 
-  // Check that the file's size fits in an 'unsigned' (with room for a
+  // Unless this is a named pipe (in which case we can handle a mismatch),
+  // check that the file's size is the same as in the file entry (which may
+  // have come from a stat cache).
+  // The buffer will always be larger than the file size on z/OS in the 
presence
+  // of characters outside the base character set.
+  assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize());
+  if (!ContentsEntry->isNamedPipe() &&
+      Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
+    Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
+
+    return std::nullopt;
+  }
+
+  // Convert source from the input charset to UTF-8 if necessary.
+  if (Converter) {
+    StringRef OriginalBuf = Buffer->getBuffer();
+
+    llvm::SmallString<0> UTF8Buf;
+    UTF8Buf.reserve(OriginalBuf.size() + 1);
+
+    std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf);
+    if (EC) {
+      Diag.Report(Loc, diag::warn_charset_conversion_failed)
+          << ContentsEntry->getName() << EC.message();
+#ifdef __MVS__
+      // On z/OS, if conversion fails, try converting from IBM-1047 to UTF-8
+      std::unique_ptr<llvm::TextEncodingConverter> FallbackConverter =
+          TextEncodingConfig::createInputConverterFromFiletag(1047, Diag);
+
+      if (FallbackConverter) {
+        // Try converting with IBM-1047 converter
+        UTF8Buf.clear();
+        UTF8Buf.reserve(OriginalBuf.size() + 1);
+        EC = FallbackConverter->convert(OriginalBuf, UTF8Buf);
+
+        if (!EC) {
+          auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>(
+              std::move(UTF8Buf), Buffer->getBufferIdentifier());
+          Buffer = std::move(NewBuf);
+        } else {
+          // TODO: Reclaim memory if the buffer size exceeds the content.
+          auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>(
+              std::move(UTF8Buf), Buffer->getBufferIdentifier());
+          Buffer = std::move(NewBuf);        
+        }
+      }  
+#endif
+    }
+  }
+
+  // Check that the buffer's size fits in an 'unsigned' (with room for a
   // past-the-end value). This is deeply regrettable, but various parts of
   // Clang (including elsewhere in this file!) use 'unsigned' to represent file
   // offsets, line numbers, string literal lengths, and so on, and fail
@@ -151,22 +203,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
     return std::nullopt;
   }
 
-  // Unless this is a named pipe (in which case we can handle a mismatch),
-  // check that the file's size is the same as in the file entry (which may
-  // have come from a stat cache).
-  // The buffer will always be larger than the file size on z/OS in the 
presence
-  // of characters outside the base character set.
-  assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize());
-  if (!ContentsEntry->isNamedPipe() &&
-      Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
-    Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
-
-    return std::nullopt;
-  }
-
-  // If the buffer is valid, check to see if it has a UTF Byte Order Mark
-  // (BOM).  We only support UTF-8 with and without a BOM right now.  See
-  // http://en.wikipedia.org/wiki/Byte_order_mark for more information.
+  // If the buffer is valid, check to see if it has a UTF Byte Order Mark (BOM)
+  // Note that any conversion requested using `-finput-charset` (if successful)
+  // has already occurred, so we are expecting UTF-8 with or without a BOM.
+  //
+  // In theory, if we see a non-UTF-8 BOM, we can assume that an appropriate
+  // conversion was not supplied via `-finput-charset` and we could try to
+  // convert based on the BOM.
+  //
+  // See http://en.wikipedia.org/wiki/Byte_order_mark for more information.
   StringRef BufStr = Buffer->getBuffer();
   const char *InvalidBOM = getInvalidBOM(BufStr);
 
@@ -537,15 +582,30 @@ FileID SourceManager::getNextFileID(FileID FID) const {
 /// being \#included from the specified IncludePosition.
 FileID SourceManager::createFileID(FileEntryRef SourceFile,
                                    SourceLocation IncludePos,
+                                   llvm::TextEncodingConverter *Converter,
                                    SrcMgr::CharacteristicKind FileCharacter,
                                    int LoadedID,
                                    SourceLocation::UIntTy LoadedOffset) {
   SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile,
                                                      isSystem(FileCharacter));
 
+  #ifndef NDEBUG
+  // Either the content cache has never been used for a FileID (and, if we are
+  // being asked to use a converter, there should be no valid buffer set up for
+  // it) or the conversion (or lack thereof) should be the same as that used
+  // previously.
+  auto [CacheConverter, CacheUsedByFileID] = IR.FileIDConverterInfo;
+  if (CacheUsedByFileID)
+    assert(CacheConverter == Converter);
+  else
+    assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded());
+#endif
+  IR.FileIDConverterInfo.setPointerAndInt(Converter, true);
+
   // If this is a named pipe, immediately load the buffer to ensure subsequent
   // calls to ContentCache::getSize() are accurate.
-  if (IR.ContentsEntry->isNamedPipe())
+  // Do the same if character-encoding conversion was requested.
+  if (IR.ContentsEntry->isNamedPipe() || Converter) 
     (void)IR.getBufferOrNone(Diag, getFileManager(), SourceLocation());
 
   return createFileIDImpl(IR, SourceFile.getName(), IncludePos, FileCharacter,
@@ -583,10 +643,12 @@ FileID SourceManager::createFileID(const 
llvm::MemoryBufferRef &Buffer,
 /// new FileID for the \p SourceFile.
 FileID
 SourceManager::getOrCreateFileID(FileEntryRef SourceFile,
-                                 SrcMgr::CharacteristicKind FileCharacter) {
+                                 SrcMgr::CharacteristicKind FileCharacter,
+                                 llvm::TextEncodingConverter *Converter) {
   FileID ID = translateFile(SourceFile);
-  return ID.isValid() ? ID : createFileID(SourceFile, SourceLocation(),
-                                         FileCharacter);
+  return ID.isValid() ? ID
+                      : createFileID(SourceFile, SourceLocation(),
+                                     FileCharacter, Converter);
 }
 
 /// createFileID - Create a new FileID for the specified ContentCache and
@@ -2340,8 +2402,8 @@ SourceManagerForFile::SourceManagerForFile(StringRef 
FileName,
       std::make_unique<DiagnosticsEngine>(DiagnosticIDs::create(), *DiagOpts);
   SourceMgr = std::make_unique<SourceManager>(*Diagnostics, *FileMgr);
   FileEntryRef FE = llvm::cantFail(FileMgr->getFileRef(FileName));
-  FileID ID =
-      SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User);
+  FileID ID = SourceMgr->createFileID(
+      FE, SourceLocation(), clang::SrcMgr::C_User, /*Converter=*/nullptr);
   assert(ID.isValid());
   SourceMgr->setMainFileID(ID);
 }
diff --git a/clang/lib/Frontend/CompilerInstance.cpp 
b/clang/lib/Frontend/CompilerInstance.cpp
index 09607e6de8ce7..e94d05ba33fbb 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -907,15 +907,20 @@ CompilerInstance::createOutputFileImpl(StringRef 
OutputPath, bool Binary,
 // Initialization Utilities
 
 bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input){
-  return InitializeSourceManager(Input, getDiagnostics(), getFileManager(),
-                                 getSourceManager());
+  // Retrieve the converter to the internal charset if it exists.
+  llvm::TextEncodingConverter *Converter =
+      hasPreprocessor() ? 
getPreprocessor().getTextEncodingConfig().getConverter(
+                              CA_FromInputEncoding)
+                        : nullptr;
+
+  return InitializeSourceManager(Input, Converter, getDiagnostics(),
+                                 getFileManager(), getSourceManager());
 }
 
 // static
-bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input,
-                                               DiagnosticsEngine &Diags,
-                                               FileManager &FileMgr,
-                                               SourceManager &SourceMgr) {
+bool CompilerInstance::InitializeSourceManager(
+    const FrontendInputFile &Input, llvm::TextEncodingConverter *Converter,
+    DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr) {
   SrcMgr::CharacteristicKind Kind =
       Input.getKind().getFormat() == InputKind::ModuleMap
           ? Input.isSystem() ? SrcMgr::C_System_ModuleMap
@@ -923,7 +928,7 @@ bool CompilerInstance::InitializeSourceManager(const 
FrontendInputFile &Input,
           : Input.isSystem() ? SrcMgr::C_System : SrcMgr::C_User;
 
   if (Input.isBuffer()) {
-    SourceMgr.setMainFileID(SourceMgr.createFileID(Input.getBuffer(), Kind));
+    SourceMgr.setMainFileID(SourceMgr.createFileID(Input.getBuffer(), Kind, 
Converter));
     assert(SourceMgr.getMainFileID().isValid() &&
            "Couldn't establish MainFileID!");
     return true;
diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp 
b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
index 1bfe644b2525a..01e3b20e0c7cb 100644
--- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
+++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
@@ -610,8 +610,10 @@ static bool ParseDirective(StringRef S, ExpectedData *ED, 
SourceManager &SM,
           }
 
           FileID FID = SM.translateFile(*File);
+         // FIXME: Figure out character-encoding converter treatment.
           if (FID.isInvalid())
-            FID = SM.createFileID(*File, Pos, SrcMgr::C_User);
+            FID = SM.createFileID(*File, Pos, SrcMgr::C_User,
+                                 /*Converter=*/nullptr);
 
           if (PH.Next(Line) && Line > 0)
             ExpectedLoc = SM.translateLineCol(FID, Line, 1);
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 6c07386f89010..c7c1e04b76ea1 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -1473,7 +1473,10 @@ bool ModuleMap::parseModuleMapFile(FileEntryRef File, 
bool IsSystem,
     if (LocalFID.isInvalid()) {
       auto FileCharacter =
           IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap;
-      LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter);
+      // FIXME: Module map files are also textual "source files". For 
consistency,
+      // conversion should occur.
+      LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter,
+                                /*Converter=*/nullptr);
     }
     ID = LocalFID;
   }
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index eb21a510dcf83..2e095fce02c0f 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2796,7 +2796,12 @@ Preprocessor::ImportAction 
Preprocessor::HandleHeaderIncludeOrImport(
   // position on the file where it will be included and after the expansions.
   if (IncludePos.isMacroID())
     IncludePos = SourceMgr.getExpansionRange(IncludePos).getEnd();
-  FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter);
+  // Retrieve the converter to the internal charset if it exists.
+  llvm::TextEncodingConverter *Converter =
+      getTextEncodingConfig().getConverter(CA_FromInputEncoding);
+
+  FileID FID =
+      SourceMgr.createFileID(*File, IncludePos, FileCharacter, Converter);
   if (!FID.isValid()) {
     TheModuleLoader.HadFatalFailure = true;
     return ImportAction::Failure;
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 1e21b4a94cea3..c5e32468bcc7e 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -649,8 +649,9 @@ void Preprocessor::EnterMainSourceFile() {
           << PPOpts.PCHThroughHeader;
       return;
     }
-    setPCHThroughHeaderFileID(
-        SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User));
+    // FIXME: Figure out character-encoding converter treatment.
+    setPCHThroughHeaderFileID(SourceMgr.createFileID(
+        *File, SourceLocation(), SrcMgr::C_User, /*Converter=*/nullptr));    
   }
 
   // Skip tokens from the Predefines and if needed the main file.
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp 
b/clang/lib/Lex/TextEncodingConfig.cpp
index bb3f0d4b4abec..ed426c304423b 100644
--- a/clang/lib/Lex/TextEncodingConfig.cpp
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -14,6 +14,8 @@ using namespace llvm;
 llvm::TextEncodingConverter *
 TextEncodingConfig::getConverter(ConversionAction Action) const {
   switch (Action) {
+  case CA_FromInputEncoding:
+    return FromInputEncodingConverter.get();
   default:
     return nullptr;
   }
diff --git a/clang/lib/Serialization/ASTReader.cpp 
b/clang/lib/Serialization/ASTReader.cpp
index 74a7b51368c28..7bc28a2791067 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -2002,8 +2002,11 @@ bool ASTReader::ReadSLocEntry(int ID) {
     }
     SrcMgr::CharacteristicKind
       FileCharacter = (SrcMgr::CharacteristicKind)Record[2];
-    FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, ID,
-                                        BaseOffset + Record[0]);
+    // Note: If conversion was originally necessary, OverriddenBuffer should be
+    // true and the associated handling will trigger.
+    FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter,
+                                        /*Converter=*/nullptr, ID,
+                                       BaseOffset + Record[0]);
     SrcMgr::FileInfo &FileInfo = SourceMgr.getSLocEntry(FID).getFile();
     FileInfo.NumCreatedFIDs = Record[5];
     if (Record[3])

>From 51adf4f5b68b461d667507bb1e568d87cab8e23e Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 16 Jun 2026 15:34:51 -0400
Subject: [PATCH 3/4] Add wrapper function for getzOSFileTag

---
 llvm/include/llvm/Support/AutoConvert.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/llvm/include/llvm/Support/AutoConvert.h 
b/llvm/include/llvm/Support/AutoConvert.h
index d68b0e8b515e0..9586d41e10c2f 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -105,6 +105,23 @@ inline ErrorOr<bool> needConversion(const Twine &FileName, 
const int FD = -1) {
   return false;
 }
 
+#ifdef __MVS__
+inline ErrorOr<__ccsid_t> getFileTag(const Twine &FileName, const int FD = -1) 
{
+  ErrorOr<__ccsid_t> Ccsid = getzOSFileTag(FileName, FD);
+  if (!Ccsid)
+    return Ccsid;
+  // Assume untagged files to be IBM-1047 encoded
+  if (*Ccsid == 0)
+    return CCSID_IBM_1047;
+  return Ccsid;
+}
+#else
+inline ErrorOr<int> getFileTag(const Twine &FileName, const int FD = -1) {
+  // On non-z/OS platforms, return 0 to indicate no file tag
+  return 0;
+}
+#endif
+
 } /* namespace llvm */
 #endif /* __cplusplus */
 

>From b8b8c2945182007ee9f02ef74358acb4b40ea0f3 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 16 Jun 2026 15:52:52 -0400
Subject: [PATCH 4/4] Changes from Bob

---
 clang/include/clang/Lex/TextEncodingConfig.h |  8 +++++
 clang/lib/Basic/SourceManager.cpp            |  7 +++-
 clang/lib/Lex/TextEncodingConfig.cpp         | 38 ++++++++++++++++++++
 llvm/include/llvm/Support/TextEncoding.h     | 22 ++++++++++++
 llvm/lib/Support/TextEncoding.cpp            | 36 +++++++++++++++++++
 5 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Lex/TextEncodingConfig.h 
b/clang/include/clang/Lex/TextEncodingConfig.h
index 30e0fcf2ac919..e4f6595997888 100644
--- a/clang/include/clang/Lex/TextEncodingConfig.h
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -17,9 +17,17 @@ enum ConversionAction { CA_NoConversion, 
CA_FromInputEncoding };
 
 class TextEncodingConfig {
 std::unique_ptr<llvm::TextEncodingConverter> FromInputEncodingConverter;
+llvm::StringMap<std::unique_ptr<llvm::TextEncodingConverter>> 
FromFiletagEncodingConverters;
 
 public:
   llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
+  static llvm::TextEncodingConverter *
+      getFromFiletagEncodingConverter(TextEncodingConfig &TEC,
+                                      llvm::StringRef FiletagEncoding);
+  static llvm::TextEncodingConverter *
+      createAndInsertFromFiletagEncodingConverter(TextEncodingConfig &TEC,
+                                                  llvm::StringRef 
FiletagEncoding,
+                                                  clang::DiagnosticsEngine 
&Diag);
 };
 
 #endif
diff --git a/clang/lib/Basic/SourceManager.cpp 
b/clang/lib/Basic/SourceManager.cpp
index c33ee69962864..25a7023b7db85 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -122,7 +122,10 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
   // return paths.
   IsBufferInvalid = true;
 
-  auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile);
+  auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile,
+                                          /*RequiresNullTerminator=*/true,
+                                          /*MaybeLimit=*/std::nullopt,
+                                          /*IsText=*/false);
 
   // If we were unable to open the file, then we are in an inconsistent
   // situation where the content cache referenced a file which no longer
@@ -589,6 +592,8 @@ FileID SourceManager::createFileID(FileEntryRef SourceFile,
   SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile,
                                                      isSystem(FileCharacter));
 
+  llvm::ErrorOr<int> Ccsid = llvm::getFileTag(SourceFile.getName());
+  if (Ccsid && *Ccsid > 0) {
   #ifndef NDEBUG
   // Either the content cache has never been used for a FileID (and, if we are
   // being asked to use a converter, there should be no valid buffer set up for
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp 
b/clang/lib/Lex/TextEncodingConfig.cpp
index ed426c304423b..01c9bfcf607db 100644
--- a/clang/lib/Lex/TextEncodingConfig.cpp
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -20,3 +20,41 @@ TextEncodingConfig::getConverter(ConversionAction Action) 
const {
     return nullptr;
   }
 }
+
+llvm::TextEncodingConverter *
+TextEncodingConfig::getFromFiletagEncodingConverter(TextEncodingConfig &TEC,
+                                                    StringRef FiletagEncoding) 
{
+  auto Iter = TEC.FromFiletagEncodingConverters.find(FiletagEncoding);
+  if (Iter != TEC.FromFiletagEncodingConverters.end()) {
+    return Iter->second.get();
+  }
+  return nullptr;
+}
+
+llvm::TextEncodingConverter *
+TextEncodingConfig::createAndInsertFromFiletagEncodingConverter(TextEncodingConfig
 &TEC,
+                                                                StringRef 
FiletagEncoding,
+                                                                
clang::DiagnosticsEngine &Diag) {
+  llvm::TextEncodingConverter *Converter = 
getFromFiletagtEncodingConverter(TEC, FiletagEncoding);
+  if (Converter)
+    return Converter;
+
+  const char *UTF8 = "UTF-8";
+  // Create a converter between the input and internal encodings
+  if (FiletagEncoding != UTF8) {
+    ErrorOr<TextEncodingConverter> ErrorOrConverter =
+        llvm::TextEncodingConverter::create(FiletagEncoding, UTF8);
+    if (!ErrorOrConverter) {
+      Diag.Report(clang::diag::err_drv_invalid_value)
+          << "Filetag encoding" << FiletagEncoding;
+      return nullptr;
+    } else {
+      auto FromFiletagEncodingConverter = 
std::make_unique<llvm::TextEncodingConverter>(
+          std::move(*ErrorOrConverter));
+      llvm::TextEncodingConverter *Result = FromFiletagEncodingConverter.get();
+      TEC.FromFiletagEncodingConverters.insert_or_assign(FiletagEncoding, 
std::move(FromFiletagEncodingConverter));
+      return Result;
+    }
+  }
+  return nullptr;
+}
diff --git a/llvm/include/llvm/Support/TextEncoding.h 
b/llvm/include/llvm/Support/TextEncoding.h
index 8a304910aa5dd..24a2e026ebc82 100644
--- a/llvm/include/llvm/Support/TextEncoding.h
+++ b/llvm/include/llvm/Support/TextEncoding.h
@@ -16,11 +16,13 @@
 #define LLVM_SUPPORT_TEXT_ENCODING_H
 
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorOr.h"
 
+#include <memory>
 #include <string>
 #include <system_error>
 
@@ -137,6 +139,26 @@ class TextEncodingConverter {
   }
 };
 
+/// Utility class to manage a cache of TextEncodingConverter instances.
+/// This is useful when you need to convert from multiple source encodings
+/// to a common target encoding (e.g., UTF-8).
+class TextEncodingConverterCache {
+  StringMap<std::unique_ptr<TextEncodingConverter>> Converters;
+
+public:
+  /// Get a converter from the cache, or nullptr if not found.
+  /// \param[in] SourceEncoding the source character encoding name
+  /// \return pointer to the converter or nullptr
+  TextEncodingConverter *getConverter(StringRef SourceEncoding) const;
+
+  /// Create and insert a converter into the cache.
+  /// \param[in] SourceEncoding the source character encoding name
+  /// \param[in] TargetEncoding the target character encoding name
+  /// \return pointer to the converter or nullptr on error
+  TextEncodingConverter *createAndInsertConverter(StringRef SourceEncoding,
+                                                   StringRef TargetEncoding);
+};
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Support/TextEncoding.cpp 
b/llvm/lib/Support/TextEncoding.cpp
index d36f02c1300b9..2a1d19935699e 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -356,3 +356,39 @@ ErrorOr<TextEncodingConverter> 
TextEncodingConverter::create(StringRef From,
   return std::make_error_code(std::errc::invalid_argument);
 #endif
 }
+
+TextEncodingConverter *
+TextEncodingConverterCache::getConverter(StringRef SourceEncoding) const {
+  auto Iter = Converters.find(SourceEncoding);
+  if (Iter != Converters.end()) {
+    return Iter->second.get();
+  }
+  return nullptr;
+}
+
+TextEncodingConverter *
+TextEncodingConverterCache::createAndInsertConverter(StringRef SourceEncoding,
+                                                      StringRef 
TargetEncoding) {
+  // Check if converter already exists
+  TextEncodingConverter *Converter = getConverter(SourceEncoding);
+  if (Converter)
+    return Converter;
+
+  // Don't create a converter if source and target are the same
+  if (SourceEncoding == TargetEncoding)
+    return nullptr;
+
+  // Create a new converter
+  ErrorOr<TextEncodingConverter> ErrorOrConverter =
+      TextEncodingConverter::create(SourceEncoding, TargetEncoding);
+  if (!ErrorOrConverter) {
+    return nullptr;
+  }
+
+  // Insert into cache and return pointer
+  auto NewConverter =
+      std::make_unique<TextEncodingConverter>(std::move(*ErrorOrConverter));
+  TextEncodingConverter *Result = NewConverter.get();
+  Converters.insert_or_assign(SourceEncoding, std::move(NewConverter));
+  return Result;
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to