https://github.com/azhan92 updated 
https://github.com/llvm/llvm-project/pull/204233

>From 5698fb7f4167d504a56db66c34f10becb3e91a5f Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 23 Jun 2026 13:34:38 -0400
Subject: [PATCH 01/14] Add changes from finput-charset PR

---
 .../clang/Basic/DiagnosticCommonKinds.td      |  3 +
 clang/include/clang/Basic/LangOptions.h       |  3 +
 clang/include/clang/Basic/SourceManager.h     |  6 ++
 clang/lib/Basic/SourceManager.cpp             | 94 ++++++++++++++-----
 4 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td 
b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index f2ed2f4698b8d..8ebac3908b465 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -417,6 +417,9 @@ def note_file_sloc_usage : Note<
   "%plural{0:|: plus %2B (%human2B) for macro expansions}2">;
 def note_file_misc_sloc_usage : Note<
   "%0 additional files entered using a total of %1B (%human1B) of space">;
+def warn_charset_conversion_failed : Warning<
+  "conversion from source encoding failed for '%0': %1; interpreting as 
IBM-1047">,
+   InGroup<DiagGroup<"charset-conversion-failed">>;
 
 // Modules
 def err_module_format_unhandled : Error<
diff --git a/clang/include/clang/Basic/LangOptions.h 
b/clang/include/clang/Basic/LangOptions.h
index 9af036156b1ad..31f34207707c8 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -618,6 +618,9 @@ class LangOptions : public LangOptionsBase {
   /// The allocation token mode.
   std::optional<llvm::AllocTokenMode> AllocTokenMode;
 
+  /// Name of the input encoding to convert to the internal encoding.
+  std::string InputEncoding; 
+
   LangOptions();
 
   /// Set language defaults for the given input language and
diff --git a/clang/include/clang/Basic/SourceManager.h 
b/clang/include/clang/Basic/SourceManager.h
index 4217b8683da1e..395fcfc9f71e8 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -50,6 +50,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TextEncoding.h"
 #include <cassert>
 #include <cstddef>
 #include <map>
@@ -156,6 +157,11 @@ class alignas(8) ContentCache {
   /// FIXME: Remove this once OrigEntry is a FileEntryRef with a stable name.
   StringRef Filename;
 
+  /// Information on whether this is associated with a FileID for a file (as
+  /// opposed to a buffer) and, if so, what conversion (if any) was requested.
+  llvm::PointerIntPair<llvm::TextEncodingConverter *, 1u, bool>
+      FileIDConverterInfo;
+
   /// A bump pointer allocated array of offsets for each source line.
   ///
   /// This is lazily computed.  The lines are owned by the SourceManager
diff --git a/clang/lib/Basic/SourceManager.cpp 
b/clang/lib/Basic/SourceManager.cpp
index b6cc6ec9365f5..8b9ee14c476a7 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -136,7 +137,51 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
 
   Buffer = std::move(*BufferOrError);
 
-  // Check that the file's size fits in an 'unsigned' (with room for a
+  // Unless this is a named pipe (in which case we can handle a mismatch),
+  // check that the file's size is the same as in the file entry (which may
+  // have come from a stat cache).
+  assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize());
+  if (!ContentsEntry->isNamedPipe() &&
+      Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
+    Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
+
+    return std::nullopt;
+  }
+
+  // Convert source from the input charset to UTF-8 if necessary.
+  llvm::TextEncodingConverter *Converter = FileIDConverterInfo.getPointer();
+  if (Converter) {
+    StringRef OriginalBuf = Buffer->getBuffer();
+    llvm::SmallString<0> UTF8Buf;
+    UTF8Buf.reserve(OriginalBuf.size() + 1);
+
+    std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf);
+    if (EC) {
+      // If conversion fails, emit a warning and fall back to interpreting the
+      // file as UTF-8 without conversion.
+      //
+      // This allows the compiler to accept system or third-party headers that
+      // are encoded in UTF-8 even if conversion to the option-specified input
+      // charset failed.
+      //
+      // Diagnostics already exist when files are not well-formed UTF-8.
+      //
+      // TODO: Add input byte offset information.
+      //
+      // TODO: Consider adjusting the message to omit the "interpreting as
+      // UTF-8" recovery description if the warning has been upgraded to an
+      // error.
+      Diag.Report(Loc, diag::warn_charset_conversion_failed)
+          << ContentsEntry->getName() << EC.message();
+    } else {
+      // TODO: Reclaim memory if the buffer size exceeds the content.
+      auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>(
+          std::move(UTF8Buf), Buffer->getBufferIdentifier());
+      Buffer = std::move(NewBuf);  
+    }
+  }
+
+  // Check that the buffer's size fits in an 'unsigned' (with room for a
   // past-the-end value). This is deeply regrettable, but various parts of
   // Clang (including elsewhere in this file!) use 'unsigned' to represent file
   // offsets, line numbers, string literal lengths, and so on, and fail
@@ -151,22 +196,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
     return std::nullopt;
   }
 
-  // Unless this is a named pipe (in which case we can handle a mismatch),
-  // check that the file's size is the same as in the file entry (which may
-  // have come from a stat cache).
-  // The buffer will always be larger than the file size on z/OS in the 
presence
-  // of characters outside the base character set.
-  assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize());
-  if (!ContentsEntry->isNamedPipe() &&
-      Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
-    Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
-
-    return std::nullopt;
-  }
-
-  // If the buffer is valid, check to see if it has a UTF Byte Order Mark
-  // (BOM).  We only support UTF-8 with and without a BOM right now.  See
-  // http://en.wikipedia.org/wiki/Byte_order_mark for more information.
+  // If the buffer is valid, check to see if it has a UTF Byte Order Mark (BOM)
+  // Note that any conversion requested using `-finput-charset` (if successful)
+  // has already occurred, so we are expecting UTF-8 with or without a BOM.
+  //
+  // In theory, if we see a non-UTF-8 BOM, we can assume that an appropriate
+  // conversion was not supplied via `-finput-charset` and we could try to
+  // convert based on the BOM.
+  //
+  // See http://en.wikipedia.org/wiki/Byte_order_mark for more information.
   StringRef BufStr = Buffer->getBuffer();
   const char *InvalidBOM = getInvalidBOM(BufStr);
 
@@ -537,15 +575,29 @@ FileID SourceManager::getNextFileID(FileID FID) const {
 /// being \#included from the specified IncludePosition.
 FileID SourceManager::createFileID(FileEntryRef SourceFile,
                                    SourceLocation IncludePos,
-                                   SrcMgr::CharacteristicKind FileCharacter,
+                                  SrcMgr::CharacteristicKind FileCharacter,
                                    int LoadedID,
                                    SourceLocation::UIntTy LoadedOffset) {
   SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile,
                                                      isSystem(FileCharacter));
 
+  #ifndef NDEBUG
+  // Either the content cache has never been used for a FileID (and, if we are
+  // being asked to use a converter, there should be no valid buffer set up for
+  // it) or the conversion (or lack thereof) should be the same as that used
+  // previously.
+  auto [CacheConverter, CacheUsedByFileID] = IR.FileIDConverterInfo;
+  if (CacheUsedByFileID)
+    assert(CacheConverter == Converter);
+  else
+    assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded());
+#endif
+  IR.FileIDConverterInfo.setPointerAndInt(Converter, true);
+
   // If this is a named pipe, immediately load the buffer to ensure subsequent
   // calls to ContentCache::getSize() are accurate.
-  if (IR.ContentsEntry->isNamedPipe())
+  // Do the same if character-encoding conversion was requested.
+  if (IR.ContentsEntry->isNamedPipe() || Converter) 
     (void)IR.getBufferOrNone(Diag, getFileManager(), SourceLocation());
 
   return createFileIDImpl(IR, SourceFile.getName(), IncludePos, FileCharacter,
@@ -585,8 +637,8 @@ FileID
 SourceManager::getOrCreateFileID(FileEntryRef SourceFile,
                                  SrcMgr::CharacteristicKind FileCharacter) {
   FileID ID = translateFile(SourceFile);
-  return ID.isValid() ? ID : createFileID(SourceFile, SourceLocation(),
-                                         FileCharacter);
+  return ID.isValid() ? ID
+                      : createFileID(SourceFile, SourceLocation(), 
FileCharacter);
 }
 
 /// createFileID - Create a new FileID for the specified ContentCache and

>From 0991f6ebe08d9b3d4b5d12915fe531f1efce0974 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 23 Jun 2026 13:34:54 -0400
Subject: [PATCH 02/14] getEncodingNameFromFileTag

---
 llvm/include/llvm/Support/AutoConvert.h | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/llvm/include/llvm/Support/AutoConvert.h 
b/llvm/include/llvm/Support/AutoConvert.h
index d68b0e8b515e0..8797664b1b337 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -105,6 +105,31 @@ inline ErrorOr<bool> needConversion(const Twine &FileName, 
const int FD = -1) {
   return false;
 }
 
+inline ErrorOr<SmallString<32>>
+getEncodingNameFromFileTag(const Twine &FileName, const int FD = -1) {
+#ifdef __MVS__
+  ErrorOr<__ccsid_t> TagOrErr = getzOSFileTag(FileName, FD);
+  if (!TagOrErr)
+    return TagOrErr.getError();
+
+  __ccsid_t Tag = *TagOrErr;
+  if (Tag == 0)
+    return {}; // Return empty string for no tag
+
+  if (Tag == 1208)
+    return {"utf-8"};
+
+  if (Tag == 1047)
+    return {"ibm-1047"};
+
+  SmallString<32> Result;
+  raw_svector_ostream(Result) << Tag;
+  return Result;
+#else
+  return {}; // Return empty string for non-MVS platforms
+#endif
+}
+
 } /* namespace llvm */
 #endif /* __cplusplus */
 

>From 56a351c4495e0447618b75934a253dc173f29ced Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 23 Jun 2026 13:35:07 -0400
Subject: [PATCH 03/14] Add text encoding cache

---
 clang/include/clang/Basic/SourceManager.h | 14 ++++++++++
 clang/lib/Basic/SourceManager.cpp         | 34 +++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/clang/include/clang/Basic/SourceManager.h 
b/clang/include/clang/Basic/SourceManager.h
index 395fcfc9f71e8..4c90e4a538c52 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -846,6 +846,11 @@ class SourceManager : public RefCountedBase<SourceManager> 
{
   /// we can add a cc1-level option to do so.
   SmallVector<std::pair<std::string, FullSourceLoc>, 2> StoredModuleBuildStack;
 
+  /// Cache of all text encoding converters used by this SourceManager.
+  /// This includes both the input charset converter and file tag converters.
+  /// Maps from "source_encoding:target_encoding" to the converter.
+  llvm::StringMap<std::unique_ptr<llvm::TextEncodingConverter>> ConverterCache;
+
 public:
   SourceManager(DiagnosticsEngine &Diag, FileManager &FileMgr,
                 bool UserFilesAreVolatile = false);
@@ -863,6 +868,15 @@ class SourceManager : public RefCountedBase<SourceManager> 
{
 
   FileManager &getFileManager() const { return FileMgr; }
 
+  /// Get or create a text encoding converter from the cache.
+  /// This method manages all converters (input charset and file tag 
converters)
+  /// in a single cache owned by SourceManager.
+  /// \param SourceEncoding the source character encoding name
+  /// \return pointer to the converter or an error code
+  /// The target encoding is always UTF-8.
+  llvm::ErrorOr<llvm::TextEncodingConverter *>
+  getOrCreateConverter(llvm::StringRef SourceEncoding);
+
   /// Set true if the SourceManager should report the original file name
   /// for contents of files that were overridden by other files. Defaults to
   /// true.
diff --git a/clang/lib/Basic/SourceManager.cpp 
b/clang/lib/Basic/SourceManager.cpp
index 8b9ee14c476a7..46a7b8b85e2dd 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -467,6 +467,40 @@ ContentCache &SourceManager::createMemBufferContentCache(
   return *Entry;
 }
 
+llvm::ErrorOr<llvm::TextEncodingConverter *>
+SourceManager::getOrCreateConverter(llvm::StringRef SourceEncoding) {
+  // Use getKnownEncoding to get normalized encoding names
+  std::optional<llvm::TextEncoding> SourceKnown =
+      llvm::TextEncodingConverter::getKnownEncoding(SourceEncoding);
+  
+  if (SourceKnown && *SourceKnown == llvm::TextEncoding::UTF8)
+    return nullptr;
+
+  // Create a cache key using canonical encoding name
+  llvm::StringRef CacheKey = SourceKnown
+      ? llvm::TextEncodingConverter::getKnownEncodingName(*SourceKnown)
+      : SourceEncoding;
+
+  // Check if converter already exists in cache
+  auto It = ConverterCache.find(CacheKey);
+  if (It != ConverterCache.end())
+    return It->second.get();
+
+  // Create a new converter
+  llvm::ErrorOr<llvm::TextEncodingConverter> NewConverter =
+      llvm::TextEncodingConverter::create(SourceEncoding, "UTF-8");
+  
+  if (!NewConverter)
+    return NewConverter.getError();
+
+  // Store the converter in the cache
+  auto Inserted = ConverterCache.insert(
+      std::make_pair(CacheKey, std::make_unique<llvm::TextEncodingConverter>(
+                                   std::move(*NewConverter))));
+  
+  return Inserted.first->second.get();
+}
+
 const SrcMgr::SLocEntry &SourceManager::loadSLocEntry(unsigned Index,
                                                       bool *Invalid) const {
   return const_cast<SourceManager *>(this)->loadSLocEntry(Index, Invalid);

>From 46220a9d135a11bc0c36171cb8fad0e0cd654264 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 23 Jun 2026 14:06:59 -0400
Subject: [PATCH 04/14] Get canonical names for encodings

---
 llvm/include/llvm/Support/TextEncoding.h | 11 +++++++++++
 llvm/lib/Support/TextEncoding.cpp        | 13 ++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Support/TextEncoding.h 
b/llvm/include/llvm/Support/TextEncoding.h
index 8a304910aa5dd..09e24000594db 100644
--- a/llvm/include/llvm/Support/TextEncoding.h
+++ b/llvm/include/llvm/Support/TextEncoding.h
@@ -105,6 +105,17 @@ class TextEncodingConverter {
   LLVM_ABI static ErrorOr<TextEncodingConverter> create(StringRef From,
                                                         StringRef To);
 
+  /// Maps the encoding name to enum constant if possible.
+  /// Uses normalized charset name matching.
+  /// \param[in] Name the character encoding name
+  /// \return the TextEncoding enum value if known, std::nullopt otherwise
+  LLVM_ABI static std::optional<TextEncoding> getKnownEncoding(StringRef Name);
+
+  /// Returns the canonical name for a known encoding.
+  /// \param[in] Encoding the TextEncoding enum value
+  /// \return the canonical name for the encoding (e.g., "UTF-8" or "IBM-1047")
+  LLVM_ABI static StringRef getKnownEncodingName(TextEncoding Encoding);
+
   TextEncodingConverter(const TextEncodingConverter &) = delete;
   TextEncodingConverter &operator=(const TextEncodingConverter &) = delete;
 
diff --git a/llvm/lib/Support/TextEncoding.cpp 
b/llvm/lib/Support/TextEncoding.cpp
index d36f02c1300b9..8e9653dab38ec 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -48,7 +48,7 @@ static void normalizeCharSetName(StringRef CSName,
 }
 
 // Maps the encoding name to enum constant if possible.
-static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
+std::optional<TextEncoding> TextEncodingConverter::getKnownEncoding(StringRef 
Name) {
   SmallString<16> Normalized;
   normalizeCharSetName(Name, Normalized);
   if (Normalized.equals("utf8"))
@@ -58,6 +58,17 @@ static std::optional<TextEncoding> 
getKnownEncoding(StringRef Name) {
   return std::nullopt;
 }
 
+// Returns the canonical name for a known encoding.
+StringRef TextEncodingConverter::getKnownEncodingName(TextEncoding Encoding) {
+  switch (Encoding) {
+  case TextEncoding::UTF8:
+    return "UTF-8";
+  case TextEncoding::IBM1047:
+    return "IBM-1047";
+  }
+  llvm_unreachable("Invalid TextEncoding value");
+}
+
 [[maybe_unused]] static void HandleOverflow(size_t &Capacity, char *&Output,
                                             size_t &OutputLength,
                                             SmallVectorImpl<char> &Result) {

>From 24c1fd677f48da1127e292053e02b92971158a14 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Mon, 22 Jun 2026 11:17:29 -0400
Subject: [PATCH 05/14] File mismatch checking

---
 clang/lib/Basic/FileManager.cpp               | 17 ++++++++++++-----
 llvm/include/llvm/Support/VirtualFileSystem.h | 12 ++++++++++++
 llvm/lib/Support/VirtualFileSystem.cpp        | 13 ++++++++++---
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index 8fb3ba0a27aad..717a692661588 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -539,15 +539,22 @@ FileManager::getBufferForFile(FileEntryRef FE, bool 
isVolatile,
     FileSize = -1;
 
   StringRef Filename = FE.getName();
-  // If the file is already open, use the open file descriptor.
+  // If the file is already open, check if the mode matches.
   if (Entry->File) {
-    auto Result = Entry->File->getBuffer(Filename, FileSize,
-                                         RequiresNullTerminator, isVolatile);
+    // Check if the cached file's mode matches the requested mode
+    // Only perform mismatch recovery for real files
+    if (!Entry->File->realFileTextMismatch(IsText)) {
+      // Mode matches, use the cached file descriptor
+      auto Result = Entry->File->getBuffer(Filename, FileSize,
+                                           RequiresNullTerminator, isVolatile);
+      Entry->closeFile();
+      return Result;
+    }
+    // Mode mismatch - close the cached file and reopen with correct mode
     Entry->closeFile();
-    return Result;
   }
 
-  // Otherwise, open the file.
+  // Open the file with the requested mode.
   return getBufferForFileImpl(Filename, FileSize, isVolatile,
                               RequiresNullTerminator, IsText);
 }
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h 
b/llvm/include/llvm/Support/VirtualFileSystem.h
index d22c534228331..a3ef38fe552a7 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -137,6 +137,18 @@ class LLVM_ABI File {
   /// Closes the file.
   virtual std::error_code close() = 0;
 
+  /// Returns true if this file was opened in text mode (with potential
+  /// encoding conversions), false if opened in binary mode.
+  /// Default implementation returns true for backward compatibility.
+  virtual bool isText() const { return true; }
+
+  /// Returns true if this is a real file and the requested text mode differs
+  /// from the current mode. Always returns false for non-real files.
+  /// Default implementation returns false for non-real files.
+  virtual bool realFileTextMismatch(bool RequestedIsText) const {
+    return false;
+  }
+
   // Get the same file with a different path.
   static ErrorOr<std::unique_ptr<File>>
   getWithPath(ErrorOr<std::unique_ptr<File>> Result, const Twine &P);
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp 
b/llvm/lib/Support/VirtualFileSystem.cpp
index 42e8bb4f9958e..2def668e63cb3 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -194,11 +194,13 @@ class RealFile : public File {
   file_t FD;
   Status S;
   std::string RealName;
+  bool IsTextMode;
 
-  RealFile(file_t RawFD, StringRef NewName, StringRef NewRealPathName)
+  RealFile(file_t RawFD, StringRef NewName, StringRef NewRealPathName,
+           bool IsText)
       : FD(RawFD), S(NewName, {}, {}, {}, {}, {},
                      llvm::sys::fs::file_type::status_error, {}),
-        RealName(NewRealPathName.str()) {
+        RealName(NewRealPathName.str()), IsTextMode(IsText) {
     assert(FD != kInvalidFile && "Invalid or inactive file descriptor");
   }
 
@@ -213,6 +215,10 @@ class RealFile : public File {
                                                    bool IsVolatile) override;
   std::error_code close() override;
   void setPath(const Twine &Path) override;
+  bool isText() const override { return IsTextMode; }
+  bool realFileTextMismatch(bool RequestedIsText) const override {
+    return IsTextMode != RequestedIsText;
+  }
 };
 
 } // namespace
@@ -320,8 +326,9 @@ class RealFileSystem : public FileSystem {
         adjustPath(Name, Storage), Flags, &RealName);
     if (!FDOrErr)
       return errorToErrorCode(FDOrErr.takeError());
+    bool IsText = (Flags & sys::fs::OF_Text) != sys::fs::OF_None;
     return std::unique_ptr<File>(
-        new RealFile(*FDOrErr, Name.str(), RealName.str()));
+        new RealFile(*FDOrErr, Name.str(), RealName.str(), IsText));
   }
 
   struct WorkingDirectory {

>From a33e9ee56ea4a99e8a9420f6097443d762ce6240 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Mon, 22 Jun 2026 17:04:01 -0400
Subject: [PATCH 06/14] Remove isText function

---
 llvm/include/llvm/Support/VirtualFileSystem.h | 5 -----
 llvm/lib/Support/VirtualFileSystem.cpp        | 1 -
 2 files changed, 6 deletions(-)

diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h 
b/llvm/include/llvm/Support/VirtualFileSystem.h
index a3ef38fe552a7..8e5ffc0f051a6 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -137,11 +137,6 @@ class LLVM_ABI File {
   /// Closes the file.
   virtual std::error_code close() = 0;
 
-  /// Returns true if this file was opened in text mode (with potential
-  /// encoding conversions), false if opened in binary mode.
-  /// Default implementation returns true for backward compatibility.
-  virtual bool isText() const { return true; }
-
   /// Returns true if this is a real file and the requested text mode differs
   /// from the current mode. Always returns false for non-real files.
   /// Default implementation returns false for non-real files.
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp 
b/llvm/lib/Support/VirtualFileSystem.cpp
index 2def668e63cb3..a46abbef127b4 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -215,7 +215,6 @@ class RealFile : public File {
                                                    bool IsVolatile) override;
   std::error_code close() override;
   void setPath(const Twine &Path) override;
-  bool isText() const override { return IsTextMode; }
   bool realFileTextMismatch(bool RequestedIsText) const override {
     return IsTextMode != RequestedIsText;
   }

>From 901c8ee08a62522622578029fc1dbf012ab42ce5 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 23 Jun 2026 15:14:07 -0400
Subject: [PATCH 07/14] Bob changes

---
 clang/lib/Basic/FileManager.cpp               |  2 +-
 llvm/include/llvm/Support/VirtualFileSystem.h |  8 +++++---
 llvm/lib/Support/VirtualFileSystem.cpp        | 16 +++++++++++++---
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index 717a692661588..dd0f7dc0a7053 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -543,7 +543,7 @@ FileManager::getBufferForFile(FileEntryRef FE, bool 
isVolatile,
   if (Entry->File) {
     // Check if the cached file's mode matches the requested mode
     // Only perform mismatch recovery for real files
-    if (!Entry->File->realFileTextMismatch(IsText)) {
+    if (!Entry->File->checkTextModeMismatch(IsText)) {
       // Mode matches, use the cached file descriptor
       auto Result = Entry->File->getBuffer(Filename, FileSize,
                                            RequiresNullTerminator, isVolatile);
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h 
b/llvm/include/llvm/Support/VirtualFileSystem.h
index 8e5ffc0f051a6..2bdf97e3721fc 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -137,10 +137,12 @@ class LLVM_ABI File {
   /// Closes the file.
   virtual std::error_code close() = 0;
 
-  /// Returns true if this is a real file and the requested text mode differs
-  /// from the current mode. Always returns false for non-real files.
+  /// Checks if this is a real file and the requested text mode differs
+  /// from the current mode. For real files with a text mode mismatch where
+  /// the buffer was previously requested, this will call 
llvm::report_fatal_error.
+  /// Always returns false for non-real files.
   /// Default implementation returns false for non-real files.
-  virtual bool realFileTextMismatch(bool RequestedIsText) const {
+  virtual bool checkTextModeMismatch(bool RequestedIsText) const {
     return false;
   }
 
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp 
b/llvm/lib/Support/VirtualFileSystem.cpp
index a46abbef127b4..743edb336a19f 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -195,12 +195,14 @@ class RealFile : public File {
   Status S;
   std::string RealName;
   bool IsTextMode;
+  bool BufferWasRequested;
 
   RealFile(file_t RawFD, StringRef NewName, StringRef NewRealPathName,
            bool IsText)
       : FD(RawFD), S(NewName, {}, {}, {}, {}, {},
                      llvm::sys::fs::file_type::status_error, {}),
-        RealName(NewRealPathName.str()), IsTextMode(IsText) {
+        RealName(NewRealPathName.str()), IsTextMode(IsText),
+        BufferWasRequested(false) {
     assert(FD != kInvalidFile && "Invalid or inactive file descriptor");
   }
 
@@ -215,8 +217,15 @@ class RealFile : public File {
                                                    bool IsVolatile) override;
   std::error_code close() override;
   void setPath(const Twine &Path) override;
-  bool realFileTextMismatch(bool RequestedIsText) const override {
-    return IsTextMode != RequestedIsText;
+  bool checkTextModeMismatch(bool RequestedIsText) const override {
+    bool HasMismatch = IsTextMode != RequestedIsText;
+    if (HasMismatch && BufferWasRequested) {
+      llvm::report_fatal_error(
+          "Text mode mismatch: file was previously opened with " +
+          Twine(IsTextMode ? "text" : "binary") + " mode, now requested with " 
+
+          Twine(RequestedIsText ? "text" : "binary") + " mode");
+    }
+    return HasMismatch;
   }
 };
 
@@ -247,6 +256,7 @@ RealFile::getBuffer(const Twine &Name, int64_t FileSize,
   auto BypassSandbox = sys::sandbox::scopedDisable();
 
   assert(FD != kInvalidFile && "cannot get buffer for closed file");
+  BufferWasRequested = true;
   return MemoryBuffer::getOpenFile(FD, Name, FileSize, RequiresNullTerminator,
                                    IsVolatile);
 }

>From ca78018eb1dc2e7be521809c1aa32148cbe65df5 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 23 Jun 2026 15:24:28 -0400
Subject: [PATCH 08/14] update name

---
 clang/lib/Basic/FileManager.cpp               | 2 +-
 llvm/include/llvm/Support/VirtualFileSystem.h | 2 +-
 llvm/lib/Support/VirtualFileSystem.cpp        | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index dd0f7dc0a7053..94fc4c15f51d2 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -543,7 +543,7 @@ FileManager::getBufferForFile(FileEntryRef FE, bool 
isVolatile,
   if (Entry->File) {
     // Check if the cached file's mode matches the requested mode
     // Only perform mismatch recovery for real files
-    if (!Entry->File->checkTextModeMismatch(IsText)) {
+    if (!Entry->File->realFileCheckTextModeMismatch(IsText)) {
       // Mode matches, use the cached file descriptor
       auto Result = Entry->File->getBuffer(Filename, FileSize,
                                            RequiresNullTerminator, isVolatile);
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h 
b/llvm/include/llvm/Support/VirtualFileSystem.h
index 2bdf97e3721fc..9cc35b6f5fb6d 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -142,7 +142,7 @@ class LLVM_ABI File {
   /// the buffer was previously requested, this will call 
llvm::report_fatal_error.
   /// Always returns false for non-real files.
   /// Default implementation returns false for non-real files.
-  virtual bool checkTextModeMismatch(bool RequestedIsText) const {
+  virtual bool realFileCheckTextModeMismatch(bool RequestedIsText) const {
     return false;
   }
 
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp 
b/llvm/lib/Support/VirtualFileSystem.cpp
index 743edb336a19f..1c1f9e9dd15b6 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -217,7 +217,7 @@ class RealFile : public File {
                                                    bool IsVolatile) override;
   std::error_code close() override;
   void setPath(const Twine &Path) override;
-  bool checkTextModeMismatch(bool RequestedIsText) const override {
+  bool realFileCheckTextModeMismatch(bool RequestedIsText) const override {
     bool HasMismatch = IsTextMode != RequestedIsText;
     if (HasMismatch && BufferWasRequested) {
       llvm::report_fatal_error(

>From 54f2ba7aba3fbe0a9cd52315762209393f1e5857 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 23 Jun 2026 16:02:43 -0400
Subject: [PATCH 09/14] Mode mismatch checking

---
 clang/lib/Basic/FileManager.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index 94fc4c15f51d2..edaeea08b20c1 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -550,7 +550,8 @@ FileManager::getBufferForFile(FileEntryRef FE, bool 
isVolatile,
       Entry->closeFile();
       return Result;
     }
-    // Mode mismatch - close the cached file and reopen with correct mode
+    // Mode mismatch - close the cached file and reopen with correct mode by
+    // falling through.
     Entry->closeFile();
   }
 

>From 5f867136428c01cc115884044f6e3ae90fe53788 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Tue, 23 Jun 2026 16:07:07 -0400
Subject: [PATCH 10/14] Create converters in createFileID

---
 clang/include/clang/Basic/SourceManager.h     |  4 +-
 .../include/clang/Frontend/CompilerInstance.h |  1 +
 clang/lib/Basic/SourceManager.cpp             | 56 ++++++++++++++++---
 clang/lib/Frontend/CompilerInstance.cpp       | 10 +++-
 .../lib/Frontend/VerifyDiagnosticConsumer.cpp |  4 +-
 clang/lib/Lex/ModuleMap.cpp                   |  7 ++-
 clang/lib/Lex/PPDirectives.cpp                |  6 +-
 clang/lib/Lex/Preprocessor.cpp                |  4 +-
 clang/lib/Serialization/ASTReader.cpp         |  5 +-
 9 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/Basic/SourceManager.h 
b/clang/include/clang/Basic/SourceManager.h
index 4c90e4a538c52..347dff62b2c38 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -938,6 +938,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// being \#included from the specified IncludePosition.
   FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos,
                       SrcMgr::CharacteristicKind FileCharacter,
+                     llvm::StringRef InputEncodingName = {},
                       int LoadedID = 0,
                       SourceLocation::UIntTy LoadedOffset = 0);
 
@@ -962,7 +963,8 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// Get the FileID for \p SourceFile if it exists. Otherwise, create a
   /// new FileID for the \p SourceFile.
   FileID getOrCreateFileID(FileEntryRef SourceFile,
-                           SrcMgr::CharacteristicKind FileCharacter);
+                           SrcMgr::CharacteristicKind FileCharacter,
+                          llvm::StringRef InputEncodingName = {});
 
   /// Creates an expansion SLocEntry for the substitution of an argument into a
   /// function-like macro's body. Returns the start of the expansion.
diff --git a/clang/include/clang/Frontend/CompilerInstance.h 
b/clang/include/clang/Frontend/CompilerInstance.h
index bb0eddb918623..522fecfcab35e 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -864,6 +864,7 @@ class CompilerInstance : public ModuleLoader {
   ///
   /// \return True on success.
   static bool InitializeSourceManager(const FrontendInputFile &Input,
+                                     llvm::StringRef InputEncodingName = {}, 
                                       DiagnosticsEngine &Diags,
                                       FileManager &FileMgr,
                                       SourceManager &SourceMgr);
diff --git a/clang/lib/Basic/SourceManager.cpp 
b/clang/lib/Basic/SourceManager.cpp
index 46a7b8b85e2dd..2e71d39dc232c 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -121,7 +121,16 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
   // return paths.
   IsBufferInvalid = true;
 
-  auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile);
+  // If a converter is set, open the file in binary mode to get raw bytes
+  // and avoid platform-specific auto-conversion (e.g., EBCDIC->ASCII on z/OS,
+  // CRLF->LF on Windows). The explicit converter will handle all 
transformations.
+  bool NeedsExplicitConversion = FileIDConverterInfo.getPointer() != nullptr;
+  bool IsText = !NeedsExplicitConversion;
+
+  auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile,
+                                           /*RequiresNullTerminator=*/true,
+                                           /*MaybeLimit=*/std::nullopt,
+                                           IsText);
 
   // If we were unable to open the file, then we are in an inconsistent
   // situation where the content cache referenced a file which no longer
@@ -610,23 +619,53 @@ FileID SourceManager::getNextFileID(FileID FID) const {
 FileID SourceManager::createFileID(FileEntryRef SourceFile,
                                    SourceLocation IncludePos,
                                   SrcMgr::CharacteristicKind FileCharacter,
+                                  llvm::StringRef InputEncodingName,
                                    int LoadedID,
                                    SourceLocation::UIntTy LoadedOffset) {
   SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile,
                                                      isSystem(FileCharacter));
 
+  llvm::ErrorOr<llvm::TextEncodingConverter *> Converter = nullptr;
+  llvm::ErrorOr<llvm::SmallString<32>> Ccsid =
+      llvm::getEncodingNameFromFileTag(SourceFile.getName());
+  if (!Ccsid) {
+    Diag.Report(SourceLocation(), diag::err_cannot_open_file)
+        << SourceFile.getName() << Ccsid.getError().message();
+    return FileID();
+  }
+  if (!Ccsid->empty()) {
+    // File has a tag, use the converter from SourceManager's cache
+    Converter = getOrCreateConverter(*Ccsid);
+    if (!Converter) {
+      Diag.Report(SourceLocation(), diag::err_cannot_open_file)
+          << SourceFile.getName()
+   << (llvm::Twine("cannot create converter from encoding '") + *Ccsid + "'");
+      return FileID();
+    }
+  } else if (!InputEncodingName.empty()) {
+    // No file tag but -finput-charset conversion is desired.
+    // Get the converter from the cache using the input encoding name.
+    Converter = getOrCreateConverter(InputEncodingName);
+    if (!Converter) {
+      llvm::report_fatal_error(
+          "Cannot create converter for file '" + SourceFile.getName() + "': " +
+          Converter.getError().message());
+    }
+  }
+
   #ifndef NDEBUG
   // Either the content cache has never been used for a FileID (and, if we are
   // being asked to use a converter, there should be no valid buffer set up for
   // it) or the conversion (or lack thereof) should be the same as that used
   // previously.
   auto [CacheConverter, CacheUsedByFileID] = IR.FileIDConverterInfo;
+  llvm::TextEncodingConverter *ConverterPtr = Converter ? *Converter : nullptr;
   if (CacheUsedByFileID)
-    assert(CacheConverter == Converter);
+    assert(CacheConverter == ConverterPtr);
   else
-    assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded());
+    assert(!ConverterPtr || IR.IsBufferInvalid || !IR.getBufferIfLoaded());
 #endif
-  IR.FileIDConverterInfo.setPointerAndInt(Converter, true);
+  IR.FileIDConverterInfo.setPointerAndInt(Converter ? *Converter : nullptr, 
true);
 
   // If this is a named pipe, immediately load the buffer to ensure subsequent
   // calls to ContentCache::getSize() are accurate.
@@ -669,10 +708,12 @@ FileID SourceManager::createFileID(const 
llvm::MemoryBufferRef &Buffer,
 /// new FileID for the \p SourceFile.
 FileID
 SourceManager::getOrCreateFileID(FileEntryRef SourceFile,
-                                 SrcMgr::CharacteristicKind FileCharacter) {
+                                 SrcMgr::CharacteristicKind FileCharacter,
+                                llvm::StringRef InputEncodingName) {
   FileID ID = translateFile(SourceFile);
   return ID.isValid() ? ID
-                      : createFileID(SourceFile, SourceLocation(), 
FileCharacter);
+                      : createFileID(SourceFile, SourceLocation(), 
FileCharacter,
+                                    InputEncodingName);
 }
 
 /// createFileID - Create a new FileID for the specified ContentCache and
@@ -2427,7 +2468,8 @@ SourceManagerForFile::SourceManagerForFile(StringRef 
FileName,
   SourceMgr = std::make_unique<SourceManager>(*Diagnostics, *FileMgr);
   FileEntryRef FE = llvm::cantFail(FileMgr->getFileRef(FileName));
   FileID ID =
-      SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User);
+      SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User,
+                             /*InputEncodingName=*/{});
   assert(ID.isValid());
   SourceMgr->setMainFileID(ID);
 }
diff --git a/clang/lib/Frontend/CompilerInstance.cpp 
b/clang/lib/Frontend/CompilerInstance.cpp
index 8aee45b5dc644..42956fb18c104 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -912,12 +912,15 @@ CompilerInstance::createOutputFileImpl(StringRef 
OutputPath, bool Binary,
 // Initialization Utilities
 
 bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input){
-  return InitializeSourceManager(Input, getDiagnostics(), getFileManager(),
-                                 getSourceManager());
+  StringRef InputEncodingName =
+      hasPreprocessor() ? llvm::StringRef(getLangOpts().InputEncoding) : 
llvm::StringRef()
+  return InitializeSourceManager(Input, InputEncodingName, getDiagnostics(), 
+                                getFileManager(), getSourceManager());
 }
 
 // static
 bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input,
+                                              llvm::StringRef 
InputEncodingName,
                                                DiagnosticsEngine &Diags,
                                                FileManager &FileMgr,
                                                SourceManager &SourceMgr) {
@@ -950,7 +953,8 @@ bool CompilerInstance::InitializeSourceManager(const 
FrontendInputFile &Input,
   }
 
   SourceMgr.setMainFileID(
-      SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind));
+      SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind,
+                            InputEncodingName));
 
   assert(SourceMgr.getMainFileID().isValid() &&
          "Couldn't establish MainFileID!");
diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp 
b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
index 1bfe644b2525a..691bc5a5fd31d 100644
--- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
+++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
@@ -610,8 +610,10 @@ static bool ParseDirective(StringRef S, ExpectedData *ED, 
SourceManager &SM,
           }
 
           FileID FID = SM.translateFile(*File);
+         // FIXME: Figure out character-encoding converter treatment.
           if (FID.isInvalid())
-            FID = SM.createFileID(*File, Pos, SrcMgr::C_User);
+            FID = SM.createFileID(*File, Pos, SrcMgr::C_User,
+                                 /*InputEncodingName=*/{});
 
           if (PH.Next(Line) && Line > 0)
             ExpectedLoc = SM.translateLineCol(FID, Line, 1);
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 6c07386f89010..3e5c5b317d580 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -1473,7 +1473,12 @@ bool ModuleMap::parseModuleMapFile(FileEntryRef File, 
bool IsSystem,
     if (LocalFID.isInvalid()) {
       auto FileCharacter =
           IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap;
-      LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter);
+      // Module map files are textual "source files". Use input charset 
converter
+      // if available, and file tag converters are handled by SourceManager's 
cache.
+      // Get input encoding from LangOptions for charset conversion
+      llvm::StringRef InputEncodingName = LangOpts.InputEncoding;
+      LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter,
+                                       InputEncodingName);
     }
     ID = LocalFID;
   }
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index eb21a510dcf83..f989c2d1d4b96 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2796,7 +2796,11 @@ Preprocessor::ImportAction 
Preprocessor::HandleHeaderIncludeOrImport(
   // position on the file where it will be included and after the expansions.
   if (IncludePos.isMacroID())
     IncludePos = SourceMgr.getExpansionRange(IncludePos).getEnd();
-  FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter);
+  // Use the SourceManager's input charset converter for non-tagged files
+  // by passing the input encoding name
+  llvm::StringRef InputEncodingName = getLangOpts().InputEncoding;
+  FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter,
+                                      InputEncodingName);
   if (!FID.isValid()) {
     TheModuleLoader.HadFatalFailure = true;
     return ImportAction::Failure;
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 1e21b4a94cea3..0b07f7de8675a 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -649,8 +649,10 @@ void Preprocessor::EnterMainSourceFile() {
           << PPOpts.PCHThroughHeader;
       return;
     }
+    // FIXME: Figure out character-encoding converter treatment.
     setPCHThroughHeaderFileID(
-        SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User));
+        SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User,
+                              /*InputEncodingName=*/{}));
   }
 
   // Skip tokens from the Predefines and if needed the main file.
diff --git a/clang/lib/Serialization/ASTReader.cpp 
b/clang/lib/Serialization/ASTReader.cpp
index f8a6a38bb9b5c..52b60df62977d 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -2002,7 +2002,10 @@ bool ASTReader::ReadSLocEntry(int ID) {
     }
     SrcMgr::CharacteristicKind
       FileCharacter = (SrcMgr::CharacteristicKind)Record[2];
-    FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, ID,
+    // Note: If conversion was originally necessary, OverriddenBuffer should be
+    // true and the associated handling will trigger.
+    FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, 
+                                       /*InputEncodingName=*/{}, ID,
                                         BaseOffset + Record[0]);
     SrcMgr::FileInfo &FileInfo = SourceMgr.getSLocEntry(FID).getFile();
     FileInfo.NumCreatedFIDs = Record[5];

>From ec7463b27999ce40a92abf767eecba70eac713ca Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Wed, 24 Jun 2026 21:35:09 -0400
Subject: [PATCH 11/14] fix fallback

---
 clang/CMakeLists.txt                          |  3 ++
 .../clang/Basic/DiagnosticCommonKinds.td      |  4 ++-
 clang/include/clang/Basic/SourceManager.h     | 34 ++++++++++++++++++-
 clang/include/clang/Config/config.h.cmake     |  3 ++
 clang/lib/Basic/SourceManager.cpp             | 33 +++++++++++++-----
 clang/lib/Driver/ToolChains/Clang.cpp         | 15 +++++++-
 6 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index cd7ba53b03061..b73a7218c3e81 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -275,6 +275,9 @@ set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL
 set(PPC_LINUX_DEFAULT_IEEELONGDOUBLE OFF CACHE BOOL
     "Enable IEEE binary128 as default long double format on PowerPC Linux.")
 
+set(CLANG_DEFAULT_INPUT_ENCODING_IBM1047 OFF CACHE BOOL
+    "Set IBM-1047 as the default input encoding")
+
 set(CLANG_SPAWN_CC1 OFF CACHE BOOL
     "Whether clang should use a new process for the CC1 invocation")
 
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td 
b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index 8ebac3908b465..4dc958cad59ce 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -418,8 +418,10 @@ def note_file_sloc_usage : Note<
 def note_file_misc_sloc_usage : Note<
   "%0 additional files entered using a total of %1B (%human1B) of space">;
 def warn_charset_conversion_failed : Warning<
-  "conversion from source encoding failed for '%0': %1; interpreting as 
IBM-1047">,
+  "conversion from source encoding failed for '%0': %1; interpreting as %2">,
    InGroup<DiagGroup<"charset-conversion-failed">>;
+def err_charset_conversion_failed : Error<
+  "conversion from source encoding failed for '%0': %1">;
 
 // Modules
 def err_module_format_unhandled : Error<
diff --git a/clang/include/clang/Basic/SourceManager.h 
b/clang/include/clang/Basic/SourceManager.h
index 347dff62b2c38..1c74d7d34d6f6 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -159,7 +159,9 @@ class alignas(8) ContentCache {
 
   /// Information on whether this is associated with a FileID for a file (as
   /// opposed to a buffer) and, if so, what conversion (if any) was requested.
-  llvm::PointerIntPair<llvm::TextEncodingConverter *, 1u, bool>
+  /// The integer part uses 2 bits: bit 0 indicates if used by FileID,
+  /// bit 1 indicates if the file was tagged.
+  llvm::PointerIntPair<llvm::TextEncodingConverter *, 2u, unsigned>
       FileIDConverterInfo;
 
   /// A bump pointer allocated array of offsets for each source line.
@@ -277,6 +279,36 @@ class alignas(8) ContentCache {
 
   // If BufStr has an invalid BOM, returns the BOM name; otherwise, returns
   // nullptr
+
+  /// Helper methods for FileIDConverterInfo bit manipulation.
+  /// Bit 0: Used by FileID flag
+  /// Bit 1: File tagged flag
+  
+  bool isUsedByFileID() const {
+    return FileIDConverterInfo.getInt() & 0x1;
+  }
+  
+  void setUsedByFileID(bool Used) {
+    unsigned Flags = FileIDConverterInfo.getInt();
+    if (Used)
+      Flags |= 0x1;
+    else
+      Flags &= ~0x1;
+    FileIDConverterInfo.setInt(Flags);
+  }
+  
+  bool isFileTagged() const {
+    return FileIDConverterInfo.getInt() & 0x2;
+  }
+  
+  void setFileTagged(bool Tagged) {
+    unsigned Flags = FileIDConverterInfo.getInt();
+    if (Tagged)
+      Flags |= 0x2;
+    else
+      Flags &= ~0x2;
+    FileIDConverterInfo.setInt(Flags);
+  }
   static const char *getInvalidBOM(StringRef BufStr);
 };
 
diff --git a/clang/include/clang/Config/config.h.cmake 
b/clang/include/clang/Config/config.h.cmake
index 11b4096726f67..fbafafc710afe 100644
--- a/clang/include/clang/Config/config.h.cmake
+++ b/clang/include/clang/Config/config.h.cmake
@@ -75,6 +75,9 @@
 /* Enable IEEE binary128 as default long double format on PowerPC Linux. */
 #cmakedefine01 PPC_LINUX_DEFAULT_IEEELONGDOUBLE
 
+/* Set IBM-1047 as the default input encoding */
+#cmakedefine01 CLANG_DEFAULT_INPUT_ENCODING_IBM1047
+
 /* Enable each functionality of modules */
 #cmakedefine01 CLANG_ENABLE_OBJC_REWRITER
 #cmakedefine01 CLANG_ENABLE_STATIC_ANALYZER
diff --git a/clang/lib/Basic/SourceManager.cpp 
b/clang/lib/Basic/SourceManager.cpp
index 2e71d39dc232c..f8100aa7b2e32 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -166,22 +166,27 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
 
     std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf);
     if (EC) {
+      // For tagged files, conversion failure is an error and we don't fall 
back
+      if (isFileTagged()) {
+        Diag.Report(Loc, diag::err_charset_conversion_failed)
+            << ContentsEntry->getName() << EC.message();
+        return std::nullopt;
+      }
+      
       // If conversion fails, emit a warning and fall back to interpreting the
-      // file as UTF-8 without conversion.
+      // file as the default charset.
       //
       // This allows the compiler to accept system or third-party headers that
-      // are encoded in UTF-8 even if conversion to the option-specified input
-      // charset failed.
-      //
-      // Diagnostics already exist when files are not well-formed UTF-8.
+      // are encoded in the default charset even if conversion to the
+      // option-specified input charset failed.
       //
       // TODO: Add input byte offset information.
       //
-      // TODO: Consider adjusting the message to omit the "interpreting as
-      // UTF-8" recovery description if the warning has been upgraded to an
-      // error.
+      // TODO: Consider adjusting the message to omit the recovery description
+      // if the warning has been upgraded to an error.
+      const char *FallbackEncoding = CLANG_DEFAULT_INPUT_ENCODING_IBM1047 ? 
"IBM-1047" : "UTF-8";
       Diag.Report(Loc, diag::warn_charset_conversion_failed)
-          << ContentsEntry->getName() << EC.message();
+          << ContentsEntry->getName() << EC.message() << FallbackEncoding;
     } else {
       // TODO: Reclaim memory if the buffer size exceeds the content.
       auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>(
@@ -642,6 +647,7 @@ FileID SourceManager::createFileID(FileEntryRef SourceFile,
    << (llvm::Twine("cannot create converter from encoding '") + *Ccsid + "'");
       return FileID();
     }
+    IR.setFileTagged(true);
   } else if (!InputEncodingName.empty()) {
     // No file tag but -finput-charset conversion is desired.
     // Get the converter from the cache using the input encoding name.
@@ -651,6 +657,15 @@ FileID SourceManager::createFileID(FileEntryRef SourceFile,
           "Cannot create converter for file '" + SourceFile.getName() + "': " +
           Converter.getError().message());
     }
+  } else if (CLANG_DEFAULT_INPUT_ENCODING_IBM1047) {
+    // When IBM-1047 is the default and no file tag or explicit -finput-charset
+    // is provided, use IBM-1047 as the default source encoding
+    Converter = getOrCreateConverter("IBM-1047");
+    if (!Converter) {
+      llvm::report_fatal_error(
+          "Cannot create IBM-1047 converter for file '" + SourceFile.getName() 
+ "': " +
+          Converter.getError().message());
+    }
   }
 
   #ifndef NDEBUG
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp 
b/clang/lib/Driver/ToolChains/Clang.cpp
index 323417e294d5a..5cd60fca92d44 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7839,10 +7839,23 @@ void Clang::ConstructJob(Compilation &C, const 
JobAction &JA,
   // -finput_charset=UTF-8 is default. Reject others
   if (Arg *inputCharset = Args.getLastArg(options::OPT_finput_charset_EQ)) {
     StringRef value = inputCharset->getValue();
-    if (!value.equals_insensitive("utf-8"))
+    bool isValid = value.equals_insensitive("utf-8");
+#if CLANG_DEFAULT_INPUT_ENCODING_IBM1047
+    // When IBM-1047 default is enabled, also accept IBM-1047
+    isValid = isValid || value.equals_insensitive("ibm-1047") ||
+              value.equals_insensitive("ibm1047");
+#endif
+    if (!isValid)
       D.Diag(diag::err_drv_invalid_value) << inputCharset->getAsString(Args)
                                           << value;
   }
+#if CLANG_DEFAULT_INPUT_ENCODING_IBM1047
+  else {
+    // When IBM-1047 default is enabled and no explicit charset is specified,
+    // set IBM-1047 as the default
+    CmdArgs.push_back("-finput-charset=IBM-1047");
+  }
+#endif
 
   // -fexec_charset=UTF-8 is default. Reject others
   if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {

>From 4d9787f33c4092709094db4f509b2d977804b975 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Wed, 24 Jun 2026 21:48:42 -0400
Subject: [PATCH 12/14] retrun type

---
 llvm/include/llvm/Support/AutoConvert.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Support/AutoConvert.h 
b/llvm/include/llvm/Support/AutoConvert.h
index 8797664b1b337..5d3849f765b0f 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -114,19 +114,19 @@ getEncodingNameFromFileTag(const Twine &FileName, const 
int FD = -1) {
 
   __ccsid_t Tag = *TagOrErr;
   if (Tag == 0)
-    return {}; // Return empty string for no tag
+    return SmallString<32>(); // Return empty string for no tag
 
   if (Tag == 1208)
-    return {"utf-8"};
+    return SmallString<32>("utf-8");
 
   if (Tag == 1047)
-    return {"ibm-1047"};
+    return SmallString<32>("ibm-1047");
 
   SmallString<32> Result;
   raw_svector_ostream(Result) << Tag;
   return Result;
 #else
-  return {}; // Return empty string for non-MVS platforms
+  return SmallString<32>(); // Return empty string for non-MVS platforms
 #endif
 }
 

>From e0061c18fb442f3556c5e1329f08e2fef7732512 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Wed, 24 Jun 2026 21:53:28 -0400
Subject: [PATCH 13/14] fix error for smallstring

---
 llvm/include/llvm/Support/AutoConvert.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/include/llvm/Support/AutoConvert.h 
b/llvm/include/llvm/Support/AutoConvert.h
index 5d3849f765b0f..b437b157b7725 100644
--- a/llvm/include/llvm/Support/AutoConvert.h
+++ b/llvm/include/llvm/Support/AutoConvert.h
@@ -18,6 +18,7 @@
 #include <_Ccsid.h>
 #endif
 #ifdef __cplusplus
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Error.h"
 #include <system_error>

>From a2cc01c801024743a6e3b0f883d8b618cb734c66 Mon Sep 17 00:00:00 2001
From: alisonzhang <[email protected]>
Date: Wed, 24 Jun 2026 22:02:37 -0400
Subject: [PATCH 14/14] add flag

---
 clang/lib/Basic/SourceManager.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Basic/SourceManager.cpp 
b/clang/lib/Basic/SourceManager.cpp
index f8100aa7b2e32..8866861ca3a5d 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -10,6 +10,7 @@
 //
 
//===----------------------------------------------------------------------===//
 
+#include "clang/Config/config.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/FileManager.h"

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to