https://github.com/azhan92 created 
https://github.com/llvm/llvm-project/pull/201187

None

>From 06f309fbf5b9e3361c355fb9d5f801c6f61cb934 Mon Sep 17 00:00:00 2001
From: Alison Zhang <[email protected]>
Date: Tue, 2 Jun 2026 15:08:29 -0400
Subject: [PATCH 1/2] Updates from -fexec-charset PR

---
 clang/include/clang/Lex/Preprocessor.h       |  6 ++++-
 clang/include/clang/Lex/TextEncodingConfig.h | 23 ++++++++++++++++++++
 clang/lib/Frontend/CompilerInstance.cpp      |  1 +
 clang/lib/Lex/CMakeLists.txt                 |  1 +
 clang/lib/Lex/TextEncodingConfig.cpp         | 20 +++++++++++++++++
 5 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 clang/include/clang/Lex/TextEncodingConfig.h
 create mode 100644 clang/lib/Lex/TextEncodingConfig.cpp

diff --git a/clang/include/clang/Lex/Preprocessor.h 
b/clang/include/clang/Lex/Preprocessor.h
index 8b684e85eb1c1..d3d32130f6e3b 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -30,6 +30,7 @@
 #include "clang/Lex/ModuleMap.h"
 #include "clang/Lex/PPCallbacks.h"
 #include "clang/Lex/PPEmbedParameters.h"
+#include "clang/Lex/TextEncodingConfig.h"
 #include "clang/Lex/Token.h"
 #include "clang/Lex/TokenLexer.h"
 #include "clang/Support/Compiler.h"
@@ -198,7 +199,8 @@ class Preprocessor {
   std::unique_ptr<ScratchBuffer> ScratchBuf;
   HeaderSearch      &HeaderInfo;
   ModuleLoader      &TheModuleLoader;
-
+  TextEncodingConfig TEC;
+  
   /// External source of macros.
   ExternalPreprocessorSource *ExternalSource;
 
@@ -1265,6 +1267,8 @@ class Preprocessor {
   Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
   llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
 
+  TextEncodingConfig &getTextEncodingConfig() { return TEC; }
+  
   void setExternalSource(ExternalPreprocessorSource *Source) {
     ExternalSource = Source;
   }
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h 
b/clang/include/clang/Lex/TextEncodingConfig.h
new file mode 100644
index 0000000000000..a810b9ab61b1a
--- /dev/null
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -0,0 +1,23 @@
+//===-- clang/Lex/TextEncodingConfig.h - Text Conversion Config -*- C++ 
-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+#define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
+
+#include "clang/Basic/LangOptions.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/TextEncoding.h"
+
+enum ConversionAction { CA_NoConversion };
+
+class TextEncodingConfig {
+public:
+  llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
+};
+
+#endif
diff --git a/clang/lib/Frontend/CompilerInstance.cpp 
b/clang/lib/Frontend/CompilerInstance.cpp
index 9e88abbece7f2..09607e6de8ce7 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -34,6 +34,7 @@
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorOptions.h"
+#include "clang/Lex/TextEncodingConfig.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
 #include "clang/Sema/ParsedAttr.h"
 #include "clang/Sema/Sema.h"
diff --git a/clang/lib/Lex/CMakeLists.txt b/clang/lib/Lex/CMakeLists.txt
index f61737cd68021..106a5d3b126be 100644
--- a/clang/lib/Lex/CMakeLists.txt
+++ b/clang/lib/Lex/CMakeLists.txt
@@ -29,6 +29,7 @@ add_clang_library(clangLex
   Preprocessor.cpp
   PreprocessorLexer.cpp
   ScratchBuffer.cpp
+  TextEncodingConfig.cpp
   TokenConcatenation.cpp
   TokenLexer.cpp
 
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp 
b/clang/lib/Lex/TextEncodingConfig.cpp
new file mode 100644
index 0000000000000..bb3f0d4b4abec
--- /dev/null
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -0,0 +1,20 @@
+//===--- TextEncodingConfig.cpp 
-------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Lex/TextEncodingConfig.h"
+#include "clang/Basic/DiagnosticDriver.h"
+
+using namespace llvm;
+
+llvm::TextEncodingConverter *
+TextEncodingConfig::getConverter(ConversionAction Action) const {
+  switch (Action) {
+  default:
+    return nullptr;
+  }
+}

>From a3b003af2fc9104896f3cd98d1e47982945669b5 Mon Sep 17 00:00:00 2001
From: Alison Zhang <[email protected]>
Date: Tue, 2 Jun 2026 15:46:40 -0400
Subject: [PATCH 2/2] Filetag support

---
 .../clang/Basic/DiagnosticCommonKinds.td      |   3 +
 clang/include/clang/Basic/SourceManager.h     |  18 ++-
 .../include/clang/Frontend/CompilerInstance.h |   1 +
 clang/include/clang/Lex/TextEncodingConfig.h  |  17 ++-
 clang/lib/Basic/SourceManager.cpp             | 115 ++++++++++++++----
 clang/lib/Frontend/CompilerInstance.cpp       |  45 +++++--
 .../lib/Frontend/VerifyDiagnosticConsumer.cpp |   3 +-
 clang/lib/Lex/ModuleMap.cpp                   |   5 +-
 clang/lib/Lex/PPDirectives.cpp                |   8 +-
 clang/lib/Lex/Preprocessor.cpp                |   5 +-
 clang/lib/Lex/TextEncodingConfig.cpp          |  38 ++++++
 clang/lib/Serialization/ASTReader.cpp         |   7 +-
 llvm/include/llvm/Support/TextEncoding.h      |   4 +
 llvm/lib/Support/TextEncoding.cpp             |   3 +-
 14 files changed, 230 insertions(+), 42 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td 
b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index f2ed2f4698b8d..e4e8e079a31f3 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -417,6 +417,9 @@ def note_file_sloc_usage : Note<
   "%plural{0:|: plus %2B (%human2B) for macro expansions}2">;
 def note_file_misc_sloc_usage : Note<
   "%0 additional files entered using a total of %1B (%human1B) of space">;
+def warn_charset_conversion_failed : Warning<
+  "conversion from source encoding failed for '%0': %1; interpreting as %2">,
+   InGroup<DiagGroup<"charset-conversion-failed">>;
 
 // Modules
 def err_module_format_unhandled : Error<
diff --git a/clang/include/clang/Basic/SourceManager.h 
b/clang/include/clang/Basic/SourceManager.h
index 4217b8683da1e..33517b9df59f5 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -50,6 +50,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TextEncoding.h"
 #include <cassert>
 #include <cstddef>
 #include <map>
@@ -156,6 +157,11 @@ class alignas(8) ContentCache {
   /// FIXME: Remove this once OrigEntry is a FileEntryRef with a stable name.
   StringRef Filename;
 
+  /// Information on whether this is associated with a FileID for a file (as
+  /// opposed to a buffer) and, if so, what conversion (if any) was requested.
+  llvm::PointerIntPair<llvm::TextEncodingConverter *, 1u, bool>
+      FileIDConverterInfo;
+
   /// A bump pointer allocated array of offsets for each source line.
   ///
   /// This is lazily computed.  The lines are owned by the SourceManager
@@ -919,8 +925,17 @@ class SourceManager : public RefCountedBase<SourceManager> 
{
   FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos,
                       SrcMgr::CharacteristicKind FileCharacter,
                       int LoadedID = 0,
+                      llvm::TextEncodingConverter *Converter, int LoadedID = 0,
                       SourceLocation::UIntTy LoadedOffset = 0);
 
+  FileID createFileID(FileEntryRef SourceFile, SourceLocation IncludePos,
+                      SrcMgr::CharacteristicKind FileCharacter,
+                      int LoadedID = 0,
+                      SourceLocation::UIntTy LoadedOffset = 0) {
+    return createFileID(SourceFile, IncludePos, FileCharacter,
+                        /*Converter=*/nullptr, LoadedID, LoadedOffset);
+  }
+
   /// Create a new FileID that represents the specified memory buffer.
   ///
   /// This does no caching of the buffer and takes ownership of the
@@ -942,7 +957,8 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// Get the FileID for \p SourceFile if it exists. Otherwise, create a
   /// new FileID for the \p SourceFile.
   FileID getOrCreateFileID(FileEntryRef SourceFile,
-                           SrcMgr::CharacteristicKind FileCharacter);
+                           SrcMgr::CharacteristicKind FileCharacter,
+                           llvm::TextEncodingConverter *Converter = nullptr);
 
   /// Creates an expansion SLocEntry for the substitution of an argument into a
   /// function-like macro's body. Returns the start of the expansion.
diff --git a/clang/include/clang/Frontend/CompilerInstance.h 
b/clang/include/clang/Frontend/CompilerInstance.h
index bb0eddb918623..6010b4bd900e9 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -864,6 +864,7 @@ class CompilerInstance : public ModuleLoader {
   ///
   /// \return True on success.
   static bool InitializeSourceManager(const FrontendInputFile &Input,
+                                      llvm::TextEncodingConverter *,
                                       DiagnosticsEngine &Diags,
                                       FileManager &FileMgr,
                                       SourceManager &SourceMgr);
diff --git a/clang/include/clang/Lex/TextEncodingConfig.h 
b/clang/include/clang/Lex/TextEncodingConfig.h
index a810b9ab61b1a..5cf077d6134a8 100644
--- a/clang/include/clang/Lex/TextEncodingConfig.h
+++ b/clang/include/clang/Lex/TextEncodingConfig.h
@@ -9,15 +9,28 @@
 #ifndef LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
 #define LLVM_CLANG_LEX_TEXTENCODINGCONFIG_H
 
-#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/Diagnostic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/TextEncoding.h"
 
-enum ConversionAction { CA_NoConversion };
+enum ConversionAction { CA_NoConversion, CA_FromInputEncoding };
 
 class TextEncodingConfig {
+  llvm::StringRef InputEncoding;
+  std::string FileTagEncoding;
+  std::unique_ptr<llvm::TextEncodingConverter> FromInputEncodingConverter;
+
 public:
   llvm::TextEncodingConverter *getConverter(ConversionAction Action) const;
+  static std::unique_ptr<llvm::TextEncodingConverter>
+#ifdef __MVS__      
+  createInputConverterFromFiletag(__ccsid_t Ccsid,
+                                   clang::DiagnosticsEngine &Diags);
+#endif  
+  static std::error_code
+  setFromInputConverter(TextEncodingConfig &TEC,
+                        std::unique_ptr<llvm::TextEncodingConverter> 
Converter);
+  llvm::StringRef getInputEncoding() { return InputEncoding; }
 };
 
 #endif
diff --git a/clang/lib/Basic/SourceManager.cpp 
b/clang/lib/Basic/SourceManager.cpp
index b6cc6ec9365f5..e36050847d40e 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -16,6 +16,7 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/SourceManagerInternals.h"
+#include "clang/Lex/TextEncodingConfig.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
@@ -31,6 +32,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -120,6 +122,13 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
   // return paths.
   IsBufferInvalid = true;
 
+  // If we have a converter, open the file in binary mode to prevent 
autoconversion.
+  llvm::TextEncodingConverter *Converter = FileIDConverterInfo.getPointer();
+  bool IsText = (Converter == nullptr);
+  auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile,
+                                           /*RequiresNullTerminator=*/true,
+                                           /*MaybeLimit=*/std::nullopt, 
IsText);
+
   auto BufferOrError = FM.getBufferForFile(*ContentsEntry, IsFileVolatile);
 
   // If we were unable to open the file, then we are in an inconsistent
@@ -136,7 +145,57 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
 
   Buffer = std::move(*BufferOrError);
 
-  // Check that the file's size fits in an 'unsigned' (with room for a
+  // Unless this is a named pipe (in which case we can handle a mismatch),
+  // check that the file's size is the same as in the file entry (which may
+  // have come from a stat cache).
+  // The buffer will always be larger than the file size on z/OS in the 
presence
+  // of characters outside the base character set.
+  assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize());
+  if (!ContentsEntry->isNamedPipe() &&
+      Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
+    Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
+
+    return std::nullopt;
+  }
+
+  // Convert source from the input charset to UTF-8 if necessary.
+  if (Converter) {
+    StringRef OriginalBuf = Buffer->getBuffer();
+
+    llvm::SmallString<0> UTF8Buf;
+    UTF8Buf.reserve(OriginalBuf.size() + 1);
+
+    std::error_code EC = Converter->convert(OriginalBuf, UTF8Buf);
+    if (EC) {
+      Diag.Report(Loc, diag::warn_charset_conversion_failed)
+          << ContentsEntry->getName() << EC.message();
+#ifdef __MVS__
+      // On z/OS, if conversion fails, try converting from IBM-1047 to UTF-8
+      std::unique_ptr<llvm::TextEncodingConverter> FallbackConverter =
+          TextEncodingConfig::createInputConverterFromFiletag(1047, Diag);
+
+      if (FallbackConverter) {
+        // Try converting with IBM-1047 converter
+        UTF8Buf.clear();
+        UTF8Buf.reserve(OriginalBuf.size() + 1);
+        EC = FallbackConverter->convert(OriginalBuf, UTF8Buf);
+
+        if (!EC) {
+          auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>(
+              std::move(UTF8Buf), Buffer->getBufferIdentifier());
+          Buffer = std::move(NewBuf);
+        } else {
+          // TODO: Reclaim memory if the buffer size exceeds the content.
+          auto NewBuf = std::make_unique<llvm::SmallVectorMemoryBuffer>(
+              std::move(UTF8Buf), Buffer->getBufferIdentifier());
+          Buffer = std::move(NewBuf);        
+        }
+      }  
+#endif
+    }
+  }
+
+  // Check that the buffer's size fits in an 'unsigned' (with room for a
   // past-the-end value). This is deeply regrettable, but various parts of
   // Clang (including elsewhere in this file!) use 'unsigned' to represent file
   // offsets, line numbers, string literal lengths, and so on, and fail
@@ -151,22 +210,15 @@ ContentCache::getBufferOrNone(DiagnosticsEngine &Diag, 
FileManager &FM,
     return std::nullopt;
   }
 
-  // Unless this is a named pipe (in which case we can handle a mismatch),
-  // check that the file's size is the same as in the file entry (which may
-  // have come from a stat cache).
-  // The buffer will always be larger than the file size on z/OS in the 
presence
-  // of characters outside the base character set.
-  assert(Buffer->getBufferSize() >= (size_t)ContentsEntry->getSize());
-  if (!ContentsEntry->isNamedPipe() &&
-      Buffer->getBufferSize() < (size_t)ContentsEntry->getSize()) {
-    Diag.Report(Loc, diag::err_file_modified) << ContentsEntry->getName();
-
-    return std::nullopt;
-  }
-
-  // If the buffer is valid, check to see if it has a UTF Byte Order Mark
-  // (BOM).  We only support UTF-8 with and without a BOM right now.  See
-  // http://en.wikipedia.org/wiki/Byte_order_mark for more information.
+  // If the buffer is valid, check to see if it has a UTF Byte Order Mark (BOM)
+  // Note that any conversion requested using `-finput-charset` (if successful)
+  // has already occurred, so we are expecting UTF-8 with or without a BOM.
+  //
+  // In theory, if we see a non-UTF-8 BOM, we can assume that an appropriate
+  // conversion was not supplied via `-finput-charset` and we could try to
+  // convert based on the BOM.
+  //
+  // See http://en.wikipedia.org/wiki/Byte_order_mark for more information.
   StringRef BufStr = Buffer->getBuffer();
   const char *InvalidBOM = getInvalidBOM(BufStr);
 
@@ -537,15 +589,30 @@ FileID SourceManager::getNextFileID(FileID FID) const {
 /// being \#included from the specified IncludePosition.
 FileID SourceManager::createFileID(FileEntryRef SourceFile,
                                    SourceLocation IncludePos,
+                                   llvm::TextEncodingConverter *Converter,
                                    SrcMgr::CharacteristicKind FileCharacter,
                                    int LoadedID,
                                    SourceLocation::UIntTy LoadedOffset) {
   SrcMgr::ContentCache &IR = getOrCreateContentCache(SourceFile,
                                                      isSystem(FileCharacter));
 
+  #ifndef NDEBUG
+  // Either the content cache has never been used for a FileID (and, if we are
+  // being asked to use a converter, there should be no valid buffer set up for
+  // it) or the conversion (or lack thereof) should be the same as that used
+  // previously.
+  auto [CacheConverter, CacheUsedByFileID] = IR.FileIDConverterInfo;
+  if (CacheUsedByFileID)
+    assert(CacheConverter == Converter);
+  else
+    assert(!Converter || IR.IsBufferInvalid || !IR.getBufferIfLoaded());
+#endif
+  IR.FileIDConverterInfo.setPointerAndInt(Converter, true);
+
   // If this is a named pipe, immediately load the buffer to ensure subsequent
   // calls to ContentCache::getSize() are accurate.
-  if (IR.ContentsEntry->isNamedPipe())
+  // Do the same if character-encoding conversion was requested.
+  if (IR.ContentsEntry->isNamedPipe() || Converter) 
     (void)IR.getBufferOrNone(Diag, getFileManager(), SourceLocation());
 
   return createFileIDImpl(IR, SourceFile.getName(), IncludePos, FileCharacter,
@@ -583,10 +650,12 @@ FileID SourceManager::createFileID(const 
llvm::MemoryBufferRef &Buffer,
 /// new FileID for the \p SourceFile.
 FileID
 SourceManager::getOrCreateFileID(FileEntryRef SourceFile,
-                                 SrcMgr::CharacteristicKind FileCharacter) {
+                                 SrcMgr::CharacteristicKind FileCharacter,
+                                 llvm::TextEncodingConverter *Converter) {
   FileID ID = translateFile(SourceFile);
-  return ID.isValid() ? ID : createFileID(SourceFile, SourceLocation(),
-                                         FileCharacter);
+  return ID.isValid() ? ID
+                      : createFileID(SourceFile, SourceLocation(),
+                                     FileCharacter, Converter);
 }
 
 /// createFileID - Create a new FileID for the specified ContentCache and
@@ -2340,8 +2409,8 @@ SourceManagerForFile::SourceManagerForFile(StringRef 
FileName,
       std::make_unique<DiagnosticsEngine>(DiagnosticIDs::create(), *DiagOpts);
   SourceMgr = std::make_unique<SourceManager>(*Diagnostics, *FileMgr);
   FileEntryRef FE = llvm::cantFail(FileMgr->getFileRef(FileName));
-  FileID ID =
-      SourceMgr->createFileID(FE, SourceLocation(), clang::SrcMgr::C_User);
+  FileID ID = SourceMgr->createFileID(
+      FE, SourceLocation(), clang::SrcMgr::C_User, /*Converter=*/nullptr);
   assert(ID.isValid());
   SourceMgr->setMainFileID(ID);
 }
diff --git a/clang/lib/Frontend/CompilerInstance.cpp 
b/clang/lib/Frontend/CompilerInstance.cpp
index 09607e6de8ce7..fc04f32910ea1 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -51,6 +51,9 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Plugins/PassPlugin.h"
 #include "llvm/Support/AdvisoryLock.h"
+#ifdef __MVS__
+#include "llvm/Support/AutoConvert.h"
+#endif
 #include "llvm/Support/BuryPointer.h"
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/Errc.h"
@@ -907,15 +910,37 @@ CompilerInstance::createOutputFileImpl(StringRef 
OutputPath, bool Binary,
 // Initialization Utilities
 
 bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input){
-  return InitializeSourceManager(Input, getDiagnostics(), getFileManager(),
-                                 getSourceManager());
+  llvm::TextEncodingConverter *Converter = nullptr;
+  if (hasPreprocessor() && !Input.isBuffer()) {
+    Preprocessor &PP = getPreprocessor();
+    StringRef InputFile = Input.getFile();
+
+#ifdef __MVS__
+    // Check for system filetag if we are on z/OS.
+    llvm::ErrorOr<__ccsid_t> Ccsid = llvm::getzOSFileTag(InputFile);
+    if (!Ccsid.getError() && *Ccsid > 0) {
+      // Create converter from filetag if it exists
+      std::unique_ptr<llvm::TextEncodingConverter> InputConverter =
+          TextEncodingConfig::createInputConverterFromFiletag(*Ccsid, 
getDiagnostics());
+
+      if (InputConverter)
+        TextEncodingConfig::setFromInputConverter(
+            PP.getTextEncodingConfig(), std::move(InputConverter));
+    }
+#endif
+
+    // Retrieve the converter to the internal charset if it exists.
+    Converter = PP.getTextEncodingConfig().getConverter(CA_FromInputEncoding);
+  }
+
+  return InitializeSourceManager(Input, Converter, getDiagnostics(),
+                                 getFileManager(), getSourceManager());  
 }
 
 // static
-bool CompilerInstance::InitializeSourceManager(const FrontendInputFile &Input,
-                                               DiagnosticsEngine &Diags,
-                                               FileManager &FileMgr,
-                                               SourceManager &SourceMgr) {
+bool CompilerInstance::InitializeSourceManager(
+    const FrontendInputFile &Input, llvm::TextEncodingConverter *Converter,
+    DiagnosticsEngine &Diags, FileManager &FileMgr, SourceManager &SourceMgr) {
   SrcMgr::CharacteristicKind Kind =
       Input.getKind().getFormat() == InputKind::ModuleMap
           ? Input.isSystem() ? SrcMgr::C_System_ModuleMap
@@ -931,10 +956,14 @@ bool CompilerInstance::InitializeSourceManager(const 
FrontendInputFile &Input,
 
   StringRef InputFile = Input.getFile();
 
+  // If we have a converter, open the file in binary mode to avoid 
autoconversion.
+  bool IsText = (Converter == nullptr);
+
   // Figure out where to get and map in the main file.
   auto FileOrErr = InputFile == "-"
                        ? FileMgr.getSTDIN()
-                       : FileMgr.getFileRef(InputFile, /*OpenFile=*/true);
+                       : FileMgr.getFileRef(InputFile, /*OpenFile=*/true,
+                                           /*CacheFailure=*/true, IsText);
   if (!FileOrErr) {
     auto EC = llvm::errorToErrorCode(FileOrErr.takeError());
     if (InputFile != "-")
@@ -945,7 +974,7 @@ bool CompilerInstance::InitializeSourceManager(const 
FrontendInputFile &Input,
   }
 
   SourceMgr.setMainFileID(
-      SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind));
+      SourceMgr.createFileID(*FileOrErr, SourceLocation(), Kind, Converter));  
  
 
   assert(SourceMgr.getMainFileID().isValid() &&
          "Couldn't establish MainFileID!");
diff --git a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp 
b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
index 1bfe644b2525a..2afe990761267 100644
--- a/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
+++ b/clang/lib/Frontend/VerifyDiagnosticConsumer.cpp
@@ -611,7 +611,8 @@ static bool ParseDirective(StringRef S, ExpectedData *ED, 
SourceManager &SM,
 
           FileID FID = SM.translateFile(*File);
           if (FID.isInvalid())
-            FID = SM.createFileID(*File, Pos, SrcMgr::C_User);
+            FID = SM.createFileID(*File, Pos, SrcMgr::C_User,
+                                 /*Converter=*/nullptr);
 
           if (PH.Next(Line) && Line > 0)
             ExpectedLoc = SM.translateLineCol(FID, Line, 1);
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 6c07386f89010..c7c1e04b76ea1 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -1473,7 +1473,10 @@ bool ModuleMap::parseModuleMapFile(FileEntryRef File, 
bool IsSystem,
     if (LocalFID.isInvalid()) {
       auto FileCharacter =
           IsSystem ? SrcMgr::C_System_ModuleMap : SrcMgr::C_User_ModuleMap;
-      LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter);
+      // FIXME: Module map files are also textual "source files". For 
consistency,
+      // conversion should occur.
+      LocalFID = SourceMgr.createFileID(File, ExternModuleLoc, FileCharacter,
+                                /*Converter=*/nullptr);
     }
     ID = LocalFID;
   }
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index eb21a510dcf83..f9636f2e61ae5 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2796,7 +2796,13 @@ Preprocessor::ImportAction 
Preprocessor::HandleHeaderIncludeOrImport(
   // position on the file where it will be included and after the expansions.
   if (IncludePos.isMacroID())
     IncludePos = SourceMgr.getExpansionRange(IncludePos).getEnd();
-  FileID FID = SourceMgr.createFileID(*File, IncludePos, FileCharacter);
+  // Retrieve the converter to the internal charset if it exists.
+  llvm::TextEncodingConverter *Converter =
+      getTextEncodingConfig().getConverter(CA_FromInputEncoding);
+
+  FileID FID =
+      SourceMgr.createFileID(*File, IncludePos, FileCharacter, Converter);
+
   if (!FID.isValid()) {
     TheModuleLoader.HadFatalFailure = true;
     return ImportAction::Failure;
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 1e21b4a94cea3..8173723fe9bfe 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -649,8 +649,9 @@ void Preprocessor::EnterMainSourceFile() {
           << PPOpts.PCHThroughHeader;
       return;
     }
-    setPCHThroughHeaderFileID(
-        SourceMgr.createFileID(*File, SourceLocation(), SrcMgr::C_User));
+    // FIXME: Figure out character-encoding converter treatment.
+    setPCHThroughHeaderFileID(SourceMgr.createFileID(
+        *File, SourceLocation(), SrcMgr::C_User, /*Converter=*/nullptr));
   }
 
   // Skip tokens from the Predefines and if needed the main file.
diff --git a/clang/lib/Lex/TextEncodingConfig.cpp 
b/clang/lib/Lex/TextEncodingConfig.cpp
index bb3f0d4b4abec..a0cf1e62c7ec1 100644
--- a/clang/lib/Lex/TextEncodingConfig.cpp
+++ b/clang/lib/Lex/TextEncodingConfig.cpp
@@ -8,13 +8,51 @@
 
 #include "clang/Lex/TextEncodingConfig.h"
 #include "clang/Basic/DiagnosticDriver.h"
+#include "llvm/Support/AutoConvert.h"
 
 using namespace llvm;
 
 llvm::TextEncodingConverter *
 TextEncodingConfig::getConverter(ConversionAction Action) const {
   switch (Action) {
+  case CA_FromInputEncoding:
+    return FromInputEncodingConverter.get();
   default:
     return nullptr;
   }
 }
+
+std::unique_ptr<llvm::TextEncodingConverter>
+TextEncodingConfig::createInputConverterFromFiletag(
+    __ccsid_t Ccsid, clang::DiagnosticsEngine &Diags) {
+  using namespace llvm;
+
+  std::string FileTagEncoding = std::to_string(Ccsid);
+
+  llvm::StringRef InputEncoding = FileTagEncoding;
+  const char *UTF8 = "UTF-8";
+
+  // Create a converter between the input and internal encodings
+  if (llvm::TextEncodingConverter::getKnownEncoding(InputEncoding) !=
+      llvm::TextEncodingConverter::getKnownEncoding(UTF8)) {
+    ErrorOr<TextEncodingConverter> ErrorOrConverter =
+        llvm::TextEncodingConverter::create(InputEncoding, UTF8);
+    if (!ErrorOrConverter) {
+      Diags.Report(clang::diag::err_drv_invalid_value)
+          << "filetag" << InputEncoding;
+      return nullptr;
+    } else {
+      return std::make_unique<llvm::TextEncodingConverter>(
+          std::move(*ErrorOrConverter));
+    }
+  }
+  return nullptr;
+}
+
+std::error_code
+TextEncodingConfig::setFromInputConverter(
+    TextEncodingConfig &TEC,
+    std::unique_ptr<llvm::TextEncodingConverter> Converter) {
+  TEC.FromInputEncodingConverter = std::move(Converter);
+  return std::error_code();
+}
diff --git a/clang/lib/Serialization/ASTReader.cpp 
b/clang/lib/Serialization/ASTReader.cpp
index 74a7b51368c28..7bc28a2791067 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -2002,8 +2002,11 @@ bool ASTReader::ReadSLocEntry(int ID) {
     }
     SrcMgr::CharacteristicKind
       FileCharacter = (SrcMgr::CharacteristicKind)Record[2];
-    FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter, ID,
-                                        BaseOffset + Record[0]);
+    // Note: If conversion was originally necessary, OverriddenBuffer should be
+    // true and the associated handling will trigger.
+    FileID FID = SourceMgr.createFileID(*File, IncludeLoc, FileCharacter,
+                                        /*Converter=*/nullptr, ID,
+                                       BaseOffset + Record[0]);
     SrcMgr::FileInfo &FileInfo = SourceMgr.getSLocEntry(FID).getFile();
     FileInfo.NumCreatedFIDs = Record[5];
     if (Record[3])
diff --git a/llvm/include/llvm/Support/TextEncoding.h 
b/llvm/include/llvm/Support/TextEncoding.h
index 8a304910aa5dd..75d4e40bfc786 100644
--- a/llvm/include/llvm/Support/TextEncoding.h
+++ b/llvm/include/llvm/Support/TextEncoding.h
@@ -137,6 +137,10 @@ class TextEncodingConverter {
   }
 };
 
+
+  // Maps the encoding name to enum constant if possible.
+  static std::optional<TextEncoding> getKnownEncoding(StringRef Name);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Support/TextEncoding.cpp 
b/llvm/lib/Support/TextEncoding.cpp
index d36f02c1300b9..bca1d9c94b057 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -48,7 +48,8 @@ static void normalizeCharSetName(StringRef CSName,
 }
 
 // Maps the encoding name to enum constant if possible.
-static std::optional<TextEncoding> getKnownEncoding(StringRef Name) {
+std::optional<TextEncoding> 
+TextEncodingConverter::getKnownEncoding(StringRef Name) {
   SmallString<16> Normalized;
   normalizeCharSetName(Name, Normalized);
   if (Normalized.equals("utf8"))

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to