https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/195890
>From f720469a901ad09b0cf94603fbcbf17a13946004 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 8 May 2026 12:17:22 -0400 Subject: [PATCH 1/4] add ParserConversionAction, do not translate unevaluated strings --- clang/include/clang/Parse/Parser.h | 1 + clang/include/clang/Sema/Sema.h | 4 +++- clang/lib/Parse/ParseDecl.cpp | 10 ++++++++++ clang/lib/Parse/ParseDeclCXX.cpp | 2 ++ clang/lib/Parse/ParseExpr.cpp | 6 +++--- clang/lib/Parse/Parser.cpp | 4 ++++ clang/lib/Sema/SemaExpr.cpp | 11 ++++++----- clang/test/CodeGen/systemz-charset-diag.cpp | 9 +++++++++ clang/test/CodeGen/systemz-charset.c | 15 +++++++++++++++ 9 files changed, 53 insertions(+), 9 deletions(-) create mode 100644 clang/test/CodeGen/systemz-charset-diag.cpp diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index cec1dc99e90d8..fa465f9e83efe 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -5662,6 +5662,7 @@ class Parser : public CodeCompletionHandler { bool Finished; }; ObjCImplParsingDataRAII *CurParsedObjCImpl; + ConversionAction ParserConversionAction; /// StashAwayMethodOrFunctionBodyTokens - Consume the tokens and store them /// for later parsing. diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 72beac7526dc5..e2bc5593efa97 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -55,6 +55,7 @@ #include "clang/Basic/TemplateKinds.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Sema/AnalysisBasedWarnings.h" #include "clang/Sema/Attr.h" #include "clang/Sema/CleanupInfo.h" @@ -7372,7 +7373,8 @@ class Sema final : public SemaBase { /// from multiple tokens. However, the common case is that StringToks points /// to one string. ExprResult ActOnStringLiteral(ArrayRef<Token> StringToks, - Scope *UDLScope = nullptr); + Scope *UDLScope = nullptr, + ConversionAction Action = CA_ToExecEncoding); ExprResult ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks); diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 55ea562faacaa..d9ee9de4f5377 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -564,6 +564,9 @@ unsigned Parser::ParseAttributeArgsCommon( nullptr, Sema::ExpressionEvaluationContextRecord::EK_AttrArgument); + SaveAndRestore<ConversionAction> SavedTranslationState( + ParserConversionAction, CA_NoConversion); + ExprResult ArgExpr = ParseAssignmentExpression(); if (ArgExpr.isInvalid()) { SkipUntil(tok::r_paren, StopAtSemi); @@ -644,6 +647,9 @@ void Parser::ParseGNUAttributeArgs( ParsedAttr::Kind AttrKind = ParsedAttr::getParsedKind(AttrName, ScopeName, Form.getSyntax()); + SaveAndRestore<ConversionAction> SavedTranslationState(ParserConversionAction, + CA_NoConversion); + if (AttrKind == ParsedAttr::AT_Availability) { ParseAvailabilityAttribute(*AttrName, AttrNameLoc, Attrs, EndLoc, ScopeName, ScopeLoc, Form); @@ -723,6 +729,9 @@ unsigned Parser::ParseClangAttributeArgs( ParsedAttr::Kind AttrKind = ParsedAttr::getParsedKind(AttrName, ScopeName, Form.getSyntax()); + SaveAndRestore<ConversionAction> SavedTranslationState(ParserConversionAction, + CA_NoConversion); + switch (AttrKind) { default: return ParseAttributeArgsCommon(AttrName, AttrNameLoc, Attrs, EndLoc, @@ -1546,6 +1555,7 @@ void Parser::ParseExternalSourceSymbolAttribute( SkipUntil(tok::comma, tok::r_paren, StopAtSemi | StopBeforeMatch); continue; } + if (Keyword == Ident_language) { if (HadLanguage) { Diag(KeywordLoc, diag::err_external_source_symbol_duplicate_clause) diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index 877161b65b6c3..61c4df70693de 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -1001,6 +1001,8 @@ Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) { return nullptr; } } else if (tokenIsLikeStringLiteral(Tok, getLangOpts())) { + SaveAndRestore<ConversionAction> SavedTranslationState( + ParserConversionAction, CA_NoConversion); AssertMessage = ParseUnevaluatedStringLiteralExpression(); } else { Diag(Tok, diag::err_expected_string_literal) diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 4c5ad7d87d6f9..7a030550b15f1 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -3023,9 +3023,9 @@ ExprResult Parser::ParseStringLiteralExpression(bool AllowUserDefinedLiteral, } // Pass the set of string tokens, ready for concatenation, to the actions. - return Actions.ActOnStringLiteral(StringToks, - AllowUserDefinedLiteral ? getCurScope() - : nullptr); + return Actions.ActOnStringLiteral( + StringToks, AllowUserDefinedLiteral ? getCurScope() : nullptr, + ParserConversionAction); } ExprResult Parser::ParseGenericSelectionExpression() { diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 5d87453cf219e..2ef6eb2be63b8 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -70,6 +70,8 @@ Parser::Parser(Preprocessor &pp, Sema &actions, bool skipFunctionBodies) NumCachedScopes = 0; CurParsedObjCImpl = nullptr; + ParserConversionAction = CA_ToExecEncoding; + // Add #pragma handlers. These are removed and destroyed in the // destructor. initializePragmaHandlers(); @@ -1551,6 +1553,8 @@ void Parser::ParseKNRParamDeclarations(Declarator &D) { } ExprResult Parser::ParseAsmStringLiteral(bool ForAsmLabel) { + SaveAndRestore<ConversionAction> SavedTranslationState(ParserConversionAction, + CA_NoConversion); ExprResult AsmString; if (isTokenStringLiteral()) { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 98062afae4577..83d57a917fa1e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -2159,8 +2159,8 @@ ExprResult Sema::ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks) { if (getLangOpts().MicrosoftExt) StringToks = ExpandedToks = ExpandFunctionLocalPredefinedMacros(StringToks); - StringLiteralParser Literal(StringToks, PP, - StringLiteralEvalMethod::Unevaluated); + StringLiteralParser Literal( + StringToks, PP, StringLiteralEvalMethod::Unevaluated, CA_NoConversion); if (Literal.hadError) return ExprError(); @@ -2231,8 +2231,8 @@ Sema::ExpandFunctionLocalPredefinedMacros(ArrayRef<Token> Toks) { return ExpandedToks; } -ExprResult -Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) { +ExprResult Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope, + ConversionAction Action) { assert(!StringToks.empty() && "Must have at least one string!"); // StringToks needs backing storage as it doesn't hold array elements itself @@ -2240,7 +2240,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) { if (getLangOpts().MicrosoftExt) StringToks = ExpandedToks = ExpandFunctionLocalPredefinedMacros(StringToks); - StringLiteralParser Literal(StringToks, PP); + StringLiteralParser Literal(StringToks, PP, + StringLiteralEvalMethod::Evaluated, Action); if (Literal.hadError) return ExprError(); diff --git a/clang/test/CodeGen/systemz-charset-diag.cpp b/clang/test/CodeGen/systemz-charset-diag.cpp new file mode 100644 index 0000000000000..11d60e1ac2793 --- /dev/null +++ b/clang/test/CodeGen/systemz-charset-diag.cpp @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 -triple s390x-none-zos -fexec-charset IBM-1047 %s -std=c++17 -emit-llvm -o - -verify + +static_assert(false, "Error string"); // expected-error {{static assertion failed: Error string}} + +[[deprecated("message")]] void test_deprecated() {return;} // expected-note {{'test_deprecated' has been explicitly marked deprecated here}} + +int main() { + test_deprecated(); // expected-warning {{'test_deprecated' is deprecated: message}} +} diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c index 897b9d2eeefa1..5279b780531c3 100644 --- a/clang/test/CodeGen/systemz-charset.c +++ b/clang/test/CodeGen/systemz-charset.c @@ -56,3 +56,18 @@ const char *Unicode = "ΓΏ"; // RUN: not %clang_cc1 -fexec-charset invalid %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR // CHECK-ERROR: error: failed to set fexec-charset to 'invalid' +void test1() { + printf(__FUNCTION__); +} +//CHECK: @__FUNCTION__.test1 = private unnamed_addr constant [6 x i8] c"\A3\85\A2\A3\F1\00" + +#define HELLO "Hello " +#define WORLD "World!" +#define HELLO_WORLD HELLO WORLD +const char* hello_macro = HELLO; +//CHECK: c"\C8\85\93\93\96@\00" +//CHECK-UTF8 = c"Hello\00" + +const char* preprocessor_concatenation = HELLO_WORLD; +//CHECK: c"\C8\85\93\93\96@\E6\96\99\93\84Z\00" +//CHECK-UTF8: c"Hello World!\00" >From 5f9b389a8d09367107d54af8cb2e7ec94244bf6e Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 8 May 2026 12:19:11 -0400 Subject: [PATCH 2/4] Add format string handling --- clang/include/clang/AST/Expr.h | 6 ++ clang/include/clang/AST/FormatString.h | 12 +-- clang/include/clang/Basic/TargetInfo.h | 3 + clang/include/clang/Lex/TextEncodingConfig.h | 3 +- clang/include/clang/Sema/Sema.h | 2 +- clang/lib/AST/Expr.cpp | 14 +++ clang/lib/AST/FormatString.cpp | 86 ++++++++++--------- clang/lib/AST/FormatStringParsing.h | 36 +++++--- clang/lib/AST/PrintfFormatString.cpp | 89 +++++++++++++------- clang/lib/AST/ScanfFormatString.cpp | 23 +++-- clang/lib/Basic/TargetInfo.cpp | 3 + clang/lib/Frontend/CompilerInstance.cpp | 2 +- clang/lib/Lex/TextEncodingConfig.cpp | 11 ++- clang/lib/Sema/SemaChecking.cpp | 54 +++++++----- clang/lib/Sema/SemaExpr.cpp | 5 +- clang/test/CodeGen/systemz-charset.c | 2 + llvm/include/llvm/Support/TextEncoding.h | 10 +++ llvm/lib/Support/TextEncoding.cpp | 19 +++++ 18 files changed, 258 insertions(+), 122 deletions(-) diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 393fe275c6269..d01afcff4095d 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -28,6 +28,7 @@ #include "clang/Basic/LangOptions.h" #include "clang/Basic/SyncScope.h" #include "clang/Basic/TypeTraits.h" +#include "clang/Lex/TextEncodingConfig.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SmallVector.h" @@ -2066,6 +2067,11 @@ class PredefinedExpr final return getIdentKindName(getIdentKind()); } + static std::string + ComputeNameAndTranslate(PredefinedIdentKind IK, const Decl *CurrentDecl, + TextEncodingConfig &TEC, + bool ForceElaboratedPrinting = false); + static std::string ComputeName(PredefinedIdentKind IK, const Decl *CurrentDecl, bool ForceElaboratedPrinting = false); diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h index a3382e1a1d007..a24ade2d71ee9 100644 --- a/clang/include/clang/AST/FormatString.h +++ b/clang/include/clang/AST/FormatString.h @@ -19,6 +19,7 @@ #define LLVM_CLANG_AST_FORMATSTRING_H #include "clang/AST/CanonicalType.h" +#include "llvm/Support/TextEncoding.h" #include <optional> namespace clang { @@ -728,7 +729,8 @@ class FormatStringHandler { virtual bool HandleInvalidPrintfConversionSpecifier( const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) { + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { return true; } @@ -744,10 +746,10 @@ class FormatStringHandler { // Scanf-specific handlers. - virtual bool - HandleInvalidScanfConversionSpecifier(const analyze_scanf::ScanfSpecifier &FS, - const char *startSpecifier, - unsigned specifierLen) { + virtual bool HandleInvalidScanfConversionSpecifier( + const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { return true; } diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index 9f7d2a17a0f8a..ec7d4fcd4d8e3 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -38,6 +38,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Error.h" +#include "llvm/Support/TextEncoding.h" #include "llvm/Support/VersionTuple.h" #include "llvm/TargetParser/Triple.h" #include <cassert> @@ -323,6 +324,8 @@ class TargetInfo : public TransferrableTargetInfo, virtual ~TargetInfo(); + llvm::TextEncodingConverter *FormatStrConverter; + /// Retrieve the target options. TargetOptions &getTargetOpts() const { assert(TargetOpts && "Missing target options"); diff --git a/clang/include/clang/Lex/TextEncodingConfig.h b/clang/include/clang/Lex/TextEncodingConfig.h index 09967a81beeed..f4ef578eb2991 100644 --- a/clang/include/clang/Lex/TextEncodingConfig.h +++ b/clang/include/clang/Lex/TextEncodingConfig.h @@ -26,7 +26,8 @@ class TextEncodingConfig { llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; static std::error_code setConvertersFromOptions(TextEncodingConfig &TEC, - const clang::LangOptions &Opts); + const clang::LangOptions &Opts, + clang::TargetInfo &TInfo); llvm::StringRef getExecEncoding() { return ExecEncoding; } }; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index e2bc5593efa97..8ac5cc175fd2f 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -55,7 +55,7 @@ #include "clang/Basic/TemplateKinds.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" -#include "clang/Lex/LiteralConverter.h" +#include "clang/Lex/TextEncodingConfig.h" #include "clang/Sema/AnalysisBasedWarnings.h" #include "clang/Sema/Attr.h" #include "clang/Sema/CleanupInfo.h" diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 64d61dbc3d128..e067df4cefd7b 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -668,6 +668,20 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedIdentKind IK) { llvm_unreachable("Unknown ident kind for PredefinedExpr"); } +std::string PredefinedExpr::ComputeNameAndTranslate( + PredefinedIdentKind IK, const Decl *CurrentDecl, TextEncodingConfig &TEC, + bool ForceElaboratedPrinting) { + using namespace clang::charinfo; + std::string Result = ComputeName(IK, CurrentDecl, ForceElaboratedPrinting); + llvm::TextEncodingConverter *Converter = TEC.getConverter(CA_ToExecEncoding); + if (Converter) { + SmallString<128> Converted; + Converter->convert(Result, Converted); + Result = std::string(Converted); + } + return Result; +} + // FIXME: Maybe this should use DeclPrinter with a special "print predefined // expr" policy instead. std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK, diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp index 7e1ac0de6dcaf..0d449fb5f0904 100644 --- a/clang/lib/AST/FormatString.cpp +++ b/clang/lib/AST/FormatString.cpp @@ -33,8 +33,9 @@ FormatStringHandler::~FormatStringHandler() {} // scanf format strings. //===----------------------------------------------------------------------===// -OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg, - const char *E) { +OptionalAmount clang::analyze_format_string::ParseAmount( + const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter) { const char *I = Beg; UpdateOnReturn<const char *> UpdateBeg(Beg, I); @@ -42,7 +43,7 @@ OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg, bool hasDigits = false; for (; I != E; ++I) { - char c = *I; + char c = FormatStrConverter.convert(*I); if (c >= '0' && c <= '9') { hasDigits = true; accumulator = (accumulator * 10) + (c - '0'); @@ -60,21 +61,22 @@ OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg, } OptionalAmount clang::analyze_format_string::ParseNonPositionAmount( - const char *&Beg, const char *E, unsigned &argIndex) { - if (*Beg == '*') { + const char *&Beg, const char *E, unsigned &argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { + if (FormatStrConverter.convert(*Beg) == '*') { ++Beg; return OptionalAmount(OptionalAmount::Arg, argIndex++, Beg, 0, false); } - return ParseAmount(Beg, E); + return ParseAmount(Beg, E, FormatStrConverter); } OptionalAmount clang::analyze_format_string::ParsePositionAmount( FormatStringHandler &H, const char *Start, const char *&Beg, const char *E, - PositionContext p) { - if (*Beg == '*') { + PositionContext p, const llvm::TextEncodingConverter &FormatStrConverter) { + if (FormatStrConverter.convert(*Beg) == '*') { const char *I = Beg + 1; - const OptionalAmount &Amt = ParseAmount(I, E); + const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter); if (Amt.getHowSpecified() == OptionalAmount::NotSpecified) { H.HandleInvalidPosition(Beg, I - Beg, p); @@ -89,7 +91,7 @@ OptionalAmount clang::analyze_format_string::ParsePositionAmount( assert(Amt.getHowSpecified() == OptionalAmount::Constant); - if (*I == '$') { + if (FormatStrConverter.convert(*I) == '$') { // Handle positional arguments // Special case: '*0$', since this is an easy mistake. @@ -109,18 +111,21 @@ OptionalAmount clang::analyze_format_string::ParsePositionAmount( return OptionalAmount(false); } - return ParseAmount(Beg, E); + return ParseAmount(Beg, E, FormatStrConverter); } bool clang::analyze_format_string::ParseFieldWidth( FormatStringHandler &H, FormatSpecifier &CS, const char *Start, - const char *&Beg, const char *E, unsigned *argIndex) { + const char *&Beg, const char *E, unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { // FIXME: Support negative field widths. if (argIndex) { - CS.setFieldWidth(ParseNonPositionAmount(Beg, E, *argIndex)); + CS.setFieldWidth( + ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter)); } else { const OptionalAmount Amt = ParsePositionAmount( - H, Start, Beg, E, analyze_format_string::FieldWidthPos); + H, Start, Beg, E, analyze_format_string::FieldWidthPos, + FormatStrConverter); if (Amt.isInvalid()) return true; @@ -129,14 +134,13 @@ bool clang::analyze_format_string::ParseFieldWidth( return false; } -bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, - FormatSpecifier &FS, - const char *Start, - const char *&Beg, - const char *E) { +bool clang::analyze_format_string::ParseArgPosition( + FormatStringHandler &H, FormatSpecifier &FS, const char *Start, + const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter) { const char *I = Beg; - const OptionalAmount &Amt = ParseAmount(I, E); + const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter); if (I == E) { // No more characters left? @@ -144,7 +148,8 @@ bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, return true; } - if (Amt.getHowSpecified() == OptionalAmount::Constant && *(I++) == '$') { + if (Amt.getHowSpecified() == OptionalAmount::Constant && + FormatStrConverter.convert(*(I++)) == '$') { // Warn that positional arguments are non-standard. H.HandlePosition(Start, I - Start); @@ -165,16 +170,15 @@ bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, return false; } -bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, - FormatSpecifier &FS, - const char *&I, - const char *E, - const LangOptions &LO) { +bool clang::analyze_format_string::ParseVectorModifier( + FormatStringHandler &H, FormatSpecifier &FS, const char *&I, const char *E, + const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter) { if (!LO.OpenCL) return false; const char *Start = I; - if (*I == 'v') { + if (FormatStrConverter.convert(*I) == 'v') { ++I; if (I == E) { @@ -182,7 +186,7 @@ bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, return true; } - OptionalAmount NumElts = ParseAmount(I, E); + OptionalAmount NumElts = ParseAmount(I, E, FormatStrConverter); if (NumElts.getHowSpecified() != OptionalAmount::Constant) { H.HandleIncompleteSpecifier(Start, E - Start); return true; @@ -194,22 +198,20 @@ bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, return false; } -bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, - const char *&I, - const char *E, - const LangOptions &LO, - bool IsScanf) { +bool clang::analyze_format_string::ParseLengthModifier( + FormatSpecifier &FS, const char *&I, const char *E, const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter, bool IsScanf) { LengthModifier::Kind lmKind = LengthModifier::None; const char *lmPosition = I; - switch (*I) { + switch (FormatStrConverter.convert(*I)) { default: return false; case 'h': ++I; - if (I != E && *I == 'h') { + if (I != E && FormatStrConverter.convert(*I) == 'h') { ++I; lmKind = LengthModifier::AsChar; - } else if (I != E && *I == 'l' && LO.OpenCL) { + } else if (I != E && FormatStrConverter.convert(*I) == 'l' && LO.OpenCL) { ++I; lmKind = LengthModifier::AsShortLong; } else { @@ -218,7 +220,7 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, break; case 'l': ++I; - if (I != E && *I == 'l') { + if (I != E && FormatStrConverter.convert(*I) == 'l') { ++I; lmKind = LengthModifier::AsLongLong; } else { @@ -251,7 +253,9 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, // be parsed as the GNU extension 'a' length modifier. If not, this // will be parsed as a conversion specifier. ++I; - if (I != E && (*I == 's' || *I == 'S' || *I == '[')) { + if (I != E && (FormatStrConverter.convert(*I) == 's' || + FormatStrConverter.convert(*I) == 'S' || + FormatStrConverter.convert(*I) == '[')) { lmKind = LengthModifier::AsAllocate; break; } @@ -269,7 +273,8 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, // scanf: AsInt64 case 'I': if (I + 1 != E && I + 2 != E) { - if (I[1] == '6' && I[2] == '4') { + if (FormatStrConverter.convert(I[1]) == '6' && + FormatStrConverter.convert(I[2]) == '4') { I += 3; lmKind = LengthModifier::AsInt64; break; @@ -277,7 +282,8 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, if (IsScanf) return false; - if (I[1] == '3' && I[2] == '2') { + if (FormatStrConverter.convert(I[1]) == '3' && + FormatStrConverter.convert(I[2]) == '2') { I += 3; lmKind = LengthModifier::AsInt32; break; diff --git a/clang/lib/AST/FormatStringParsing.h b/clang/lib/AST/FormatStringParsing.h index 401528481a9d6..531bc291e0b5b 100644 --- a/clang/lib/AST/FormatStringParsing.h +++ b/clang/lib/AST/FormatStringParsing.h @@ -35,29 +35,43 @@ template <typename T> class UpdateOnReturn { namespace analyze_format_string { -OptionalAmount ParseAmount(const char *&Beg, const char *E); -OptionalAmount ParseNonPositionAmount(const char *&Beg, const char *E, - unsigned &argIndex); +OptionalAmount +ParseAmount(const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter); -OptionalAmount ParsePositionAmount(FormatStringHandler &H, const char *Start, - const char *&Beg, const char *E, - PositionContext p); +OptionalAmount +ParseNonPositionAmount(const char *&Beg, const char *E, unsigned &argIndex, + const llvm::TextEncodingConverter &FormatStrConverter); + +OptionalAmount +ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg, + const char *E, PositionContext p, + const llvm::TextEncodingConverter &FormatStrConverter); + +OptionalAmount +ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg, + const char *E, PositionContext p, + const llvm::TextEncodingConverter &FormatStrConverter); bool ParseFieldWidth(FormatStringHandler &H, FormatSpecifier &CS, const char *Start, const char *&Beg, const char *E, - unsigned *argIndex); + unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter); bool ParseArgPosition(FormatStringHandler &H, FormatSpecifier &CS, - const char *Start, const char *&Beg, const char *E); + const char *Start, const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter); bool ParseVectorModifier(FormatStringHandler &H, FormatSpecifier &FS, - const char *&Beg, const char *E, - const LangOptions &LO); + const char *&Beg, const char *E, const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter); /// Returns true if a LengthModifier was parsed and installed in the /// FormatSpecifier& argument, and false otherwise. bool ParseLengthModifier(FormatSpecifier &FS, const char *&Beg, const char *E, - const LangOptions &LO, bool IsScanf = false); + const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter, + bool IsScanf = false); /// Returns true if the invalid specifier in \p SpecifierBegin is a UTF-8 /// string; check that it won't go further than \p FmtStrEnd and write diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp index 6610a2de9e083..7efcc554ec136 100644 --- a/clang/lib/AST/PrintfFormatString.cpp +++ b/clang/lib/AST/PrintfFormatString.cpp @@ -35,14 +35,17 @@ typedef clang::analyze_format_string::SpecifierResult<PrintfSpecifier> using analyze_format_string::ParseNonPositionAmount; -static bool ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, - const char *Start, const char *&Beg, const char *E, - unsigned *argIndex) { +static bool +ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, const char *Start, + const char *&Beg, const char *E, unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { if (argIndex) { - FS.setPrecision(ParseNonPositionAmount(Beg, E, *argIndex)); + FS.setPrecision( + ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter)); } else { const OptionalAmount Amt = ParsePositionAmount( - H, Start, Beg, E, analyze_format_string::PrecisionPos); + H, Start, Beg, E, analyze_format_string::PrecisionPos, + FormatStrConverter); if (Amt.isInvalid()) return true; FS.setPrecision(Amt); @@ -50,11 +53,14 @@ static bool ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, return false; } -static bool ParseObjCFlags(FormatStringHandler &H, PrintfSpecifier &FS, - const char *FlagBeg, const char *E, bool Warn) { +static bool +ParseObjCFlags(FormatStringHandler &H, PrintfSpecifier &FS, const char *FlagBeg, + const char *E, bool Warn, + const llvm::TextEncodingConverter &FormatStrConverter) { StringRef Flag(FlagBeg, E - FlagBeg); // Currently there is only one flag. - if (Flag == "tt") { + if (Flag.size() == 2 && FormatStrConverter.convert(FlagBeg[0]) == 't' && + FormatStrConverter.convert(FlagBeg[1]) == 't') { FS.setHasObjCTechnicalTerm(FlagBeg); return false; } @@ -81,6 +87,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, const char *Start = nullptr; UpdateOnReturn<const char *> UpdateBeg(Beg, I); + const llvm::TextEncodingConverter &FormatStrConverter = + *Target.FormatStrConverter; // Look for a '%' character that indicates the start of a format specifier. for (; I != E; ++I) { char c = *I; @@ -89,7 +97,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, H.HandleNullChar(I); return true; } - if (c == '%') { + if (FormatStrConverter.convert(c) == '%') { Start = I++; // Record the start of the format specifier. break; } @@ -107,7 +115,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } PrintfSpecifier FS; - if (ParseArgPosition(H, FS, Start, I, E)) + if (ParseArgPosition(H, FS, Start, I, E, FormatStrConverter)) return true; if (I == E) { @@ -117,13 +125,17 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, return true; } - if (*I == '{') { + if (FormatStrConverter.convert(*I) == '{') { ++I; unsigned char PrivacyFlags = 0; StringRef MatchedStr; do { - StringRef Str(I, E - I); + const char *II; + std::string S(I, E - I); + for (unsigned long i = 0; i < S.length(); ++i) + S[i] = FormatStrConverter.convert(S[i]); + StringRef Str(S); std::string Match = "^[[:space:]]*" "(private|public|sensitive|mask\\.[^[:space:],}]*)" "[[:space:]]*(,|})"; @@ -132,25 +144,38 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, if (R.match(Str, &Matches)) { MatchedStr = Matches[1]; + II = I; I += Matches[0].size(); + while (FormatStrConverter.convert(*II) == ' ') + ++II; + // Set the privacy flag if the privacy annotation in the // comma-delimited segment is at least as strict as the privacy // annotations in previous comma-delimited segments. if (MatchedStr.starts_with("mask")) { - StringRef MaskType = MatchedStr.substr(sizeof("mask.") - 1); + StringRef MaskType(II + sizeof("mask.") - 1, + MatchedStr.size() - sizeof("mask.") + 1); unsigned Size = MaskType.size(); + if (Warn && (Size == 0 || Size > 8)) H.handleInvalidMaskType(MaskType); FS.setMaskType(MaskType); - } else if (MatchedStr == "sensitive") + } else if (MatchedStr == "sensitive") { + StringRef ProxyMatchedStr(II, sizeof("sensitive") - 1); + MatchedStr = ProxyMatchedStr; PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsSensitive; - else if (PrivacyFlags != - clang::analyze_os_log::OSLogBufferItem::IsSensitive && - MatchedStr == "private") + } else if (PrivacyFlags != + clang::analyze_os_log::OSLogBufferItem::IsSensitive && + MatchedStr == "private") { + StringRef ProxyMatchedStr(II, sizeof("private") - 1); + MatchedStr = ProxyMatchedStr; PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPrivate; - else if (PrivacyFlags == 0 && MatchedStr == "public") + } else if (PrivacyFlags == 0 && MatchedStr == "public") { + StringRef ProxyMatchedStr(II, sizeof("public") - 1); + MatchedStr = ProxyMatchedStr; PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPublic; + } } else { size_t CommaOrBracePos = Str.find_if([](char c) { return c == ',' || c == '}'; }); @@ -165,7 +190,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, I += CommaOrBracePos + 1; } // Continue until the closing brace is found. - } while (*(I - 1) == ','); + } while (FormatStrConverter.convert(*(I - 1)) == ','); // Set the privacy flag. switch (PrivacyFlags) { @@ -188,7 +213,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // Look for flags (if any). bool hasMore = true; for (; I != E; ++I) { - switch (*I) { + switch (FormatStrConverter.convert(*I)) { default: hasMore = false; break; @@ -225,7 +250,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // Look for the field width (if any). if (ParseFieldWidth(H, FS, Start, I, E, - FS.usesPositionalArg() ? nullptr : &argIndex)) + FS.usesPositionalArg() ? nullptr : &argIndex, + FormatStrConverter)) return true; if (I == E) { @@ -236,7 +262,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } // Look for the precision (if any). - if (*I == '.') { + if (FormatStrConverter.convert(*I) == '.') { ++I; if (I == E) { if (Warn) @@ -245,7 +271,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } if (ParsePrecision(H, FS, Start, I, E, - FS.usesPositionalArg() ? nullptr : &argIndex)) + FS.usesPositionalArg() ? nullptr : &argIndex, + FormatStrConverter)) return true; if (I == E) { @@ -256,11 +283,11 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } } - if (ParseVectorModifier(H, FS, I, E, LO)) + if (ParseVectorModifier(H, FS, I, E, LO, FormatStrConverter)) return true; // Look for the length modifier. - if (ParseLengthModifier(FS, I, E, LO) && I == E) { + if (ParseLengthModifier(FS, I, E, LO, FormatStrConverter) && I == E) { // No more characters left? if (Warn) H.HandleIncompleteSpecifier(Start, E - Start); @@ -274,7 +301,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // enables better recovery, and we don't know if // these flags are applicable until later. const char *ObjCModifierFlagsStart = nullptr, *ObjCModifierFlagsEnd = nullptr; - if (*I == '[') { + if (FormatStrConverter.convert(*I) == '[') { ObjCModifierFlagsStart = I; ++I; auto flagStart = I; @@ -286,8 +313,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, return true; } // Did we find the closing ']'? - if (*I == ']') { - if (ParseObjCFlags(H, FS, flagStart, I, Warn)) + if (FormatStrConverter.convert(*I) == ']') { + if (ParseObjCFlags(H, FS, flagStart, I, Warn, FormatStrConverter)) return true; ++I; break; @@ -307,7 +334,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // Finally, look for the conversion specifier. const char *conversionPosition = I++; ConversionSpecifier::Kind k = ConversionSpecifier::InvalidSpecifier; - switch (*conversionPosition) { + switch (FormatStrConverter.convert(*conversionPosition)) { default: break; // C99: 7.19.6.1 (section 8). @@ -470,7 +497,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, FS.setConversionSpecifier(CS); } // Assume the conversion takes one argument. - return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len); + return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len, + FormatStrConverter); } return PrintfSpecifierResult(Start, FS); } @@ -480,7 +508,6 @@ bool clang::analyze_format_string::ParsePrintfString( const TargetInfo &Target, bool isFreeBSDKPrintf) { unsigned argIndex = 0; - // Keep looking for a format specifier until we have exhausted the string. while (I != E) { const PrintfSpecifierResult &FSR = ParsePrintfSpecifier( diff --git a/clang/lib/AST/ScanfFormatString.cpp b/clang/lib/AST/ScanfFormatString.cpp index 90cbbd60bbcf5..c63171844d90d 100644 --- a/clang/lib/AST/ScanfFormatString.cpp +++ b/clang/lib/AST/ScanfFormatString.cpp @@ -81,7 +81,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, const char *I = Beg; const char *Start = nullptr; UpdateOnReturn<const char *> UpdateBeg(Beg, I); - + const llvm::TextEncodingConverter &FormatStrConverter = + *Target.FormatStrConverter; // Look for a '%' character that indicates the start of a format specifier. for (; I != E; ++I) { char c = *I; @@ -90,7 +91,9 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, H.HandleNullChar(I); return true; } - if (c == '%') { + SmallString<1> ConvertedChar; + FormatStrConverter.convert(StringRef(&c, 1), ConvertedChar); + if (ConvertedChar[0] == '%') { Start = I++; // Record the start of the format specifier. break; } @@ -107,7 +110,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, } ScanfSpecifier FS; - if (ParseArgPosition(H, FS, Start, I, E)) + if (ParseArgPosition(H, FS, Start, I, E, FormatStrConverter)) return true; if (I == E) { @@ -117,7 +120,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, } // Look for '*' flag if it is present. - if (*I == '*') { + if (FormatStrConverter.convert(*I) == '*') { FS.setSuppressAssignment(I); if (++I == E) { H.HandleIncompleteSpecifier(Start, E - Start); @@ -127,7 +130,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, // Look for the field width (if any). Unlike printf, this is either // a fixed integer or isn't present. - const OptionalAmount &Amt = clang::analyze_format_string::ParseAmount(I, E); + const OptionalAmount &Amt = + clang::analyze_format_string::ParseAmount(I, E, FormatStrConverter); if (Amt.getHowSpecified() != OptionalAmount::NotSpecified) { assert(Amt.getHowSpecified() == OptionalAmount::Constant); FS.setFieldWidth(Amt); @@ -140,7 +144,9 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, } // Look for the length modifier. - if (ParseLengthModifier(FS, I, E, LO, /*IsScanf=*/true) && I == E) { + if (ParseLengthModifier(FS, I, E, LO, FormatStrConverter, + /*IsScanf=*/true) && + I == E) { // No more characters left? H.HandleIncompleteSpecifier(Start, E - Start); return true; @@ -155,7 +161,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, // Finally, look for the conversion specifier. const char *conversionPosition = I++; ScanfConversionSpecifier::Kind k = ScanfConversionSpecifier::InvalidSpecifier; - switch (*conversionPosition) { + switch (FormatStrConverter.convert(*conversionPosition)) { default: break; case '%': @@ -262,7 +268,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, FS.setConversionSpecifier(CS); } // Assume the conversion takes one argument. - return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len); + return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len, + FormatStrConverter); } return ScanfSpecifierResult(Start, FS); } diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index e6ae89e0948c5..43efca42886cc 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -194,6 +194,9 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { MaxOpenCLWorkGroupSize = 1024; MaxBitIntWidth.reset(); + + FormatStrConverter = new llvm::TextEncodingConverter( + std::move(*llvm::TextEncodingConverter::createNoopConverter())); } // Out of line virtual dtor for TargetInfo. diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index c9b5342b7e8d9..83945d203762c 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -550,7 +550,7 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { PP->setDependencyDirectivesGetter(*GetDependencyDirectives); if (auto EC = TextEncodingConfig::setConvertersFromOptions( - PP->getTextEncodingConfig(), getLangOpts())) + PP->getTextEncodingConfig(), getLangOpts(), getTarget())) PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config) << PP->getTextEncodingConfig().getExecEncoding(); } diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp index b89d5baefcc23..427b75a1c0a8b 100644 --- a/clang/lib/Lex/TextEncodingConfig.cpp +++ b/clang/lib/Lex/TextEncodingConfig.cpp @@ -23,7 +23,8 @@ TextEncodingConfig::getConverter(ConversionAction Action) const { std::error_code TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC, - const clang::LangOptions &Opts) { + const clang::LangOptions &Opts, + clang::TargetInfo &TInfo) { using namespace llvm; const char *UTF8 = "UTF-8"; @@ -41,5 +42,13 @@ TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC, new TextEncodingConverter(std::move(*ErrorOrConverter)); else return ErrorOrConverter.getError(); + + ErrorOrConverter = llvm::TextEncodingConverter::create(TEC.SystemEncoding, + TEC.InternalEncoding); + + if (ErrorOrConverter) + TInfo.FormatStrConverter = + new TextEncodingConverter(std::move(*ErrorOrConverter)); + return std::error_code(); } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 4706fa5d3cde0..9b15c23c7494d 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -104,6 +104,7 @@ #include "llvm/Support/Locale.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SaveAndRestore.h" +#include "llvm/Support/TextEncoding.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/RISCVTargetParser.h" #include "llvm/TargetParser/Triple.h" @@ -7872,10 +7873,10 @@ class CheckFormatHandler : public analyze_format_string::FormatStringHandler { ArrayRef<FixItHint> Fixit = {}); protected: - bool HandleInvalidConversionSpecifier(unsigned argIndex, SourceLocation Loc, - const char *startSpec, - unsigned specifierLen, - const char *csStart, unsigned csLen); + bool HandleInvalidConversionSpecifier( + unsigned argIndex, SourceLocation Loc, const char *startSpec, + unsigned specifierLen, const char *csStart, unsigned csLen, + const llvm::TextEncodingConverter &FormatStrConverter); void HandlePositionalNonpositionalArgs(SourceLocation Loc, const char *startSpec, @@ -8105,7 +8106,8 @@ void UncoveredArgHandler::Diagnose(Sema &S, bool IsFunctionCall, bool CheckFormatHandler::HandleInvalidConversionSpecifier( unsigned argIndex, SourceLocation Loc, const char *startSpec, - unsigned specifierLen, const char *csStart, unsigned csLen) { + unsigned specifierLen, const char *csStart, unsigned csLen, + const llvm::TextEncodingConverter &FormatStrConverter) { bool keepGoing = true; if (argIndex < NumDataArgs) { // Consider the argument coverered, even though the specifier doesn't @@ -8120,7 +8122,13 @@ bool CheckFormatHandler::HandleInvalidConversionSpecifier( keepGoing = false; } - StringRef Specifier(csStart, csLen); + // The csStart points to a character that has already been converted to the + // exec charset, so we have to reverse the conversion to allow diagnostic + // message to match an expected value when using -verify option, + std::string RS(csStart, csLen); + for (unsigned int i = 0; i < RS.size(); ++i) + RS[i] = FormatStrConverter.convert(RS[i]); + StringRef Specifier(RS); // If the specifier in non-printable, it could be the first byte of a UTF-8 // sequence. In that case, print the UTF-8 code point. If not, print the byte @@ -8274,7 +8282,8 @@ class CheckPrintfHandler : public CheckFormatHandler { bool HandleInvalidPrintfConversionSpecifier( const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) override; + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) override; void handleInvalidMaskType(StringRef MaskType) override; @@ -8414,13 +8423,14 @@ class DecomposePrintfHandler : public CheckPrintfHandler { bool CheckPrintfHandler::HandleInvalidPrintfConversionSpecifier( const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) { + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { const analyze_printf::PrintfConversionSpecifier &CS = FS.getConversionSpecifier(); return HandleInvalidConversionSpecifier( FS.getArgIndex(), getLocationOfByte(CS.getStart()), startSpecifier, - specifierLen, CS.getStart(), CS.getLength()); + specifierLen, CS.getStart(), CS.getLength(), FormatStrConverter); } void CheckPrintfHandler::handleInvalidMaskType(StringRef MaskType) { @@ -8928,15 +8938,15 @@ bool CheckPrintfHandler::HandlePrintfSpecifier( // Check for using an Objective-C specific conversion specifier // in a non-ObjC literal. if (!allowsObjCArg() && CS.isObjCArg()) { - return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier, - specifierLen); + return HandleInvalidPrintfConversionSpecifier( + FS, startSpecifier, specifierLen, *Target.FormatStrConverter); } // %P can only be used with os_log. if (FSType != FormatStringType::OSLog && CS.getKind() == ConversionSpecifier::PArg) { - return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier, - specifierLen); + return HandleInvalidPrintfConversionSpecifier( + FS, startSpecifier, specifierLen, *Target.FormatStrConverter); } // %n is not allowed with os_log. @@ -8955,8 +8965,8 @@ bool CheckPrintfHandler::HandlePrintfSpecifier( (CS.getKind() == ConversionSpecifier::PArg || CS.getKind() == ConversionSpecifier::sArg || CS.getKind() == ConversionSpecifier::ObjCObjArg)) { - return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier, - specifierLen); + return HandleInvalidPrintfConversionSpecifier( + FS, startSpecifier, specifierLen, *Target.FormatStrConverter); } // Check for use of public/private annotation outside of os_log(). @@ -9614,10 +9624,10 @@ class CheckScanfHandler : public CheckFormatHandler { const char *startSpecifier, unsigned specifierLen) override; - bool - HandleInvalidScanfConversionSpecifier(const analyze_scanf::ScanfSpecifier &FS, - const char *startSpecifier, - unsigned specifierLen) override; + bool HandleInvalidScanfConversionSpecifier( + const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) override; void HandleIncompleteScanList(const char *start, const char *end) override; }; @@ -9633,13 +9643,15 @@ void CheckScanfHandler::HandleIncompleteScanList(const char *start, bool CheckScanfHandler::HandleInvalidScanfConversionSpecifier( const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) { + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { + const analyze_scanf::ScanfConversionSpecifier &CS = FS.getConversionSpecifier(); return HandleInvalidConversionSpecifier( FS.getArgIndex(), getLocationOfByte(CS.getStart()), startSpecifier, - specifierLen, CS.getStart(), CS.getLength()); + specifierLen, CS.getStart(), CS.getLength(), FormatStrConverter); } bool CheckScanfHandler::HandleScanfSpecifier( diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 83d57a917fa1e..ee3ceefd8f97e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -3636,8 +3636,9 @@ ExprResult Sema::BuildPredefinedExpr(SourceLocation Loc, // the string. bool ForceElaboratedPrinting = IK == PredefinedIdentKind::Function && getLangOpts().MSVCCompat; - auto Str = - PredefinedExpr::ComputeName(IK, currentDecl, ForceElaboratedPrinting); + auto Str = PredefinedExpr::ComputeNameAndTranslate( + IK, currentDecl, getPreprocessor().getTextEncodingConfig(), + ForceElaboratedPrinting); unsigned Length = Str.length(); llvm::APInt LengthI(32, Length + 1); diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c index 5279b780531c3..78ae3353224af 100644 --- a/clang/test/CodeGen/systemz-charset.c +++ b/clang/test/CodeGen/systemz-charset.c @@ -1,6 +1,8 @@ // RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s // RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -DIBM1047_ONLY=1 -o - | FileCheck %s --check-prefix=CHECK-UTF8 +int printf(char const *, ...); + const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; //CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00" //CHECK-UTF8: c"ABCDEFGHIJKLMNOPQRSTUVWXYZ\00" diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h index 8a304910aa5dd..8f5a6122ede45 100644 --- a/llvm/include/llvm/Support/TextEncoding.h +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -105,6 +105,8 @@ class TextEncodingConverter { LLVM_ABI static ErrorOr<TextEncodingConverter> create(StringRef From, StringRef To); + LLVM_ABI static ErrorOr<TextEncodingConverter> createNoopConverter(); + TextEncodingConverter(const TextEncodingConverter &) = delete; TextEncodingConverter &operator=(const TextEncodingConverter &) = delete; @@ -135,6 +137,14 @@ class TextEncodingConverter { return std::string(Result); return EC; } + + char convert(char SingleChar) const { + SmallString<1> Result; + auto EC = Converter->convert(StringRef(&SingleChar, 1), Result); + if (!EC) + return Result[0]; + return '\0'; + } }; } // namespace llvm diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index d36f02c1300b9..5c1d9696686a2 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -356,3 +356,22 @@ ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From, return std::make_error_code(std::errc::invalid_argument); #endif } + +class TextEncodingConverterNoop final + : public details::TextEncodingConverterImplBase { + +public: + TextEncodingConverterNoop() {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl<char> &Result) override { + Result.assign(Source.begin(), Source.end()); + return std::error_code(); + } + + void reset() override {} +}; + +ErrorOr<TextEncodingConverter> TextEncodingConverter::createNoopConverter() { + return TextEncodingConverter(std::make_unique<TextEncodingConverterNoop>()); +} >From 5d6451c4002332c296c79cff8ee42f0ec2287f27 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 8 May 2026 12:20:45 -0400 Subject: [PATCH 3/4] convert to exec-charset inside getPredefinedStringLiteralFromCache, test __builtin_FILE() --- clang/include/clang/Basic/TargetInfo.h | 2 ++ clang/lib/AST/ASTContext.cpp | 10 +++++++++ clang/lib/Basic/TargetInfo.cpp | 3 +++ clang/lib/Lex/TextEncodingConfig.cpp | 9 +++++---- clang/test/CodeGen/systemz-charset.cpp | 28 ++++++++++++++++++++++++++ 5 files changed, 48 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index ec7d4fcd4d8e3..6c0e65a85ee13 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -326,6 +326,8 @@ class TargetInfo : public TransferrableTargetInfo, llvm::TextEncodingConverter *FormatStrConverter; + llvm::TextEncodingConverter *ExecStrConverter; + /// Retrieve the target options. TargetOptions &getTargetOpts() const { assert(TargetOpts && "Missing target options"); diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index a0894318dbd53..80e073385ce82 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -13752,6 +13752,16 @@ ASTContext::getPredefinedStringLiteralFromCache(StringRef Key) const { *this, Key, StringLiteralKind::Ordinary, /*Pascal*/ false, getStringLiteralArrayType(CharTy, Key.size()), SourceLocation()); + + llvm::TextEncodingConverter *Converter = getTargetInfo().ExecStrConverter; + if (Converter) { + SmallString<128> Converted; + Converter->convert(Result->getString(), Converted); + Result = StringLiteral::Create( + *this, Converted, StringLiteralKind::Ordinary, /*Pascal*/ false, + getStringLiteralArrayType(CharTy, Converted.size()), SourceLocation()); + } + return Result; } diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 43efca42886cc..0c553033ad069 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -197,6 +197,9 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { FormatStrConverter = new llvm::TextEncodingConverter( std::move(*llvm::TextEncodingConverter::createNoopConverter())); + + ExecStrConverter = new llvm::TextEncodingConverter( + std::move(*llvm::TextEncodingConverter::createNoopConverter())); } // Out of line virtual dtor for TargetInfo. diff --git a/clang/lib/Lex/TextEncodingConfig.cpp b/clang/lib/Lex/TextEncodingConfig.cpp index 427b75a1c0a8b..6df88e258ffde 100644 --- a/clang/lib/Lex/TextEncodingConfig.cpp +++ b/clang/lib/Lex/TextEncodingConfig.cpp @@ -37,14 +37,15 @@ TextEncodingConfig::setConvertersFromOptions(TextEncodingConfig &TEC, return std::error_code(); ErrorOr<TextEncodingConverter> ErrorOrConverter = llvm::TextEncodingConverter::create(UTF8, TEC.ExecEncoding); - if (ErrorOrConverter) + if (ErrorOrConverter) { TEC.ToExecEncodingConverter = new TextEncodingConverter(std::move(*ErrorOrConverter)); - else + TInfo.ExecStrConverter = TEC.ToExecEncodingConverter; + } else return ErrorOrConverter.getError(); - ErrorOrConverter = llvm::TextEncodingConverter::create(TEC.SystemEncoding, - TEC.InternalEncoding); + ErrorOrConverter = llvm::TextEncodingConverter::create( + TInfo.getTriple().getDefaultNarrowTextEncoding(), UTF8); if (ErrorOrConverter) TInfo.FormatStrConverter = diff --git a/clang/test/CodeGen/systemz-charset.cpp b/clang/test/CodeGen/systemz-charset.cpp index f7becd5b39492..f8219ca00d20a 100644 --- a/clang/test/CodeGen/systemz-charset.cpp +++ b/clang/test/CodeGen/systemz-charset.cpp @@ -1,46 +1,74 @@ // RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -std=c++17 -fexec-charset IBM-1047 -o - | FileCheck %s +// RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -o - | FileCheck %s --check-prefix=CHECK-UTF8 const char *RawString = R"(Hello\n)"; //CHECK: c"\C8\85\93\93\96\E0\95\00" +//CHECK-UTF8: c"Hello\\n\00" const char *MultiLineRawString = R"( Hello There)"; //CHECK: c"\15\C8\85\93\93\96\15\E3\88\85\99\85\00" +//CHECK-UTF8: c"\0AHello\0AThere\00" char UnicodeChar8 = u8'1'; //CHECK: i8 49 +//CHECK-UTF8: i8 49 char16_t UnicodeChar16 = u'1'; //CHECK: i16 49 +//CHECK-UTF8: i16 49 char32_t UnicodeChar32 = U'1'; //CHECK: i32 49 +//CHECK-UTF8: i32 49 const char *EscapeCharacters8 = u8"\a\b\f\n\r\t\v\\\'\"\?"; //CHECK: c"\07\08\0C\0A\0D\09\0B\\'\22?\00" +//CHECK-UTF8: c"\07\08\0C\0A\0D\09\0B\\'\22?\00" const char16_t *EscapeCharacters16 = u"\a\b\f\n\r\t\v\\\'\"\?"; //CHECK: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0] +//CHECK-UTF8: [12 x i16] [i16 7, i16 8, i16 12, i16 10, i16 13, i16 9, i16 11, i16 92, i16 39, i16 34, i16 63, i16 0] const char32_t *EscapeCharacters32 = U"\a\b\f\n\r\t\v\\\'\"\?"; //CHECK: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0] +//CHECK-UTF8: [12 x i32] [i32 7, i32 8, i32 12, i32 10, i32 13, i32 9, i32 11, i32 92, i32 39, i32 34, i32 63, i32 0] const char *UnicodeString8 = u8"Hello"; //CHECK: c"Hello\00" +//CHECK-UTF8: c"Hello\00" + const char16_t *UnicodeString16 = u"Hello"; //CHECK: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0] +//CHECK-UTF8: [6 x i16] [i16 72, i16 101, i16 108, i16 108, i16 111, i16 0] + const char32_t *UnicodeString32 = U"Hello"; //CHECK: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0] +//CHECK=UTF8: [6 x i32] [i32 72, i32 101, i32 108, i32 108, i32 111, i32 0] const char *UnicodeRawString8 = u8R"("Hello\")"; //CHECK: c"\22Hello\\\22\00" +//CHECK=UTF8: c"\22Hello\\\22\00" + const char16_t *UnicodeRawString16 = uR"("Hello\")"; //CHECK: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0] +//CHECK=UTF8: [9 x i16] [i16 34, i16 72, i16 101, i16 108, i16 108, i16 111, i16 92, i16 34, i16 0] + const char32_t *UnicodeRawString32 = UR"("Hello\")"; //CHECK: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0] +//CHECK=UTF8: [9 x i32] [i32 34, i32 72, i32 101, i32 108, i32 108, i32 111, i32 92, i32 34, i32 0] const char *UnicodeUCNString8 = u8"\u00E2\u00AC\U000000DF"; //CHECK: c"\C3\A2\C2\AC\C3\9F\00" +//CHECK=UTF8: c"\C3\A2\C2\AC\C3\9F\00" + const char16_t *UnicodeUCNString16 = u"\u00E2\u00AC\U000000DF"; //CHECK: [4 x i16] [i16 226, i16 172, i16 223, i16 0] +//CHECK=UTF8: [4 x i16] [i16 226, i16 172, i16 223, i16 0] + const char32_t *UnicodeUCNString32 = U"\u00E2\u00AC\U000000DF"; //CHECK: [4 x i32] [i32 226, i32 172, i32 223, i32 0] +//CHECK=UTF8: [4 x i32] [i32 226, i32 172, i32 223, i32 0] + +const char *file = __builtin_FILE(); +//CHECK: {{.*}}\A2\A8\A2\A3\85\94\A9`\83\88\81\99\A2\85\A3K\83\97\97\00" +//CHECK-UTF8: {{.*}}systemz-charset.cpp\00" >From f29a959da91d4ba9a3944b29ea45ecff2522b110 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 8 May 2026 12:21:38 -0400 Subject: [PATCH 4/4] Enable driver changes for fexec-charset --- clang/docs/LanguageExtensions.rst | 6 +++--- .../clang/Basic/DiagnosticDriverKinds.td | 1 + clang/include/clang/Options/Options.td | 18 +++++++++++++---- clang/lib/Driver/ToolChains/Clang.cpp | 20 +++++++++++++------ clang/test/CodeGen/systemz-charset.c | 1 + clang/test/Driver/cl-options.c | 7 ++++--- clang/test/Driver/clang_f_opts.c | 14 ++++++++++--- llvm/include/llvm/Support/TextEncoding.h | 2 ++ llvm/lib/Support/TextEncoding.cpp | 10 ++++++++++ 9 files changed, 60 insertions(+), 19 deletions(-) diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 03cb02deb5e7f..f2aca70d9d57d 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -421,9 +421,9 @@ Builtin Macros ``__clang_literal_encoding__`` Defined to a narrow string literal that represents the current encoding of - narrow string literals, e.g., ``"hello"``. This macro typically expands to - "UTF-8" (but may change in the future if the - ``-fexec-charset="Encoding-Name"`` option is implemented.) + narrow string literals, e.g., ``"hello"``. This macro expands to the text + encoding specified by ``-fexec-charset`` if any, or a system-specific default + otherwise: ``"IBM-1047"`` on z/OS and ``"UTF-8"`` on all other systems. ``__clang_wide_literal_encoding__`` Defined to a narrow string literal that represents the current encoding of diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 114ee475c371f..16460583b3770 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -149,6 +149,7 @@ def warn_drv_unsupported_option_part_for_target : Warning< InGroup<OptionIgnored>; def err_drv_unsupported_option_part_for_target : Error< "'%0' in '%1' option is not currently supported for target '%2'">; +def err_drv_unsupported_encoding : Error<"'%0' is not a supported encoding">; def warn_drv_invalid_argument_for_flang : Warning< "'%0' is not valid for Fortran">, InGroup<OptionIgnored>; diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 73bce00b921ea..95163962da647 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -3152,7 +3152,13 @@ def fexperimental_strict_floating_point : Flag<["-"], "fexperimental-strict-floa def finput_charset_EQ : Joined<["-"], "finput-charset=">, Visibility<[ClangOption, FlangOption, FC1Option]>, Group<f_Group>, HelpText<"Specify the default character set for source files">; -def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>; +def fexec_charset_EQ + : Joined<["-"], "fexec-charset=">, + Group<f_Group>, + HelpText< + "Set the execution <encoding> for string and character literals. " + "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, " + "and possibly those supported by ICU or the host iconv library.">; def finstrument_functions : Flag<["-"], "finstrument-functions">, Group<f_Group>, @@ -9163,9 +9169,13 @@ def _SLASH_showFilenames_ : CLFlag<"showFilenames-">, def _SLASH_source_charset : CLCompileJoined<"source-charset:">, HelpText<"Set source encoding, supports only UTF-8">, Alias<finput_charset_EQ>; -def _SLASH_execution_charset : CLCompileJoined<"execution-charset:">, - HelpText<"Set runtime encoding, supports only UTF-8">, - Alias<fexec_charset_EQ>; +def _SLASH_execution_charset + : CLCompileJoined<"execution-charset:">, + HelpText< + "Set the execution <encoding> for string and character literals. " + "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, " + "and possibly those supported by ICU or the host iconv library.">, + Alias<fexec_charset_EQ>; def _SLASH_std : CLCompileJoined<"std:">, HelpText<"Set language version (c++14,c++17,c++20,c++23preview,c++latest,c11,c17)">; def _SLASH_U : CLJoinedOrSeparate<"U">, HelpText<"Undefine macro">, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index bdffa4fdd7e6b..fe78b60096aa3 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -52,6 +52,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/Path.h" #include "llvm/Support/Process.h" +#include "llvm/Support/TextEncoding.h" #include "llvm/Support/YAMLParser.h" #include "llvm/TargetParser/AArch64TargetParser.h" #include "llvm/TargetParser/ARMTargetParserCommon.h" @@ -7755,12 +7756,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, << value; } - // -fexec_charset=UTF-8 is default. Reject others - if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) { - StringRef value = execCharset->getValue(); - if (!value.equals_insensitive("utf-8")) - D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args) - << value; + if (Arg *ExecEncoding = Args.getLastArg(options::OPT_fexec_charset_EQ)) { + StringRef Value = ExecEncoding->getValue(); + if (llvm::TextEncodingConverter::isEncodingSupported(Value)) { + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back(Args.MakeArgString(Value)); + } else { + D.Diag(diag::err_drv_unsupported_encoding) << Value; + } + } else { + // Set the default fexec-charset as the system charset. + CmdArgs.push_back("-fexec-charset"); + CmdArgs.push_back( + Args.MakeArgString(Triple.getDefaultNarrowTextEncoding())); } RenderDiagnosticsOptions(D, Args, CmdArgs); diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c index 78ae3353224af..dbb36aed49990 100644 --- a/clang/test/CodeGen/systemz-charset.c +++ b/clang/test/CodeGen/systemz-charset.c @@ -1,4 +1,5 @@ // RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s +// RUN: %clang %s -emit-llvm -S -target s390x-ibm-zos -o - | FileCheck %s // RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -DIBM1047_ONLY=1 -o - | FileCheck %s --check-prefix=CHECK-UTF8 int printf(char const *, ...); diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index c0f57ae768252..1a2827012549d 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -250,10 +250,11 @@ // RUN: not %clang_cl /source-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=source-charset-utf-16 %s // source-charset-utf-16: invalid value 'utf-16' in '/source-charset:utf-16' -// /execution-charset: should warn on everything except UTF-8. -// RUN: not %clang_cl /execution-charset:utf-16 -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-utf-16 %s -// execution-charset-utf-16: invalid value 'utf-16' in '/execution-charset:utf-16' +// /execution-charset: should warn on invalid charsets. +// RUN: not %clang_cl /execution-charset:invalid-charset -### -- %s 2>&1 | FileCheck -check-prefix=execution-charset-invalid %s +// execution-charset-invalid: 'invalid-charset' is not a supported encoding // + // RUN: %clang_cl /Umymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s // RUN: %clang_cl /U mymacro -### -- %s 2>&1 | FileCheck -check-prefix=U %s // U: "-U" "mymacro" diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c index 5871f1580d6b7..7fe67068118fc 100644 --- a/clang/test/Driver/clang_f_opts.c +++ b/clang/test/Driver/clang_f_opts.c @@ -232,8 +232,16 @@ // RUN: not %clang -### -S -finput-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-INPUT-CHARSET %s // CHECK-INVALID-INPUT-CHARSET: error: invalid value 'iso-8859-1' in '-finput-charset=iso-8859-1' -// RUN: not %clang -### -S -fexec-charset=iso-8859-1 -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-EXEC-CHARSET %s -// CHECK-INVALID-EXEC-CHARSET: error: invalid value 'iso-8859-1' in '-fexec-charset=iso-8859-1' +// RUN: not %clang -### -S -fexec-charset=invalid-charset -o /dev/null %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-EXEC-CHARSET %s +// CHECK-INVALID-EXEC-CHARSET: error: 'invalid-charset' is not a supported encoding + +// Test that we support the following exec charsets. The preferred MIME name is +// `IBM1047`, but `IBM-1047` is the name used by z/OS USS utilities such as +// `chtag`. +// RUN: %clang -### -S -fexec-charset=UTF-8 -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-EXEC-CHARSET-UTF-8 %s +// RUN: %clang -### -S -fexec-charset=IBM-1047 -o /dev/null %s 2>&1 | FileCheck --check-prefix=CHECK-EXEC-CHARSET-IBM-1047 %s +// CHECK-EXEC-CHARSET-UTF-8: "-fexec-charset" "UTF-8" +// CHECK-EXEC-CHARSET-IBM-1047: "-fexec-charset" "IBM-1047" // Test that we don't error on these. // RUN: not %clang -### -S -Werror \ @@ -247,7 +255,7 @@ // RUN: -fident -fno-ident \ // RUN: -fimplicit-templates -fno-implicit-templates \ // RUN: -finput-charset=UTF-8 \ -// RUN: -fexec-charset=UTF-8 \ +// RUN: -fexec-charset=UTF-8 \ // RUN: -fivopts -fno-ivopts \ // RUN: -fnon-call-exceptions -fno-non-call-exceptions \ // RUN: -fpermissive -fno-permissive \ diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h index 8f5a6122ede45..c120e36d1de6c 100644 --- a/llvm/include/llvm/Support/TextEncoding.h +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -145,6 +145,8 @@ class TextEncodingConverter { return Result[0]; return '\0'; } + + LLVM_ABI static bool isEncodingSupported(StringRef Name); }; } // namespace llvm diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index 5c1d9696686a2..475799df9070b 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -58,6 +58,16 @@ static std::optional<TextEncoding> getKnownEncoding(StringRef Name) { return std::nullopt; } +bool TextEncodingConverter::isEncodingSupported(StringRef Name) { + if (getKnownEncoding(Name)) + return true; + llvm::ErrorOr<llvm::TextEncodingConverter> ErrorOrConverter = + llvm::TextEncodingConverter::create("UTF-8", Name.data()); + if (ErrorOrConverter) + return true; + return false; +} + [[maybe_unused]] static void HandleOverflow(size_t &Capacity, char *&Output, size_t &OutputLength, SmallVectorImpl<char> &Result) { _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
