https://github.com/abhina-sree updated https://github.com/llvm/llvm-project/pull/196568
>From debd0182db40d0aebf12da43b6e26e497b8fbd63 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 19 Jun 2026 08:50:48 -0400 Subject: [PATCH 1/9] use LiteralEncoding internally, address other comments --- clang/include/clang/Basic/DiagnosticLexKinds.td | 2 +- clang/include/clang/Basic/LangOptions.h | 4 ++-- clang/include/clang/Lex/TextEncoding.h | 8 ++++---- clang/include/clang/Options/Options.td | 4 ++-- clang/lib/Frontend/CompilerInstance.cpp | 2 +- clang/lib/Frontend/InitPreprocessor.cpp | 12 ++++++------ clang/lib/Lex/LiteralSupport.cpp | 2 +- clang/lib/Lex/TextEncoding.cpp | 16 ++++++++-------- clang/lib/Sema/SemaExpr.cpp | 2 +- clang/test/CodeGen/systemz-charset-diag.cpp | 2 +- clang/test/CodeGen/systemz-charset.c | 10 ++++++++++ 11 files changed, 37 insertions(+), 27 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td index f12fa0205b650..3b0b4d87fc006 100644 --- a/clang/include/clang/Basic/DiagnosticLexKinds.td +++ b/clang/include/clang/Basic/DiagnosticLexKinds.td @@ -288,7 +288,7 @@ def ext_string_too_long : Extension<"string literal of length %0 exceeds " def err_character_too_large : Error< "character too large for enclosing character literal type">; def err_exec_charset_conversion_failed - : Error<"conversion to execution encoding failed: '%0'">; + : Error<"conversion to literal encoding failed: '%0'">; def warn_c99_compat_unicode_literal : Warning< "unicode literals are incompatible with C99">, InGroup<C99Compat>, DefaultIgnore; diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 5ec31b356d059..bbf47c34b306a 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -618,8 +618,8 @@ class LangOptions : public LangOptionsBase { /// The allocation token mode. std::optional<llvm::AllocTokenMode> AllocTokenMode; - /// Name of the execution encoding to convert the internal encoding to. - std::string ExecEncoding; + /// Name of the literal encoding to convert the internal encoding to. + std::string LiteralEncoding; LangOptions(); diff --git a/clang/include/clang/Lex/TextEncoding.h b/clang/include/clang/Lex/TextEncoding.h index 770cb3c5eff1a..c892d1fadbc38 100644 --- a/clang/include/clang/Lex/TextEncoding.h +++ b/clang/include/clang/Lex/TextEncoding.h @@ -13,18 +13,18 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/TextEncoding.h" -enum ConversionAction { CA_NoConversion, CA_ToExecEncoding }; +enum ConversionAction { CA_NoConversion, CA_ToLiteralEncoding }; class TextEncoding { - llvm::StringRef ExecEncoding; - llvm::TextEncodingConverter *ToExecEncodingConverter = nullptr; + llvm::StringRef LiteralEncoding; + llvm::TextEncodingConverter *ToLiteralEncodingConverter = nullptr; public: llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; static std::error_code setConvertersFromOptions(TextEncoding &TE, const clang::LangOptions &Opts); - llvm::StringRef getExecEncoding() { return ExecEncoding; } + llvm::StringRef getLiteralEncoding() { return LiteralEncoding; } }; #endif diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 92b62fa8fceb4..bad318f703935 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -7537,10 +7537,10 @@ def tune_cpu : Separate<["-"], "tune-cpu">, HelpText<"Tune for a specific cpu type">, MarshallingInfoString<TargetOpts<"TuneCPU">>; def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<encoding>">, - HelpText<"Set the execution <encoding> for ordinary string and character literals. " + HelpText<"Set the <encoding> for ordinary string and character literals. " "Supported character encodings include ISO-8859-1, UTF-8, IBM1047, " "and possibly those supported by ICU or the host iconv library.">, - MarshallingInfoString<LangOpts<"ExecEncoding">>; + MarshallingInfoString<LangOpts<"LiteralEncoding">>; def target_cpu : Separate<["-"], "target-cpu">, HelpText<"Target a specific cpu type">, MarshallingInfoString<TargetOpts<"CPU">>; diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index 952eb73c210ff..f4e0f09035fff 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -558,7 +558,7 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { if (auto EC = TextEncoding::setConvertersFromOptions(PP->getTextEncoding(), getLangOpts())) PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config) - << PP->getTextEncoding().getExecEncoding(); + << PP->getTextEncoding().getLiteralEncoding(); } // ASTContext diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 15c62e39d9506..eb60e0e674fea 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1036,12 +1036,12 @@ static void InitializePredefinedMacros(const TargetInfo &TI, // Macros to help identify the narrow and wide character sets. This is set // to fexec-charset. If fexec-charset is not specified, the default is the // system charset. - Builder.defineMacro( - "__clang_literal_encoding__", - Twine("\"" + - (LangOpts.ExecEncoding.empty() ? TI.getDefaultOrdinaryTextEncoding() - : LangOpts.ExecEncoding) + - "\"")); + Builder.defineMacro("__clang_literal_encoding__", + Twine("\"" + + (LangOpts.LiteralEncoding.empty() + ? TI.getDefaultOrdinaryTextEncoding() + : LangOpts.LiteralEncoding) + + "\"")); if (TI.getTypeWidth(TI.getWCharType()) >= 32) { // FIXME: 32-bit wchar_t signals UTF-32. This may change diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp index 70070e8bb1f2a..e31dcc8c76db6 100644 --- a/clang/lib/Lex/LiteralSupport.cpp +++ b/clang/lib/Lex/LiteralSupport.cpp @@ -1861,7 +1861,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, const TextEncoding &TE = PP.getTextEncoding(); llvm::TextEncodingConverter *Converter = nullptr; if (isOrdinary()) - Converter = TE.getConverter(CA_ToExecEncoding); + Converter = TE.getConverter(CA_ToLiteralEncoding); // Unicode escapes representing characters that cannot be correctly // represented in a single code unit are disallowed in character literals diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp index ba878800564f0..393caaadd5d37 100644 --- a/clang/lib/Lex/TextEncoding.cpp +++ b/clang/lib/Lex/TextEncoding.cpp @@ -12,8 +12,8 @@ llvm::TextEncodingConverter * TextEncoding::getConverter(ConversionAction Action) const { switch (Action) { - case CA_ToExecEncoding: - return ToExecEncodingConverter; + case CA_ToLiteralEncoding: + return ToLiteralEncodingConverter; default: return nullptr; } @@ -25,17 +25,17 @@ TextEncoding::setConvertersFromOptions(TextEncoding &TEC, using namespace llvm; const char *UTF8 = "UTF-8"; - TEC.ExecEncoding = - Opts.ExecEncoding.empty() ? UTF8 : Opts.ExecEncoding.c_str(); + TEC.LiteralEncoding = + Opts.LiteralEncoding.empty() ? UTF8 : Opts.LiteralEncoding.c_str(); - // Create converter between internal and exec encoding specified + // Create converter between internal and literal encoding specified // in fexec-charset option. - if (TEC.ExecEncoding == UTF8) + if (TEC.LiteralEncoding == UTF8) return std::error_code(); ErrorOr<TextEncodingConverter> ErrorOrConverter = - llvm::TextEncodingConverter::create(UTF8, TEC.ExecEncoding); + llvm::TextEncodingConverter::create(UTF8, TEC.LiteralEncoding); if (ErrorOrConverter) - TEC.ToExecEncodingConverter = + TEC.ToLiteralEncodingConverter = new TextEncodingConverter(std::move(*ErrorOrConverter)); else return ErrorOrConverter.getError(); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index eea63e2497e06..391ad927af439 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -2241,7 +2241,7 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) { StringToks = ExpandedToks = ExpandFunctionLocalPredefinedMacros(StringToks); StringLiteralParser Literal( - StringToks, PP, StringLiteralEvalMethod::Evaluated, CA_ToExecEncoding); + StringToks, PP, StringLiteralEvalMethod::Evaluated, CA_ToLiteralEncoding); if (Literal.hadError) return ExprError(); diff --git a/clang/test/CodeGen/systemz-charset-diag.cpp b/clang/test/CodeGen/systemz-charset-diag.cpp index 5b398b4b58af6..4ed94810150a3 100644 --- a/clang/test/CodeGen/systemz-charset-diag.cpp +++ b/clang/test/CodeGen/systemz-charset-diag.cpp @@ -1,3 +1,3 @@ // RUN: %clang_cc1 -triple s390x-none-zos -fexec-charset IBM-1047 %s -std=c++17 -emit-llvm -o - -verify -const char* Computer = "🖥️"; // expected-error-re {{conversion to execution encoding failed: {{.*}}}} +const char* Computer = "🖥️"; // expected-error-re {{conversion to literal encoding failed: {{.*}}}} diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c index 897b9d2eeefa1..766b6a83f00ff 100644 --- a/clang/test/CodeGen/systemz-charset.c +++ b/clang/test/CodeGen/systemz-charset.c @@ -56,3 +56,13 @@ const char *Unicode = "ÿ"; // RUN: not %clang_cc1 -fexec-charset invalid %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR // CHECK-ERROR: error: failed to set fexec-charset to 'invalid' +#define HELLO "Hello " +#define WORLD "World!" +#define HELLO_WORLD HELLO WORLD +const char* hello_macro = HELLO; +//CHECK: c"\C8\85\93\93\96@\00" +//CHECK-UTF8 = c"Hello\00" + +const char* preprocessor_concatenation = HELLO_WORLD; +//CHECK: c"\C8\85\93\93\96@\E6\96\99\93\84Z\00" +//CHECK-UTF8: c"Hello World!\00" >From b9055262bdec5d31d8e0be8b2d52eeed7c66ca0c Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 8 May 2026 12:17:22 -0400 Subject: [PATCH 2/9] add ParserConversionAction, do not translate unevaluated strings --- clang/include/clang/Parse/Parser.h | 1 + clang/include/clang/Sema/Sema.h | 4 +++- clang/lib/Parse/ParseDecl.cpp | 10 ++++++++++ clang/lib/Parse/ParseDeclCXX.cpp | 2 ++ clang/lib/Parse/ParseExpr.cpp | 6 +++--- clang/lib/Parse/Parser.cpp | 4 ++++ clang/lib/Sema/SemaExpr.cpp | 12 ++++++------ clang/test/CodeGen/systemz-charset-diag.cpp | 8 ++++++++ clang/test/CodeGen/systemz-charset.c | 5 +++++ 9 files changed, 42 insertions(+), 10 deletions(-) diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index c6c492b4980af..b441998e54040 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -5715,6 +5715,7 @@ class Parser : public CodeCompletionHandler { bool Finished; }; ObjCImplParsingDataRAII *CurParsedObjCImpl; + ConversionAction ParserConversionAction; /// StashAwayMethodOrFunctionBodyTokens - Consume the tokens and store them /// for later parsing. diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index b8d760e7e0975..d54e4ce19166a 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -55,6 +55,7 @@ #include "clang/Basic/TemplateKinds.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" +#include "clang/Lex/LiteralConverter.h" #include "clang/Sema/AnalysisBasedWarnings.h" #include "clang/Sema/Attr.h" #include "clang/Sema/CleanupInfo.h" @@ -7374,7 +7375,8 @@ class Sema final : public SemaBase { /// from multiple tokens. However, the common case is that StringToks points /// to one string. ExprResult ActOnStringLiteral(ArrayRef<Token> StringToks, - Scope *UDLScope = nullptr); + Scope *UDLScope = nullptr, + ConversionAction Action = CA_ToExecEncoding); ExprResult ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks); diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 405dddf7991b4..97e0721c02b1b 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -564,6 +564,9 @@ unsigned Parser::ParseAttributeArgsCommon( nullptr, Sema::ExpressionEvaluationContextRecord::EK_AttrArgument); + SaveAndRestore<ConversionAction> SavedTranslationState( + ParserConversionAction, CA_NoConversion); + ExprResult ArgExpr = ParseAssignmentExpression(); if (ArgExpr.isInvalid()) { SkipUntil(tok::r_paren, StopAtSemi); @@ -644,6 +647,9 @@ void Parser::ParseGNUAttributeArgs( ParsedAttr::Kind AttrKind = ParsedAttr::getParsedKind(AttrName, ScopeName, Form.getSyntax()); + SaveAndRestore<ConversionAction> SavedTranslationState(ParserConversionAction, + CA_NoConversion); + if (AttrKind == ParsedAttr::AT_Availability) { ParseAvailabilityAttribute(*AttrName, AttrNameLoc, Attrs, EndLoc, ScopeName, ScopeLoc, Form); @@ -723,6 +729,9 @@ unsigned Parser::ParseClangAttributeArgs( ParsedAttr::Kind AttrKind = ParsedAttr::getParsedKind(AttrName, ScopeName, Form.getSyntax()); + SaveAndRestore<ConversionAction> SavedTranslationState(ParserConversionAction, + CA_NoConversion); + switch (AttrKind) { default: return ParseAttributeArgsCommon(AttrName, AttrNameLoc, Attrs, EndLoc, @@ -1546,6 +1555,7 @@ void Parser::ParseExternalSourceSymbolAttribute( SkipUntil(tok::comma, tok::r_paren, StopAtSemi | StopBeforeMatch); continue; } + if (Keyword == Ident_language) { if (HadLanguage) { Diag(KeywordLoc, diag::err_external_source_symbol_duplicate_clause) diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index 893989bd2398f..388cfa662068a 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -1001,6 +1001,8 @@ Decl *Parser::ParseStaticAssertDeclaration(SourceLocation &DeclEnd) { return nullptr; } } else if (tokenIsLikeStringLiteral(Tok, getLangOpts())) { + SaveAndRestore<ConversionAction> SavedTranslationState( + ParserConversionAction, CA_NoConversion); AssertMessage = ParseUnevaluatedStringLiteralExpression(); } else { Diag(Tok, diag::err_expected_string_literal) diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index 2987d32d6e0d2..f8855d06fa343 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -3060,9 +3060,9 @@ ExprResult Parser::ParseStringLiteralExpression(bool AllowUserDefinedLiteral, } // Pass the set of string tokens, ready for concatenation, to the actions. - return Actions.ActOnStringLiteral(StringToks, - AllowUserDefinedLiteral ? getCurScope() - : nullptr); + return Actions.ActOnStringLiteral( + StringToks, AllowUserDefinedLiteral ? getCurScope() : nullptr, + ParserConversionAction); } ExprResult Parser::ParseGenericSelectionExpression() { diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 5e1fd4df1a3f0..7ac5e0a36d60e 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -70,6 +70,8 @@ Parser::Parser(Preprocessor &pp, Sema &actions, bool skipFunctionBodies) NumCachedScopes = 0; CurParsedObjCImpl = nullptr; + ParserConversionAction = CA_ToExecEncoding; + // Add #pragma handlers. These are removed and destroyed in the // destructor. initializePragmaHandlers(); @@ -1551,6 +1553,8 @@ void Parser::ParseKNRParamDeclarations(Declarator &D) { } ExprResult Parser::ParseAsmStringLiteral(bool ForAsmLabel) { + SaveAndRestore<ConversionAction> SavedTranslationState(ParserConversionAction, + CA_NoConversion); ExprResult AsmString; if (isTokenStringLiteral()) { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 391ad927af439..089fdc5c5b6cc 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -2159,8 +2159,8 @@ ExprResult Sema::ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks) { if (getLangOpts().MicrosoftExt) StringToks = ExpandedToks = ExpandFunctionLocalPredefinedMacros(StringToks); - StringLiteralParser Literal(StringToks, PP, - StringLiteralEvalMethod::Unevaluated); + StringLiteralParser Literal( + StringToks, PP, StringLiteralEvalMethod::Unevaluated, CA_NoConversion); if (Literal.hadError) return ExprError(); @@ -2231,8 +2231,8 @@ Sema::ExpandFunctionLocalPredefinedMacros(ArrayRef<Token> Toks) { return ExpandedToks; } -ExprResult -Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) { +ExprResult Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope, + ConversionAction Action) { assert(!StringToks.empty() && "Must have at least one string!"); // StringToks needs backing storage as it doesn't hold array elements itself @@ -2240,8 +2240,8 @@ Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) { if (getLangOpts().MicrosoftExt) StringToks = ExpandedToks = ExpandFunctionLocalPredefinedMacros(StringToks); - StringLiteralParser Literal( - StringToks, PP, StringLiteralEvalMethod::Evaluated, CA_ToLiteralEncoding); + StringLiteralParser Literal(StringToks, PP, + StringLiteralEvalMethod::Evaluated, Action); if (Literal.hadError) return ExprError(); diff --git a/clang/test/CodeGen/systemz-charset-diag.cpp b/clang/test/CodeGen/systemz-charset-diag.cpp index 4ed94810150a3..ad08e1f391214 100644 --- a/clang/test/CodeGen/systemz-charset-diag.cpp +++ b/clang/test/CodeGen/systemz-charset-diag.cpp @@ -1,3 +1,11 @@ // RUN: %clang_cc1 -triple s390x-none-zos -fexec-charset IBM-1047 %s -std=c++17 -emit-llvm -o - -verify const char* Computer = "🖥️"; // expected-error-re {{conversion to literal encoding failed: {{.*}}}} + +static_assert(false, "Error string"); // expected-error {{static assertion failed: Error string}} + +[[deprecated("message")]] void test_deprecated() {return;} // expected-note {{'test_deprecated' has been explicitly marked deprecated here}} + +int main() { + test_deprecated(); // expected-warning {{'test_deprecated' is deprecated: message}} +} diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c index 766b6a83f00ff..618b0cc203ab6 100644 --- a/clang/test/CodeGen/systemz-charset.c +++ b/clang/test/CodeGen/systemz-charset.c @@ -66,3 +66,8 @@ const char* hello_macro = HELLO; const char* preprocessor_concatenation = HELLO_WORLD; //CHECK: c"\C8\85\93\93\96@\E6\96\99\93\84Z\00" //CHECK-UTF8: c"Hello World!\00" + +void test1() { + printf(__FUNCTION__); +} +//CHECK: @__FUNCTION__.test1 = private unnamed_addr constant [6 x i8] c"\A3\85\A2\A3\F1\00" >From 840f505abbb265ac25f665ab8c8451725f1cd051 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 8 May 2026 12:29:23 -0400 Subject: [PATCH 3/9] Remove old include --- clang/include/clang/Sema/Sema.h | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index d54e4ce19166a..aecd0d1c2f5dd 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -55,7 +55,6 @@ #include "clang/Basic/TemplateKinds.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" -#include "clang/Lex/LiteralConverter.h" #include "clang/Sema/AnalysisBasedWarnings.h" #include "clang/Sema/Attr.h" #include "clang/Sema/CleanupInfo.h" >From 7ee3d5c76f9538338d6449f52b2d5cf120afb389 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Mon, 11 May 2026 09:27:48 -0400 Subject: [PATCH 4/9] Fix build failure --- clang/include/clang/Sema/Sema.h | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index aecd0d1c2f5dd..5d00b0c94daa3 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -55,6 +55,7 @@ #include "clang/Basic/TemplateKinds.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" +#include "clang/Lex/TextEncodingConfig.h" #include "clang/Sema/AnalysisBasedWarnings.h" #include "clang/Sema/Attr.h" #include "clang/Sema/CleanupInfo.h" >From ff2c43189b2224b2f0e4e3ba68d4d3558149e634 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Tue, 12 May 2026 08:07:08 -0400 Subject: [PATCH 5/9] fix CI --- clang/test/CodeGen/systemz-charset.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/clang/test/CodeGen/systemz-charset.c b/clang/test/CodeGen/systemz-charset.c index 618b0cc203ab6..16f269f8fb2f5 100644 --- a/clang/test/CodeGen/systemz-charset.c +++ b/clang/test/CodeGen/systemz-charset.c @@ -1,6 +1,8 @@ // RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset IBM-1047 -o - | FileCheck %s // RUN: %clang_cc1 %s -emit-llvm -triple s390x-none-zos -fexec-charset UTF-8 -DIBM1047_ONLY=1 -o - | FileCheck %s --check-prefix=CHECK-UTF8 +int printf(char const *, ...); + const char *UpperCaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; //CHECK: c"\C1\C2\C3\C4\C5\C6\C7\C8\C9\D1\D2\D3\D4\D5\D6\D7\D8\D9\E2\E3\E4\E5\E6\E7\E8\E9\00" //CHECK-UTF8: c"ABCDEFGHIJKLMNOPQRSTUVWXYZ\00" >From 4011c35715375f7e727819b2fd287ab6644e3f02 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Tue, 12 May 2026 15:21:15 -0400 Subject: [PATCH 6/9] fix CI --- clang/include/clang/AST/Expr.h | 6 ++++++ clang/include/clang/Sema/Sema.h | 4 ++-- clang/lib/AST/Expr.cpp | 15 +++++++++++++++ clang/lib/Parse/ParseDecl.cpp | 1 - clang/lib/Parse/Parser.cpp | 2 +- clang/lib/Sema/SemaExpr.cpp | 5 +++-- 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index b91bf4a5375fb..69ac328c8f0a7 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -28,6 +28,7 @@ #include "clang/Basic/LangOptions.h" #include "clang/Basic/SyncScope.h" #include "clang/Basic/TypeTraits.h" +#include "clang/Lex/TextEncoding.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SmallVector.h" @@ -2066,6 +2067,11 @@ class PredefinedExpr final return getIdentKindName(getIdentKind()); } + static std::string + ComputeNameAndTranslate(PredefinedIdentKind IK, const Decl *CurrentDecl, + TextEncoding &TE, + bool ForceElaboratedPrinting = false); + static std::string ComputeName(PredefinedIdentKind IK, const Decl *CurrentDecl, bool ForceElaboratedPrinting = false); diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 5d00b0c94daa3..f78455769a082 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -55,7 +55,7 @@ #include "clang/Basic/TemplateKinds.h" #include "clang/Basic/TokenKinds.h" #include "clang/Basic/TypeTraits.h" -#include "clang/Lex/TextEncodingConfig.h" +#include "clang/Lex/TextEncoding.h" #include "clang/Sema/AnalysisBasedWarnings.h" #include "clang/Sema/Attr.h" #include "clang/Sema/CleanupInfo.h" @@ -7376,7 +7376,7 @@ class Sema final : public SemaBase { /// to one string. ExprResult ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope = nullptr, - ConversionAction Action = CA_ToExecEncoding); + ConversionAction Action = CA_ToLiteralEncoding); ExprResult ActOnUnevaluatedStringLiteral(ArrayRef<Token> StringToks); diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 90747be4208e1..ead2880b9ebec 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -673,6 +673,21 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedIdentKind IK) { llvm_unreachable("Unknown ident kind for PredefinedExpr"); } +std::string PredefinedExpr::ComputeNameAndTranslate( + PredefinedIdentKind IK, const Decl *CurrentDecl, TextEncoding &TE, + bool ForceElaboratedPrinting) { + using namespace clang::charinfo; + std::string Result = ComputeName(IK, CurrentDecl, ForceElaboratedPrinting); + llvm::TextEncodingConverter *Converter = + TE.getConverter(CA_ToLiteralEncoding); + if (Converter) { + SmallString<128> Converted; + Converter->convert(Result, Converted); + Result = std::string(Converted); + } + return Result; +} + // FIXME: Maybe this should use DeclPrinter with a special "print predefined // expr" policy instead. std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK, diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index 97e0721c02b1b..3aa41ebc05397 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -1555,7 +1555,6 @@ void Parser::ParseExternalSourceSymbolAttribute( SkipUntil(tok::comma, tok::r_paren, StopAtSemi | StopBeforeMatch); continue; } - if (Keyword == Ident_language) { if (HadLanguage) { Diag(KeywordLoc, diag::err_external_source_symbol_duplicate_clause) diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 7ac5e0a36d60e..5a199b842fe8e 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -70,7 +70,7 @@ Parser::Parser(Preprocessor &pp, Sema &actions, bool skipFunctionBodies) NumCachedScopes = 0; CurParsedObjCImpl = nullptr; - ParserConversionAction = CA_ToExecEncoding; + ParserConversionAction = CA_ToLiteralEncoding; // Add #pragma handlers. These are removed and destroyed in the // destructor. diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 089fdc5c5b6cc..eac281b523862 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -3636,8 +3636,9 @@ ExprResult Sema::BuildPredefinedExpr(SourceLocation Loc, // the string. bool ForceElaboratedPrinting = IK == PredefinedIdentKind::Function && getLangOpts().MSVCCompat; - auto Str = - PredefinedExpr::ComputeName(IK, currentDecl, ForceElaboratedPrinting); + auto Str = PredefinedExpr::ComputeNameAndTranslate( + IK, currentDecl, getPreprocessor().getTextEncoding(), + ForceElaboratedPrinting); unsigned Length = Str.length(); llvm::APInt LengthI(32, Length + 1); >From 6b3785f0e03b0344a7459bc185733342b47dcc43 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Fri, 8 May 2026 12:19:11 -0400 Subject: [PATCH 7/9] Add format string handling --- clang/include/clang/AST/FormatString.h | 12 ++-- clang/include/clang/Basic/TargetInfo.h | 3 + clang/include/clang/Lex/TextEncoding.h | 3 +- clang/lib/AST/FormatString.cpp | 86 ++++++++++++----------- clang/lib/AST/FormatStringParsing.h | 36 +++++++--- clang/lib/AST/PrintfFormatString.cpp | 89 +++++++++++++++--------- clang/lib/AST/ScanfFormatString.cpp | 23 +++--- clang/lib/Basic/TargetInfo.cpp | 3 + clang/lib/Frontend/CompilerInstance.cpp | 4 +- clang/lib/Lex/TextEncoding.cpp | 11 ++- clang/lib/Sema/SemaChecking.cpp | 54 ++++++++------ llvm/include/llvm/Support/TextEncoding.h | 10 +++ llvm/lib/Support/TextEncoding.cpp | 19 +++++ 13 files changed, 233 insertions(+), 120 deletions(-) diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h index a3382e1a1d007..a24ade2d71ee9 100644 --- a/clang/include/clang/AST/FormatString.h +++ b/clang/include/clang/AST/FormatString.h @@ -19,6 +19,7 @@ #define LLVM_CLANG_AST_FORMATSTRING_H #include "clang/AST/CanonicalType.h" +#include "llvm/Support/TextEncoding.h" #include <optional> namespace clang { @@ -728,7 +729,8 @@ class FormatStringHandler { virtual bool HandleInvalidPrintfConversionSpecifier( const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) { + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { return true; } @@ -744,10 +746,10 @@ class FormatStringHandler { // Scanf-specific handlers. - virtual bool - HandleInvalidScanfConversionSpecifier(const analyze_scanf::ScanfSpecifier &FS, - const char *startSpecifier, - unsigned specifierLen) { + virtual bool HandleInvalidScanfConversionSpecifier( + const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { return true; } diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index a4984cffc430a..909bde840d3fa 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -38,6 +38,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Error.h" +#include "llvm/Support/TextEncoding.h" #include "llvm/Support/VersionTuple.h" #include "llvm/TargetParser/Triple.h" #include <cassert> @@ -323,6 +324,8 @@ class TargetInfo : public TransferrableTargetInfo, virtual ~TargetInfo(); + llvm::TextEncodingConverter *FormatStrConverter; + /// Retrieve the target options. TargetOptions &getTargetOpts() const { assert(TargetOpts && "Missing target options"); diff --git a/clang/include/clang/Lex/TextEncoding.h b/clang/include/clang/Lex/TextEncoding.h index c892d1fadbc38..f525b06cff37b 100644 --- a/clang/include/clang/Lex/TextEncoding.h +++ b/clang/include/clang/Lex/TextEncoding.h @@ -22,7 +22,8 @@ class TextEncoding { public: llvm::TextEncodingConverter *getConverter(ConversionAction Action) const; static std::error_code - setConvertersFromOptions(TextEncoding &TE, const clang::LangOptions &Opts); + setConvertersFromOptions(TextEncoding &TE, const clang::LangOptions &Opts, + clang::TargetInfo &TInfo); llvm::StringRef getLiteralEncoding() { return LiteralEncoding; } }; diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp index 7e1ac0de6dcaf..0d449fb5f0904 100644 --- a/clang/lib/AST/FormatString.cpp +++ b/clang/lib/AST/FormatString.cpp @@ -33,8 +33,9 @@ FormatStringHandler::~FormatStringHandler() {} // scanf format strings. //===----------------------------------------------------------------------===// -OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg, - const char *E) { +OptionalAmount clang::analyze_format_string::ParseAmount( + const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter) { const char *I = Beg; UpdateOnReturn<const char *> UpdateBeg(Beg, I); @@ -42,7 +43,7 @@ OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg, bool hasDigits = false; for (; I != E; ++I) { - char c = *I; + char c = FormatStrConverter.convert(*I); if (c >= '0' && c <= '9') { hasDigits = true; accumulator = (accumulator * 10) + (c - '0'); @@ -60,21 +61,22 @@ OptionalAmount clang::analyze_format_string::ParseAmount(const char *&Beg, } OptionalAmount clang::analyze_format_string::ParseNonPositionAmount( - const char *&Beg, const char *E, unsigned &argIndex) { - if (*Beg == '*') { + const char *&Beg, const char *E, unsigned &argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { + if (FormatStrConverter.convert(*Beg) == '*') { ++Beg; return OptionalAmount(OptionalAmount::Arg, argIndex++, Beg, 0, false); } - return ParseAmount(Beg, E); + return ParseAmount(Beg, E, FormatStrConverter); } OptionalAmount clang::analyze_format_string::ParsePositionAmount( FormatStringHandler &H, const char *Start, const char *&Beg, const char *E, - PositionContext p) { - if (*Beg == '*') { + PositionContext p, const llvm::TextEncodingConverter &FormatStrConverter) { + if (FormatStrConverter.convert(*Beg) == '*') { const char *I = Beg + 1; - const OptionalAmount &Amt = ParseAmount(I, E); + const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter); if (Amt.getHowSpecified() == OptionalAmount::NotSpecified) { H.HandleInvalidPosition(Beg, I - Beg, p); @@ -89,7 +91,7 @@ OptionalAmount clang::analyze_format_string::ParsePositionAmount( assert(Amt.getHowSpecified() == OptionalAmount::Constant); - if (*I == '$') { + if (FormatStrConverter.convert(*I) == '$') { // Handle positional arguments // Special case: '*0$', since this is an easy mistake. @@ -109,18 +111,21 @@ OptionalAmount clang::analyze_format_string::ParsePositionAmount( return OptionalAmount(false); } - return ParseAmount(Beg, E); + return ParseAmount(Beg, E, FormatStrConverter); } bool clang::analyze_format_string::ParseFieldWidth( FormatStringHandler &H, FormatSpecifier &CS, const char *Start, - const char *&Beg, const char *E, unsigned *argIndex) { + const char *&Beg, const char *E, unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { // FIXME: Support negative field widths. if (argIndex) { - CS.setFieldWidth(ParseNonPositionAmount(Beg, E, *argIndex)); + CS.setFieldWidth( + ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter)); } else { const OptionalAmount Amt = ParsePositionAmount( - H, Start, Beg, E, analyze_format_string::FieldWidthPos); + H, Start, Beg, E, analyze_format_string::FieldWidthPos, + FormatStrConverter); if (Amt.isInvalid()) return true; @@ -129,14 +134,13 @@ bool clang::analyze_format_string::ParseFieldWidth( return false; } -bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, - FormatSpecifier &FS, - const char *Start, - const char *&Beg, - const char *E) { +bool clang::analyze_format_string::ParseArgPosition( + FormatStringHandler &H, FormatSpecifier &FS, const char *Start, + const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter) { const char *I = Beg; - const OptionalAmount &Amt = ParseAmount(I, E); + const OptionalAmount &Amt = ParseAmount(I, E, FormatStrConverter); if (I == E) { // No more characters left? @@ -144,7 +148,8 @@ bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, return true; } - if (Amt.getHowSpecified() == OptionalAmount::Constant && *(I++) == '$') { + if (Amt.getHowSpecified() == OptionalAmount::Constant && + FormatStrConverter.convert(*(I++)) == '$') { // Warn that positional arguments are non-standard. H.HandlePosition(Start, I - Start); @@ -165,16 +170,15 @@ bool clang::analyze_format_string::ParseArgPosition(FormatStringHandler &H, return false; } -bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, - FormatSpecifier &FS, - const char *&I, - const char *E, - const LangOptions &LO) { +bool clang::analyze_format_string::ParseVectorModifier( + FormatStringHandler &H, FormatSpecifier &FS, const char *&I, const char *E, + const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter) { if (!LO.OpenCL) return false; const char *Start = I; - if (*I == 'v') { + if (FormatStrConverter.convert(*I) == 'v') { ++I; if (I == E) { @@ -182,7 +186,7 @@ bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, return true; } - OptionalAmount NumElts = ParseAmount(I, E); + OptionalAmount NumElts = ParseAmount(I, E, FormatStrConverter); if (NumElts.getHowSpecified() != OptionalAmount::Constant) { H.HandleIncompleteSpecifier(Start, E - Start); return true; @@ -194,22 +198,20 @@ bool clang::analyze_format_string::ParseVectorModifier(FormatStringHandler &H, return false; } -bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, - const char *&I, - const char *E, - const LangOptions &LO, - bool IsScanf) { +bool clang::analyze_format_string::ParseLengthModifier( + FormatSpecifier &FS, const char *&I, const char *E, const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter, bool IsScanf) { LengthModifier::Kind lmKind = LengthModifier::None; const char *lmPosition = I; - switch (*I) { + switch (FormatStrConverter.convert(*I)) { default: return false; case 'h': ++I; - if (I != E && *I == 'h') { + if (I != E && FormatStrConverter.convert(*I) == 'h') { ++I; lmKind = LengthModifier::AsChar; - } else if (I != E && *I == 'l' && LO.OpenCL) { + } else if (I != E && FormatStrConverter.convert(*I) == 'l' && LO.OpenCL) { ++I; lmKind = LengthModifier::AsShortLong; } else { @@ -218,7 +220,7 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, break; case 'l': ++I; - if (I != E && *I == 'l') { + if (I != E && FormatStrConverter.convert(*I) == 'l') { ++I; lmKind = LengthModifier::AsLongLong; } else { @@ -251,7 +253,9 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, // be parsed as the GNU extension 'a' length modifier. If not, this // will be parsed as a conversion specifier. ++I; - if (I != E && (*I == 's' || *I == 'S' || *I == '[')) { + if (I != E && (FormatStrConverter.convert(*I) == 's' || + FormatStrConverter.convert(*I) == 'S' || + FormatStrConverter.convert(*I) == '[')) { lmKind = LengthModifier::AsAllocate; break; } @@ -269,7 +273,8 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, // scanf: AsInt64 case 'I': if (I + 1 != E && I + 2 != E) { - if (I[1] == '6' && I[2] == '4') { + if (FormatStrConverter.convert(I[1]) == '6' && + FormatStrConverter.convert(I[2]) == '4') { I += 3; lmKind = LengthModifier::AsInt64; break; @@ -277,7 +282,8 @@ bool clang::analyze_format_string::ParseLengthModifier(FormatSpecifier &FS, if (IsScanf) return false; - if (I[1] == '3' && I[2] == '2') { + if (FormatStrConverter.convert(I[1]) == '3' && + FormatStrConverter.convert(I[2]) == '2') { I += 3; lmKind = LengthModifier::AsInt32; break; diff --git a/clang/lib/AST/FormatStringParsing.h b/clang/lib/AST/FormatStringParsing.h index 401528481a9d6..531bc291e0b5b 100644 --- a/clang/lib/AST/FormatStringParsing.h +++ b/clang/lib/AST/FormatStringParsing.h @@ -35,29 +35,43 @@ template <typename T> class UpdateOnReturn { namespace analyze_format_string { -OptionalAmount ParseAmount(const char *&Beg, const char *E); -OptionalAmount ParseNonPositionAmount(const char *&Beg, const char *E, - unsigned &argIndex); +OptionalAmount +ParseAmount(const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter); -OptionalAmount ParsePositionAmount(FormatStringHandler &H, const char *Start, - const char *&Beg, const char *E, - PositionContext p); +OptionalAmount +ParseNonPositionAmount(const char *&Beg, const char *E, unsigned &argIndex, + const llvm::TextEncodingConverter &FormatStrConverter); + +OptionalAmount +ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg, + const char *E, PositionContext p, + const llvm::TextEncodingConverter &FormatStrConverter); + +OptionalAmount +ParsePositionAmount(FormatStringHandler &H, const char *Start, const char *&Beg, + const char *E, PositionContext p, + const llvm::TextEncodingConverter &FormatStrConverter); bool ParseFieldWidth(FormatStringHandler &H, FormatSpecifier &CS, const char *Start, const char *&Beg, const char *E, - unsigned *argIndex); + unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter); bool ParseArgPosition(FormatStringHandler &H, FormatSpecifier &CS, - const char *Start, const char *&Beg, const char *E); + const char *Start, const char *&Beg, const char *E, + const llvm::TextEncodingConverter &FormatStrConverter); bool ParseVectorModifier(FormatStringHandler &H, FormatSpecifier &FS, - const char *&Beg, const char *E, - const LangOptions &LO); + const char *&Beg, const char *E, const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter); /// Returns true if a LengthModifier was parsed and installed in the /// FormatSpecifier& argument, and false otherwise. bool ParseLengthModifier(FormatSpecifier &FS, const char *&Beg, const char *E, - const LangOptions &LO, bool IsScanf = false); + const LangOptions &LO, + const llvm::TextEncodingConverter &FormatStrConverter, + bool IsScanf = false); /// Returns true if the invalid specifier in \p SpecifierBegin is a UTF-8 /// string; check that it won't go further than \p FmtStrEnd and write diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp index 6610a2de9e083..7efcc554ec136 100644 --- a/clang/lib/AST/PrintfFormatString.cpp +++ b/clang/lib/AST/PrintfFormatString.cpp @@ -35,14 +35,17 @@ typedef clang::analyze_format_string::SpecifierResult<PrintfSpecifier> using analyze_format_string::ParseNonPositionAmount; -static bool ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, - const char *Start, const char *&Beg, const char *E, - unsigned *argIndex) { +static bool +ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, const char *Start, + const char *&Beg, const char *E, unsigned *argIndex, + const llvm::TextEncodingConverter &FormatStrConverter) { if (argIndex) { - FS.setPrecision(ParseNonPositionAmount(Beg, E, *argIndex)); + FS.setPrecision( + ParseNonPositionAmount(Beg, E, *argIndex, FormatStrConverter)); } else { const OptionalAmount Amt = ParsePositionAmount( - H, Start, Beg, E, analyze_format_string::PrecisionPos); + H, Start, Beg, E, analyze_format_string::PrecisionPos, + FormatStrConverter); if (Amt.isInvalid()) return true; FS.setPrecision(Amt); @@ -50,11 +53,14 @@ static bool ParsePrecision(FormatStringHandler &H, PrintfSpecifier &FS, return false; } -static bool ParseObjCFlags(FormatStringHandler &H, PrintfSpecifier &FS, - const char *FlagBeg, const char *E, bool Warn) { +static bool +ParseObjCFlags(FormatStringHandler &H, PrintfSpecifier &FS, const char *FlagBeg, + const char *E, bool Warn, + const llvm::TextEncodingConverter &FormatStrConverter) { StringRef Flag(FlagBeg, E - FlagBeg); // Currently there is only one flag. - if (Flag == "tt") { + if (Flag.size() == 2 && FormatStrConverter.convert(FlagBeg[0]) == 't' && + FormatStrConverter.convert(FlagBeg[1]) == 't') { FS.setHasObjCTechnicalTerm(FlagBeg); return false; } @@ -81,6 +87,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, const char *Start = nullptr; UpdateOnReturn<const char *> UpdateBeg(Beg, I); + const llvm::TextEncodingConverter &FormatStrConverter = + *Target.FormatStrConverter; // Look for a '%' character that indicates the start of a format specifier. for (; I != E; ++I) { char c = *I; @@ -89,7 +97,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, H.HandleNullChar(I); return true; } - if (c == '%') { + if (FormatStrConverter.convert(c) == '%') { Start = I++; // Record the start of the format specifier. break; } @@ -107,7 +115,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } PrintfSpecifier FS; - if (ParseArgPosition(H, FS, Start, I, E)) + if (ParseArgPosition(H, FS, Start, I, E, FormatStrConverter)) return true; if (I == E) { @@ -117,13 +125,17 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, return true; } - if (*I == '{') { + if (FormatStrConverter.convert(*I) == '{') { ++I; unsigned char PrivacyFlags = 0; StringRef MatchedStr; do { - StringRef Str(I, E - I); + const char *II; + std::string S(I, E - I); + for (unsigned long i = 0; i < S.length(); ++i) + S[i] = FormatStrConverter.convert(S[i]); + StringRef Str(S); std::string Match = "^[[:space:]]*" "(private|public|sensitive|mask\\.[^[:space:],}]*)" "[[:space:]]*(,|})"; @@ -132,25 +144,38 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, if (R.match(Str, &Matches)) { MatchedStr = Matches[1]; + II = I; I += Matches[0].size(); + while (FormatStrConverter.convert(*II) == ' ') + ++II; + // Set the privacy flag if the privacy annotation in the // comma-delimited segment is at least as strict as the privacy // annotations in previous comma-delimited segments. if (MatchedStr.starts_with("mask")) { - StringRef MaskType = MatchedStr.substr(sizeof("mask.") - 1); + StringRef MaskType(II + sizeof("mask.") - 1, + MatchedStr.size() - sizeof("mask.") + 1); unsigned Size = MaskType.size(); + if (Warn && (Size == 0 || Size > 8)) H.handleInvalidMaskType(MaskType); FS.setMaskType(MaskType); - } else if (MatchedStr == "sensitive") + } else if (MatchedStr == "sensitive") { + StringRef ProxyMatchedStr(II, sizeof("sensitive") - 1); + MatchedStr = ProxyMatchedStr; PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsSensitive; - else if (PrivacyFlags != - clang::analyze_os_log::OSLogBufferItem::IsSensitive && - MatchedStr == "private") + } else if (PrivacyFlags != + clang::analyze_os_log::OSLogBufferItem::IsSensitive && + MatchedStr == "private") { + StringRef ProxyMatchedStr(II, sizeof("private") - 1); + MatchedStr = ProxyMatchedStr; PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPrivate; - else if (PrivacyFlags == 0 && MatchedStr == "public") + } else if (PrivacyFlags == 0 && MatchedStr == "public") { + StringRef ProxyMatchedStr(II, sizeof("public") - 1); + MatchedStr = ProxyMatchedStr; PrivacyFlags = clang::analyze_os_log::OSLogBufferItem::IsPublic; + } } else { size_t CommaOrBracePos = Str.find_if([](char c) { return c == ',' || c == '}'; }); @@ -165,7 +190,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, I += CommaOrBracePos + 1; } // Continue until the closing brace is found. - } while (*(I - 1) == ','); + } while (FormatStrConverter.convert(*(I - 1)) == ','); // Set the privacy flag. switch (PrivacyFlags) { @@ -188,7 +213,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // Look for flags (if any). bool hasMore = true; for (; I != E; ++I) { - switch (*I) { + switch (FormatStrConverter.convert(*I)) { default: hasMore = false; break; @@ -225,7 +250,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // Look for the field width (if any). if (ParseFieldWidth(H, FS, Start, I, E, - FS.usesPositionalArg() ? nullptr : &argIndex)) + FS.usesPositionalArg() ? nullptr : &argIndex, + FormatStrConverter)) return true; if (I == E) { @@ -236,7 +262,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } // Look for the precision (if any). - if (*I == '.') { + if (FormatStrConverter.convert(*I) == '.') { ++I; if (I == E) { if (Warn) @@ -245,7 +271,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } if (ParsePrecision(H, FS, Start, I, E, - FS.usesPositionalArg() ? nullptr : &argIndex)) + FS.usesPositionalArg() ? nullptr : &argIndex, + FormatStrConverter)) return true; if (I == E) { @@ -256,11 +283,11 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, } } - if (ParseVectorModifier(H, FS, I, E, LO)) + if (ParseVectorModifier(H, FS, I, E, LO, FormatStrConverter)) return true; // Look for the length modifier. - if (ParseLengthModifier(FS, I, E, LO) && I == E) { + if (ParseLengthModifier(FS, I, E, LO, FormatStrConverter) && I == E) { // No more characters left? if (Warn) H.HandleIncompleteSpecifier(Start, E - Start); @@ -274,7 +301,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // enables better recovery, and we don't know if // these flags are applicable until later. const char *ObjCModifierFlagsStart = nullptr, *ObjCModifierFlagsEnd = nullptr; - if (*I == '[') { + if (FormatStrConverter.convert(*I) == '[') { ObjCModifierFlagsStart = I; ++I; auto flagStart = I; @@ -286,8 +313,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, return true; } // Did we find the closing ']'? - if (*I == ']') { - if (ParseObjCFlags(H, FS, flagStart, I, Warn)) + if (FormatStrConverter.convert(*I) == ']') { + if (ParseObjCFlags(H, FS, flagStart, I, Warn, FormatStrConverter)) return true; ++I; break; @@ -307,7 +334,7 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, // Finally, look for the conversion specifier. const char *conversionPosition = I++; ConversionSpecifier::Kind k = ConversionSpecifier::InvalidSpecifier; - switch (*conversionPosition) { + switch (FormatStrConverter.convert(*conversionPosition)) { default: break; // C99: 7.19.6.1 (section 8). @@ -470,7 +497,8 @@ ParsePrintfSpecifier(FormatStringHandler &H, const char *&Beg, const char *E, FS.setConversionSpecifier(CS); } // Assume the conversion takes one argument. - return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len); + return !H.HandleInvalidPrintfConversionSpecifier(FS, Start, Len, + FormatStrConverter); } return PrintfSpecifierResult(Start, FS); } @@ -480,7 +508,6 @@ bool clang::analyze_format_string::ParsePrintfString( const TargetInfo &Target, bool isFreeBSDKPrintf) { unsigned argIndex = 0; - // Keep looking for a format specifier until we have exhausted the string. while (I != E) { const PrintfSpecifierResult &FSR = ParsePrintfSpecifier( diff --git a/clang/lib/AST/ScanfFormatString.cpp b/clang/lib/AST/ScanfFormatString.cpp index 90cbbd60bbcf5..c63171844d90d 100644 --- a/clang/lib/AST/ScanfFormatString.cpp +++ b/clang/lib/AST/ScanfFormatString.cpp @@ -81,7 +81,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, const char *I = Beg; const char *Start = nullptr; UpdateOnReturn<const char *> UpdateBeg(Beg, I); - + const llvm::TextEncodingConverter &FormatStrConverter = + *Target.FormatStrConverter; // Look for a '%' character that indicates the start of a format specifier. for (; I != E; ++I) { char c = *I; @@ -90,7 +91,9 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, H.HandleNullChar(I); return true; } - if (c == '%') { + SmallString<1> ConvertedChar; + FormatStrConverter.convert(StringRef(&c, 1), ConvertedChar); + if (ConvertedChar[0] == '%') { Start = I++; // Record the start of the format specifier. break; } @@ -107,7 +110,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, } ScanfSpecifier FS; - if (ParseArgPosition(H, FS, Start, I, E)) + if (ParseArgPosition(H, FS, Start, I, E, FormatStrConverter)) return true; if (I == E) { @@ -117,7 +120,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, } // Look for '*' flag if it is present. - if (*I == '*') { + if (FormatStrConverter.convert(*I) == '*') { FS.setSuppressAssignment(I); if (++I == E) { H.HandleIncompleteSpecifier(Start, E - Start); @@ -127,7 +130,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, // Look for the field width (if any). Unlike printf, this is either // a fixed integer or isn't present. - const OptionalAmount &Amt = clang::analyze_format_string::ParseAmount(I, E); + const OptionalAmount &Amt = + clang::analyze_format_string::ParseAmount(I, E, FormatStrConverter); if (Amt.getHowSpecified() != OptionalAmount::NotSpecified) { assert(Amt.getHowSpecified() == OptionalAmount::Constant); FS.setFieldWidth(Amt); @@ -140,7 +144,9 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, } // Look for the length modifier. - if (ParseLengthModifier(FS, I, E, LO, /*IsScanf=*/true) && I == E) { + if (ParseLengthModifier(FS, I, E, LO, FormatStrConverter, + /*IsScanf=*/true) && + I == E) { // No more characters left? H.HandleIncompleteSpecifier(Start, E - Start); return true; @@ -155,7 +161,7 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, // Finally, look for the conversion specifier. const char *conversionPosition = I++; ScanfConversionSpecifier::Kind k = ScanfConversionSpecifier::InvalidSpecifier; - switch (*conversionPosition) { + switch (FormatStrConverter.convert(*conversionPosition)) { default: break; case '%': @@ -262,7 +268,8 @@ static ScanfSpecifierResult ParseScanfSpecifier(FormatStringHandler &H, FS.setConversionSpecifier(CS); } // Assume the conversion takes one argument. - return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len); + return !H.HandleInvalidScanfConversionSpecifier(FS, Beg, Len, + FormatStrConverter); } return ScanfSpecifierResult(Start, FS); } diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 854d23cadaea2..0864d6855068a 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -194,6 +194,9 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { MaxOpenCLWorkGroupSize = 1024; MaxBitIntWidth.reset(); + + FormatStrConverter = new llvm::TextEncodingConverter( + std::move(*llvm::TextEncodingConverter::createNoopConverter())); } // Out of line virtual dtor for TargetInfo. diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index f4e0f09035fff..ef2899c47f3c8 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -555,8 +555,8 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) { if (GetDependencyDirectives) PP->setDependencyDirectivesGetter(*GetDependencyDirectives); - if (auto EC = TextEncoding::setConvertersFromOptions(PP->getTextEncoding(), - getLangOpts())) + if (auto EC = TextEncoding::setConvertersFromOptions( + PP->getTextEncoding(), getLangOpts(), getTarget())) PP->getDiagnostics().Report(clang::diag::err_fe_text_encoding_config) << PP->getTextEncoding().getLiteralEncoding(); } diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp index 393caaadd5d37..ec9945ec789bd 100644 --- a/clang/lib/Lex/TextEncoding.cpp +++ b/clang/lib/Lex/TextEncoding.cpp @@ -21,7 +21,8 @@ TextEncoding::getConverter(ConversionAction Action) const { std::error_code TextEncoding::setConvertersFromOptions(TextEncoding &TEC, - const clang::LangOptions &Opts) { + const clang::LangOptions &Opts, + clang::TargetInfo &TInfo) { using namespace llvm; const char *UTF8 = "UTF-8"; @@ -39,5 +40,13 @@ TextEncoding::setConvertersFromOptions(TextEncoding &TEC, new TextEncodingConverter(std::move(*ErrorOrConverter)); else return ErrorOrConverter.getError(); + + ErrorOrConverter = llvm::TextEncodingConverter::create(TEC.SystemEncoding, + TEC.InternalEncoding); + + if (ErrorOrConverter) + TInfo.FormatStrConverter = + new TextEncodingConverter(std::move(*ErrorOrConverter)); + return std::error_code(); } diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 8a8c9cc9d2c23..d9cae43a69fdc 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -104,6 +104,7 @@ #include "llvm/Support/Locale.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SaveAndRestore.h" +#include "llvm/Support/TextEncoding.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/RISCVTargetParser.h" #include "llvm/TargetParser/Triple.h" @@ -7930,10 +7931,10 @@ class CheckFormatHandler : public analyze_format_string::FormatStringHandler { ArrayRef<FixItHint> Fixit = {}); protected: - bool HandleInvalidConversionSpecifier(unsigned argIndex, SourceLocation Loc, - const char *startSpec, - unsigned specifierLen, - const char *csStart, unsigned csLen); + bool HandleInvalidConversionSpecifier( + unsigned argIndex, SourceLocation Loc, const char *startSpec, + unsigned specifierLen, const char *csStart, unsigned csLen, + const llvm::TextEncodingConverter &FormatStrConverter); void HandlePositionalNonpositionalArgs(SourceLocation Loc, const char *startSpec, @@ -8163,7 +8164,8 @@ void UncoveredArgHandler::Diagnose(Sema &S, bool IsFunctionCall, bool CheckFormatHandler::HandleInvalidConversionSpecifier( unsigned argIndex, SourceLocation Loc, const char *startSpec, - unsigned specifierLen, const char *csStart, unsigned csLen) { + unsigned specifierLen, const char *csStart, unsigned csLen, + const llvm::TextEncodingConverter &FormatStrConverter) { bool keepGoing = true; if (argIndex < NumDataArgs) { // Consider the argument coverered, even though the specifier doesn't @@ -8178,7 +8180,13 @@ bool CheckFormatHandler::HandleInvalidConversionSpecifier( keepGoing = false; } - StringRef Specifier(csStart, csLen); + // The csStart points to a character that has already been converted to the + // exec charset, so we have to reverse the conversion to allow diagnostic + // message to match an expected value when using -verify option, + std::string RS(csStart, csLen); + for (unsigned int i = 0; i < RS.size(); ++i) + RS[i] = FormatStrConverter.convert(RS[i]); + StringRef Specifier(RS); // If the specifier in non-printable, it could be the first byte of a UTF-8 // sequence. In that case, print the UTF-8 code point. If not, print the byte @@ -8332,7 +8340,8 @@ class CheckPrintfHandler : public CheckFormatHandler { bool HandleInvalidPrintfConversionSpecifier( const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) override; + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) override; void handleInvalidMaskType(StringRef MaskType) override; @@ -8472,13 +8481,14 @@ class DecomposePrintfHandler : public CheckPrintfHandler { bool CheckPrintfHandler::HandleInvalidPrintfConversionSpecifier( const analyze_printf::PrintfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) { + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { const analyze_printf::PrintfConversionSpecifier &CS = FS.getConversionSpecifier(); return HandleInvalidConversionSpecifier( FS.getArgIndex(), getLocationOfByte(CS.getStart()), startSpecifier, - specifierLen, CS.getStart(), CS.getLength()); + specifierLen, CS.getStart(), CS.getLength(), FormatStrConverter); } void CheckPrintfHandler::handleInvalidMaskType(StringRef MaskType) { @@ -8986,15 +8996,15 @@ bool CheckPrintfHandler::HandlePrintfSpecifier( // Check for using an Objective-C specific conversion specifier // in a non-ObjC literal. if (!allowsObjCArg() && CS.isObjCArg()) { - return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier, - specifierLen); + return HandleInvalidPrintfConversionSpecifier( + FS, startSpecifier, specifierLen, *Target.FormatStrConverter); } // %P can only be used with os_log. if (FSType != FormatStringType::OSLog && CS.getKind() == ConversionSpecifier::PArg) { - return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier, - specifierLen); + return HandleInvalidPrintfConversionSpecifier( + FS, startSpecifier, specifierLen, *Target.FormatStrConverter); } // %n is not allowed with os_log. @@ -9013,8 +9023,8 @@ bool CheckPrintfHandler::HandlePrintfSpecifier( (CS.getKind() == ConversionSpecifier::PArg || CS.getKind() == ConversionSpecifier::sArg || CS.getKind() == ConversionSpecifier::ObjCObjArg)) { - return HandleInvalidPrintfConversionSpecifier(FS, startSpecifier, - specifierLen); + return HandleInvalidPrintfConversionSpecifier( + FS, startSpecifier, specifierLen, *Target.FormatStrConverter); } // Check for use of public/private annotation outside of os_log(). @@ -9687,10 +9697,10 @@ class CheckScanfHandler : public CheckFormatHandler { const char *startSpecifier, unsigned specifierLen) override; - bool - HandleInvalidScanfConversionSpecifier(const analyze_scanf::ScanfSpecifier &FS, - const char *startSpecifier, - unsigned specifierLen) override; + bool HandleInvalidScanfConversionSpecifier( + const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) override; void HandleIncompleteScanList(const char *start, const char *end) override; }; @@ -9706,13 +9716,15 @@ void CheckScanfHandler::HandleIncompleteScanList(const char *start, bool CheckScanfHandler::HandleInvalidScanfConversionSpecifier( const analyze_scanf::ScanfSpecifier &FS, const char *startSpecifier, - unsigned specifierLen) { + unsigned specifierLen, + const llvm::TextEncodingConverter &FormatStrConverter) { + const analyze_scanf::ScanfConversionSpecifier &CS = FS.getConversionSpecifier(); return HandleInvalidConversionSpecifier( FS.getArgIndex(), getLocationOfByte(CS.getStart()), startSpecifier, - specifierLen, CS.getStart(), CS.getLength()); + specifierLen, CS.getStart(), CS.getLength(), FormatStrConverter); } bool CheckScanfHandler::HandleScanfSpecifier( diff --git a/llvm/include/llvm/Support/TextEncoding.h b/llvm/include/llvm/Support/TextEncoding.h index 8a304910aa5dd..8f5a6122ede45 100644 --- a/llvm/include/llvm/Support/TextEncoding.h +++ b/llvm/include/llvm/Support/TextEncoding.h @@ -105,6 +105,8 @@ class TextEncodingConverter { LLVM_ABI static ErrorOr<TextEncodingConverter> create(StringRef From, StringRef To); + LLVM_ABI static ErrorOr<TextEncodingConverter> createNoopConverter(); + TextEncodingConverter(const TextEncodingConverter &) = delete; TextEncodingConverter &operator=(const TextEncodingConverter &) = delete; @@ -135,6 +137,14 @@ class TextEncodingConverter { return std::string(Result); return EC; } + + char convert(char SingleChar) const { + SmallString<1> Result; + auto EC = Converter->convert(StringRef(&SingleChar, 1), Result); + if (!EC) + return Result[0]; + return '\0'; + } }; } // namespace llvm diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp index d36f02c1300b9..5c1d9696686a2 100644 --- a/llvm/lib/Support/TextEncoding.cpp +++ b/llvm/lib/Support/TextEncoding.cpp @@ -356,3 +356,22 @@ ErrorOr<TextEncodingConverter> TextEncodingConverter::create(StringRef From, return std::make_error_code(std::errc::invalid_argument); #endif } + +class TextEncodingConverterNoop final + : public details::TextEncodingConverterImplBase { + +public: + TextEncodingConverterNoop() {} + + std::error_code convertString(StringRef Source, + SmallVectorImpl<char> &Result) override { + Result.assign(Source.begin(), Source.end()); + return std::error_code(); + } + + void reset() override {} +}; + +ErrorOr<TextEncodingConverter> TextEncodingConverter::createNoopConverter() { + return TextEncodingConverter(std::make_unique<TextEncodingConverterNoop>()); +} >From 48c2a16354369421987de1ab9e69770b99586654 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Wed, 13 May 2026 15:10:35 -0400 Subject: [PATCH 8/9] fix CI --- clang/include/clang/Lex/TextEncoding.h | 1 + clang/lib/Lex/TextEncoding.cpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Lex/TextEncoding.h b/clang/include/clang/Lex/TextEncoding.h index f525b06cff37b..097e96371338b 100644 --- a/clang/include/clang/Lex/TextEncoding.h +++ b/clang/include/clang/Lex/TextEncoding.h @@ -10,6 +10,7 @@ #define LLVM_CLANG_LEX_TEXTENCODING_H #include "clang/Basic/LangOptions.h" +#include "clang/Basic/TargetInfo.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/TextEncoding.h" diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp index ec9945ec789bd..682b45816cc7e 100644 --- a/clang/lib/Lex/TextEncoding.cpp +++ b/clang/lib/Lex/TextEncoding.cpp @@ -41,8 +41,8 @@ TextEncoding::setConvertersFromOptions(TextEncoding &TEC, else return ErrorOrConverter.getError(); - ErrorOrConverter = llvm::TextEncodingConverter::create(TEC.SystemEncoding, - TEC.InternalEncoding); + ErrorOrConverter = llvm::TextEncodingConverter::create( + TInfo.getDefaultNarrowTextEncoding(), UTF8); if (ErrorOrConverter) TInfo.FormatStrConverter = >From d96b76449eed75eaa209aa69be39412e6ce5fbe7 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan <[email protected]> Date: Thu, 28 May 2026 15:16:49 -0400 Subject: [PATCH 9/9] do not convert character by character --- clang/lib/Lex/TextEncoding.cpp | 2 +- clang/lib/Sema/SemaChecking.cpp | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/clang/lib/Lex/TextEncoding.cpp b/clang/lib/Lex/TextEncoding.cpp index 682b45816cc7e..df3e4dbcaf4b4 100644 --- a/clang/lib/Lex/TextEncoding.cpp +++ b/clang/lib/Lex/TextEncoding.cpp @@ -42,7 +42,7 @@ TextEncoding::setConvertersFromOptions(TextEncoding &TEC, return ErrorOrConverter.getError(); ErrorOrConverter = llvm::TextEncodingConverter::create( - TInfo.getDefaultNarrowTextEncoding(), UTF8); + TInfo.getDefaultOrdinaryTextEncoding(), UTF8); if (ErrorOrConverter) TInfo.FormatStrConverter = diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index d9cae43a69fdc..6ec6979440369 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -8183,10 +8183,12 @@ bool CheckFormatHandler::HandleInvalidConversionSpecifier( // The csStart points to a character that has already been converted to the // exec charset, so we have to reverse the conversion to allow diagnostic // message to match an expected value when using -verify option, - std::string RS(csStart, csLen); - for (unsigned int i = 0; i < RS.size(); ++i) - RS[i] = FormatStrConverter.convert(RS[i]); - StringRef Specifier(RS); + SmallString<4> RS; + auto EC = FormatStrConverter.convert(StringRef(csStart, csLen), RS); + if (EC) { + keepGoing = false; + } + llvm::StringRef Specifier(RS); // If the specifier in non-printable, it could be the first byte of a UTF-8 // sequence. In that case, print the UTF-8 code point. If not, print the byte _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
