https://github.com/cor3ntin created 
https://github.com/llvm/llvm-project/pull/164654

UTF-16 to UTF-16 conversions seems widespread,
and lone surrogate have a distinct representation in UTF-32.

Lets not warn on this case to make the warning easier to adopt. This follows 
SG-16 guideline

https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/p3695r2.html#changes-since-r1

Fixes #163719

>From 9b3789ae8e9827956d618a45a30c97197d1f0b3f Mon Sep 17 00:00:00 2001
From: Corentin Jabot <[email protected]>
Date: Sun, 19 Oct 2025 09:26:22 +0200
Subject: [PATCH] [Clang] Do not warn on UTF-16 -> UTF-32 conversions.
 (#163927)

UTF-16 to UTF-16 conversions seems widespread,
and lone surrogate have a distinct representation in UTF-32.

Lets not warn on this case to make the warning easier to adopt. This
follows SG-16 guideline


https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/p3695r2.html#changes-since-r1

Fixes #163719
---
 clang/lib/Sema/SemaChecking.cpp                          | 9 ++++++++-
 clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp | 8 ++++----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index dd5b710d7e1d4..41bcf8fd493fc 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -12014,13 +12014,20 @@ static void 
DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
                                                    SourceLocation CC) {
   assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() 
&&
          Source != Target);
+
+  // Lone surrogates have a distinct representation in UTF-32.
+  // Converting between UTF-16 and UTF-32 codepoints seems very widespread,
+  // so don't warn on such conversion.
+  if (Source->isChar16Type() && Target->isChar32Type())
+    return;
+
   Expr::EvalResult Result;
   if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects,
                        S.isConstantEvaluatedContext())) {
     llvm::APSInt Value(32);
     Value = Result.Val.getInt();
     bool IsASCII = Value <= 0x7F;
-    bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF);
+    bool IsBMP = Value <= 0xDFFF || (Value >= 0xE000 && Value <= 0xFFFF);
     bool ConversionPreservesSemantics =
         IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP);
 
diff --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp 
b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
index fcff006d0e028..f17f20ca25295 100644
--- a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
+++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
@@ -14,7 +14,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
     c16(u32); // expected-warning {{implicit conversion from 'char32_t' to 
'char16_t' may lose precision and change the meaning of the represented code 
unit}}
 
     c32(u8);  // expected-warning {{implicit conversion from 'char8_t' to 
'char32_t' may change the meaning of the represented code unit}}
-    c32(u16); // expected-warning {{implicit conversion from 'char16_t' to 
'char32_t' may change the meaning of the represented code unit}}
+    c32(u16);
     c32(u32);
 
 
@@ -30,7 +30,7 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
     c16(char32_t(0x7f));
     c16(char32_t(0x80));
     c16(char32_t(0xD7FF));
-    c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 
'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}}
+    c16(char32_t(0xD800));
     c16(char32_t(0xE000));
     c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 
'char32_t' to 'char16_t' changes the meaning of the code point '🐉'}}
 
@@ -44,8 +44,8 @@ void test(char8_t u8, char16_t u16, char32_t u32) {
     c32(char16_t(0x80));
 
     c32(char16_t(0xD7FF));
-    c32(char16_t(0xD800)); // expected-warning {{implicit conversion from 
'char16_t' to 'char32_t' changes the meaning of the code unit '<0xD800>'}}
-    c32(char16_t(0xDFFF)); // expected-warning {{implicit conversion from 
'char16_t' to 'char32_t' changes the meaning of the code unit '<0xDFFF>'}}
+    c32(char16_t(0xD800));
+    c32(char16_t(0xDFFF));
     c32(char16_t(0xE000));
     c32(char16_t(u'☕'));
 

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to