Author: cor3ntin
Date: 2025-05-15T18:16:05+02:00
New Revision: 381a649fb991eadb0c594de2d8b6166fcc11345a

URL: 
https://github.com/llvm/llvm-project/commit/381a649fb991eadb0c594de2d8b6166fcc11345a
DIFF: 
https://github.com/llvm/llvm-project/commit/381a649fb991eadb0c594de2d8b6166fcc11345a.diff

LOG: [Clang] Add warnings when mixing different charN_t types (#138708)

charN_t represent code units of different UTF encodings. Therefore the
values of 2 different charN_t objects do not represent the same
characters.

In order to avoid comparing apples and oranges, we add new warnings to
warn on:
  - Implicit conversions
  - Comparisons
  - Other cases involving arithmetic conversions

We only produce the warning if we cannot establish the comparison would
be safe through constant evaluation.

The new `-Wimplicit-unicode-conversion` warning is enabled by default.

Note that this PR intentionally doesn;t touches char/wchar_t, but it
would be worth considering also warning on extending the new warnings to
these types (in a follow up)

Additionally most arithmetic operations on charN_t don't really make
sense (ie what does it mean to addition code units), so we could add
warnings for that.

Fixes #138526

Added: 
    clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp

Modified: 
    clang/docs/ReleaseNotes.rst
    clang/include/clang/AST/ASTDiagnostic.h
    clang/include/clang/AST/Type.h
    clang/include/clang/Basic/DiagnosticGroups.td
    clang/include/clang/Basic/DiagnosticSemaKinds.td
    clang/lib/AST/ASTDiagnostic.cpp
    clang/lib/AST/Type.cpp
    clang/lib/Sema/SemaChecking.cpp
    clang/lib/Sema/SemaExpr.cpp
    libcxx/include/print
    libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
    libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
    libcxx/test/std/localization/codecvt_unicode.pass.cpp
    
libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp
    
libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp
    
libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp
    
libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp
    
libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp
    libcxx/utils/libcxx/test/features.py
    llvm/include/llvm/Support/ConvertUTF.h
    llvm/lib/Support/ConvertUTFWrapper.cpp

Removed: 
    


################################################################################
diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e9ad11325db02..f271a8303cbc9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -534,6 +534,10 @@ Improvements to Clang's diagnostics
   packing may 
diff er under the MS struct ABI (#GH117428).
 
 
+- A new ``-Wcharacter-conversion`` warns where comparing or implicitly 
converting
+  between 
diff erent Unicode character types (``char8_t``, ``char16_t``, ``char32_t``).
+  This warning only triggers in C++ as these types are aliases in C. 
(#GH138526)
+
 Improvements to Clang's time-trace
 ----------------------------------
 

diff  --git a/clang/include/clang/AST/ASTDiagnostic.h 
b/clang/include/clang/AST/ASTDiagnostic.h
index ef22249828629..baa410e3e4a03 100644
--- a/clang/include/clang/AST/ASTDiagnostic.h
+++ b/clang/include/clang/AST/ASTDiagnostic.h
@@ -38,6 +38,9 @@ namespace clang {
   /// is initialized before passing it in.
   QualType desugarForDiagnostic(ASTContext &Context, QualType QT,
                                 bool &ShouldAKA);
+
+  std::string FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T);
+
 }  // end namespace clang
 
 #endif

diff  --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 180f3623983de..5c8c0e1cf1d00 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2521,6 +2521,7 @@ class alignas(TypeAlignment) Type : public 
ExtQualsTypeCommonBase {
   bool isChar16Type() const;
   bool isChar32Type() const;
   bool isAnyCharacterType() const;
+  bool isUnicodeCharacterType() const;
   bool isIntegralType(const ASTContext &Ctx) const;
 
   /// Determine whether this type is an integral or enumeration type.

diff  --git a/clang/include/clang/Basic/DiagnosticGroups.td 
b/clang/include/clang/Basic/DiagnosticGroups.td
index 5a3e756f07ecc..616f2555931f5 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -111,6 +111,7 @@ def EnumConversion : DiagGroup<"enum-conversion",
                                 ImplicitEnumEnumCast,
                                 EnumFloatConversion,
                                 EnumCompareConditional]>;
+def CharacterConversion : DiagGroup<"character-conversion">;
 def DeprecatedOFast : DiagGroup<"deprecated-ofast">;
 def ObjCSignedCharBoolImplicitIntConversion :
   DiagGroup<"objc-signed-char-bool-implicit-int-conversion">;
@@ -1119,6 +1120,7 @@ def Parentheses : DiagGroup<"parentheses",
 //   - __null-to-integer conversion warnings are on by default
 def Conversion : DiagGroup<"conversion",
                            [BoolConversion,
+                            CharacterConversion,
                             ConstantConversion,
                             EnumConversion,
                             BitFieldEnumConversion,

diff  --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td 
b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 6e940a318b61d..f0bd5a1174020 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -4369,6 +4369,29 @@ def warn_address_of_reference_bool_conversion : Warning<
   "code; pointer may be assumed to always convert to true">,
   InGroup<UndefinedBoolConversion>;
 
+def warn_impcast_unicode_char_type
+    : Warning<"implicit conversion from %0 to %1 may change the meaning of the 
"
+              "represented code unit">,
+      InGroup<CharacterConversion>;
+def warn_impcast_unicode_precision
+    : Warning<"implicit conversion from %0 to %1 may lose precision and change 
"
+              "the meaning of the represented code unit">,
+      InGroup<CharacterConversion>;
+def warn_impcast_unicode_char_type_constant
+    : Warning<"implicit conversion from %0 to %1 changes the meaning of the "
+              "%select{code unit|code point}2 '%3'">,
+      InGroup<CharacterConversion>;
+
+def warn_comparison_unicode_mixed_types
+    : Warning<"comparing values of 
diff erent Unicode code unit types %0 and %1 "
+              "may compare 
diff erent code points">,
+      InGroup<CharacterConversion>;
+
+def warn_comparison_unicode_mixed_types_constant
+    : Warning<"comparing values of 
diff erent Unicode code unit types %0 and %1 "
+              "compares unrelated code units '%2' and '%3'">,
+      InGroup<CharacterConversion>;
+
 def warn_xor_used_as_pow : Warning<
   "result of '%0' is %1; did you mean exponentiation?">,
   InGroup<XorUsedAsPow>;
@@ -6834,7 +6857,7 @@ def err_counted_by_on_incomplete_type_on_use : Error <
 
 def note_counted_by_consider_completing_pointee_ty : Note<
   "consider providing a complete definition for %0">;
-  
+
 def note_counted_by_consider_using_sized_by : Note<
   "consider using '__sized_by%select{|_or_null}0' instead of "
   "'__counted_by%select{|_or_null}0'">;
@@ -7733,6 +7756,11 @@ def warn_comparison_of_mixed_enum_types_switch : Warning<
   "%
diff { ($ and $)|}0,1">,
   InGroup<EnumCompareSwitch>;
 
+def warn_arith_conv_mixed_unicode_types
+    : Warning<"%sub{select_arith_conv_kind}0 "
+              "
diff erent Unicode character types %1 and %2">,
+      InGroup<CharacterConversion>;
+
 def err_typecheck_assign_const : Error<
   "%select{"
   "cannot assign to return value because function %1 returns a const value|"

diff  --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp
index 6cb09b0492ac9..a00d5801f054b 100644
--- a/clang/lib/AST/ASTDiagnostic.cpp
+++ b/clang/lib/AST/ASTDiagnostic.cpp
@@ -20,6 +20,8 @@
 #include "clang/AST/TemplateBase.h"
 #include "clang/AST/Type.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
@@ -2190,3 +2192,31 @@ static bool FormatTemplateTypeDiff(ASTContext &Context, 
QualType FromType,
   TD.DiffTemplate();
   return TD.Emit();
 }
+
+std::string clang::FormatUTFCodeUnitAsCodepoint(unsigned Value, QualType T) {
+  auto IsSingleCodeUnitCP = [](unsigned Value, QualType T) {
+    if (T->isChar8Type()) {
+      assert(Value <= 0xFF && "not a valid UTF-8 code unit");
+      return Value <= 0x7F;
+    }
+    if (T->isChar16Type()) {
+      assert(Value <= 0xFFFF && "not a valid UTF-16 code unit");
+      return llvm::IsSingleCodeUnitUTF16Codepoint(Value);
+    }
+    assert(T->isChar32Type());
+    return llvm::IsSingleCodeUnitUTF32Codepoint(Value);
+  };
+  llvm::SmallVector<char, 16> Str;
+  if (!IsSingleCodeUnitCP(Value, T)) {
+    llvm::raw_svector_ostream OS(Str);
+    OS << "<" << llvm::format_hex(Value, 1, /*Upper=*/true) << ">";
+    return std::string(Str.begin(), Str.end());
+  }
+
+  char Buffer[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
+  char *Ptr = Buffer;
+  [[maybe_unused]] bool Converted = llvm::ConvertCodePointToUTF8(Value, Ptr);
+  assert(Converted && "trying to encode invalid code unit");
+  EscapeStringForDiagnostic(StringRef(Buffer, Ptr - Buffer), Str);
+  return std::string(Str.begin(), Str.end());
+}

diff  --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index aee1e9a5ecb03..a20bc3ffba823 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2193,6 +2193,20 @@ bool Type::isAnyCharacterType() const {
   }
 }
 
+bool Type::isUnicodeCharacterType() const {
+  const auto *BT = dyn_cast<BuiltinType>(CanonicalType);
+  if (!BT)
+    return false;
+  switch (BT->getKind()) {
+  default:
+    return false;
+  case BuiltinType::Char8:
+  case BuiltinType::Char16:
+  case BuiltinType::Char32:
+    return true;
+  }
+}
+
 /// isSignedIntegerType - Return true if this is an integer type that is
 /// signed, according to C99 6.2.5p4 [char, signed char, short, int, long..],
 /// an enum decl which has a signed representation

diff  --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index d7c62b44a5c50..84b84de28c511 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -14,6 +14,7 @@
 #include "CheckExprLifetime.h"
 #include "clang/AST/APValue.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTDiagnostic.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/AttrIterator.h"
 #include "clang/AST/CharUnits.h"
@@ -11871,6 +11872,47 @@ static void DiagnoseIntInBoolContext(Sema &S, Expr *E) 
{
   }
 }
 
+static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
+                                                   const Type *Target, Expr *E,
+                                                   QualType T,
+                                                   SourceLocation CC) {
+  assert(Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType() 
&&
+         Source != Target);
+  Expr::EvalResult Result;
+  if (E->EvaluateAsInt(Result, S.getASTContext(), Expr::SE_AllowSideEffects,
+                       S.isConstantEvaluatedContext())) {
+    llvm::APSInt Value(32);
+    Value = Result.Val.getInt();
+    bool IsASCII = Value <= 0x7F;
+    bool IsBMP = Value <= 0xD7FF || (Value >= 0xE000 && Value <= 0xFFFF);
+    bool ConversionPreservesSemantics =
+        IsASCII || (!Source->isChar8Type() && !Target->isChar8Type() && IsBMP);
+
+    if (!ConversionPreservesSemantics) {
+      auto IsSingleCodeUnitCP = [](const QualType &T,
+                                   const llvm::APSInt &Value) {
+        if (T->isChar8Type())
+          return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
+        if (T->isChar16Type())
+          return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
+        assert(T->isChar32Type());
+        return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
+      };
+
+      S.Diag(CC, diag::warn_impcast_unicode_char_type_constant)
+          << E->getType() << T
+          << IsSingleCodeUnitCP(E->getType().getUnqualifiedType(), Value)
+          << FormatUTFCodeUnitAsCodepoint(Value.getExtValue(), E->getType());
+    }
+  } else {
+    bool LosesPrecision = S.getASTContext().getIntWidth(E->getType()) >
+                          S.getASTContext().getIntWidth(T);
+    DiagnoseImpCast(S, E, T, CC,
+                    LosesPrecision ? diag::warn_impcast_unicode_precision
+                                   : diag::warn_impcast_unicode_char_type);
+  }
+}
+
 void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
                                    bool *ICContext, bool IsListInit) {
   if (E->isTypeDependent() || E->isValueDependent()) return;
@@ -12208,6 +12250,11 @@ void Sema::CheckImplicitConversion(Expr *E, QualType 
T, SourceLocation CC,
 
   DiscardMisalignedMemberAddress(Target, E);
 
+  if (Source->isUnicodeCharacterType() && Target->isUnicodeCharacterType()) {
+    DiagnoseMixedUnicodeImplicitConversion(*this, Source, Target, E, T, CC);
+    return;
+  }
+
   if (Target->isBooleanType())
     DiagnoseIntInBoolContext(*this, E);
 

diff  --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 2c81f7c583eb6..ae0626ebadb18 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -15,6 +15,7 @@
 #include "UsedDeclVisitor.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTDiagnostic.h"
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/ASTMutationListener.h"
 #include "clang/AST/CXXInheritance.h"
@@ -1568,6 +1569,79 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, 
Expr *RHS,
   }
 }
 
+static void CheckUnicodeArithmeticConversions(Sema &SemaRef, Expr *LHS,
+                                              Expr *RHS, SourceLocation Loc,
+                                              ArithConvKind ACK) {
+  QualType LHSType = LHS->getType().getUnqualifiedType();
+  QualType RHSType = RHS->getType().getUnqualifiedType();
+
+  if (!SemaRef.getLangOpts().CPlusPlus || !LHSType->isUnicodeCharacterType() ||
+      !RHSType->isUnicodeCharacterType())
+    return;
+
+  if (ACK == ArithConvKind::Comparison) {
+    if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
+      return;
+
+    auto IsSingleCodeUnitCP = [](const QualType &T, const llvm::APSInt &Value) 
{
+      if (T->isChar8Type())
+        return llvm::IsSingleCodeUnitUTF8Codepoint(Value.getExtValue());
+      if (T->isChar16Type())
+        return llvm::IsSingleCodeUnitUTF16Codepoint(Value.getExtValue());
+      assert(T->isChar32Type());
+      return llvm::IsSingleCodeUnitUTF32Codepoint(Value.getExtValue());
+    };
+
+    Expr::EvalResult LHSRes, RHSRes;
+    bool LHSSuccess = LHS->EvaluateAsInt(LHSRes, SemaRef.getASTContext(),
+                                         Expr::SE_AllowSideEffects,
+                                         SemaRef.isConstantEvaluatedContext());
+    bool RHSuccess = RHS->EvaluateAsInt(RHSRes, SemaRef.getASTContext(),
+                                        Expr::SE_AllowSideEffects,
+                                        SemaRef.isConstantEvaluatedContext());
+
+    // Don't warn if the one known value is a representable
+    // in the type of both expressions.
+    if (LHSSuccess != RHSuccess) {
+      Expr::EvalResult &Res = LHSSuccess ? LHSRes : RHSRes;
+      if (IsSingleCodeUnitCP(LHSType, Res.Val.getInt()) &&
+          IsSingleCodeUnitCP(RHSType, Res.Val.getInt()))
+        return;
+    }
+
+    if (!LHSSuccess || !RHSuccess) {
+      SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types)
+          << LHS->getSourceRange() << RHS->getSourceRange() << LHSType
+          << RHSType;
+      return;
+    }
+
+    llvm::APSInt LHSValue(32);
+    LHSValue = LHSRes.Val.getInt();
+    llvm::APSInt RHSValue(32);
+    RHSValue = RHSRes.Val.getInt();
+
+    bool LHSSafe = IsSingleCodeUnitCP(LHSType, LHSValue);
+    bool RHSSafe = IsSingleCodeUnitCP(RHSType, RHSValue);
+    if (LHSSafe && RHSSafe)
+      return;
+
+    SemaRef.Diag(Loc, diag::warn_comparison_unicode_mixed_types_constant)
+        << LHS->getSourceRange() << RHS->getSourceRange() << LHSType << RHSType
+        << FormatUTFCodeUnitAsCodepoint(LHSValue.getExtValue(), LHSType)
+        << FormatUTFCodeUnitAsCodepoint(RHSValue.getExtValue(), RHSType);
+    return;
+  }
+
+  if (SemaRef.getASTContext().hasSameType(LHSType, RHSType))
+    return;
+
+  SemaRef.Diag(Loc, diag::warn_arith_conv_mixed_unicode_types)
+      << LHS->getSourceRange() << RHS->getSourceRange() << ACK << LHSType
+      << RHSType;
+  return;
+}
+
 /// UsualArithmeticConversions - Performs various conversions that are common 
to
 /// binary operators (C99 6.3.1.8). If both operands aren't arithmetic, this
 /// routine returns the first non-arithmetic type found. The client is
@@ -1575,8 +1649,11 @@ void Sema::checkEnumArithmeticConversions(Expr *LHS, 
Expr *RHS,
 QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS,
                                           SourceLocation Loc,
                                           ArithConvKind ACK) {
+
   checkEnumArithmeticConversions(LHS.get(), RHS.get(), Loc, ACK);
 
+  CheckUnicodeArithmeticConversions(*this, LHS.get(), RHS.get(), Loc, ACK);
+
   if (ACK != ArithConvKind::CompAssign) {
     LHS = UsualUnaryConversions(LHS.get());
     if (LHS.isInvalid())

diff  --git a/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp 
b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
new file mode 100644
index 0000000000000..fcff006d0e028
--- /dev/null
+++ b/clang/test/SemaCXX/warn-implicit-unicode-conversions.cpp
@@ -0,0 +1,151 @@
+// RUN: %clang_cc1 -verify -fsyntax-only -std=c++20 -Wconversion %s
+
+void c8(char8_t);
+void c16(char16_t);
+void c32(char32_t);
+
+void test(char8_t u8, char16_t u16, char32_t u32) {
+    c8(u8);
+    c8(u16); // expected-warning {{implicit conversion from 'char16_t' to 
'char8_t' may lose precision and change the meaning of the represented code 
unit}}
+    c8(u32); // expected-warning {{implicit conversion from 'char32_t' to 
'char8_t' may lose precision and change the meaning of the represented code 
unit}}
+
+    c16(u8);  // expected-warning {{implicit conversion from 'char8_t' to 
'char16_t' may change the meaning of the represented code unit}}
+    c16(u16);
+    c16(u32); // expected-warning {{implicit conversion from 'char32_t' to 
'char16_t' may lose precision and change the meaning of the represented code 
unit}}
+
+    c32(u8);  // expected-warning {{implicit conversion from 'char8_t' to 
'char32_t' may change the meaning of the represented code unit}}
+    c32(u16); // expected-warning {{implicit conversion from 'char16_t' to 
'char32_t' may change the meaning of the represented code unit}}
+    c32(u32);
+
+
+    c8(char32_t(0x7f));
+    c8(char32_t(0x80));   // expected-warning {{implicit conversion from 
'char32_t' to 'char8_t' changes the meaning of the code point '<U+0080>'}}
+
+    c8(char16_t(0x7f));
+    c8(char16_t(0x80));   // expected-warning {{implicit conversion from 
'char16_t' to 'char8_t' changes the meaning of the code point '<U+0080>'}}
+    c8(char16_t(0xD800)); // expected-warning {{implicit conversion from 
'char16_t' to 'char8_t' changes the meaning of the code unit '<0xD800>'}}
+    c8(char16_t(0xE000)); // expected-warning {{implicit conversion from 
'char16_t' to 'char8_t' changes the meaning of the code point '<U+E000>'}}
+
+
+    c16(char32_t(0x7f));
+    c16(char32_t(0x80));
+    c16(char32_t(0xD7FF));
+    c16(char32_t(0xD800)); // expected-warning {{implicit conversion from 
'char32_t' to 'char16_t' changes the meaning of the code unit '<0xD800>'}}
+    c16(char32_t(0xE000));
+    c16(char32_t(U'🐉')); // expected-warning {{implicit conversion from 
'char32_t' to 'char16_t' changes the meaning of the code point '🐉'}}
+
+
+    c32(char8_t(0x7f));
+    c32(char8_t(0x80)); // expected-warning {{implicit conversion from 
'char8_t' to 'char32_t' changes the meaning of the code unit '<0x80>'}}
+    c32(char8_t(0xFF)); // expected-warning {{implicit conversion from 
'char8_t' to 'char32_t' changes the meaning of the code unit '<0xFF>'}}
+
+
+    c32(char16_t(0x7f));
+    c32(char16_t(0x80));
+
+    c32(char16_t(0xD7FF));
+    c32(char16_t(0xD800)); // expected-warning {{implicit conversion from 
'char16_t' to 'char32_t' changes the meaning of the code unit '<0xD800>'}}
+    c32(char16_t(0xDFFF)); // expected-warning {{implicit conversion from 
'char16_t' to 'char32_t' changes the meaning of the code unit '<0xDFFF>'}}
+    c32(char16_t(0xE000));
+    c32(char16_t(u'☕'));
+
+    (void)static_cast<char32_t>(char8_t(0x80)); //no warnings for explicit 
conversions.
+
+    using Char8 = char8_t;
+    Char8 c81 = u16; // expected-warning {{implicit conversion from 'char16_t' 
to 'Char8' (aka 'char8_t') may lose precision and change the meaning of the 
represented code unit}}
+
+    [[maybe_unused]] char c = u16; // expected-warning {{implicit conversion 
loses integer precision: 'char16_t' to 'char'}}
+
+    // FIXME: We should apply the same logic to wchar
+    [[maybe_unused]] wchar_t wc = u16;
+    [[maybe_unused]] wchar_t wc2 = u8;
+}
+
+void test_comp(char8_t u8, char16_t u16, char32_t u32) {
+    (void)(u8 == u8' ');
+    (void)(u8 == u' ');
+    (void)(u8 == U' ');
+
+    (void)(u16 == u8' ');
+    (void)(u16 == U' ');
+
+    (void)(u32 == u8' ');
+    (void)(u32 == u' ');
+    (void)(u32 == U' ');
+
+    (void)(u8 == u'\u00FF'); // expected-warning{{comparing values of 
diff erent Unicode code unit types 'char8_t' and 'char16_t' may compare 
diff erent code points}}
+    (void)(u8 == U'\u00FF'); // expected-warning{{comparing values of 
diff erent Unicode code unit types 'char8_t' and 'char32_t' may compare 
diff erent code points}}
+
+    (void)(u16 == u8'\xFF'); // expected-warning{{comparing values of 
diff erent Unicode code unit types 'char16_t' and 'char8_t' may compare 
diff erent code points}}
+    (void)(u16 == u'\u00FF');
+    (void)(u16 == U'\u00FF');
+    (void)(u16 == U'\xD800'); // expected-warning{{comparing values of 
diff erent Unicode code unit types 'char16_t' and 'char32_t' may compare 
diff erent code points}}
+
+    (void)(u32 == u8'\xFF');  // expected-warning{{comparing values of 
diff erent Unicode code unit types 'char32_t' and 'char8_t' may compare 
diff erent code points}}
+    (void)(u32 == u'\u00FF');
+    (void)(u32 == u'\xD800'); // expected-warning{{comparing values of 
diff erent Unicode code unit types 'char32_t' and 'char16_t' may compare 
diff erent code points}}
+
+    (void)(char8_t(0x7f) == char8_t(0x7f));
+    (void)(char8_t(0x7f) == char16_t(0x7f));
+    (void)(char8_t(0x7f) == char32_t(0x7f));
+
+    (void)(char8_t(0x80) == char8_t(0x80));
+    (void)(char8_t(0x80) == char16_t(0x80)); // expected-warning{{comparing 
values of 
diff erent Unicode code unit types 'char8_t' and 'char16_t' compares unrelated 
code units '<0x80>' and '<U+0080>}}
+    (void)(char8_t(0x80) == char32_t(0x80)); // expected-warning{{comparing 
values of 
diff erent Unicode code unit types 'char8_t' and 'char32_t' compares unrelated 
code units '<0x80>' and '<U+0080>}}
+
+    (void)(char8_t(0x80) == char8_t(0x7f));
+    (void)(char8_t(0x80) == char16_t(0x7f)); // expected-warning{{comparing 
values of 
diff erent Unicode code unit types 'char8_t' and 'char16_t' compares unrelated 
code units '<0x80>' and '<U+007F>'}}
+    (void)(char8_t(0x80) == char32_t(0x7f)); // expected-warning{{comparing 
values of 
diff erent Unicode code unit types 'char8_t' and 'char32_t' compares unrelated 
code units '<0x80>' and '<U+007F>'}}
+
+
+    (void)(char16_t(0x7f) < char8_t(0x7f));
+    (void)(char16_t(0x7f) < char16_t(0x7f));
+    (void)(char16_t(0x7f) < char32_t(0x7f));
+
+    (void)(char16_t(0x80) < char8_t(0x80)); // expected-warning{{comparing 
values of 
diff erent Unicode code unit types 'char16_t' and 'char8_t' compares unrelated 
code units '<U+0080>' and '<0x80>'}}
+    (void)(char16_t(0x80) < char16_t(0x80));
+    (void)(char16_t(0x80) < char32_t(0x80));
+
+    (void)(char16_t(0x80) == char8_t(0x7f));
+    (void)(char16_t(0x80) < char16_t(0x7f));
+    (void)(char16_t(0x80) < char32_t(0x7f));
+
+
+    (void)(char32_t(0x7f) < char8_t(0x7f));
+    (void)(char32_t(0x7f) < char16_t(0x7f));
+    (void)(char32_t(0x7f) < char32_t(0x7f));
+
+    (void)(char32_t(0x80) < char8_t(0x80)); // expected-warning{{comparing 
values of 
diff erent Unicode code unit types 'char32_t' and 'char8_t' compares unrelated 
code units '<U+0080>' and '<0x80>'}}
+    (void)(char32_t(0x80) < char16_t(0x80));
+    (void)(char32_t(0x80) < char32_t(0x80));
+
+    (void)(char32_t(0x80) == char8_t(0x7f));
+    (void)(char32_t(0x80) < char16_t(0x7f));
+    (void)(char32_t(0x80) < char32_t(0x7f));
+
+
+    (void)(char32_t(U'🐉') <= char16_t(0xD800)); // expected-warning{{comparing 
values of 
diff erent Unicode code unit types 'char32_t' and 'char16_t' compares unrelated 
code units '🐉' and '<0xD800>'}}
+    (void)(char32_t(U'🐉') <= char16_t(0xD7FF));
+
+    (void)(char16_t(0xD800) >= char32_t(U'🐉')); // expected-warning{{comparing 
values of 
diff erent Unicode code unit types 'char16_t' and 'char32_t' compares unrelated 
code units '<0xD800>' and '🐉'}}
+    (void)(char16_t(0xD7FF) >= char32_t(U'🐉'));
+}
+
+void check_arithmetic(char8_t u8, char16_t u16, char32_t u32) {
+
+    (void)(u8 + u8);
+    (void)(u16 += u16);
+    (void)(u32 & u32);
+    (void)(1 ? u16 : u16);
+
+    (void)(u8 + u16);  // expected-warning {{arithmetic between 
diff erent Unicode character types 'char8_t' and 'char16_t'}}
+    (void)(u8 += u16); // expected-warning {{compound assignment of 
diff erent Unicode character types 'char8_t' and 'char16_t'}}
+    (void)(u8 & u16);  // expected-warning {{bitwise operation between 
diff erent Unicode character types 'char8_t' and 'char16_t'}}
+    (void)(1 ? u8 : u16);  // expected-warning {{conditional expression 
between 
diff erent Unicode character types 'char8_t' and 'char16_t'}}
+
+
+    (void)(u16 * u32);  // expected-warning {{arithmetic between 
diff erent Unicode character types 'char16_t' and 'char32_t'}}
+    (void)(u16 -= u32); // expected-warning {{compound assignment of 
diff erent Unicode character types 'char16_t' and 'char32_t'}}
+    (void)(u16 | u32);  // expected-warning {{bitwise operation between 
diff erent Unicode character types 'char16_t' and 'char32_t'}}
+    (void)(1 ? u32 : u16);  // expected-warning {{conditional expression 
between 
diff erent Unicode character types 'char32_t' and 'char16_t'}}
+}

diff  --git a/libcxx/include/print b/libcxx/include/print
index 61c3ebcd98cb8..be05d30e0147f 100644
--- a/libcxx/include/print
+++ b/libcxx/include/print
@@ -123,7 +123,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __encode(_OutIt& 
__out_it, char32_t __value
   _LIBCPP_ASSERT_UNCATEGORIZED(__is_scalar_value(__value), "an invalid unicode 
scalar value results in invalid UTF-16");
 
   if (__value < 0x10000) {
-    *__out_it++ = __value;
+    *__out_it++ = static_cast<iter_value_t<_OutIt>>(__value);
     return;
   }
 

diff  --git 
a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp 
b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
index 02cc84c288828..859532d4b79c7 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.equal/equal.pass.cpp
@@ -20,6 +20,8 @@
 
 // We test the cartesian product, so we sometimes compare 
diff erently signed types
 // ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
+// ADDITIONAL_COMPILE_FLAGS(character-conversion-warnings): 
-Wno-character-conversion
+
 // MSVC warning C4242: 'argument': conversion from 'int' to 'const _Ty', 
possible loss of data
 // MSVC warning C4244: 'argument': conversion from 'wchar_t' to 'const _Ty', 
possible loss of data
 // MSVC warning C4389: '==': signed/unsigned mismatch

diff  --git 
a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp 
b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
index 3aaeb9c2f345f..989edcb3f6eed 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
@@ -8,6 +8,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
 // ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-sign-compare
+// ADDITIONAL_COMPILE_FLAGS(character-conversion-warnings): 
-Wno-character-conversion
 // MSVC warning C4245: conversion from 'int' to 'wchar_t', signed/unsigned 
mismatch
 // MSVC warning C4305: truncation from 'int' to 'bool'
 // MSVC warning C4310: cast truncates constant value

diff  --git a/libcxx/test/std/localization/codecvt_unicode.pass.cpp 
b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
index e54c0c2a4610a..fc5625d8ce4e9 100644
--- a/libcxx/test/std/localization/codecvt_unicode.pass.cpp
+++ b/libcxx/test/std/localization/codecvt_unicode.pass.cpp
@@ -484,7 +484,7 @@ template <class InternT, class ExternT>
 void utf8_to_utf16_in_ok(const std::codecvt<InternT, ExternT, mbstate_t>& cvt) 
{
   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
-  const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+  const InternT expected[]    = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
   static_assert(array_size(input) == 11, "");
   static_assert(array_size(expected) == 6, "");
 
@@ -549,7 +549,7 @@ template <class InternT, class ExternT>
 void utf8_to_utf16_in_partial(const std::codecvt<InternT, ExternT, mbstate_t>& 
cvt) {
   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
   const unsigned char input[] = "b\u0448\uAAAA\U0010AAAA";
-  const char16_t expected[]   = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+  const InternT expected[]    = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
   static_assert(array_size(input) == 11, "");
   static_assert(array_size(expected) == 6, "");
 
@@ -618,7 +618,7 @@ template <class InternT, class ExternT>
 void utf8_to_utf16_in_error(const std::codecvt<InternT, ExternT, mbstate_t>& 
cvt) {
   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP, 4-byte CP
   const unsigned char input[] = "b\u0448\uD700\U0010AAAA";
-  const char16_t expected[]   = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
+  const InternT expected[]    = {'b', 0x0448, 0xD700, 0xDBEA, 0xDEAA, 0};
   static_assert(array_size(input) == 11, "");
   static_assert(array_size(expected) == 6, "");
 
@@ -765,7 +765,7 @@ void utf8_to_utf16_in(const std::codecvt<InternT, ExternT, 
mbstate_t>& cvt) {
 template <class InternT, class ExternT>
 void utf16_to_utf8_out_ok(const std::codecvt<InternT, ExternT, mbstate_t>& 
cvt) {
   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
-  const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+  const InternT input[]          = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
   static_assert(array_size(input) == 6, "");
   static_assert(array_size(expected) == 11, "");
@@ -801,7 +801,7 @@ void utf16_to_utf8_out_ok(const std::codecvt<InternT, 
ExternT, mbstate_t>& cvt)
 template <class InternT, class ExternT>
 void utf16_to_utf8_out_partial(const std::codecvt<InternT, ExternT, 
mbstate_t>& cvt) {
   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
-  const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+  const InternT input[]          = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
   static_assert(array_size(input) == 6, "");
   static_assert(array_size(expected) == 11, "");
@@ -860,7 +860,7 @@ void utf16_to_utf8_out_partial(const std::codecvt<InternT, 
ExternT, mbstate_t>&
 template <class InternT, class ExternT>
 void utf16_to_utf8_out_error(const std::codecvt<InternT, ExternT, mbstate_t>& 
cvt) {
   // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP
-  const char16_t input[]         = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
+  const InternT input[]          = {'b', 0x0448, 0xAAAA, 0xDBEA, 0xDEAA, 0};
   const unsigned char expected[] = "b\u0448\uAAAA\U0010AAAA";
   static_assert(array_size(input) == 6, "");
   static_assert(array_size(expected) == 11, "");

diff  --git 
a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp
 
b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp
index c34e864220e12..86a08ee32cb45 100644
--- 
a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp
+++ 
b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_in.pass.cpp
@@ -33,6 +33,6 @@ int main(int, char**) {
   assert(from_next - from == 9);
   assert(to_next - to == 9);
   for (unsigned i = 0; i < 9; ++i)
-    assert(to[i] == from[i]);
+    assert(to[i] == static_cast<char16_t>(from[i]));
   return 0;
 }

diff  --git 
a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp
 
b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp
index c39e64de7a59f..d5c0c3cf31244 100644
--- 
a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp
+++ 
b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char16_t_char8_t_out.pass.cpp
@@ -34,6 +34,6 @@ int main(int, char**) {
   assert(from_next - from == 9);
   assert(to_next - to == 9);
   for (unsigned i = 0; i < 9; ++i)
-    assert(to[i] == from[i]);
+    assert(static_cast<char16_t>(to[i]) == from[i]);
   return 0;
 }

diff  --git 
a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp
 
b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp
index e848f8a10912e..e6af982c10e99 100644
--- 
a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp
+++ 
b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_in.pass.cpp
@@ -33,6 +33,6 @@ int main(int, char**) {
   assert(from_next - from == 9);
   assert(to_next - to == 9);
   for (unsigned i = 0; i < 9; ++i)
-    assert(to[i] == from[i]);
+    assert(to[i] == static_cast<char32_t>(from[i]));
   return 0;
 }

diff  --git 
a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp
 
b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp
index 7a31c9ef10558..3cf46a436e2e7 100644
--- 
a/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp
+++ 
b/libcxx/test/std/localization/locale.categories/category.ctype/locale.codecvt/locale.codecvt.members/char32_t_char8_t_out.pass.cpp
@@ -34,6 +34,6 @@ int main(int, char**) {
   assert(from_next - from == 9);
   assert(to_next - to == 9);
   for (unsigned i = 0; i < 9; ++i)
-    assert(to[i] == from[i]);
+    assert(static_cast<char32_t>(to[i]) == from[i]);
   return 0;
 }

diff  --git 
a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp
 
b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp
index e3bc9c3c100d4..971fcd68cc8e6 100644
--- 
a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp
+++ 
b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign2.pass.cpp
@@ -19,9 +19,9 @@
 
 #ifndef TEST_HAS_NO_CHAR8_T
 constexpr bool test_constexpr() {
-  char8_t c = u'1';
+  char8_t c = u8'1';
   std::char_traits<char8_t>::assign(c, u'a');
-  return c == u'a';
+  return c == u8'a';
 }
 
 int main(int, char**) {

diff  --git a/libcxx/utils/libcxx/test/features.py 
b/libcxx/utils/libcxx/test/features.py
index 10fc4b0afde6b..74746e37d3bc4 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -144,6 +144,10 @@ def _mingwSupportsModules(cfg):
         when=lambda cfg: hasCompileFlag(cfg, "-Wuser-defined-warnings"),
         actions=[AddCompileFlag("-Wuser-defined-warnings")],
     ),
+    Feature(
+        name="character-conversion-warnings",
+        when=lambda cfg: hasCompileFlag(cfg, "-Wcharacter-conversion"),
+    ),
     # Tests to validate whether the compiler has a way to set the maximum 
number
     # of steps during constant evaluation. Since the flag 
diff ers per compiler
     # store the "valid" flag as a feature. This allows passing the proper 
compile

diff  --git a/llvm/include/llvm/Support/ConvertUTF.h 
b/llvm/include/llvm/Support/ConvertUTF.h
index dd446f280a483..3bb238e7df2ed 100644
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -346,6 +346,10 @@ LLVM_ABI bool convertUTF32ToUTF8String(ArrayRef<UTF32> 
Src, std::string &Out);
 LLVM_ABI bool convertUTF8ToUTF16String(StringRef SrcUTF8,
                                        SmallVectorImpl<UTF16> &DstUTF16);
 
+bool IsSingleCodeUnitUTF8Codepoint(unsigned);
+bool IsSingleCodeUnitUTF16Codepoint(unsigned);
+bool IsSingleCodeUnitUTF32Codepoint(unsigned);
+
 #if defined(_WIN32)
 namespace sys {
 namespace windows {

diff  --git a/llvm/lib/Support/ConvertUTFWrapper.cpp 
b/llvm/lib/Support/ConvertUTFWrapper.cpp
index 4952fe65d7767..76ead00c977bd 100644
--- a/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -303,5 +303,15 @@ bool convertWideToUTF8(const std::wstring &Source, 
std::string &Result) {
   }
 }
 
+bool IsSingleCodeUnitUTF8Codepoint(unsigned V) { return V <= 0x7F; }
+
+bool IsSingleCodeUnitUTF16Codepoint(unsigned V) {
+  return V <= 0xD7FF || (V >= 0xE000 && V <= 0xFFFF);
+}
+
+bool IsSingleCodeUnitUTF32Codepoint(unsigned V) {
+  return V <= 0xD7FF || (V >= 0xE000 && V <= 0x10FFFF);
+}
+
 } // end namespace llvm
 


        
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to