https://github.com/dzbarsky created 
https://github.com/llvm/llvm-project/pull/202626

Encode confusable replacement strings once in a shared UTF-8 byte array. Each 
sorted mapping becomes an eight-byte code point, 16-bit offset, and 16-bit 
length entry instead of reserving nineteen UTF-32 code points for every 
replacement.

ConfusableIdentifierCheck appends the UTF-8 slice directly after the binary 
search, removing the per-identifier UTF-32 to UTF-8 conversion. Add 
compile-time assertions for the eight-byte entry layout and the 64 KiB 
replacement-data limit, plus generator-side overflow checks.

The logical table shrinks from 508,400 to 63,327 bytes, saving 445,073 bytes. A 
stripped standalone clang-tidy shrinks from 91,116,608 to 90,654,176 bytes, 
saving 462,432 bytes (0.508%).

A lookup microbenchmark improves from 79.9 to 40.4 ns median. A 
100,000-identifier clang-tidy benchmark improves from 0.343 to 0.314 seconds 
mean.

Validate all 6,355 replacements byte-for-byte against confusables.txt, 
deterministic regeneration, byte-identical baseline and candidate diagnostics, 
and the focused misc-confusable-identifiers test with multi-code-point and 
non-ASCII replacements. Compile and link the generator, consumer object, and 
standalone clang-tidy.

Work towards #202616

>From f16a5f2ee0935f6800a577a56517e42fea2bee97 Mon Sep 17 00:00:00 2001
From: David Zbarsky <[email protected]>
Date: Mon, 8 Jun 2026 23:45:57 -0400
Subject: [PATCH] [clang-tidy] Compact the confusable identifier table

Encode confusable replacement strings once in a shared UTF-8 byte array. Each 
sorted mapping becomes an eight-byte code point, 16-bit offset, and 16-bit 
length entry instead of reserving nineteen UTF-32 code points for every 
replacement.

ConfusableIdentifierCheck appends the UTF-8 slice directly after the binary 
search, removing the per-identifier UTF-32 to UTF-8 conversion. Add 
compile-time assertions for the eight-byte entry layout and the 64 KiB 
replacement-data limit, plus generator-side overflow checks.

The logical table shrinks from 508,400 to 63,327 bytes, saving 445,073 bytes. A 
stripped standalone clang-tidy shrinks from 91,116,608 to 90,654,176 bytes, 
saving 462,432 bytes (0.508%).

A lookup microbenchmark improves from 79.9 to 40.4 ns median. A 
100,000-identifier clang-tidy benchmark improves from 0.343 to 0.314 seconds 
mean.

Validate all 6,355 replacements byte-for-byte against confusables.txt, 
deterministic regeneration, byte-identical baseline and candidate diagnostics, 
and the focused misc-confusable-identifiers test with multi-code-point and 
non-ASCII replacements. Compile and link the generator, consumer object, and 
standalone clang-tidy.
---
 .../misc/ConfusableIdentifierCheck.cpp        | 37 +++++----
 .../ConfusableTable/BuildConfusableTable.cpp  | 76 +++++++++++++------
 .../checkers/misc/confusable-identifiers.cpp  | 10 +++
 3 files changed, 81 insertions(+), 42 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp 
b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
index f386bd613dfc6..7647c509ea202 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
@@ -13,12 +13,23 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/ConvertUTF.h"
 
+#include <cstdint>
+
 namespace {
 // Preprocessed version of
 // https://www.unicode.org/Public/security/latest/confusables.txt
 //
-// This contains a sorted array of { UTF32 codepoint; UTF32 values[N];}
+// This contains a sorted array of code points and slices into a shared UTF-8
+// replacement array.
+struct ConfusableEntry {
+  llvm::UTF32 CodePoint;
+  uint16_t Offset;
+  uint16_t Length;
+};
+static_assert(sizeof(ConfusableEntry) == 8);
+
 #include "Confusables.inc"
+static_assert(sizeof(ConfusableReplacementData) <= 1ULL << 16);
 } // namespace
 
 namespace clang::tidy::misc {
@@ -63,26 +74,14 @@ static SmallString<64U> skeleton(StringRef Name) {
       break;
     }
 
-    const StringRef Key(Prev, Curr - Prev);
-    auto *Where = llvm::lower_bound(ConfusableEntries, CodePoint,
-                                    [](decltype(ConfusableEntries[0]) X,
-                                       UTF32 Y) { return X.codepoint < Y; });
-    if (Where == std::end(ConfusableEntries) || CodePoint != Where->codepoint) 
{
+    auto *Where = llvm::lower_bound(
+        ConfusableEntries, CodePoint,
+        [](const ConfusableEntry &X, UTF32 Y) { return X.CodePoint < Y; });
+    if (Where == std::end(ConfusableEntries) || CodePoint != Where->CodePoint) 
{
       Skeleton.append(Prev, Curr);
     } else {
-      UTF8 Buffer[32];
-      UTF8 *BufferStart = std::begin(Buffer);
-      UTF8 *IBuffer = BufferStart;
-      const UTF32 *ValuesStart = std::begin(Where->values);
-      const UTF32 *ValuesEnd = llvm::find(Where->values, '\0');
-      if (ConvertUTF32toUTF8(&ValuesStart, ValuesEnd, &IBuffer,
-                             std::end(Buffer),
-                             strictConversion) != conversionOK) {
-        errs() << "Unicode conversion issue\n";
-        break;
-      }
-      Skeleton.append(reinterpret_cast<char *>(BufferStart),
-                      reinterpret_cast<char *>(IBuffer));
+      Skeleton.append(
+          StringRef(ConfusableReplacementData + Where->Offset, Where->Length));
     }
   }
   return Skeleton;
diff --git 
a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp 
b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
index 0c2f289d16922..af5791c21923e 100644
--- a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
+++ b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp
@@ -8,12 +8,23 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <cstdint>
+#include <limits>
+
 using namespace llvm;
 
+namespace {
+struct ConfusableEntry {
+  UTF32 CodePoint;
+  std::string Replacement;
+};
+} // namespace
+
 int main(int argc, char *argv[]) {
   auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true);
   if (!ErrorOrBuffer)
@@ -24,7 +35,7 @@ int main(int argc, char *argv[]) {
   SmallVector<StringRef> Lines;
   SplitString(Content, Lines, "\r\n");
 
-  std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries;
+  std::vector<ConfusableEntry> Entries;
   SmallVector<StringRef> Values;
   for (const StringRef Line : Lines) {
     if (Line.starts_with('#'))
@@ -41,42 +52,61 @@ int main(int argc, char *argv[]) {
     llvm::UTF32 CodePoint = 0;
     From.getAsInteger(16, CodePoint);
 
-    SmallVector<llvm::UTF32> To;
+    std::string Replacement;
     SmallVector<StringRef> ToN;
     Values[1].split(ToN, ' ', -1, false);
     for (const StringRef ToI : ToN) {
       llvm::UTF32 ToCodePoint = 0;
       ToI.trim().getAsInteger(16, ToCodePoint);
-      To.push_back(ToCodePoint);
+      char Encoded[UNI_MAX_UTF8_BYTES_PER_CODE_POINT];
+      char *EncodedEnd = Encoded;
+      if (!ConvertCodePointToUTF8(ToCodePoint, EncodedEnd)) {
+        errs() << "Failed to encode code point: " << ToI << "\n";
+        return 3;
+      }
+      Replacement.append(Encoded, EncodedEnd);
     }
-    // Sentinel
-    To.push_back(0);
 
-    Entries.emplace_back(CodePoint, To);
+    Entries.push_back({CodePoint, std::move(Replacement)});
   }
-  llvm::sort(Entries);
+  llvm::sort(Entries,
+             [](const ConfusableEntry &LHS, const ConfusableEntry &RHS) {
+               return LHS.CodePoint < RHS.CodePoint;
+             });
 
-  const unsigned LargestValue =
-      llvm::max_element(Entries, [](const auto &Entry0, const auto &Entry1) {
-        return Entry0.second.size() < Entry1.second.size();
-      })->second.size();
+  StringMap<uint16_t> ReplacementOffsets;
+  std::string ReplacementData;
+  for (const ConfusableEntry &Entry : Entries) {
+    if (ReplacementOffsets.contains(Entry.Replacement))
+      continue;
+    if (ReplacementData.size() + Entry.Replacement.size() >
+        std::numeric_limits<uint16_t>::max()) {
+      errs() << "Confusable replacement data exceeds 16-bit offsets\n";
+      return 4;
+    }
+    ReplacementOffsets.try_emplace(
+        Entry.Replacement, static_cast<uint16_t>(ReplacementData.size()));
+    ReplacementData.append(Entry.Replacement);
+  }
 
   std::error_code Ec;
   llvm::raw_fd_ostream Os(argv[2], Ec);
 
-  // FIXME: If memory consumption and/or lookup time becomes a constraint, it
-  // maybe worth using a more elaborate data structure.
-  Os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue
-     << "];} "
-        "ConfusableEntries[] = {\n";
-  for (const auto &Values : Entries) {
-    Os << "  { ";
-    Os << Values.first;
-    Os << ", {";
-    for (auto CP : Values.second)
-      Os << CP << ", ";
+  static constexpr char HexDigits[] = "0123456789ABCDEF";
+  Os << "constexpr char ConfusableReplacementData[] =\n";
+  for (size_t I = 0; I < ReplacementData.size(); I += 32) {
+    Os << "    \"";
+    for (unsigned char C : StringRef(ReplacementData).substr(I, 32))
+      Os << "\\x" << HexDigits[C >> 4] << HexDigits[C & 0x0F];
+    Os << "\"\n";
+  }
+  Os << "    ;\n\n";
 
-    Os << "}},\n";
+  Os << "constexpr ConfusableEntry ConfusableEntries[] = {\n";
+  for (const ConfusableEntry &Entry : Entries) {
+    const auto It = ReplacementOffsets.find(Entry.Replacement);
+    Os << "  {" << Entry.CodePoint << ", " << It->second << ", "
+       << Entry.Replacement.size() << "},\n";
   }
   Os << "};\n";
   return 0;
diff --git 
a/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp 
b/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp
index fb7eeed7e715f..82520eac4523b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp
@@ -17,6 +17,16 @@ int ll;
 // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 'll' is confusable with 'l1' 
[misc-confusable-identifiers]
 // CHECK-MESSAGES: :[[#@LINE-3]]:5: note: other declaration found here
 
+int rn;
+int m;
+// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 'm' is confusable with 'rn' 
[misc-confusable-identifiers]
+// CHECK-MESSAGES: :[[#@LINE-3]]:5: note: other declaration found here
+
+int μ;
+int µ;
+// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 'µ' is confusable with 'μ' 
[misc-confusable-identifiers]
+// CHECK-MESSAGES: :[[#@LINE-3]]:5: note: other declaration found here
+
 bool f0(const char *q1, const char *ql) {
   // CHECK-MESSAGES: :[[#@LINE-1]]:37: warning: 'ql' is confusable with 'q1' 
[misc-confusable-identifiers]
   // CHECK-MESSAGES: :[[#@LINE-2]]:21: note: other declaration found here

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to