https://github.com/dzbarsky created https://github.com/llvm/llvm-project/pull/202626
Encode confusable replacement strings once in a shared UTF-8 byte array. Each sorted mapping becomes an eight-byte code point, 16-bit offset, and 16-bit length entry instead of reserving nineteen UTF-32 code points for every replacement. ConfusableIdentifierCheck appends the UTF-8 slice directly after the binary search, removing the per-identifier UTF-32 to UTF-8 conversion. Add compile-time assertions for the eight-byte entry layout and the 64 KiB replacement-data limit, plus generator-side overflow checks. The logical table shrinks from 508,400 to 63,327 bytes, saving 445,073 bytes. A stripped standalone clang-tidy shrinks from 91,116,608 to 90,654,176 bytes, saving 462,432 bytes (0.508%). A lookup microbenchmark improves from 79.9 to 40.4 ns median. A 100,000-identifier clang-tidy benchmark improves from 0.343 to 0.314 seconds mean. Validate all 6,355 replacements byte-for-byte against confusables.txt, deterministic regeneration, byte-identical baseline and candidate diagnostics, and the focused misc-confusable-identifiers test with multi-code-point and non-ASCII replacements. Compile and link the generator, consumer object, and standalone clang-tidy. Work towards #202616 >From f16a5f2ee0935f6800a577a56517e42fea2bee97 Mon Sep 17 00:00:00 2001 From: David Zbarsky <[email protected]> Date: Mon, 8 Jun 2026 23:45:57 -0400 Subject: [PATCH] [clang-tidy] Compact the confusable identifier table Encode confusable replacement strings once in a shared UTF-8 byte array. Each sorted mapping becomes an eight-byte code point, 16-bit offset, and 16-bit length entry instead of reserving nineteen UTF-32 code points for every replacement. ConfusableIdentifierCheck appends the UTF-8 slice directly after the binary search, removing the per-identifier UTF-32 to UTF-8 conversion. Add compile-time assertions for the eight-byte entry layout and the 64 KiB replacement-data limit, plus generator-side overflow checks. The logical table shrinks from 508,400 to 63,327 bytes, saving 445,073 bytes. A stripped standalone clang-tidy shrinks from 91,116,608 to 90,654,176 bytes, saving 462,432 bytes (0.508%). A lookup microbenchmark improves from 79.9 to 40.4 ns median. A 100,000-identifier clang-tidy benchmark improves from 0.343 to 0.314 seconds mean. Validate all 6,355 replacements byte-for-byte against confusables.txt, deterministic regeneration, byte-identical baseline and candidate diagnostics, and the focused misc-confusable-identifiers test with multi-code-point and non-ASCII replacements. Compile and link the generator, consumer object, and standalone clang-tidy. --- .../misc/ConfusableIdentifierCheck.cpp | 37 +++++---- .../ConfusableTable/BuildConfusableTable.cpp | 76 +++++++++++++------ .../checkers/misc/confusable-identifiers.cpp | 10 +++ 3 files changed, 81 insertions(+), 42 deletions(-) diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp index f386bd613dfc6..7647c509ea202 100644 --- a/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp @@ -13,12 +13,23 @@ #include "llvm/ADT/SmallString.h" #include "llvm/Support/ConvertUTF.h" +#include <cstdint> + namespace { // Preprocessed version of // https://www.unicode.org/Public/security/latest/confusables.txt // -// This contains a sorted array of { UTF32 codepoint; UTF32 values[N];} +// This contains a sorted array of code points and slices into a shared UTF-8 +// replacement array. +struct ConfusableEntry { + llvm::UTF32 CodePoint; + uint16_t Offset; + uint16_t Length; +}; +static_assert(sizeof(ConfusableEntry) == 8); + #include "Confusables.inc" +static_assert(sizeof(ConfusableReplacementData) <= 1ULL << 16); } // namespace namespace clang::tidy::misc { @@ -63,26 +74,14 @@ static SmallString<64U> skeleton(StringRef Name) { break; } - const StringRef Key(Prev, Curr - Prev); - auto *Where = llvm::lower_bound(ConfusableEntries, CodePoint, - [](decltype(ConfusableEntries[0]) X, - UTF32 Y) { return X.codepoint < Y; }); - if (Where == std::end(ConfusableEntries) || CodePoint != Where->codepoint) { + auto *Where = llvm::lower_bound( + ConfusableEntries, CodePoint, + [](const ConfusableEntry &X, UTF32 Y) { return X.CodePoint < Y; }); + if (Where == std::end(ConfusableEntries) || CodePoint != Where->CodePoint) { Skeleton.append(Prev, Curr); } else { - UTF8 Buffer[32]; - UTF8 *BufferStart = std::begin(Buffer); - UTF8 *IBuffer = BufferStart; - const UTF32 *ValuesStart = std::begin(Where->values); - const UTF32 *ValuesEnd = llvm::find(Where->values, '\0'); - if (ConvertUTF32toUTF8(&ValuesStart, ValuesEnd, &IBuffer, - std::end(Buffer), - strictConversion) != conversionOK) { - errs() << "Unicode conversion issue\n"; - break; - } - Skeleton.append(reinterpret_cast<char *>(BufferStart), - reinterpret_cast<char *>(IBuffer)); + Skeleton.append( + StringRef(ConfusableReplacementData + Where->Offset, Where->Length)); } } return Skeleton; diff --git a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp index 0c2f289d16922..af5791c21923e 100644 --- a/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp +++ b/clang-tools-extra/clang-tidy/misc/ConfusableTable/BuildConfusableTable.cpp @@ -8,12 +8,23 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringMap.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include <cstdint> +#include <limits> + using namespace llvm; +namespace { +struct ConfusableEntry { + UTF32 CodePoint; + std::string Replacement; +}; +} // namespace + int main(int argc, char *argv[]) { auto ErrorOrBuffer = MemoryBuffer::getFile(argv[1], true); if (!ErrorOrBuffer) @@ -24,7 +35,7 @@ int main(int argc, char *argv[]) { SmallVector<StringRef> Lines; SplitString(Content, Lines, "\r\n"); - std::vector<std::pair<llvm::UTF32, SmallVector<llvm::UTF32>>> Entries; + std::vector<ConfusableEntry> Entries; SmallVector<StringRef> Values; for (const StringRef Line : Lines) { if (Line.starts_with('#')) @@ -41,42 +52,61 @@ int main(int argc, char *argv[]) { llvm::UTF32 CodePoint = 0; From.getAsInteger(16, CodePoint); - SmallVector<llvm::UTF32> To; + std::string Replacement; SmallVector<StringRef> ToN; Values[1].split(ToN, ' ', -1, false); for (const StringRef ToI : ToN) { llvm::UTF32 ToCodePoint = 0; ToI.trim().getAsInteger(16, ToCodePoint); - To.push_back(ToCodePoint); + char Encoded[UNI_MAX_UTF8_BYTES_PER_CODE_POINT]; + char *EncodedEnd = Encoded; + if (!ConvertCodePointToUTF8(ToCodePoint, EncodedEnd)) { + errs() << "Failed to encode code point: " << ToI << "\n"; + return 3; + } + Replacement.append(Encoded, EncodedEnd); } - // Sentinel - To.push_back(0); - Entries.emplace_back(CodePoint, To); + Entries.push_back({CodePoint, std::move(Replacement)}); } - llvm::sort(Entries); + llvm::sort(Entries, + [](const ConfusableEntry &LHS, const ConfusableEntry &RHS) { + return LHS.CodePoint < RHS.CodePoint; + }); - const unsigned LargestValue = - llvm::max_element(Entries, [](const auto &Entry0, const auto &Entry1) { - return Entry0.second.size() < Entry1.second.size(); - })->second.size(); + StringMap<uint16_t> ReplacementOffsets; + std::string ReplacementData; + for (const ConfusableEntry &Entry : Entries) { + if (ReplacementOffsets.contains(Entry.Replacement)) + continue; + if (ReplacementData.size() + Entry.Replacement.size() > + std::numeric_limits<uint16_t>::max()) { + errs() << "Confusable replacement data exceeds 16-bit offsets\n"; + return 4; + } + ReplacementOffsets.try_emplace( + Entry.Replacement, static_cast<uint16_t>(ReplacementData.size())); + ReplacementData.append(Entry.Replacement); + } std::error_code Ec; llvm::raw_fd_ostream Os(argv[2], Ec); - // FIXME: If memory consumption and/or lookup time becomes a constraint, it - // maybe worth using a more elaborate data structure. - Os << "struct {llvm::UTF32 codepoint; llvm::UTF32 values[" << LargestValue - << "];} " - "ConfusableEntries[] = {\n"; - for (const auto &Values : Entries) { - Os << " { "; - Os << Values.first; - Os << ", {"; - for (auto CP : Values.second) - Os << CP << ", "; + static constexpr char HexDigits[] = "0123456789ABCDEF"; + Os << "constexpr char ConfusableReplacementData[] =\n"; + for (size_t I = 0; I < ReplacementData.size(); I += 32) { + Os << " \""; + for (unsigned char C : StringRef(ReplacementData).substr(I, 32)) + Os << "\\x" << HexDigits[C >> 4] << HexDigits[C & 0x0F]; + Os << "\"\n"; + } + Os << " ;\n\n"; - Os << "}},\n"; + Os << "constexpr ConfusableEntry ConfusableEntries[] = {\n"; + for (const ConfusableEntry &Entry : Entries) { + const auto It = ReplacementOffsets.find(Entry.Replacement); + Os << " {" << Entry.CodePoint << ", " << It->second << ", " + << Entry.Replacement.size() << "},\n"; } Os << "};\n"; return 0; diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp index fb7eeed7e715f..82520eac4523b 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/confusable-identifiers.cpp @@ -17,6 +17,16 @@ int ll; // CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 'll' is confusable with 'l1' [misc-confusable-identifiers] // CHECK-MESSAGES: :[[#@LINE-3]]:5: note: other declaration found here +int rn; +int m; +// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 'm' is confusable with 'rn' [misc-confusable-identifiers] +// CHECK-MESSAGES: :[[#@LINE-3]]:5: note: other declaration found here + +int μ; +int µ; +// CHECK-MESSAGES: :[[#@LINE-1]]:5: warning: 'µ' is confusable with 'μ' [misc-confusable-identifiers] +// CHECK-MESSAGES: :[[#@LINE-3]]:5: note: other declaration found here + bool f0(const char *q1, const char *ql) { // CHECK-MESSAGES: :[[#@LINE-1]]:37: warning: 'ql' is confusable with 'q1' [misc-confusable-identifiers] // CHECK-MESSAGES: :[[#@LINE-2]]:21: note: other declaration found here _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
