https://github.com/dzbarsky created https://github.com/llvm/llvm-project/pull/202641
GenericTaintChecker constructs its built-in rules from a large initializer of CallDescription and GenericTaintRule objects. The initializer expands into substantial constructor code even though the rules are static metadata. Store the built-in rules in compact descriptors and construct the existing objects when the checker is first used. A shared name table replaces repeated StringRef construction, and LLVM_ATTRIBUTE_MINSIZE keeps the one-time decoder compact. Static assertions fix the descriptor layouts at 6 and 18 bytes. In a Release build with assertions on arm64 macOS, this changes: GenericTaintChecker.cpp.o: 245,104 -> 119,576 bytes (-125,528) clang: 130,015,696 -> 129,933,280 bytes (-82,416) clang stripped: 107,999,520 -> 107,916,960 bytes (-82,560) LLVM multicall: 162,032,848 -> 161,933,920 bytes (-98,928) multicall stripped: 132,075,728 -> 131,976,656 bytes (-99,072) Thirty paired batches of 200 fresh analyzer invocations measured a 0.99% CPU-time improvement, with a 95% confidence interval from 0.26% to 1.75%. A separate 500-invocation sample retired 2.7% fewer instructions and used 4.2% fewer cycles. All 60 focused Static Analyzer tests passed, including the GenericTaint, standard-library summary, stream, errno, malloc, PCH, and SARIF tests. Work towards #202616 >From 00730783426bf1cd444114e727a794e5503d7d1c Mon Sep 17 00:00:00 2001 From: David Zbarsky <[email protected]> Date: Tue, 9 Jun 2026 05:33:28 -0400 Subject: [PATCH] [clang][StaticAnalyzer] Compact GenericTaint rule descriptors GenericTaintChecker constructs its built-in rules from a large initializer of CallDescription and GenericTaintRule objects. The initializer expands into substantial constructor code even though the rules are static metadata. Store the built-in rules in compact descriptors and construct the existing objects when the checker is first used. A shared name table replaces repeated StringRef construction, and LLVM_ATTRIBUTE_MINSIZE keeps the one-time decoder compact. Static assertions fix the descriptor layouts at 6 and 18 bytes. In a Release build with assertions on arm64 macOS, this changes: GenericTaintChecker.cpp.o: 245,104 -> 119,576 bytes (-125,528) clang: 130,015,696 -> 129,933,280 bytes (-82,416) clang stripped: 107,999,520 -> 107,916,960 bytes (-82,560) LLVM multicall: 162,032,848 -> 161,933,920 bytes (-98,928) multicall stripped: 132,075,728 -> 131,976,656 bytes (-99,072) Thirty paired batches of 200 fresh analyzer invocations measured a 0.99% CPU-time improvement, with a 95% confidence interval from 0.26% to 1.75%. A separate 500-invocation sample retired 2.7% fewer instructions and used 4.2% fewer cycles. All 60 focused Static Analyzer tests passed, including the GenericTaint, standard-library summary, stream, errno, malloc, PCH, and SARIF tests. --- .../Checkers/GenericTaintChecker.cpp | 483 ++++++++++-------- 1 file changed, 262 insertions(+), 221 deletions(-) diff --git a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp index c121052e03081..4ae5f513ed6c3 100644 --- a/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp @@ -404,7 +404,7 @@ class GenericTaintChecker /// The taint rules are initalized with the help of a CheckerContext to /// access user-provided configuration. - void initTaintRules(CheckerContext &C) const; + LLVM_ATTRIBUTE_MINSIZE void initTaintRules(CheckerContext &C) const; // TODO: The two separate `CallDescriptionMap`s were introduced when // `CallDescription` was unable to restrict matches to the global namespace @@ -561,6 +561,223 @@ GenericTaintRuleParser::parseConfiguration(const std::string &Option, return Rules; } +enum class TaintRuleKind : uint8_t { Source, Prop, Sink }; +enum class TaintRuleMessage : uint8_t { + None, + SanitizeSystemArgs, + UncontrolledFormatString, +}; + +struct ArgSetDescriptor { + int8_t Args[4]; + uint8_t Count; + int8_t VariadicIndex; +}; + +struct TaintRuleDescriptor { + uint16_t NameOffset; + uint8_t NameLength; + uint8_t Mode; + TaintRuleKind Kind; + TaintRuleMessage Message; + ArgSetDescriptor First; + ArgSetDescriptor Second; +}; + +static_assert(sizeof(ArgSetDescriptor) == 6); +static_assert(sizeof(TaintRuleDescriptor) == 18); + +// Keep each rule in a compact static descriptor. Construct the dynamic +// CallDescription and GenericTaintRule objects once when the checker is +// first used. +// clang-format off +#define TAINT_ARGS(A0, A1, A2, A3, Count, Variadic) \ + {{A0, A1, A2, A3}, Count, Variadic} +#define TAINT_CLIB static_cast<uint8_t>(CDM::CLibrary) +#define TAINT_CLIB_HARDENED \ + static_cast<uint8_t>(CDM::CLibraryMaybeHardened) +#define TAINT_SOURCE TaintRuleKind::Source +#define TAINT_PROP TaintRuleKind::Prop +#define TAINT_SINK TaintRuleKind::Sink +#define TAINT_NO_MESSAGE TaintRuleMessage::None +#define TAINT_SANITIZE_SYSTEM_ARGS TaintRuleMessage::SanitizeSystemArgs +#define TAINT_UNCONTROLLED_FORMAT_STRING \ + TaintRuleMessage::UncontrolledFormatString +#define TAINT_RULES(M) \ + /* Sources. */ \ + M(fdopen, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(fopen, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(freopen, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getch, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getchar, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getchar_unlocked, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(gets, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(0, -1, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(gets_s, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(0, -1, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(scanf, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 0, 1), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(scanf_s, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 0, 1), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(wgetch, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + /* _IO_getc could be a propagator, but that would require modeling all */ \ + /* possible sources of the _IO_FILE * argument. */ \ + M(_IO_getc, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getcwd, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(0, -1, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getwd, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(0, -1, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(readlink, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(1, -1, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(readlinkat, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(2, -1, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(get_current_dir_name, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(gethostname, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getnameinfo, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(2, 4, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getseuserbyname, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(1, 2, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getgroups, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(1, -1, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getlogin, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(-1, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(getlogin_r, TAINT_CLIB, TAINT_SOURCE, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + /* Propagators. */ \ + M(accept, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(atoi, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(atol, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(atoll, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(fgetc, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(fgetln, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(fgets, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(2, 0, 0, 0, 1, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(fgetws, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(2, 0, 0, 0, 1, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(fscanf, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, 2)) \ + M(fscanf_s, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, 2)) \ + M(sscanf, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, 2)) \ + M(sscanf_s, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, 2)) \ + M(getc, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(getc_unlocked, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(getdelim, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(3, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 1, -2)) \ + /* TODO: This also matches std::getline(); rule it out explicitly. */ \ + M(getline, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(2, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 1, -2)) \ + M(getw, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(pread, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 2, 3, 4, -2), TAINT_ARGS(1, -1, 0, 0, 2, -2)) \ + M(read, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 2, 0, 0, 2, -2), TAINT_ARGS(1, -1, 0, 0, 2, -2)) \ + M(fread, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(3, 0, 0, 0, 1, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(recv, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(1, -1, 0, 0, 2, -2)) \ + M(recvfrom, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(1, -1, 0, 0, 2, -2)) \ + M(ttyname, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(ttyname_r, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(1, -1, 0, 0, 2, -2)) \ + M(basename, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(dirname, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(fnmatch, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(mbtowc, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 0, 0, 0, 1, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(wctomb, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 0, 0, 0, 1, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(wcwidth, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(memcmp, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 2, 0, 3, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(memcpy, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 2, 0, 0, 2, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(memmove, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 2, 0, 0, 2, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(bcopy, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 2, 0, 0, 2, -2), TAINT_ARGS(1, 0, 0, 0, 1, -2)) \ + /* These search functions only propagate taint from the haystack. */ \ + M(memmem, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strstr, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strcasestr, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(memchr, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(memrchr, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(rawmemchr, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strchr, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strrchr, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strchrnul, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(index, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(rindex, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + /* FIXME: For arrays, only the first array element gets tainted. */ \ + M(qsort, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 1, -2)) \ + M(qsort_r, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 1, -2)) \ + M(strcmp, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strcasecmp, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strncmp, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 2, 0, 3, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strncasecmp, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 2, 0, 3, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strspn, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strcspn, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strpbrk, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strndup, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strndupa, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strdup, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strdupa, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(wcsdup, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + /* strlen, wcslen, strnlen, and similar functions intentionally do not */ \ + /* propagate taint. See https://github.com/llvm/llvm-project/pull/66086. */ \ + M(strtol, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(1, -1, 0, 0, 2, -2)) \ + M(strtoll, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(1, -1, 0, 0, 2, -2)) \ + M(strtoul, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(1, -1, 0, 0, 2, -2)) \ + M(strtoull, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(1, -1, 0, 0, 2, -2)) \ + M(tolower, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(toupper, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isalnum, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isalpha, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isascii, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isblank, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(iscntrl, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isdigit, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isgraph, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(islower, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isprint, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(ispunct, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isspace, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isupper, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(isxdigit, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(-1, 0, 0, 0, 1, -2)) \ + M(strcpy, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 0, 0, 0, 1, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(stpcpy, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 0, 0, 0, 1, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(strcat, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(wcsncat, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(strncpy, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 2, 0, 0, 2, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(strncat, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 2, 0, 3, -2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(strlcpy, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 2, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 1, -2)) \ + M(strlcat, TAINT_CLIB_HARDENED, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(0, 1, 2, 0, 3, -2), TAINT_ARGS(0, 0, 0, 0, 1, -2)) \ + /* The hardened sprintf variants insert parameters in the middle, so */ \ + /* CLibraryMaybeHardened cannot model them together with the base calls. */ \ + M(snprintf, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 2, 0, 0, 2, 3), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(sprintf, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 0, 0, 0, 1, 2), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(__snprintf_chk, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(1, 4, 0, 0, 2, 5), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + M(__sprintf_chk, TAINT_CLIB, TAINT_PROP, TAINT_NO_MESSAGE, TAINT_ARGS(3, 0, 0, 0, 1, 4), TAINT_ARGS(0, -1, 0, 0, 2, -2)) \ + /* Sinks. */ \ + M(system, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(popen, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(execl, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 0, 0, 0, 0, 0), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(execle, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 0, 0, 0, 0, 0), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(execlp, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 0, 0, 0, 0, 0), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(execv, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(execve, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 1, 2, 0, 3, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(fexecve, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 1, 2, 0, 3, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(execvp, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 1, 0, 0, 2, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(execvpe, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 1, 2, 0, 3, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(dlopen, TAINT_CLIB, TAINT_SINK, TAINT_SANITIZE_SYSTEM_ARGS, TAINT_ARGS(0, 0, 0, 0, 1, -2), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + /* Allocation functions are intentionally not unconditional sinks because */ \ + /* that produces false positives; specialized checkers should model them. */ \ + M(setproctitle, TAINT_CLIB, TAINT_SINK, TAINT_UNCONTROLLED_FORMAT_STRING, TAINT_ARGS(0, 0, 0, 0, 1, 1), TAINT_ARGS(0, 0, 0, 0, 0, -2)) \ + M(setproctitle_fast, TAINT_CLIB, TAINT_SINK, TAINT_UNCONTROLLED_FORMAT_STRING, TAINT_ARGS(0, 0, 0, 0, 1, 1), TAINT_ARGS(0, 0, 0, 0, 0, -2)) +// clang-format on + +struct TaintRuleNameTable { +#define TAINT_RULE_NAME(Name, ...) char N_##Name[sizeof(#Name)]; + TAINT_RULES(TAINT_RULE_NAME) +#undef TAINT_RULE_NAME +}; + +constexpr TaintRuleNameTable TaintRuleNames = { +#define TAINT_RULE_NAME(Name, ...) #Name, + TAINT_RULES(TAINT_RULE_NAME) +#undef TAINT_RULE_NAME +}; + +static_assert(sizeof(TaintRuleNameTable) <= + std::numeric_limits<uint16_t>::max()); + +#define TAINT_RULE_DESCRIPTOR(Name, ...) \ + {static_cast<uint16_t>(offsetof(TaintRuleNameTable, N_##Name)), \ + static_cast<uint8_t>(sizeof(TaintRuleNames.N_##Name) - 1), __VA_ARGS__}, +constexpr TaintRuleDescriptor TaintRuleDescriptors[] = { + TAINT_RULES(TAINT_RULE_DESCRIPTOR)}; +#undef TAINT_RULE_DESCRIPTOR +#undef TAINT_RULES +#undef TAINT_UNCONTROLLED_FORMAT_STRING +#undef TAINT_SANITIZE_SYSTEM_ARGS +#undef TAINT_NO_MESSAGE +#undef TAINT_SINK +#undef TAINT_PROP +#undef TAINT_SOURCE +#undef TAINT_CLIB_HARDENED +#undef TAINT_CLIB +#undef TAINT_ARGS + void GenericTaintChecker::initTaintRules(CheckerContext &C) const { // Check for exact name match for functions without builtin substitutes. // Use qualified name, because these are C functions without namespace. @@ -572,226 +789,50 @@ void GenericTaintChecker::initTaintRules(CheckerContext &C) const { std::vector<std::pair<CallDescription, GenericTaintRule>>; using TR = GenericTaintRule; - RulesConstructionTy GlobalCRules{ - // Sources - {{CDM::CLibrary, {"fdopen"}}, TR::Source({{ReturnValueIndex}})}, - {{CDM::CLibrary, {"fopen"}}, TR::Source({{ReturnValueIndex}})}, - {{CDM::CLibrary, {"freopen"}}, TR::Source({{ReturnValueIndex}})}, - {{CDM::CLibrary, {"getch"}}, TR::Source({{ReturnValueIndex}})}, - {{CDM::CLibrary, {"getchar"}}, TR::Source({{ReturnValueIndex}})}, - {{CDM::CLibrary, {"getchar_unlocked"}}, TR::Source({{ReturnValueIndex}})}, - {{CDM::CLibrary, {"gets"}}, TR::Source({{0, ReturnValueIndex}})}, - {{CDM::CLibrary, {"gets_s"}}, TR::Source({{0, ReturnValueIndex}})}, - {{CDM::CLibrary, {"scanf"}}, TR::Source({{}, 1})}, - {{CDM::CLibrary, {"scanf_s"}}, TR::Source({{}, 1})}, - {{CDM::CLibrary, {"wgetch"}}, TR::Source({{ReturnValueIndex}})}, - // Sometimes the line between taint sources and propagators is blurry. - // _IO_getc is choosen to be a source, but could also be a propagator. - // This way it is simpler, as modeling it as a propagator would require - // to model the possible sources of _IO_FILE * values, which the _IO_getc - // function takes as parameters. - {{CDM::CLibrary, {"_IO_getc"}}, TR::Source({{ReturnValueIndex}})}, - {{CDM::CLibrary, {"getcwd"}}, TR::Source({{0, ReturnValueIndex}})}, - {{CDM::CLibrary, {"getwd"}}, TR::Source({{0, ReturnValueIndex}})}, - {{CDM::CLibrary, {"readlink"}}, TR::Source({{1, ReturnValueIndex}})}, - {{CDM::CLibrary, {"readlinkat"}}, TR::Source({{2, ReturnValueIndex}})}, - {{CDM::CLibrary, {"get_current_dir_name"}}, - TR::Source({{ReturnValueIndex}})}, - {{CDM::CLibrary, {"gethostname"}}, TR::Source({{0}})}, - {{CDM::CLibrary, {"getnameinfo"}}, TR::Source({{2, 4}})}, - {{CDM::CLibrary, {"getseuserbyname"}}, TR::Source({{1, 2}})}, - {{CDM::CLibrary, {"getgroups"}}, TR::Source({{1, ReturnValueIndex}})}, - {{CDM::CLibrary, {"getlogin"}}, TR::Source({{ReturnValueIndex}})}, - {{CDM::CLibrary, {"getlogin_r"}}, TR::Source({{0}})}, - - // Props - {{CDM::CLibrary, {"accept"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"atoi"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"atol"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"atoll"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"fgetc"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"fgetln"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"fgets"}}, - TR::Prop({{2}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"fgetws"}}, - TR::Prop({{2}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibrary, {"fscanf"}}, TR::Prop({{0}}, {{}, 2})}, - {{CDM::CLibrary, {"fscanf_s"}}, TR::Prop({{0}}, {{}, 2})}, - {{CDM::CLibrary, {"sscanf"}}, TR::Prop({{0}}, {{}, 2})}, - {{CDM::CLibrary, {"sscanf_s"}}, TR::Prop({{0}}, {{}, 2})}, - - {{CDM::CLibrary, {"getc"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"getc_unlocked"}}, - TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"getdelim"}}, TR::Prop({{3}}, {{0}})}, - // TODO: this intends to match the C function `getline()`, but the call - // description also matches the C++ function `std::getline()`; it should - // be ruled out by some additional logic. - {{CDM::CLibrary, {"getline"}}, TR::Prop({{2}}, {{0}})}, - {{CDM::CLibrary, {"getw"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"pread"}}, - TR::Prop({{0, 1, 2, 3}}, {{1, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"read"}}, - TR::Prop({{0, 2}}, {{1, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"fread"}}, - TR::Prop({{3}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"recv"}}, - TR::Prop({{0}}, {{1, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"recvfrom"}}, - TR::Prop({{0}}, {{1, ReturnValueIndex}})}, - - {{CDM::CLibrary, {"ttyname"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"ttyname_r"}}, - TR::Prop({{0}}, {{1, ReturnValueIndex}})}, - - {{CDM::CLibrary, {"basename"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"dirname"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"fnmatch"}}, TR::Prop({{1}}, {{ReturnValueIndex}})}, - - {{CDM::CLibrary, {"mbtowc"}}, TR::Prop({{1}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibrary, {"wctomb"}}, TR::Prop({{1}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibrary, {"wcwidth"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - - {{CDM::CLibrary, {"memcmp"}}, - TR::Prop({{0, 1, 2}}, {{ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"memcpy"}}, - TR::Prop({{1, 2}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"memmove"}}, - TR::Prop({{1, 2}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"bcopy"}}, TR::Prop({{0, 2}}, {{1}})}, - - // Note: "memmem" and its variants search for a byte sequence ("needle") - // in a larger area ("haystack"). Currently we only propagate taint from - // the haystack to the result, but in theory tampering with the needle - // could also produce incorrect results. - {{CDM::CLibrary, {"memmem"}}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strstr"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strcasestr"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - - // Analogously, the following functions search for a byte within a buffer - // and we only propagate taint from the buffer to the result. - {{CDM::CLibraryMaybeHardened, {"memchr"}}, - TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"memrchr"}}, - TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"rawmemchr"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"strchr"}}, - TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"strrchr"}}, - TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"strchrnul"}}, - TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"index"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"rindex"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - - // FIXME: In case of arrays, only the first element of the array gets - // tainted. - {{CDM::CLibrary, {"qsort"}}, TR::Prop({{0}}, {{0}})}, - {{CDM::CLibrary, {"qsort_r"}}, TR::Prop({{0}}, {{0}})}, - - {{CDM::CLibrary, {"strcmp"}}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strcasecmp"}}, - TR::Prop({{0, 1}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strncmp"}}, - TR::Prop({{0, 1, 2}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strncasecmp"}}, - TR::Prop({{0, 1, 2}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strspn"}}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strcspn"}}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strpbrk"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - - {{CDM::CLibrary, {"strndup"}}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strndupa"}}, TR::Prop({{0, 1}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strdup"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"strdupa"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"wcsdup"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - - // strlen, wcslen, strnlen and alike intentionally don't propagate taint. - // See the details here: https://github.com/llvm/llvm-project/pull/66086 - - {{CDM::CLibrary, {"strtol"}}, TR::Prop({{0}}, {{1, ReturnValueIndex}})}, - {{CDM::CLibrary, {"strtoll"}}, TR::Prop({{0}}, {{1, ReturnValueIndex}})}, - {{CDM::CLibrary, {"strtoul"}}, TR::Prop({{0}}, {{1, ReturnValueIndex}})}, - {{CDM::CLibrary, {"strtoull"}}, TR::Prop({{0}}, {{1, ReturnValueIndex}})}, - - {{CDM::CLibrary, {"tolower"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"toupper"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - - {{CDM::CLibrary, {"isalnum"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"isalpha"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"isascii"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"isblank"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"iscntrl"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"isdigit"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"isgraph"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"islower"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"isprint"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"ispunct"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"isspace"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"isupper"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - {{CDM::CLibrary, {"isxdigit"}}, TR::Prop({{0}}, {{ReturnValueIndex}})}, - - {{CDM::CLibraryMaybeHardened, {"strcpy"}}, - TR::Prop({{1}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"stpcpy"}}, - TR::Prop({{1}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"strcat"}}, - TR::Prop({{0, 1}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"wcsncat"}}, - TR::Prop({{0, 1}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"strncpy"}}, - TR::Prop({{1, 2}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"strncat"}}, - TR::Prop({{0, 1, 2}}, {{0, ReturnValueIndex}})}, - {{CDM::CLibraryMaybeHardened, {"strlcpy"}}, TR::Prop({{1, 2}}, {{0}})}, - {{CDM::CLibraryMaybeHardened, {"strlcat"}}, TR::Prop({{0, 1, 2}}, {{0}})}, - - // Usually the matching mode `CDM::CLibraryMaybeHardened` is sufficient - // for unified handling of a function `FOO()` and its hardened variant - // `__FOO_chk()`, but in the "sprintf" family the extra parameters of the - // hardened variants are inserted into the middle of the parameter list, - // so that would not work in their case. - // int snprintf(char * str, size_t maxlen, const char * format, ...); - {{CDM::CLibrary, {"snprintf"}}, - TR::Prop({{1, 2}, 3}, {{0, ReturnValueIndex}})}, - // int sprintf(char * str, const char * format, ...); - {{CDM::CLibrary, {"sprintf"}}, - TR::Prop({{1}, 2}, {{0, ReturnValueIndex}})}, - // int __snprintf_chk(char * str, size_t maxlen, int flag, size_t strlen, - // const char * format, ...); - {{CDM::CLibrary, {"__snprintf_chk"}}, - TR::Prop({{1, 4}, 5}, {{0, ReturnValueIndex}})}, - // int __sprintf_chk(char * str, int flag, size_t strlen, const char * - // format, ...); - {{CDM::CLibrary, {"__sprintf_chk"}}, - TR::Prop({{3}, 4}, {{0, ReturnValueIndex}})}, - - // Sinks - {{CDM::CLibrary, {"system"}}, TR::Sink({{0}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"popen"}}, TR::Sink({{0}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"execl"}}, TR::Sink({{}, {0}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"execle"}}, TR::Sink({{}, {0}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"execlp"}}, TR::Sink({{}, {0}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"execv"}}, TR::Sink({{0, 1}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"execve"}}, - TR::Sink({{0, 1, 2}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"fexecve"}}, - TR::Sink({{0, 1, 2}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"execvp"}}, TR::Sink({{0, 1}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"execvpe"}}, - TR::Sink({{0, 1, 2}}, MsgSanitizeSystemArgs)}, - {{CDM::CLibrary, {"dlopen"}}, TR::Sink({{0}}, MsgSanitizeSystemArgs)}, - - // malloc, calloc, alloca, realloc, memccpy - // are intentionally not marked as taint sinks because unconditional - // reporting for these functions generates many false positives. - // These taint sinks should be implemented in other checkers with more - // sophisticated sanitation heuristics. - - {{CDM::CLibrary, {"setproctitle"}}, - TR::Sink({{0}, 1}, MsgUncontrolledFormatString)}, - {{CDM::CLibrary, {"setproctitle_fast"}}, - TR::Sink({{0}, 1}, MsgUncontrolledFormatString)}}; + RulesConstructionTy GlobalCRules; + GlobalCRules.reserve(std::size(TaintRuleDescriptors) + 2); + + auto MakeArgSet = [](const ArgSetDescriptor &Desc) { + ArgVecTy Args; + Args.append(Desc.Args, Desc.Args + Desc.Count); + std::optional<ArgIdxTy> VariadicIndex; + if (Desc.VariadicIndex != -2) + VariadicIndex = Desc.VariadicIndex; + return ArgSet(std::move(Args), VariadicIndex); + }; + + for (const TaintRuleDescriptor &Desc : TaintRuleDescriptors) { + StringRef Name(reinterpret_cast<const char *>(&TaintRuleNames) + + Desc.NameOffset, + Desc.NameLength); + CallDescription Call(static_cast<CDM>(Desc.Mode), {Name}); + ArgSet First = MakeArgSet(Desc.First); + ArgSet Second = MakeArgSet(Desc.Second); + GenericTaintRule Rule = [&]() { + switch (Desc.Kind) { + case TaintRuleKind::Source: + return TR::Source(std::move(First)); + case TaintRuleKind::Prop: + return TR::Prop(std::move(First), std::move(Second)); + case TaintRuleKind::Sink: { + std::optional<StringRef> Message; + switch (Desc.Message) { + case TaintRuleMessage::None: + break; + case TaintRuleMessage::SanitizeSystemArgs: + Message = MsgSanitizeSystemArgs; + break; + case TaintRuleMessage::UncontrolledFormatString: + Message = MsgUncontrolledFormatString; + break; + } + return TR::Sink(std::move(First), Message); + } + } + llvm_unreachable("unknown taint rule kind"); + }(); + GlobalCRules.emplace_back(std::move(Call), std::move(Rule)); + } if (TR::UntrustedEnv(C)) { // void setproctitle_init(int argc, char *argv[], char *envp[]) _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
