llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-libcxx Author: Akira Hatanaka (ahatanak) <details> <summary>Changes</summary> std::__find routes a search to the libc wmemchr (via __constexpr_wmemchr) whenever the element type has the same size and alignment as wchar_t. That is wrong under -fshort-wchar on a platform whose native wchar_t is 4 bytes: wchar_t shrinks to 2 bytes, so _Tp=char16_t satisfies that condition, and the search is routed to wmemchr, which still reads 4-byte elements. Only take the wmemchr path when wchar_t is still its native type, i.e., unmodified by -fshort-wchar. The check uses the new __native_wchar_t alias in <cwchar> (from __WCHAR_NATIVE_TYPE__, falling back to wchar_t on older compilers). Normal builds keep the wmemchr fast path unchanged. Fixes https://github.com/llvm/llvm-project/issues/195149 rdar://175090927 --- Full diff: https://github.com/llvm/llvm-project/pull/203621.diff 10 Files Affected: - (modified) clang/docs/LanguageExtensions.rst (+8) - (modified) clang/docs/ReleaseNotes.rst (+7) - (modified) clang/include/clang/Basic/TargetInfo.h (+12) - (modified) clang/lib/Basic/TargetInfo.cpp (+8) - (modified) clang/lib/Frontend/InitPreprocessor.cpp (+1) - (modified) clang/test/Preprocessor/init-aarch64.c (+1) - (modified) clang/test/Preprocessor/init.c (+1) - (modified) libcxx/include/__algorithm/find.h (+7-1) - (modified) libcxx/include/cwchar (+9) - (added) libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp (+42) ``````````diff diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index fbb9947f39d3e..c0beb6ddecd02 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -393,6 +393,14 @@ Builtin Macros Defined to an integral value that is the include depth of the file currently being translated. For the main file, this value is zero. +``__WCHAR_NATIVE_TYPE__`` + clang-specific extension defined to the platform's native type for + ``wchar_t``, i.e., the type ``wchar_t`` would have without ``-fshort-wchar``. + This matches ``__WCHAR_TYPE__`` unless ``-fshort-wchar`` is in effect. This + lets code detect when ``wchar_t`` is different from its native type, + e.g., to decide whether dispatching to a ``wchar_t``-based runtime function + such as ``wmemchr`` is safe. + ``__TIMESTAMP__`` Defined to the date and time of the last modification of the current source file. diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index cf4826f50e5a5..12d09cb361825 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -312,6 +312,13 @@ Non-comprehensive list of changes in this release - ``typeid`` on references and pointers of ``final`` types no longer emits a vtable lookup at runtime. +- Added a new predefined macro ``__WCHAR_NATIVE_TYPE__``, expanding to the + platform's native type for ``wchar_t`` (the type ``wchar_t`` would have + without ``-fshort-wchar``). It matches ``__WCHAR_TYPE__`` unless + ``-fshort-wchar`` is in effect, letting code detect when ``wchar_t`` is + different from its native type. + + - Updated support for Unicode from 15.1 to 18.0. New Compiler Flags diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index cc226403877e2..8615da60803f7 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -163,6 +163,11 @@ struct TransferrableTargetInfo { Char16Type, Char32Type, Int64Type, Int16Type, SigAtomicType, ProcessIDType; + /// The platform's native type for wchar_t, i.e., the type wchar_t would have + /// without -fshort-wchar. This matches WCharType unless -fshort-wchar is in + /// effect. + IntType WideCharNativeType; + /// Whether Objective-C's built-in boolean type should be signed char. /// /// Otherwise, when this flag is not set, the normal built-in boolean type is @@ -417,6 +422,13 @@ class TargetInfo : public TransferrableTargetInfo, return getCorrespondingUnsignedType(IntPtrType); } IntType getWCharType() const { return WCharType; } + + /// Return the platform's native type for wchar_t, i.e., the type wchar_t + /// would have without -fshort-wchar. + IntType getWideCharNativeType() const { + return WideCharNativeType == NoInt ? WCharType : WideCharNativeType; + } + IntType getWIntType() const { return WIntType; } IntType getChar16Type() const { return Char16Type; } IntType getChar32Type() const { return Char32Type; } diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 854d23cadaea2..62fb6c8175484 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -137,6 +137,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { IntMaxType = SignedLongLong; IntPtrType = SignedLong; WCharType = SignedInt; + WideCharNativeType = NoInt; WIntType = SignedInt; Char16Type = UnsignedShort; Char32Type = UnsignedInt; @@ -423,6 +424,13 @@ void TargetInfo::adjust(DiagnosticsEngine &Diags, LangOptions &Opts, if (Opts.NoBitFieldTypeAlign) UseBitFieldTypeAlignment = false; + // Capture the platform-native wchar_t before -fshort-wchar can override + // WCharType below. adjust() may run more than once on the same target, so + // only record it the first time, while WCharType still holds the target + // default. + if (WideCharNativeType == NoInt) + WideCharNativeType = WCharType; + switch (Opts.WCharSize) { default: llvm_unreachable("invalid wchar_t width"); case 0: break; diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 3f0468a938149..f516c5159dba7 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1168,6 +1168,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI, DefineType("__SIZE_TYPE__", TI.getSizeType(), Builder); DefineFmt(LangOpts, "__SIZE", TI.getSizeType(), TI, Builder); DefineType("__WCHAR_TYPE__", TI.getWCharType(), Builder); + DefineType("__WCHAR_NATIVE_TYPE__", TI.getWideCharNativeType(), Builder); DefineType("__WINT_TYPE__", TI.getWIntType(), Builder); DefineTypeSizeAndWidth("__SIG_ATOMIC", TI.getSigAtomicType(), TI, Builder); if (LangOpts.C23) diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c index 09e3fc926a309..3ec78a7651480 100644 --- a/clang/test/Preprocessor/init-aarch64.c +++ b/clang/test/Preprocessor/init-aarch64.c @@ -393,6 +393,7 @@ // AARCH64-NEXT: #define __USER_LABEL_PREFIX__ // AARCH64-NEXT: #define __VERSION__ "{{.*}}" // AARCH64-NEXT: #define __WCHAR_MAX__ 4294967295U +// AARCH64-NEXT: #define __WCHAR_NATIVE_TYPE__ unsigned int // AARCH64-NEXT: #define __WCHAR_TYPE__ unsigned int // AARCH64-NEXT: #define __WCHAR_UNSIGNED__ 1 // AARCH64-NEXT: #define __WCHAR_WIDTH__ 32 diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c index 80b7a6399e5f4..cc67db4fa068e 100644 --- a/clang/test/Preprocessor/init.c +++ b/clang/test/Preprocessor/init.c @@ -2076,6 +2076,7 @@ // WEBASSEMBLY-NEXT:#define __USER_LABEL_PREFIX__ // WEBASSEMBLY-NEXT:#define __VERSION__ "{{.*}}" // WEBASSEMBLY-NEXT:#define __WCHAR_MAX__ 2147483647 +// WEBASSEMBLY-NEXT:#define __WCHAR_NATIVE_TYPE__ int // WEBASSEMBLY-NEXT:#define __WCHAR_TYPE__ int // WEBASSEMBLY-NOT:#define __WCHAR_UNSIGNED__ // WEBASSEMBLY-NEXT:#define __WCHAR_WIDTH__ 32 diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h index f677fb2c7392d..66657a9056537 100644 --- a/libcxx/include/__algorithm/find.h +++ b/libcxx/include/__algorithm/find.h @@ -127,7 +127,13 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __find(_Tp* __first, _T return __last; } # if _LIBCPP_HAS_WIDE_CHARACTERS - else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t)) { + // __builtin_wmemchr lowers to a libc call that walks native-sized wchar_t + // elements. Only take this path when wchar_t still has its platform-native + // size and alignment. Otherwise (e.g., under -fshort-wchar) fall through to the + // vectorized integral path, which honors the current wchar_t size. + else if constexpr (sizeof(_Tp) == sizeof(wchar_t) && _LIBCPP_ALIGNOF(_Tp) >= _LIBCPP_ALIGNOF(wchar_t) && + sizeof(wchar_t) == sizeof(__native_wchar_t) && _LIBCPP_ALIGNOF(wchar_t) == + _LIBCPP_ALIGNOF(__native_wchar_t)) { if (auto __ret = std::__constexpr_wmemchr(__first, __value, __last - __first)) return __ret; return __last; diff --git a/libcxx/include/cwchar b/libcxx/include/cwchar index e2534977a7a3c..cc7e7fd8128bd 100644 --- a/libcxx/include/cwchar +++ b/libcxx/include/cwchar @@ -197,6 +197,15 @@ using ::putwchar _LIBCPP_USING_IF_EXISTS; using ::vwprintf _LIBCPP_USING_IF_EXISTS; using ::wprintf _LIBCPP_USING_IF_EXISTS; +// Names the platform-native wchar_t (the type wchar_t would have without +// -fshort-wchar). Falls back to wchar_t on compilers that predate +// __WCHAR_NATIVE_TYPE__ (Clang < 23), preserving prior behavior. +# ifdef __WCHAR_NATIVE_TYPE__ +using __native_wchar_t = __WCHAR_NATIVE_TYPE__; +# else +using __native_wchar_t = wchar_t; +# endif + inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 size_t __constexpr_wcslen(const wchar_t* __str) { # if __has_builtin(__builtin_wcslen) return __builtin_wcslen(__str); diff --git a/libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp new file mode 100644 index 0000000000000..a261f7fd30c9c --- /dev/null +++ b/libcxx/test/libcxx/strings/basic.string/string.ops/string_find/short_wchar.pass.cpp @@ -0,0 +1,42 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Regression test for llvm/llvm-project#195149: u16string::find used to +// dispatch through __builtin_wmemchr when sizeof(char16_t) == sizeof(wchar_t). +// Under -fshort-wchar on a platform whose native wchar_t is 4 bytes +// (e.g., Linux/Darwin), the libc wmemchr keeps walking 4-byte elements, so the +// search returned wrong results. __find now gates the wmemchr fast path on the +// platform-native wchar_t size (via __WCHAR_NATIVE_TYPE__) so the runtime +// libcall is taken only when it is binary-compatible with what wmemchr expects. +// +// Only meaningful where the platform-native wchar_t differs from 2 bytes; on +// Windows (native 2-byte wchar_t) the optimization is always safe. + +// ADDITIONAL_COMPILE_FLAGS: -fshort-wchar + +#include <cassert> +#include <string> + +#include "test_macros.h" + +TEST_CONSTEXPR_CXX20 bool test() { + std::u16string s = u"hello"; + std::u16string t = u"goodbye"; + assert(s.find(u'o') == 4); + assert(t.find(u'b') == 4); + assert(s.find(u'z') == std::u16string::npos); + return true; +} + +int main(int, char**) { + test(); +#if TEST_STD_VER >= 20 + static_assert(test()); +#endif + return 0; +} `````````` </details> https://github.com/llvm/llvm-project/pull/203621 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
