Reproducer % clang++-16 -std=c++17 -Wfatal-errors -Wall -Wextra -Werror -O1 -o fails '-DHWY_DISABLED_TARGETS=(HWY_NEON|HWY_SVE|HWY_SVE2|HWY_SVE_256|HWY_SVE2_128)' math_test4.cc -lhwy -lhwy_contrib -lhwy_test % valgrind ./fails
// Copyright 2020 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License.
#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ #define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ // Memory allocator with support for alignment and offsets. #include <memory> #include <utility> #include "hwy/base.h" namespace hwy { // Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which // requires a literal. This matches typical L1 cache line sizes, which prevents // false sharing. #define HWY_ALIGNMENT 64 // Pointers to functions equivalent to malloc/free with an opaque void* passed // to them. using AllocPtr = void* (*)(void* opaque, size_t bytes); using FreePtr = void (*)(void* opaque, void* memory); // Returns null or a pointer to at least `payload_size` (which can be zero) // bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and // the vector size. Calls `alloc` with the passed `opaque` pointer to obtain // memory or malloc() if it is null. HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size, AllocPtr alloc_ptr, void* opaque_ptr); // Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it // must have been returned from a previous call to `AllocateAlignedBytes`. // Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if // `free_ptr` function is null, uses the default free(). HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr, void* opaque_ptr); // Class that deletes the aligned pointer passed to operator() calling the // destructor before freeing the pointer. This is equivalent to the // std::default_delete but for aligned objects. For a similar deleter equivalent // to free() for aligned memory see AlignedFreer(). class AlignedDeleter { public: AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {} AlignedDeleter(FreePtr free_ptr, void* opaque_ptr) : free_(free_ptr), opaque_ptr_(opaque_ptr) {} template <typename T> void operator()(T* aligned_pointer) const { return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_, TypedArrayDeleter<T>); } private: template <typename T> static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) { size_t elems = size_in_bytes / sizeof(T); for (size_t i = 0; i < elems; i++) { // Explicitly call the destructor on each element. (static_cast<T*>(ptr) + i)->~T(); } } // Function prototype that calls the destructor for each element in a typed // array. TypeArrayDeleter<T> would match this prototype. using ArrayDeleter = void (*)(void* t_ptr, size_t t_size); HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr, void* opaque_ptr, ArrayDeleter deleter); FreePtr free_; void* opaque_ptr_; }; // Unique pointer to T with custom aligned deleter. This can be a single // element U or an array of element if T is a U[]. The custom aligned deleter // will call the destructor on U or each element of a U[] in the array case. template <typename T> using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>; // Aligned memory equivalent of make_unique<T> using the custom allocators // alloc/free with the passed `opaque` pointer. This function calls the // constructor with the passed Args... and calls the destructor of the object // when the AlignedUniquePtr is destroyed. template <typename T, typename... Args> AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) { T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque)); return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...), AlignedDeleter(free, opaque)); } // Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free // functions. template <typename T, typename... Args> AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) { T* ptr = static_cast<T*>(AllocateAlignedBytes( sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr)); return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...), AlignedDeleter()); } // Helpers for array allocators (avoids overflow) namespace detail { // Returns x such that 1u << x == n (if n is a power of two). static inline constexpr size_t ShiftCount(size_t n) { return (n <= 1) ? 0 : 1 + ShiftCount(n / 2); } template <typename T> T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) { constexpr size_t size = sizeof(T); constexpr bool is_pow2 = (size & (size - 1)) == 0; constexpr size_t bits = ShiftCount(size); static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect"); const size_t bytes = is_pow2 ? items << bits : items * size; const size_t check = is_pow2 ? bytes >> bits : bytes / size; if (check != items) { return nullptr; // overflowed } return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr)); } } // namespace detail // Aligned memory equivalent of make_unique<T[]> for array types using the // custom allocators alloc/free. This function calls the constructor with the // passed Args... on every created item. The destructor of each element will be // called when the AlignedUniquePtr is destroyed. template <typename T, typename... Args> AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc( size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) { T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque); if (ptr != nullptr) { for (size_t i = 0; i < items; i++) { new (ptr + i) T(std::forward<Args>(args)...); } } return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque)); } template <typename T, typename... Args> AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) { return MakeUniqueAlignedArrayWithAlloc<T, Args...>( items, nullptr, nullptr, nullptr, std::forward<Args>(args)...); } // Custom deleter for std::unique_ptr equivalent to using free() as a deleter // but for aligned memory. class AlignedFreer { public: // Pass address of this to ctor to skip deleting externally-owned memory. static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {} AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {} AlignedFreer(FreePtr free_ptr, void* opaque_ptr) : free_(free_ptr), opaque_ptr_(opaque_ptr) {} template <typename T> void operator()(T* aligned_pointer) const { // TODO(deymo): assert that we are using a POD type T. FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_); } private: FreePtr free_; void* opaque_ptr_; }; // Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD // data use AlignedUniquePtr. template <typename T> using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>; // Allocate an aligned and uninitialized array of POD values as a unique_ptr. // Upon destruction of the unique_ptr the aligned array will be freed. template <typename T> AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc, FreePtr free, void* opaque) { return AlignedFreeUniquePtr<T[]>( detail::AllocateAlignedItems<T>(items, alloc, opaque), AlignedFreer(free, opaque)); } // Same as previous AllocateAligned(), using default allocate/free functions. template <typename T> AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) { return AllocateAligned<T>(items, nullptr, nullptr, nullptr); } } // namespace hwy #endif // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_ // Copyright 2020 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Main header required before using vector types. // IWYU pragma: begin_exports #include "hwy/base.h" #include "hwy/detect_compiler_arch.h" #include "hwy/highway_export.h" #include "hwy/targets.h" // IWYU pragma: end_exports // This include guard is checked by foreach_target, so avoid the usual _H_ // suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included // after/outside this include guard. #ifndef HWY_HIGHWAY_INCLUDED #define HWY_HIGHWAY_INCLUDED namespace hwy { // API version (https://semver.org/); keep in sync with CMakeLists.txt. #define HWY_MAJOR 1 #define HWY_MINOR 0 #define HWY_PATCH 7 //------------------------------------------------------------------------------ // Shorthand for tags (defined in shared-inl.h) used to select overloads. // Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over // HWY_CAPPED(T, N). // HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of // registers in the group, and is ignored on targets that do not support groups. #define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T> #define HWY_FULL2(T, LMUL) \ hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))> #define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3 // Workaround for MSVC grouping __VA_ARGS__ into a single argument #define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren // Trailing comma avoids -pedantic false alarm #define HWY_CHOOSE_FULL(...) \ HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, )) #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__) // Vector of up to MAX_N lanes. It's better to use full vectors where possible. #define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N> //------------------------------------------------------------------------------ // Export user functions for static/dynamic dispatch // Evaluates to 0 inside a translation unit if it is generating anything but the // static target (the last one if multiple targets are enabled). Used to prevent // redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only // compile once anyway, so this is 1 unless it is or has been included. #ifndef HWY_ONCE #define HWY_ONCE 1 #endif // HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for // HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is // defined), and can be used to deduce the return type of Choose*. #if HWY_STATIC_TARGET == HWY_SCALAR #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_EMU128 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_RVV #define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_WASM_EMU256 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_WASM #define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_WITHOUT_AES::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_NEON #define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SVE #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SVE2 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SVE_256 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SVE2_128 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_PPC8 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_PPC9 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_PPC10 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SSE2 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE2::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SSSE3 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_SSE4 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_AVX2 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_AVX3 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_AVX3_DL #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4 #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_ZEN4::FUNC_NAME #elif HWY_STATIC_TARGET == HWY_AVX3_SPR #define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_SPR::FUNC_NAME #endif // HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or // nullptr is that target was not compiled. #if HWY_TARGETS & HWY_EMU128 #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME #elif HWY_TARGETS & HWY_SCALAR #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME #else // When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at // runtime, fall back to the baseline with HWY_STATIC_DISPATCH(). #define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME) #endif #if HWY_TARGETS & HWY_WASM_EMU256 #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME #else #define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_WASM #define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME #else #define HWY_CHOOSE_WASM(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_RVV #define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME #else #define HWY_CHOOSE_RVV(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_NEON_WITHOUT_AES #define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) &N_NEON_WITHOUT_AES::FUNC_NAME #else #define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_NEON #define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME #else #define HWY_CHOOSE_NEON(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_SVE #define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME #else #define HWY_CHOOSE_SVE(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_SVE2 #define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME #else #define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_SVE_256 #define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME #else #define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_SVE2_128 #define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME #else #define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_PPC8 #define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME #else #define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_PPC9 #define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME #else #define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_PPC10 #define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME #else #define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_SSE2 #define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME #else #define HWY_CHOOSE_SSE2(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_SSSE3 #define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME #else #define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_SSE4 #define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME #else #define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_AVX2 #define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME #else #define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_AVX3 #define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME #else #define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_AVX3_DL #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME #else #define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_AVX3_ZEN4 #define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) &N_AVX3_ZEN4::FUNC_NAME #else #define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr #endif #if HWY_TARGETS & HWY_AVX3_SPR #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME #else #define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr #endif // MSVC 2017 workaround: the non-type template parameter to ChooseAndCall // apparently cannot be an array. Use a function pointer instead, which has the // disadvantage that we call the static (not best) target on the first call to // any HWY_DYNAMIC_DISPATCH. #if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915 #define HWY_DISPATCH_WORKAROUND 1 #else #define HWY_DISPATCH_WORKAROUND 0 #endif // Provides a static member function which is what is called during the first // HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of // this function are the first entry in the tables created by HWY_EXPORT. template <typename RetType, typename... Args> struct FunctionCache { public: typedef RetType(FunctionType)(Args...); #if HWY_DISPATCH_WORKAROUND template <FunctionType* const func> static RetType ChooseAndCall(Args... args) { ChosenTarget& chosen_target = GetChosenTarget(); chosen_target.Update(SupportedTargets()); return (*func)(args...); } #else // A template function that when instantiated has the same signature as the // function being called. This function initializes the bit array of targets // supported by the current CPU and then calls the appropriate entry within // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any // exported functions, even those defined by different translation units, // will dispatch directly to the best available target. template <FunctionType* const table[]> static RetType ChooseAndCall(Args... args) { ChosenTarget& chosen_target = GetChosenTarget(); chosen_target.Update(SupportedTargets()); return (table[chosen_target.GetIndex()])(args...); } #endif // HWY_DISPATCH_WORKAROUND }; // Used to deduce the template parameters RetType and Args from a function. template <typename RetType, typename... Args> FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) { return FunctionCache<RetType, Args...>(); } #define HWY_DISPATCH_TABLE(FUNC_NAME) \ HWY_CONCAT(FUNC_NAME, HighwayDispatchTable) // HWY_EXPORT(FUNC_NAME); expands to a static array that is used by // HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This // static array must be defined at the same namespace level as the function // it is exporting. // After being exported, it can be called from other parts of the same source // file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper // like in the following example: // // #include "hwy/highway.h" // HWY_BEFORE_NAMESPACE(); // namespace skeleton { // namespace HWY_NAMESPACE { // // void MyFunction(int a, char b, const char* c) { ... } // // // NOLINTNEXTLINE(google-readability-namespace-comments) // } // namespace HWY_NAMESPACE // } // namespace skeleton // HWY_AFTER_NAMESPACE(); // // namespace skeleton { // HWY_EXPORT(MyFunction); // Defines the dispatch table in this scope. // // void MyFunction(int a, char b, const char* c) { // return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c); // } // } // namespace skeleton // #if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) // Simplified version for IDE or the dynamic dispatch case with only one target. // This case still uses a table, although of a single element, to provide the // same compile error conditions as with the dynamic dispatch case when multiple // targets are being compiled. #define HWY_EXPORT(FUNC_NAME) \ HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \ HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)} #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME) #define HWY_DYNAMIC_POINTER(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME) #else // Simplified version for MSVC 2017: function pointer instead of table. #if HWY_DISPATCH_WORKAROUND #define HWY_EXPORT(FUNC_NAME) \ static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \ /* The first entry in the table initializes the global cache and \ * calls the function from HWY_STATIC_TARGET. */ \ &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \ FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>, \ HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ HWY_CHOOSE_FALLBACK(FUNC_NAME), \ } #else // Dynamic dispatch case with one entry per dynamic target plus the fallback // target and the initialization wrapper. #define HWY_EXPORT(FUNC_NAME) \ static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \ FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = { \ /* The first entry in the table initializes the global cache and \ * calls the appropriate function. */ \ &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH( \ FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>, \ HWY_CHOOSE_TARGET_LIST(FUNC_NAME), \ HWY_CHOOSE_FALLBACK(FUNC_NAME), \ } #endif // HWY_DISPATCH_WORKAROUND #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \ (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])) #define HWY_DYNAMIC_POINTER(FUNC_NAME) \ (HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]) #endif // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0) // DEPRECATED names; please use HWY_HAVE_* instead. #define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64 #define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16 #define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64 } // namespace hwy #endif // HWY_HIGHWAY_INCLUDED //------------------------------------------------------------------------------ // NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want // to include them once per target, which is ensured by the toggle check. // Because ops/*.h are included under it, they do not need their own guard. #if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE) #ifdef HWY_HIGHWAY_PER_TARGET #undef HWY_HIGHWAY_PER_TARGET #else #define HWY_HIGHWAY_PER_TARGET #endif // These define ops inside namespace hwy::HWY_NAMESPACE. #if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 #include "hwy/ops/x86_128-inl.h" #elif HWY_TARGET == HWY_AVX2 #include "hwy/ops/x86_256-inl.h" #elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \ HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR #include "hwy/ops/x86_512-inl.h" #elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \ HWY_TARGET == HWY_PPC10 #include "hwy/ops/ppc_vsx-inl.h" #elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES #include "hwy/ops/arm_neon-inl.h" #elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \ HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 #include "hwy/ops/arm_sve-inl.h" #elif HWY_TARGET == HWY_WASM_EMU256 #include "hwy/ops/wasm_256-inl.h" #elif HWY_TARGET == HWY_WASM #include "hwy/ops/wasm_128-inl.h" #elif HWY_TARGET == HWY_RVV #include "hwy/ops/rvv-inl.h" #elif HWY_TARGET == HWY_EMU128 #include "hwy/ops/emu128-inl.h" #elif HWY_TARGET == HWY_SCALAR #include "hwy/ops/scalar-inl.h" #else #pragma message("HWY_TARGET does not match any known target") #endif // HWY_TARGET #include "hwy/ops/generic_ops-inl.h" #endif // HWY_HIGHWAY_PER_TARGET // Copyright 2022 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef HWY_PRINT_H_ #define HWY_PRINT_H_ // Helpers for printing vector lanes. #include <stddef.h> #include <stdio.h> #include "hwy/base.h" #include "hwy/highway_export.h" namespace hwy { namespace detail { // For implementing value comparisons etc. as type-erased functions to reduce // template bloat. struct TypeInfo { size_t sizeof_t; bool is_float; bool is_signed; bool is_bf16; }; template <typename T> HWY_INLINE TypeInfo MakeTypeInfo() { TypeInfo info; info.sizeof_t = sizeof(T); info.is_float = IsFloat<T>(); info.is_signed = IsSigned<T>(); info.is_bf16 = IsSame<T, bfloat16_t>(); return info; } HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100); HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr, char* string100); HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption, const void* array_void, size_t N, size_t lane_u = 0, size_t max_lanes = 7); } // namespace detail template <typename T> HWY_NOINLINE void PrintValue(T value) { char str[100]; detail::ToString(hwy::detail::MakeTypeInfo<T>(), &value, str); fprintf(stderr, "%s,", str); } template <typename T> HWY_NOINLINE void PrintArray(const T* value, size_t count) { detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "", value, count, 0, count); } } // namespace hwy #endif // HWY_PRINT_H_ namespace hwy { namespace N_NEON_WITHOUT_AES { template <class D, class V, class M = MFromD<D>> V Atan2(D d, V y, V x) { V kHalf = Set(d, 0.5), kPi = Set(d, 3.14159265358979323846264), kPi2 = kHalf, k0 = Zero(d); M y_0, x_neg = Lt(x, k0), y_inf = IsInf(y), x_inf = IsInf(x), nan; V if_xneg_pi = IfThenElseZero(x_neg, kPi); V if_yinf = IfThenElse(x_inf, Add(kPi2, if_xneg_pi), kPi); #if 1 V t0 = Zero(d); V t = IfThenElse(y_inf, if_yinf, t0); (void)t; #else V t = IfThenElse(y_inf, if_yinf, t); #endif return CopySign(t, y); } } // namespace N_NEON_WITHOUT_AES } // namespace hwy namespace hwy { namespace detail { bool IsEqual(const TypeInfo &, const void *, const void *); void PrintMismatchAndAbort(const TypeInfo &, const void *, const void *, const char *, const char *, int, size_t, size_t = 1); } // namespace detail char AssertEqual_target_name; template <typename T> void AssertEqual(T expected, T actual, const char *filename, int line) { auto info = detail::MakeTypeInfo<T>(); if (!IsEqual(info, &expected, &actual)) PrintMismatchAndAbort(info, &expected, &actual, &AssertEqual_target_name, filename, line, 0); } namespace N_NEON_WITHOUT_AES { template <class D> void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b, const char *filename, int line) { size_t N = Lanes(d); Rebind<uint8_t, D> d8; size_t N8 = Lanes(d8); auto bits_a = AllocateAligned<uint8_t> HWY_MAX(size_t{}, N8); auto bits_b = AllocateAligned<uint8_t>(N8); size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get()); size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get()); AssertEqual(num_bytes_a, num_bytes_b, filename, line); size_t remainder = N; if (remainder) { } } #define HWY_ASSERT_MASK_EQ(d, expected, actual) \ AssertMaskEqual(d, expected, actual, __FILE__, __LINE__) struct TestAtan2 { template <typename T, class D> void operator()(T t, D d) { size_t N = Lanes(d); size_t padded; AlignedFreeUniquePtr<T[]> in_y, in_x, expected; Atan2TestCases(t, d, padded, in_y, in_x, expected); Vec<D> tolerance = Set(d, T()); for (size_t i = 0; i < padded; i += N) { Vec<D> y = Load(d, &in_y[i]); Vec<D> x = Load(d, &in_x[i]); Vec<D> actual = Atan2(d, y, x); Vec<D> vexpected = Load(d, &expected[i]); Mask<D> exp_nan = IsNaN(vexpected); Mask<D> act_nan = IsNaN(actual); HWY_ASSERT_MASK_EQ(d, exp_nan, act_nan); Mask<D> ge = Ge(actual, Sub(vexpected, tolerance)); Mask<D> le = Le(actual, tolerance); Mask<D> ok = And(le, ge); if (!AllTrue(d, ok)) HWY_ASSERT(0); } } }; template <typename T, size_t kMul, size_t kMinArg, int kPow2 = 0> struct ForeachCappedR { static void Do(size_t min_lanes, size_t max_lanes) { CappedTag<T, kMinArg> d; TestAtan2()(double(), d); ForeachCappedR<double, kMul / 2, kPow2>::Do(min_lanes, max_lanes); } }; template <typename T, size_t kMinArg, int kPow2> struct ForeachCappedR<T, 0, kMinArg, kPow2> { static void Do(size_t, size_t) {} }; struct ForPartialVectors { void operator()(double t) { (void)t; ForeachCappedR<double, 1, 1>::Do(1, 1); } }; template <typename T, class D> void Atan2TestCases(T, D d, size_t &padded, AlignedFreeUniquePtr<T[]> &out_y, AlignedFreeUniquePtr<T[]> &out_x, AlignedFreeUniquePtr<T[]> &out_expected) { struct YX { T y; T x; T expected; }; T pos(1E5), neg(1E7), n0(0.0), inf = GetLane(Inf(d)), nan = 0, pi(3.141592653589793238); YX test_cases[]{{-inf, inf, -pi / 4}, {neg, inf, n0}, {pos, nan, nan}}; size_t kNumTestCases = sizeof(0); size_t N = Lanes(d); padded = RoundUpTo(kNumTestCases, N); out_y = AllocateAligned<T>(padded); out_x = AllocateAligned<T>(padded); out_expected = AllocateAligned<T>(padded); size_t i = 0; out_y[i] = test_cases[i].y; out_x[i] = test_cases[i].x; out_expected[i] = test_cases[i].expected; } void TestAllAtan2() { ForPartialVectors func; func(double()); } } // namespace N_NEON_WITHOUT_AES } // namespace hwy int main() { hwy::N_NEON_WITHOUT_AES::TestAllAtan2(); }